From b25f2d6213bef603810d492e1a909cf32f43d886 Mon Sep 17 00:00:00 2001
From: "Matthijs S. Berends" <m.s.berends@umcg.nl>
Date: Thu, 28 Mar 2019 21:33:28 +0100
Subject: [PATCH] v0.6.1

---
 DESCRIPTION                      |  4 +-
 NEWS.md                          |  6 +++
 R/eucast_rules.R                 | 76 +++++++++++++-------------------
 R/guess_ab_col.R                 |  6 +--
 R/mo.R                           |  6 ++-
 R/mo_history.R                   | 59 +++++++++++++------------
 man/as.mo.Rd                     |  6 ++-
 reproduction_of_antibiotics.R    | 58 ++++++++++++++++++++++++
 tests/testthat/test-mo_history.R |  2 +-
 vignettes/benchmarks.Rmd         |  2 +-
 10 files changed, 143 insertions(+), 82 deletions(-)
 create mode 100644 reproduction_of_antibiotics.R

diff --git a/DESCRIPTION b/DESCRIPTION
index 598c0391c..71b8c4a23 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: AMR
-Version: 0.6.0
-Date: 2019-03-27
+Version: 0.6.1
+Date: 2019-03-28
 Title: Antimicrobial Resistance Analysis
 Authors@R: c(
     person(
diff --git a/NEWS.md b/NEWS.md
index ff88a3497..f83415d80 100755
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,9 @@
+# AMR 0.6.1
+
+#### Changed
+* Fixed a critical bug when using `eucast_rules()` with `verbose = TRUE`
+* Coercion of microbial IDs are now written to the package namespace instead of the user's home folder, to comply with the CRAN policy
+
 # AMR 0.6.0
 
 **New website!**
diff --git a/R/eucast_rules.R b/R/eucast_rules.R
index 4739fbea2..d6022f2e3 100755
--- a/R/eucast_rules.R
+++ b/R/eucast_rules.R
@@ -454,10 +454,10 @@ eucast_rules <- function(tbl,
           stop(e, call. = FALSE)
         }
       )
-      suppressMessages(
-        suppressWarnings(
-          tbl[rows, cols] <<- to
-      ))
+      # suppressMessages(
+      #   suppressWarnings(
+      #     tbl[rows, cols] <<- to
+      #   ))
 
       after <- as.character(unlist(as.list(tbl_original[rows, cols])))
 
@@ -489,46 +489,22 @@ eucast_rules <- function(tbl,
         number_newly_changed_to_R
 
       if (verbose == TRUE) {
-        for (r in 1:length(rows)) {
-          for (c in 1:length(cols)) {
-            old <- before_df[rows[r], cols[c]]
-            new <- tbl[rows[r], cols[c]]
-            if (!identical(old, new)) {
-              verbose_new <- data.frame(row = rows[r],
-                                        col = cols[c],
-                                        mo = tbl_original[rows[r], col_mo],
-                                        mo_fullname = "",
-                                        old = old,
-                                        new = new,
-                                        rule_source = strip_style(rule[1]),
-                                        rule_group = strip_style(rule[2]),
-                                        stringsAsFactors = FALSE)
-              verbose_info <<- rbind(verbose_info, verbose_new)
-            }
-          }
+        old <- as.data.frame(tbl_bak, stringsAsFactors = FALSE)[rows,]
+        new <- as.data.frame(tbl, stringsAsFactors = FALSE)[rows,]
+        MOs <- as.data.frame(tbl_original, stringsAsFactors = FALSE)[rows, col_mo][[1]]
+        for (i in 1:length(cols)) {
+          verbose_new <- data.frame(row = rows,
+                                    col = cols[i],
+                                    mo = MOs,
+                                    mo_fullname = "",
+                                    old = as.character(old[, cols[i]]),
+                                    new = as.character(new[, cols[i]]),
+                                    rule_source = strip_style(rule[1]),
+                                    rule_group = strip_style(rule[2]),
+                                    stringsAsFactors = FALSE)
+          colnames(verbose_new) <- c("row", "col", "mo", "mo_fullname", "old", "new", "rule_source", "rule_group")
+          verbose_info <<- rbind(verbose_info, verbose_new)
         }
-        # verbose_new <- data.frame(row = integer(0),
-        #                           col = character(0),
-        #                           old = character(0),
-        #                           new = character(0),
-        #                           rule_source = character(0),
-        #                           rule_group = character(0),
-        #                           stringsAsFactors = FALSE)
-        # a <<- rule
-        # for (i in 1:length(cols)) {
-        #   # add new row for every affected column
-        #   verbose_new <- data.frame(rule_type = strip_style(rule[1]),
-        #                             rule_set = strip_style(rule[2]),
-        #                             force_to = to,
-        #                             found = length(before),
-        #                             changed = sum(before != after, na.rm = TRUE),
-        #                             target_column = cols[i],
-        #                             stringsAsFactors = FALSE)
-        #   verbose_new$target_rows <- list(unname(rows))
-        #   rownames(verbose_new) <- NULL
-        #   verbose_info <<- rbind(verbose_info, verbose_new)
-        # }
-
       }
     }
   }
@@ -543,6 +519,7 @@ eucast_rules <- function(tbl,
 
   # save original table
   tbl_original <- tbl
+  tbl_bak <- tbl
 
   # join to microorganisms data set
   tbl <- tbl %>%
@@ -1886,9 +1863,9 @@ eucast_rules <- function(tbl,
       format(x, big.mark = big.mark, decimal.mark = decimal.mark)
     }
     cat(bold(paste('\n=> EUCAST rules', paste0(wouldve, 'affected'),
-             number_affected_rows %>% length() %>% formatnr(),
-             'out of', nrow(tbl_original) %>% formatnr(),
-             'rows\n')))
+                   number_affected_rows %>% length() %>% formatnr(),
+                   'out of', nrow(tbl_original) %>% formatnr(),
+                   'rows\n')))
     total_added <- number_added_S + number_added_I + number_added_R
     total_changed <- number_changed_to_S + number_changed_to_I + number_changed_to_R
     cat(colour(paste0("   -> ", wouldve, "added ",
@@ -1905,6 +1882,9 @@ eucast_rules <- function(tbl,
                                formatnr(number_changed_to_I), " to I; ",
                                formatnr(number_changed_to_R), " to R)"),
                       "\n")))
+    if (verbose == FALSE) {
+      cat(paste("Use", bold("verbose = TRUE"), "to get a data.frame with all specified edits.\n"))
+    }
   }
 
   if (verbose == TRUE) {
@@ -1913,6 +1893,9 @@ eucast_rules <- function(tbl,
         verbose_info$mo_fullname <- mo_fullname(verbose_info$mo)
       )
     )
+    verbose_info <- verbose_info %>%
+      filter(!is.na(new) & !identical(old, new)) %>%
+      arrange(row)
     return(verbose_info)
   }
 
@@ -1932,3 +1915,4 @@ interpretive_reading <- function(...) {
   .Deprecated("eucast_rules")
   eucast_rules(...)
 }
+
diff --git a/R/guess_ab_col.R b/R/guess_ab_col.R
index 1c3e3b290..a94a971a1 100755
--- a/R/guess_ab_col.R
+++ b/R/guess_ab_col.R
@@ -54,7 +54,7 @@ guess_ab_col <- function(tbl = NULL, col = NULL, verbose = FALSE) {
   if (is.null(tbl) & is.null(col)) {
     return(as.name("guess_ab_col"))
   }
-  #stop("This function should not be called directly.")
+
   if (length(col) > 1) {
     warning("argument 'col' has length > 1 and only the first element will be used")
     col <- col[1]
@@ -114,7 +114,7 @@ guess_ab_col <- function(tbl = NULL, col = NULL, verbose = FALSE) {
 
   if (length(ab_result) == 0) {
     if (verbose == TRUE) {
-      message('no result found for col "', col, '"')
+      message('no column found for input "', col, '"')
     }
     return(NULL)
   } else {
@@ -124,7 +124,7 @@ guess_ab_col <- function(tbl = NULL, col = NULL, verbose = FALSE) {
     }
     if (length(result) == 0) {
       if (verbose == TRUE) {
-        message('no result found for col "', col, '"')
+        message('no column found for input "', col, '"')
       }
       return(NULL)
     }
diff --git a/R/mo.R b/R/mo.R
index b65baff98..b30aed529 100755
--- a/R/mo.R
+++ b/R/mo.R
@@ -61,7 +61,11 @@
 #' The algorithm uses data from the Catalogue of Life (see below) and from one other source (see \code{?microorganisms}).
 #'
 #' \strong{Self-learning algoritm} \cr
-#' The \code{as.mo()} function gains experience from previously determined microbial IDs and learns from it. This drastically improves both speed and reliability. Use \code{clean_mo_history()} to reset the algorithms. Only experience from your current \code{AMR} package version is used. This is done because in the future the taxonomic tree (which is included in this package) may change for any organism and it consequently has to rebuild its knowledge. Usually, any guess after the first try runs 80-95\% faster than the first try. The algorithm saves its previous findings to \code{~/.Rhistory_mo}.
+#' The \code{as.mo()} function gains experience from previously determined microbial IDs and learns from it. This drastically improves both speed and reliability. Use \code{clean_mo_history()} to reset the algorithms. Only experience from your current \code{AMR} package version is used. This is done because in the future the taxonomic tree (which is included in this package) may change for any organism and it consequently has to rebuild its knowledge.
+#'
+#' Usually, any guess after the first try runs 80-95\% faster than the first try.
+#'
+#' For now, learning only works per session. If R is closed or terminated, the algorithms reset. This will probably be resolved in a next version.
 #'
 #' \strong{Intelligent rules} \cr
 #' This function uses intelligent rules to help getting fast and logical results. It tries to find matches in this order:
diff --git a/R/mo_history.R b/R/mo_history.R
index 9217eab9a..b17cbd76d 100644
--- a/R/mo_history.R
+++ b/R/mo_history.R
@@ -19,10 +19,9 @@
 # Visit our website for more info: https://msberends.gitab.io/AMR.     #
 # ==================================================================== #
 
-# print successful as.mo coercions to file, not uncertain ones
+# print successful as.mo coercions to AMR environment
 #' @importFrom dplyr %>% distinct filter
 set_mo_history <- function(x, mo, uncertainty_level, force = FALSE) {
-  file_location <- base::path.expand('~/.Rhistory_mo')
   if (base::interactive() | force == TRUE) {
     mo_hist <- read_mo_history(uncertainty_level = uncertainty_level, force = force)
     df <- data.frame(x, mo, stringsAsFactors = FALSE) %>%
@@ -37,12 +36,17 @@ set_mo_history <- function(x, mo, uncertainty_level, force = FALSE) {
       # save package version too, as both the as.mo() algorithm and the reference data set may change
       if (NROW(mo_hist[base::which(mo_hist$x == x[i] &
                                    mo_hist$uncertainty_level >= uncertainty_level &
-                                   mo_hist$package_version == utils::packageVersion("AMR")),]) == 0) {
-        base::write(x = c(x[i], mo[i], uncertainty_level, base::as.character(utils::packageVersion("AMR"))),
-                    file = file_location,
-                    ncolumns = 4,
-                    append = TRUE,
-                    sep = "\t")
+                                   mo_hist$package_v == utils::packageVersion("AMR")),]) == 0) {
+        assign(x = "mo_history",
+               value = rbind(mo_hist,
+                             data.frame(
+                               x = x[i],
+                               mo = mo[i],
+                               uncertainty_level = uncertainty_level,
+                               package_v = base::as.character(utils::packageVersion("AMR")),
+                               stringsAsFactors = FALSE)
+                             ),
+               envir = asNamespace("AMR"))
       }
     }
   }
@@ -50,35 +54,35 @@ set_mo_history <- function(x, mo, uncertainty_level, force = FALSE) {
 }
 
 get_mo_history <- function(x, uncertainty_level, force = FALSE) {
-  file_read <- read_mo_history(uncertainty_level = uncertainty_level, force = force)
-  if (base::is.null(file_read)) {
+  history <- read_mo_history(uncertainty_level = uncertainty_level, force = force)
+  if (base::is.null(history)) {
     NA
   } else {
     data.frame(x = toupper(x), stringsAsFactors = FALSE) %>%
-      left_join(file_read, by = "x") %>%
+      left_join(history, by = "x") %>%
       pull(mo)
   }
 }
 
 #' @importFrom dplyr %>% filter distinct
 read_mo_history <- function(uncertainty_level = 2, force = FALSE, unfiltered = FALSE) {
-  file_location <- base::path.expand('~/.Rhistory_mo')
-  if (!base::file.exists(file_location) | (!base::interactive() & force == FALSE)) {
+  if ((!base::interactive() & force == FALSE)) {
     return(NULL)
   }
   uncertainty_level_param <- uncertainty_level
-  file_read <- utils::read.table(file = file_location,
-                                 header = FALSE,
-                                 sep = "\t",
-                                 col.names = c("x", "mo", "uncertainty_level", "package_version"),
-                                 stringsAsFactors = FALSE)
+
+  history <- tryCatch(get("mo_history", envir = asNamespace("AMR")),
+                        error = function(e) NULL)
+  if (is.null(history)) {
+    return(NULL)
+  }
   # Below: filter on current package version.
   # Even current fullnames may be replaced by new taxonomic names, so new versions of
   # the Catalogue of Life must not lead to data corruption.
 
   if (unfiltered == FALSE) {
-    file_read <- file_read %>%
-      filter(package_version == utils::packageVersion("AMR"),
+    history <- history %>%
+      filter(package_v == as.character(utils::packageVersion("AMR")),
              # only take unknowns if uncertainty_level_param is higher
              ((mo == "UNKNOWN" & uncertainty_level_param == uncertainty_level) |
                 (mo != "UNKNOWN" & uncertainty_level_param >= uncertainty_level))) %>%
@@ -86,10 +90,10 @@ read_mo_history <- function(uncertainty_level = 2, force = FALSE, unfiltered = F
       distinct(x, mo, .keep_all = TRUE)
   }
 
-  if (nrow(file_read) == 0) {
+  if (nrow(history) == 0) {
     NULL
   } else {
-    file_read
+    history
   }
 }
 
@@ -98,20 +102,21 @@ read_mo_history <- function(uncertainty_level = 2, force = FALSE, unfiltered = F
 #' @importFrom utils menu
 #' @export
 clean_mo_history <- function(...) {
-  file_location <- base::path.expand('~/.Rhistory_mo')
-  if (file.exists(file_location)) {
+  if (!is.null(read_mo_history())) {
     if (interactive() & !isTRUE(list(...)$force)) {
       q <- menu(title = paste("This will remove all",
                               format(nrow(read_mo_history(999, unfiltered = TRUE)), big.mark = ","),
-                              "previously determined microbial IDs. Are you sure?"),
+                              "microbial IDs determined previously in this session. Are you sure?"),
                 choices = c("Yes", "No"),
                 graphics = FALSE)
       if (q != 1) {
         return(invisible())
       }
     }
-    unlink(file_location)
-    cat(red("File", file_location, "removed."))
+    assign(x = "mo_history",
+           value = NULL,
+           envir = asNamespace("AMR"))
+    cat(red("History removed."))
   }
 }
 
diff --git a/man/as.mo.Rd b/man/as.mo.Rd
index 69fc3cf6a..56fb4f437 100644
--- a/man/as.mo.Rd
+++ b/man/as.mo.Rd
@@ -72,7 +72,11 @@ Use the \code{\link{mo_property}_*} functions to get properties based on the ret
 The algorithm uses data from the Catalogue of Life (see below) and from one other source (see \code{?microorganisms}).
 
 \strong{Self-learning algoritm} \cr
-The \code{as.mo()} function gains experience from previously determined microbial IDs and learns from it. This drastically improves both speed and reliability. Use \code{clean_mo_history()} to reset the algorithms. Only experience from your current \code{AMR} package version is used. This is done because in the future the taxonomic tree (which is included in this package) may change for any organism and it consequently has to rebuild its knowledge. Usually, any guess after the first try runs 80-95\% faster than the first try. The algorithm saves its previous findings to \code{~/.Rhistory_mo}.
+The \code{as.mo()} function gains experience from previously determined microbial IDs and learns from it. This drastically improves both speed and reliability. Use \code{clean_mo_history()} to reset the algorithms. Only experience from your current \code{AMR} package version is used. This is done because in the future the taxonomic tree (which is included in this package) may change for any organism and it consequently has to rebuild its knowledge.
+
+Usually, any guess after the first try runs 80-95\% faster than the first try.
+
+For now, learning only works per session. If R is closed or terminated, the algorithms reset. This will probably be resolved in a next version.
 
 \strong{Intelligent rules} \cr
 This function uses intelligent rules to help getting fast and logical results. It tries to find matches in this order:
diff --git a/reproduction_of_antibiotics.R b/reproduction_of_antibiotics.R
new file mode 100644
index 000000000..b1a657145
--- /dev/null
+++ b/reproduction_of_antibiotics.R
@@ -0,0 +1,58 @@
+
+# WORK IN PROGRESS --------------------------------------------------------
+
+# vector with official names, return vector with CIDs
+get_CID <- function(ab) {
+  CID <- rep(NA_integer_, length(ab))
+  p <- progress_estimated(n = length(ab), min_time = 0)
+  for (i in 1:length(ab)) {
+    p$tick()$print()
+
+    CID[i] <- tryCatch(
+      data.table::fread(paste0("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/",
+                               ab[i],
+                               "/cids/TXT?name_type=complete"),
+                        showProgress = FALSE)[[1]][1],
+      error = function(e) NA_integer_)
+    Sys.sleep(0.2)
+  }
+  CID
+}
+
+# returns vector with synonyms (brand names) for a single CID
+get_synonyms <- function(CID, clean = TRUE) {
+  p <- progress_estimated(n = length(CID), min_time = 0)
+  p$tick()$print()
+
+  synonyms_txt <- tryCatch(
+    data.table::fread(paste0("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/fastidentity/cid/",
+                                           CID,
+                                           "/synonyms/TXT"),
+                                    sep = "\n",
+                                    showProgress = FALSE)[[1]],
+    error = function(e) NA_character_)
+
+  if (clean == TRUE) {
+    # remove txt between brackets
+    synonyms_txt <- trimws(gsub("[(].*[)]", "", gsub("[[].*[]]", "", synonyms_txt)))
+    # only length 6 to 20 and no txt with reading marks or numbers
+    synonyms_txt <- synonyms_txt[nchar(synonyms_txt) %in% c(6:20)
+                                 & !synonyms_txt %like% "[-&{},_0-9]"]
+    synonyms_txt <- unlist(strsplit(synonyms_txt,  ";", fixed = TRUE))
+  }
+  synonyms_txt <- synonyms_txt[tolower(synonyms_txt) %in% unique(tolower(synonyms_txt))]
+  sort(synonyms_txt)
+}
+
+CIDs <- get_CID(antibiotics$official)
+synonyms <- character(length(CIDs))
+p <- progress_estimated(n = length(synonyms), min_time = 0)
+for (i in 365:length(synonyms)) {
+  #p$tick()$print()
+  if (!is.na(CIDs[i])) {
+    synonyms[i] <- paste(get_synonyms(CIDs[i]), collapse = "|")
+  }
+}
+
+antibiotics$cid <- CIDs
+antibiotics$trade_name <- synonyms
diff --git a/tests/testthat/test-mo_history.R b/tests/testthat/test-mo_history.R
index bf93955f1..486959b41 100644
--- a/tests/testthat/test-mo_history.R
+++ b/tests/testthat/test-mo_history.R
@@ -40,5 +40,5 @@ test_that("mo_history works", {
   expect_equal(as.character(as.mo("testsubject", force_mo_history = TRUE)), "B_ESCHR_COL")
 
   expect_equal(colnames(read_mo_history(force = TRUE)),
-               c("x", "mo", "uncertainty_level", "package_version"))
+               c("x", "mo", "uncertainty_level", "package_v"))
 })
diff --git a/vignettes/benchmarks.Rmd b/vignettes/benchmarks.Rmd
index d0b20e657..24e07e4e6 100755
--- a/vignettes/benchmarks.Rmd
+++ b/vignettes/benchmarks.Rmd
@@ -103,7 +103,7 @@ boxplot(microbenchmark(
         main = "Benchmarks per prevalence")
 ```
 
-The highest outliers are the first times. All next determinations were done in only thousands of seconds.
+The highest outliers are the first times. All next determinations were done in only thousands of seconds. For now, learning only works per session. If R is closed or terminated, the algorithms reset. This will probably be resolved in a next version.
 
 Still, uncommon microorganisms take a lot more time than common microorganisms, especially the first time. To relieve this pitfall and further improve performance, two important calculations take almost no time at all: **repetitive results** and **already precalculated results**.