(v0.7.1.9073) as.mo() self-learning algorithm

2025-08-25 07:02:10 +02:00 · 2019-09-15 22:57:30 +02:00
parent cd178ee569
commit 398c5bdc4f
31 changed files with 1030 additions and 2360 deletions
--- a/R/like.R
+++ b/R/like.R
@@ -21,7 +21,7 @@

 #' Pattern Matching
 #'
-#' Convenient wrapper around \code{\link[base]{grep}} to match a pattern: \code{a \%like\% b}. It always returns a \code{logical} vector and is always case-insensitive. Also, \code{pattern} (\code{b}) can be as long as \code{x} (\code{a}) to compare items of each index in both vectors.
+#' Convenient wrapper around \code{\link[base]{grep}} to match a pattern: \code{a \%like\% b}. It always returns a \code{logical} vector and is always case-insensitive (use \code{a \%like_case\% b} for case-sensitive matching). Also, \code{pattern} (\code{b}) can be as long as \code{x} (\code{a}) to compare items of each index in both vectors, or can both have the same length to iterate over all cases.
 #' @inheritParams base::grepl
 #' @return A \code{logical} vector
 #' @name like
@@ -53,14 +53,14 @@
 #'   left_join_microorganisms() %>%
 #'   filter(genus %like% '^ent') %>%
 #'   freq(genus, species)
-like <- function(x, pattern) {
+like <- function(x, pattern, ignore.case = TRUE) {
  if (length(pattern) > 1) {
    if (length(x) != length(pattern)) {
      if (length(x) == 1) {
        x <- rep(x, length(pattern))
      }
      # return TRUE for every 'x' that matches any 'pattern', FALSE otherwise
-      res <- sapply(pattern, function(pttrn) x %like% pttrn)
+      res <- sapply(pattern, function(pttrn) base::grepl(pttrn, x, ignore.case = ignore.case))
      res2 <- as.logical(rowSums(res))
      # get only first item of every hit in pattern
      res2[duplicated(res)] <- FALSE
@@ -71,9 +71,9 @@ like <- function(x, pattern) {
      res <- vector(length = length(pattern))
      for (i in 1:length(res)) {
        if (is.factor(x[i])) {
-          res[i] <- as.integer(x[i]) %in% base::grep(pattern[i], levels(x[i]), ignore.case = TRUE)
+          res[i] <- as.integer(x[i]) %in% base::grep(pattern[i], levels(x[i]), ignore.case = ignore.case)
        } else {
-          res[i] <- base::grepl(pattern[i], x[i], ignore.case = TRUE)
+          res[i] <- base::grepl(pattern[i], x[i], ignore.case = ignore.case)
        }
      }
      return(res)
@@ -82,16 +82,24 @@ like <- function(x, pattern) {

  # the regular way how grepl works; just one pattern against one or more x
  if (is.factor(x)) {
-    as.integer(x) %in% base::grep(pattern, levels(x), ignore.case = TRUE)
+    as.integer(x) %in% base::grep(pattern, levels(x), ignore.case = ignore.case)
  } else {
-    tryCatch(base::grepl(pattern, x, ignore.case = TRUE),
+    tryCatch(base::grepl(pattern, x, ignore.case = ignore.case),
             error = function(e) ifelse(test = grepl("Invalid regexp", e$message),
                                        # try with perl = TRUE:
-                                        yes = return(base::grepl(pattern, x, ignore.case = TRUE, perl = TRUE)),
+                                        yes = return(base::grepl(pattern, x, ignore.case = ignore.case, perl = TRUE)),
                                        no = stop(e$message)))
  }
 }

 #' @rdname like
 #' @export
-"%like%" <- like
+"%like%" <- function(x, pattern) {
+  like(x, pattern, ignore.case = TRUE)
+}
+
+#' @rdname like
+#' @export
+"%like_case%" <- function(x, pattern) {
+  like(x, pattern, ignore.case = FALSE)
+}
--- a/R/misc.R
+++ b/R/misc.R
@@ -41,8 +41,8 @@ percent <- function(x, round = 1, force_zero = FALSE, decimal.mark = getOption("
      big.mark <- " "
    }
  }
-  x <- percent_clean(x = x, round = round, force_zero = force_zero, 
-                     decimal.mark = decimal.mark, big.mark = big.mark, ...)
+  percent_clean(x = x, round = round, force_zero = force_zero, 
+                decimal.mark = decimal.mark, big.mark = big.mark, ...)
 }

 #' @importFrom crayon blue bold red
--- a/R/mo.R
+++ b/R/mo.R
--- a/R/mo2.R
+++ b/R/mo2.R
--- a/R/mo_history.R
+++ b/R/mo_history.R
@@ -19,116 +19,152 @@
 # Visit our website for more info: https://msberends.gitlab.io/AMR.    #
 # ==================================================================== #

-# print successful as.mo coercions to AMR environment
-#' @importFrom dplyr %>% distinct filter
-set_mo_history <- function(x, mo, uncertainty_level, force = FALSE) {
-  # disable function for now
-  return(base::invisible())
+mo_history_file <- file.path(file.path(system.file(package = "AMR"), "mo_history"), "mo_history.csv")

-  # if (base::interactive() | force == TRUE) {
-  #   mo_hist <- read_mo_history(uncertainty_level = uncertainty_level, force = force)
-  #   df <- data.frame(x, mo, stringsAsFactors = FALSE) %>%
-  #     distinct(x, .keep_all = TRUE) %>%
-  #     filter(!is.na(x) & !is.na(mo))
-  #   if (nrow(df) == 0) {
-  #     return(base::invisible())
-  #   }
-  #   x <- toupper(df$x)
-  #   mo <- df$mo
-  #   for (i in 1:length(x)) {
-  #     # save package version too, as both the as.mo() algorithm and the reference data set may change
-  #     if (NROW(mo_hist[base::which(mo_hist$x == x[i] &
-  #                                  mo_hist$uncertainty_level >= uncertainty_level &
-  #                                  mo_hist$package_v == utils::packageVersion("AMR")),]) == 0) {
-  #       tryCatch(
-  #         assign(x = "mo_history",
-  #                value = rbind(mo_hist,
-  #                              data.frame(
-  #                                x = x[i],
-  #                                mo = mo[i],
-  #                                uncertainty_level = uncertainty_level,
-  #                                package_v = base::as.character(utils::packageVersion("AMR")),
-  #                                stringsAsFactors = FALSE)),
-  #                envir = asNamespace("AMR")),
-  #         error = function(e) invisible())
-  #     }
-  #   }
-  # }
-  # return(base::invisible())
+# print successful as.mo coercions to a options entry
+#' @importFrom dplyr %>% distinct filter
+set_mo_history <- function(x, mo, uncertainty_level, force = FALSE, disable = FALSE) {
+  if (isTRUE(disable)) {
+    return(base::invisible())
+  }
+  
+  if (base::interactive() | force == TRUE) {
+    mo_hist <- read_mo_history(uncertainty_level = uncertainty_level, force = force)
+    df <- data.frame(x, mo, stringsAsFactors = FALSE) %>%
+      distinct(x, .keep_all = TRUE) %>%
+      filter(!is.na(x) & !is.na(mo))
+    if (nrow(df) == 0) {
+      return(base::invisible())
+    }
+    x <- toupper(df$x)
+    mo <- df$mo
+    for (i in 1:length(x)) {
+      # save package version too, as both the as.mo() algorithm and the reference data set may change
+      if (NROW(mo_hist[base::which(mo_hist$x == x[i] &
+                                   mo_hist$uncertainty_level >= uncertainty_level &
+                                   mo_hist$package_v == utils::packageVersion("AMR")),]) == 0) {
+        # # Not using the file system:        
+        # tryCatch(options(mo_remembered_results = rbind(mo_hist,
+        #                                                data.frame(
+        #                                                  x = x[i],
+        #                                                  mo = mo[i],
+        #                                                  uncertainty_level = uncertainty_level,
+        #                                                  package_v = base::as.character(utils::packageVersion("AMR")),
+        #                                                  stringsAsFactors = FALSE))),
+        #          error = function(e) base::invisible())
+        # # don't remember more than 1,000 different input values
+        # if (tryCatch(nrow(getOption("mo_remembered_results")), error = function(e) 1001) > 1000) {
+        #   return(base::invisible())
+        # }
+        if (is.null(mo_hist)) {
+          message(blue(paste0("NOTE: results are saved to ", mo_history_file, ".")))
+        }
+        tryCatch(write.csv(rbind(mo_hist,
+                                 data.frame(
+                                   x = x[i],
+                                   mo = mo[i],
+                                   uncertainty_level = uncertainty_level,
+                                   package_v = base::as.character(utils::packageVersion("AMR")),
+                                   stringsAsFactors = FALSE)),
+                           file = mo_history_file, row.names = FALSE),
+                 error = function(e) base::invisible())
+      }
+    }
+  }
+  return(base::invisible())
 }

-get_mo_history <- function(x, uncertainty_level, force = FALSE) {
-  # disable function for now
-  return(NA)
+get_mo_history <- function(x, uncertainty_level, force = FALSE, disable = FALSE) {
+  if (isTRUE(disable)) {
+    return(to_class_mo(NA))
+  }

-  # history <- read_mo_history(uncertainty_level = uncertainty_level, force = force)
-  # if (base::is.null(history)) {
-  #   NA
-  # } else {
-  #   data.frame(x = toupper(x), stringsAsFactors = FALSE) %>%
-  #     left_join(history, by = "x") %>%
-  #     pull(mo)
-  # }
+  history <- read_mo_history(uncertainty_level = uncertainty_level, force = force)
+  if (base::is.null(history)) {
+    result <- NA
+  } else {
+    result <- data.frame(x = toupper(x), stringsAsFactors = FALSE) %>%
+      left_join(history, by = "x") %>%
+      pull(mo)
+  }
+  to_class_mo(result)
 }

 #' @importFrom dplyr %>% filter distinct
-read_mo_history <- function(uncertainty_level = 2, force = FALSE, unfiltered = FALSE) {
-  # disable function for now
-  return(NULL)
+read_mo_history <- function(uncertainty_level = 2, force = FALSE, unfiltered = FALSE, disable = FALSE) {
+  if (isTRUE(disable)) {
+    return(NULL)
+  }

-  # if ((!base::interactive() & force == FALSE)) {
-  #   return(NULL)
-  # }
-  # uncertainty_level_param <- uncertainty_level
-  #
-  # history <- tryCatch(get("mo_history", envir = asNamespace("AMR")),
-  #                       error = function(e) NULL)
-  # if (is.null(history)) {
-  #   return(NULL)
-  # }
-  # # Below: filter on current package version.
-  # # Even current fullnames may be replaced by new taxonomic names, so new versions of
-  # # the Catalogue of Life must not lead to data corruption.
-  #
-  # if (unfiltered == FALSE) {
-  #   history <- history %>%
-  #     filter(package_v == as.character(utils::packageVersion("AMR")),
-  #            # only take unknowns if uncertainty_level_param is higher
-  #            ((mo == "UNKNOWN" & uncertainty_level_param == uncertainty_level) |
-  #               (mo != "UNKNOWN" & uncertainty_level_param >= uncertainty_level))) %>%
-  #     arrange(desc(uncertainty_level)) %>%
-  #     distinct(x, mo, .keep_all = TRUE)
-  # }
-  #
-  # if (nrow(history) == 0) {
-  #   NULL
-  # } else {
-  #   history
-  # }
+  if ((!base::interactive() & force == FALSE)) {
+    return(NULL)
+  }
+  uncertainty_level_param <- uncertainty_level
+  
+  # # Not using the file system:
+  # history <- tryCatch(getOption("mo_remembered_results"),
+  #                     error = function(e) NULL)
+  history <- tryCatch(read.csv(mo_history_file, stringsAsFactors = FALSE),
+                      warning = function(w) invisible(),
+                      error = function(e) NULL)
+  if (is.null(history)) {
+    return(NULL)
+  }
+  # Below: filter on current package version.
+  # Even current fullnames may be replaced by new taxonomic names, so new versions of
+  # the Catalogue of Life must not lead to data corruption.
+
+  if (unfiltered == FALSE) {
+    history <- history %>%
+      filter(package_v == as.character(utils::packageVersion("AMR")),
+             # only take unknowns if uncertainty_level_param is higher
+             ((mo == "UNKNOWN" & uncertainty_level_param == uncertainty_level) |
+                (mo != "UNKNOWN" & uncertainty_level_param >= uncertainty_level))) %>%
+      arrange(desc(uncertainty_level)) %>%
+      distinct(x, mo, .keep_all = TRUE)
+  }
+
+  if (nrow(history) == 0) {
+    NULL
+  } else {
+    history
+  }
 }

-# @rdname as.mo
-# @importFrom crayon red
-# @importFrom utils menu
-# @export
-clean_mo_history <- function(...) {
-  # if (!is.null(read_mo_history())) {
-  #   if (interactive() & !isTRUE(list(...)$force)) {
-  #     q <- menu(title = paste("This will remove all",
-  #                             format(nrow(read_mo_history(999, unfiltered = TRUE)), big.mark = ","),
-  #                             "microbial IDs determined previously in this session. Are you sure?"),
-  #               choices = c("Yes", "No"),
-  #               graphics = FALSE)
-  #     if (q != 1) {
-  #       return(invisible())
-  #     }
-  #   }
-  #   tryCatch(
-  #     assign(x = "mo_history",
-  #            value = NULL,
-  #            envir = asNamespace("AMR")),
-  #     error = function(e) invisible())
-  #   cat(red("History removed."))
-  # }
+#' @rdname as.mo
+#' @importFrom crayon red
+#' @importFrom utils menu
+#' @export
+clear_mo_history <- function(...) {
+  if (!is.null(read_mo_history())) {
+    if (interactive() & !isTRUE(list(...)$force)) {
+      q <- menu(title = paste("This will clear all",
+                              format(nrow(read_mo_history(999, unfiltered = TRUE)), big.mark = ","),
+                              "previously determined microbial IDs. Are you sure?"),
+                choices = c("Yes", "No"),
+                graphics = FALSE)
+      if (q != 1) {
+        return(invisible())
+      }
+    }
+    # # Not using the file system:
+    # success <- tryCatch(options(mo_remembered_results = NULL),
+    #                     error = function(e) FALSE)
+    success <- create_blank_mo_history()
+    if (!isFALSE(success)) {
+      cat(red(paste("File", mo_history_file, "cleared.")))
+    }
+  }
 }

+create_blank_mo_history <- function() {
+  tryCatch(
+    write.csv(x = data.frame(x = character(0),
+                             mo = character(0),
+                             uncertainty_level = integer(0),
+                             package_v = character(0),
+                             stringsAsFactors = FALSE),
+              file = mo_history_file),
+    warning = function(w) invisible(),
+    error = function(e) TRUE)
+}
--- a/R/mo_property.R
+++ b/R/mo_property.R
@@ -408,7 +408,7 @@ mo_validate <- function(x, property, ...) {

  if (!"AMR" %in% base::.packages()) {
    require("AMR")
-    # check onLoad() in R/zzz.R: data tables are created there.
+    # check .onLoad() in R/zzz.R: data tables are created there.
  }

  # try to catch an error when inputting an invalid parameter
--- a/R/zzz.R
+++ b/R/zzz.R
@@ -26,7 +26,8 @@

  # register data
  microorganisms.oldDT <- as.data.table(AMR::microorganisms.old)
-  microorganisms.oldDT$fullname_lower <- tolower(microorganisms.oldDT$fullname)
+  # for fullname_lower: keep only dots, letters, numbers, slashes, spaces and dashes
+  microorganisms.oldDT$fullname_lower <- gsub("[^.a-z0-9/ \\-]+", "", tolower(microorganisms.oldDT$fullname))
  setkey(microorganisms.oldDT, col_id, fullname)

  assign(x = "microorganismsDT",
@@ -81,7 +82,8 @@
 #' @importFrom data.table as.data.table setkey
 make_DT <- function() {
  microorganismsDT <- as.data.table(AMR::microorganisms)
-  microorganismsDT$fullname_lower <- tolower(microorganismsDT$fullname)
+  # for fullname_lower: keep only dots, letters, numbers, slashes, spaces and dashes
+  microorganismsDT$fullname_lower <- gsub("[^.a-z0-9/ \\-]+", "", tolower(microorganismsDT$fullname))
  setkey(microorganismsDT,
         prevalence,
         kingdom,