set_mo_source

2025-11-25 00:51:09 +01:00 · 2019-01-21 15:53:01 +01:00
parent c6c3088e9f
commit 46dcc7e2e8
36 changed files with 1134 additions and 261 deletions
--- a/R/data.R
+++ b/R/data.R
@@ -148,7 +148,7 @@
 #' }
 #' @source [3] Integrated Taxonomic Information System (ITIS) on-line database, \url{https://www.itis.gov}.
 #' @inheritSection AMR Read more on our website!
-#' @seealso \code{\link{as.mo}} \code{\link{mo_property}} \code{\link{microorganisms.umcg}}
+#' @seealso \code{\link{as.mo}} \code{\link{mo_property}} \code{\link{microorganisms.codes}}
 "microorganisms"

 #' Data set with old taxonomic data from ITIS
@@ -167,29 +167,17 @@
 #' @seealso \code{\link{as.mo}} \code{\link{mo_property}} \code{\link{microorganisms}}
 "microorganisms.old"

-#' Translation table for UMCG
+#' Translation table for microorganism codes
 #'
-#' A data set containing all bacteria codes of UMCG MMB. These codes can be joined to data with an ID from \code{\link{microorganisms}$mo} (using \code{\link{left_join_microorganisms}}). GLIMS codes can also be translated to valid \code{MO}s with \code{\link{guess_mo}}.
-#' @format A \code{\link{data.frame}} with 1,095 observations and 2 variables:
+#' A data set containing commonly used codes for microorganisms. Define your own with \code{\link{set_mo_source}}.
+#' @format A \code{\link{data.frame}} with 3,303 observations and 2 variables:
 #' \describe{
-#'   \item{\code{umcg}}{Code of microorganism according to UMCG MMB}
-#'   \item{\code{certe}}{Code of microorganism according to Certe MMB}
-#' }
-#' @inheritSection AMR Read more on our website!
-#' @seealso \code{\link{as.mo}} \code{\link{microorganisms.certe}} \code{\link{microorganisms}}
-"microorganisms.umcg"
-
-#' Translation table for Certe
-#'
-#' A data set containing all bacteria codes of Certe MMB. These codes can be joined to data with an ID from \code{\link{microorganisms}$mo} (using \code{\link{left_join_microorganisms}}). GLIMS codes can also be translated to valid \code{MO}s with \code{\link{guess_mo}}.
-#' @format A \code{\link{data.frame}} with 2,665 observations and 2 variables:
-#' \describe{
-#'   \item{\code{certe}}{Code of microorganism according to Certe MMB}
+#'   \item{\code{certe}}{Commonly used code of a microorganism}
 #'   \item{\code{mo}}{Code of microorganism in \code{\link{microorganisms}}}
 #' }
 #' @inheritSection AMR Read more on our website!
 #' @seealso \code{\link{as.mo}} \code{\link{microorganisms}}
-"microorganisms.certe"
+"microorganisms.codes"

 #' Data set with 2000 blood culture isolates of septic patients
 #'
--- a/R/globals.R
+++ b/R/globals.R
@@ -54,11 +54,10 @@ globalVariables(c(".",
                  "median",
                  "mic",
                  "microorganisms",
-                  "microorganisms.certe",
+                  "microorganisms.codes",
                  "microorganisms.old",
                  "microorganisms.oldDT",
                  "microorganisms.prevDT",
-                  "microorganisms.umcg",
                  "microorganisms.unprevDT",
                  "microorganismsDT",
                  "mo",
--- a/R/mo.R
+++ b/R/mo.R
@@ -30,7 +30,7 @@
 #'
 #'   This excludes \emph{Enterococci} at default (who are in group D), use \code{Lancefield = "all"} to also categorise all \emph{Enterococci} as group D.
 #' @param allow_uncertain a logical to indicate whether the input should be checked for less possible results, see Details
-#' @param reference_df a \code{data.frame} to use for extra reference when translating \code{x} to a valid \code{mo}. The first column can be any microbial name, code or ID (used in your analysis or organisation), the second column must be a valid \code{mo} as found in the \code{\link{microorganisms}} data set.
+#' @param reference_df a \code{data.frame} to use for extra reference when translating \code{x} to a valid \code{mo}. See \code{\link{set_mo_source}} and \code{\link{get_mo_source}} to automate the usage of your own codes (e.g. used in your analysis or organisation).
 #' @rdname as.mo
 #' @aliases mo
 #' @keywords mo Becker becker Lancefield lancefield guess
@@ -139,7 +139,7 @@
 #' df <- df %>%
 #'   mutate(mo = as.mo(paste(genus, species)))
 #' }
-as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = TRUE, reference_df = NULL) {
+as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = TRUE, reference_df = get_mo_source()) {
  mo <- mo_validate(x = x, property = "mo",
                    Becker = Becker, Lancefield = Lancefield,
                    allow_uncertain = allow_uncertain, reference_df = reference_df)
@@ -152,11 +152,11 @@ is.mo <- function(x) {
  identical(class(x), "mo")
 }

-#' @importFrom dplyr %>% pull left_join n_distinct progress_estimated
+#' @importFrom dplyr %>% pull left_join n_distinct progress_estimated filter
 #' @importFrom data.table data.table as.data.table setkey
 #' @importFrom crayon magenta red italic
 exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
-                       allow_uncertain = TRUE, reference_df = NULL,
+                       allow_uncertain = TRUE, reference_df = get_mo_source(),
                       property = "mo", clear_options = TRUE) {

  if (!"AMR" %in% base::.packages()) {
@@ -206,11 +206,16 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
    if (!is.data.frame(reference_df) | NCOL(reference_df) < 2) {
      stop('`reference_df` must be a data.frame with at least two columns.', call. = FALSE)
    }
-    # remove factors, just keep characters
+    if (!"mo" %in% colnames(reference_df)) {
+      stop("`reference_df` must contain a column `mo` with values from the 'microorganisms' data set.", call. = FALSE)
+    }
+    reference_df <- reference_df %>% filter(!is.na(mo))
+    # # remove factors, just keep characters
    suppressWarnings(
      reference_df[] <- lapply(reference_df, as.character)
    )
  }
+
  if (all(identical(trimws(x_input), "") | is.na(x_input))) {
    # all empty
    if (property == "mo") {
@@ -220,24 +225,25 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
    } else {
      return(rep(NA_character_, length(x_input)))
    }
-  } else if (all(x %in% microorganismsDT[["mo"]])) {
-    # existing mo codes when not looking for property "mo", like mo_genus("B_ESCHR_COL")
-    x <- microorganismsDT[data.table(mo = x), on = "mo", ..property][[1]]
-  } else if (!is.null(reference_df)
-             & all(x %in% reference_df[, 1])
-             & all(reference_df[, 2] %in% microorganismsDT[["mo"]])) {
-    # manually defined reference
+
+  } else if (all(x %in% reference_df[, 1])
+             & all(reference_df[, "mo"] %in% microorganismsDT[["mo"]])) {
+    # all in reference df
    colnames(reference_df)[1] <- "x"
-    colnames(reference_df)[2] <- "mo"
    suppressWarnings(
      x <- data.frame(x = x, stringsAsFactors = FALSE) %>%
        left_join(reference_df, by = "x") %>%
        left_join(microorganisms, by = "mo") %>%
        pull(property)
    )
-  } else if (all(toupper(x) %in% microorganisms.certe[, "certe"])) {
-    # old Certe codes
-    y <- as.data.table(microorganisms.certe)[data.table(certe = toupper(x)), on = "certe", ]
+
+  } else if (all(x %in% microorganismsDT[["mo"]])) {
+    # existing mo codes when not looking for property "mo", like mo_genus("B_ESCHR_COL")
+    x <- microorganismsDT[data.table(mo = x), on = "mo", ..property][[1]]
+
+  } else if (all(toupper(x) %in% microorganisms.codes[, "code"])) {
+    # commonly used MO codes
+    y <- as.data.table(microorganisms.codes)[data.table(code = toupper(x)), on = "code", ]
    x <- microorganismsDT[data.table(mo = y[["mo"]]), on = "mo", ..property][[1]]

  } else if (!all(x %in% microorganismsDT[[property]])) {
@@ -419,28 +425,16 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
      }

      # TRY OTHER SOURCES ----
-      if (toupper(x_backup[i]) %in% microorganisms.certe[, 1]) {
-        mo_found <- microorganisms.certe[toupper(x_backup[i]) == microorganisms.certe[, 1], 2][1L]
+      if (toupper(x_backup[i]) %in% microorganisms.codes[, 1]) {
+        mo_found <- microorganisms.codes[toupper(x_backup[i]) == microorganisms.codes[, 1], "mo"][1L]
        if (length(mo_found) > 0) {
          x[i] <- microorganismsDT[mo == mo_found, ..property][[1]][1L]
          next
        }
      }
-      if (x_backup[i] %in% microorganisms.umcg[, 1]) {
-        mo_umcg <- microorganisms.umcg[microorganisms.umcg[, 1] == x_backup[i], 2]
-        mo_found <- microorganisms.certe[microorganisms.certe[, 1] == mo_umcg, 2][1L]
-        if (length(mo_found) == 0) {
-          # not found
-          x[i] <- NA_character_
-          failures <- c(failures, x_backup[i])
-        } else {
-          x[i] <- microorganismsDT[mo == mo_found, ..property][[1]][1L]
-        }
-        next
-      }
      if (!is.null(reference_df)) {
        if (x_backup[i] %in% reference_df[, 1]) {
-          ref_mo <- reference_df[reference_df[, 1] == x_backup[i], 2]
+          ref_mo <- reference_df[reference_df[, 1] == x_backup[i], "mo"]
          if (ref_mo %in% microorganismsDT[, mo]) {
            x[i] <- microorganismsDT[mo == ref_mo, ..property][[1]][1L]
            next
--- a/R/mo_source.R
+++ b/R/mo_source.R
@@ -0,0 +1,174 @@
+# ==================================================================== #
+# TITLE                                                                #
+# Antimicrobial Resistance (AMR) Analysis                              #
+#                                                                      #
+# SOURCE                                                               #
+# https://gitlab.com/msberends/AMR                                     #
+#                                                                      #
+# LICENCE                                                              #
+# (c) 2019 Berends MS (m.s.berends@umcg.nl), Luz CF (c.f.luz@umcg.nl)  #
+#                                                                      #
+# This R package is free software; you can freely use and distribute   #
+# it for both personal and commercial purposes under the terms of the  #
+# GNU General Public License version 2.0 (GNU GPL-2), as published by  #
+# the Free Software Foundation.                                        #
+#                                                                      #
+# This R package was created for academic research and was publicly    #
+# released in the hope that it will be useful, but it comes WITHOUT    #
+# ANY WARRANTY OR LIABILITY.                                           #
+# Visit our website for more info: https://msberends.gitab.io/AMR.     #
+# ==================================================================== #
+
+#' Use predefined reference data set
+#'
+#' These functions can be used to predefine your own reference to be used in \code{\link{as.mo}} and consequently all \code{mo_*} functions like \code{\link{mo_genus}} and \code{\link{mo_gramstain}}.
+#' @param path location of your reference file, see Details
+#' @rdname mo_source
+#' @name mo_source
+#' @aliases set_mo_source get_mo_source
+#' @details The reference file can be a text file seperated with commas (CSV) or pipes, an Excel file (old 'xls' format or new 'xlsx' format) or an R object file (extension '.rds'). To use an Excel file, you need to have the \code{readxl} package installed.
+#'
+#' \code{set_mo_source} will check the file for validity: it must be a \code{data.frame}, must have a column named \code{"mo"} which contains values from \code{microorganisms$mo} and must have a reference column with your own defined values. If all tests pass, \code{set_mo_source} will read the file into R and export it to \code{"~/.mo_source.rds"}. This compressed data file will then be used at default for MO determination (function \code{\link{as.mo}} and consequently all \code{mo_*} functions like \code{\link{mo_genus}} and \code{\link{mo_gramstain}}). The location of the original file will be saved as option with \code{\link{options}(mo_source = path)}. Its timestamp will be saved with \code{\link{options}(mo_source_datetime = ...)}.
+#'
+#' \code{get_mo_source} will return the data set by reading \code{"~/.mo_source.rds"} with \code{\link{readRDS}}. If the original file has changed (the file defined with \code{path}), it will call \code{set_mo_source} to update the data file automatically.
+#'
+#' Reading an Excel file (\code{.xlsx}) with only one row has a size of 8-9 kB. The compressed file will have a size of 0.1 kB and can be read by \code{get_mo_source} in only a couple of microseconds (a millionth of a second).
+#' @importFrom dplyr select everything
+#' @export
+#' @inheritSection AMR Read more on our website!
+#' @examples
+#' \dontrun{
+#'
+#' # imagine this Excel file (mo codes looked up in `microorganisms` data set):
+#' #            A                B
+#' # 1  our code            mo
+#' # 2  lab_mo_ecoli        B_ESCHR_COL
+#' # 3  lab_mo_kpneumoniae  B_KLBSL_PNE
+#'
+#' # 1. We save it as 'home/me/ourcodes.xlsx'
+#'
+#' # 2. We use it for input:
+#' set_mo_source("C:\path\ourcodes.xlsx")
+#' #> Created mo_source file '~/.mo_source.rds' from 'home/me/ourcodes.xlsx'.
+#'
+#' # 3. And use it in our functions:
+#' as.mo("lab_mo_ecoli")
+#' #> B_ESCHR_COL
+#'
+#' mo_genus("lab_mo_kpneumoniae")
+#' #> "Klebsiella"
+#'
+#' # 4. It will look for changes itself:
+#' # (add new row to the Excel file and save it)
+#'
+#' mo_genus("lab_mo_kpneumoniae")
+#' #> Updated mo_source file '~/.mo_source.rds' from 'home/me/ourcodes.xlsx'.
+#' #> "Klebsiella"
+#' }
+set_mo_source <- function(path) {
+
+  if (!is.character(path) | length(path) > 1) {
+    stop("`path` must be a character of length 1.")
+  }
+
+  if (path == "") {
+    options(mo_source = NULL)
+    options(mo_source_timestamp = NULL)
+    if (file.exists("~/.mo_source.rds")) {
+      unlink("~/.mo_source.rds")
+      message("Removed mo_source file '~/.mo_source.rds'.")
+    }
+    return(invisible())
+  }
+
+  if (!file.exists(path)) {
+    stop("File not found: ", path)
+  }
+
+  is_valid <- function(df) {
+    valid <- TRUE
+    if (!is.data.frame(df)) {
+      valid <- FALSE
+    } else if (!"mo" %in% colnames(df)) {
+      valid <- FALSE
+    } else if (!all(df$mo %in% AMR::microorganisms$mo)) {
+      valid <- FALSE
+    } else if (NCOL(df) < 2) {
+      valid <- FALSE
+    }
+    valid
+  }
+
+  if (path %like% '[.]rds$') {
+    df <- readRDS(path)
+
+  } else if (path %like% '[.]xlsx?$') {
+    # is Excel file (old or new)
+    if (!"readxl" %in% utils::installed.packages()) {
+      stop("Install the 'readxl' package first.")
+    }
+    if (path %like% '[.]xlsx$') {
+      df <- readxl::read_xlsx(path)
+    } else {
+      df <- readxl::read_xls(path)
+    }
+
+  } else {
+    # try comma first
+    try(
+      df <- utils::read.table(header = TRUE, sep = ",", stringsAsFactors = FALSE),
+      silent = TRUE)
+    if (!is_valid(df)) {
+      # try pipe
+      try(
+        df <- utils::read.table(header = TRUE, sep = "|", stringsAsFactors = FALSE),
+        silent = TRUE)
+    }
+  }
+
+  if (!is_valid(df)) {
+    stop("File must contain a column with self-defined values and a reference column `mo` with valid values from the `microorganisms` data set.")
+  }
+
+  if (colnames(df)[1] == "mo") {
+    # put mo to the end
+    df <- df %>% select(-"mo", everything(), "mo")
+  }
+
+  df <- as.data.frame(df, stringAsFactors = FALSE)
+
+  # success
+  if (file.exists("~/.mo_source.rds")) {
+    action <- "Updated"
+  } else {
+    action <- "Created"
+  }
+  saveRDS(df, "~/.mo_source.rds")
+  options(mo_source = path)
+  options(mo_source_timestamp = as.character(file.info(path)$mtime))
+  message(action, " mo_source file '~/.mo_source.rds' from '", path, "'.")
+}
+
+#' @rdname mo_source
+#' @export
+get_mo_source <- function() {
+
+  if (is.null(getOption("mo_source", NULL))) {
+    return(NULL)
+  } else {
+    old_time <- as.POSIXct(getOption("mo_source_timestamp"))
+    new_time <- as.POSIXct(as.character(file.info(getOption("mo_source", ""))$mtime))
+
+    if (is.na(new_time)) {
+      # source file was deleted, remove reference too
+      set_mo_source("")
+      return(NULL)
+    }
+    if (new_time != old_time) {
+      # set updated source
+      set_mo_source(getOption("mo_source"))
+    }
+  }
+
+  readRDS("~/.mo_source.rds")
+}