as.mo improvements

2026-01-11 11:54:34 +01:00 · 2019-02-25 15:52:32 +01:00
parent 0ec76cfa98
commit c506d2893b
13 changed files with 390 additions and 376 deletions
--- a/R/mo.R
+++ b/R/mo.R
@@ -97,7 +97,7 @@
 #' The artificial intelligence takes into account microbial prevalence of pathogens in humans. It uses three groups and every (sub)species is in the group it matches first. These groups are:
 #' \itemize{
 #'   \item{1 (most prevalent): class is Gammaproteobacteria \strong{or} genus is one of: \emph{Enterococcus}, \emph{Staphylococcus}, \emph{Streptococcus}.}
-#'   \item{2: phylum is one of: Proteobacteria, Firmicutes, Actinobacteria, Sarcomastigophora \strong{or} genus is one of: \emph{Aspergillus}, \emph{Bacteroides}, \emph{Candida}, \emph{Capnocytophaga}, \emph{Chryseobacterium}, \emph{Cryptococcus}, \emph{Elisabethkingia}, \emph{Flavobacterium}, \emph{Fusobacterium}, \emph{Giardia}, \emph{Leptotrichia}, \emph{Mycoplasma}, \emph{Prevotella}, \emph{Rhodotorula}, \emph{Treponema}, \emph{Trichophyton}.}
+#'   \item{2: phylum is one of: Proteobacteria, Firmicutes, Actinobacteria, Sarcomastigophora \strong{or} genus is one of: \emph{Aspergillus}, \emph{Bacteroides}, \emph{Candida}, \emph{Capnocytophaga}, \emph{Chryseobacterium}, \emph{Cryptococcus}, \emph{Elisabethkingia}, \emph{Flavobacterium}, \emph{Fusobacterium}, \emph{Giardia}, \emph{Leptotrichia}, \emph{Mycoplasma}, \emph{Prevotella}, \emph{Rhodotorula}, \emph{Treponema}, \emph{Trichophyton}, \emph{Ureaplasma}.}
 #'   \item{3 (least prevalent): all others.}
 #' }
 #'
@@ -167,10 +167,9 @@
 #' }
 as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = TRUE, reference_df = get_mo_source()) {
  # will be checked for mo class in validation
-  mo <- mo_validate(x = x, property = "mo",
-                    Becker = Becker, Lancefield = Lancefield,
-                    allow_uncertain = allow_uncertain, reference_df = reference_df)
-  structure(.Data = mo, class = "mo")
+  mo_validate(x = x, property = "mo",
+              Becker = Becker, Lancefield = Lancefield,
+              allow_uncertain = allow_uncertain, reference_df = reference_df)
 }

 #' @rdname as.mo
@@ -229,7 +228,7 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
  x <- x[!is.na(x) & !is.null(x) & !identical(x, "")]

  # conversion of old MO codes from v0.5.0 (ITIS) to later versions (Catalogue of Life)
-  if (any(x %like% "^[BFP]_[A-Z]{3,7}")) {
+  if (any(x %like% "^[BFP]_[A-Z]{3,7}") & !all(x %in% microorganisms$mo)) {
    leftpart <- gsub("^([BFP]_[A-Z]{3,7}).*", "\\1", x)
    if (any(leftpart %in% names(mo_codes_v0.5.0))) {
      rightpart <- gsub("^[BFP]_[A-Z]{3,7}(.*)", "\\1", x)
@@ -256,40 +255,52 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
  # all empty
  if (all(identical(trimws(x_input), "") | is.na(x_input))) {
    if (property == "mo") {
-      return(structure(rep(NA_character_, length(x_input)), class = "mo"))
+      return(structure(rep(NA_character_, length(x_input)),
+                       class = "mo"))
    } else {
      return(rep(NA_character_, length(x_input)))
    }

  } else if (all(x %in% reference_df[, 1])
-             & all(reference_df[, "mo"] %in% microorganismsDT[, "mo"][[1]])) {
+             & all(reference_df[, "mo"] %in% AMR::microorganisms$mo)) {
    # all in reference df
    colnames(reference_df)[1] <- "x"
    suppressWarnings(
      x <- data.frame(x = x, stringsAsFactors = FALSE) %>%
        left_join(reference_df, by = "x") %>%
-        left_join(microorganisms, by = "mo") %>%
+        left_join(AMR::microorganisms, by = "mo") %>%
        pull(property)
    )

-  } else if (all(x %in% microorganismsDT[, "mo"][[1]])) {
+  } else if (all(x %in% AMR::microorganisms$mo)) {
    # existing mo codes when not looking for property "mo", like mo_genus("B_ESCHR_COL")
-    x <- microorganismsDT[data.table(mo = x), on = "mo", ..property][[1]]
+    y <- microorganismsDT[prevalence == 1][data.table(mo = x), on = "mo", ..property][[1]]
+    if (any(is.na(y))) {
+      y[is.na(y)] <- microorganismsDT[prevalence == 2][data.table(mo = x[is.na(y)]), on = "mo", ..property][[1]]
+    }
+    if (any(is.na(y))) {
+      y[is.na(y)] <- microorganismsDT[prevalence == 3][data.table(mo = x[is.na(y)]), on = "mo", ..property][[1]]
+    }
+    x <- y

-  } else if (all(x %in% microorganismsDT[prevalence == 1, "fullname"][[1]])) {
+  } else if (all(x %in% AMR::microorganisms$fullname)) {
    # we need special treatment for very prevalent full names, they are likely!
    # e.g. as.mo("Staphylococcus aureus")
-    x <- microorganismsDT[prevalence == 1][data.table(fullname = x), on = "fullname", ..property][[1]]
-  } else if (all(x %in% microorganismsDT[prevalence == 2, "fullname"][[1]])) {
-    # same for common full names, they are also likely
-    x <- microorganismsDT[prevalence == 2][data.table(fullname = x), on = "fullname", ..property][[1]]
+    y <- microorganismsDT[prevalence == 1][data.table(fullname = x), on = "fullname", ..property][[1]]
+    if (any(is.na(y))) {
+      y[is.na(y)] <- microorganismsDT[prevalence == 2][data.table(fullname = x[is.na(y)]), on = "fullname", ..property][[1]]
+    }
+    if (any(is.na(y))) {
+      y[is.na(y)] <- microorganismsDT[prevalence == 3][data.table(fullname = x[is.na(y)]), on = "fullname", ..property][[1]]
+    }
+    x <- y

-  } else if (all(toupper(x) %in% microorganisms.codes[, "code"])) {
+  } else if (all(toupper(x) %in% AMR::microorganisms.codes$code)) {
    # commonly used MO codes
-    y <- as.data.table(microorganisms.codes)[data.table(code = toupper(x)), on = "code", ]
+    y <- as.data.table(AMR::microorganisms.codes)[data.table(code = toupper(x)), on = "code", ]
    x <- microorganismsDT[data.table(mo = y[["mo"]]), on = "mo", ..property][[1]]

-  } else if (!all(x %in% microorganismsDT[, ..property][[1]])) {
+  } else if (!all(x %in% AMR::microorganisms[, property])) {

    x_backup <- x

@@ -504,8 +515,8 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
      }

      # TRY OTHER SOURCES ----
-      if (toupper(x_backup[i]) %in% microorganisms.codes[, 1]) {
-        mo_found <- microorganisms.codes[toupper(x_backup[i]) == microorganisms.codes[, 1], "mo"][1L]
+      if (toupper(x_backup[i]) %in% AMR::microorganisms.codes[, 1]) {
+        mo_found <- AMR::microorganisms.codes[toupper(x_backup[i]) == AMR::microorganisms.codes[, 1], "mo"][1L]
        if (length(mo_found) > 0) {
          x[i] <- microorganismsDT[mo == mo_found, ..property][[1]][1L]
          next
--- a/R/mo_property.R
+++ b/R/mo_property.R
@@ -494,7 +494,7 @@ mo_validate <- function(x, property, ...) {
    # check onLoad() in R/zzz.R: data tables are created there.
  }

-  if (!all(x %in% microorganismsDT[, ..property][[1]])
+  if (!all(x %in% microorganisms[, property])
      | Becker %in% c(TRUE, "all")
      | Lancefield %in% c(TRUE, "all")) {
    exec_as.mo(x, property = property, ...)
--- a/R/zzz.R
+++ b/R/zzz.R
@@ -30,6 +30,10 @@
    microorganisms.oldDT <- as.data.table(AMR::microorganisms.old)
    setkey(microorganisms.oldDT, col_id, fullname)

+    assign(x = "microorganisms",
+           value = make(),
+           envir = asNamespace("AMR"))
+
    assign(x = "microorganismsDT",
           value = make_DT(),
           envir = asNamespace("AMR"))
@@ -45,9 +49,8 @@
 }

 #' @importFrom dplyr mutate case_when
-#' @importFrom data.table as.data.table setkey
-make_DT <- function() {
-  microorganismsDT <- AMR::microorganisms %>%
+make <- function() {
+  AMR::microorganisms %>%
    mutate(prevalence = case_when(
      class == "Gammaproteobacteria"
      | genus %in% c("Enterococcus", "Staphylococcus", "Streptococcus")
@@ -71,11 +74,16 @@ make_DT <- function() {
                     "Prevotella",
                     "Rhodotorula",
                     "Treponema",
-                     "Trichophyton")
+                     "Trichophyton",
+                     "Ureaplasma")
      ~ 2,
      TRUE ~ 3
-    )) %>%
-    as.data.table()
+    ))
+}
+
+#' @importFrom data.table as.data.table setkey
+make_DT <- function() {
+  microorganismsDT <- as.data.table(make())
  setkey(microorganismsDT,
         kingdom,
         prevalence,