new MOs, cleanup

2025-08-24 16:22:10 +02:00 · 2018-09-01 21:19:46 +02:00
parent 5965d3c794
commit 75fe4d401f
20 changed files with 166 additions and 179 deletions
--- a/R/atc.R
+++ b/R/atc.R
@@ -31,7 +31,7 @@
 #' In the ATC classification system, the active substances are classified in a hierarchy with five different levels.  The system has fourteen main anatomical/pharmacological groups or 1st levels. Each ATC main group is divided into 2nd levels which could be either pharmacological or therapeutic groups.  The 3rd and 4th levels are chemical, pharmacological or therapeutic subgroups and the 5th level is the chemical substance.  The 2nd, 3rd and 4th levels are often used to identify pharmacological subgroups when that is considered more appropriate than therapeutic or chemical subgroups.
 #'   Source: \url{https://www.whocc.no/atc/structure_and_principles/}
 #' @return Character (vector) with class \code{"act"}. Unknown values will return \code{NA}.
-#' @seealso \code{\link{antibiotics}} for the dataframe that is being used to determine ATC's.
+#' @seealso \code{\link{antibiotics}} for the dataframe that is being used to determine ATCs.
 #' @examples
 #' # These examples all return "J01FA01", the ATC code of Erythromycin:
 #' as.atc("J01FA01")
--- a/R/data.R
+++ b/R/data.R
@@ -16,10 +16,10 @@
 # GNU General Public License for more details.                         #
 # ==================================================================== #

-#' Dataset with 423 antibiotics
+#' Data set with 423 antibiotics
 #'
-#' A dataset containing all antibiotics with a J0 code and some other antimicrobial agents, with their DDD's. Except for trade names and abbreviations, all properties were downloaded from the WHO, see Source.
-#' @format A data.frame with 423 observations and 18 variables:
+#' A data set containing all antibiotics with a J0 code and some other antimicrobial agents, with their DDDs. Except for trade names and abbreviations, all properties were downloaded from the WHO, see Source.
+#' @format A \code{\link{tibble}} with 423 observations and 18 variables:
 #' \describe{
 #'   \item{\code{atc}}{ATC code, like \code{J01CR02}}
 #'   \item{\code{certe}}{Certe code, like \code{amcl}}
@@ -120,10 +120,10 @@
 #
 "antibiotics"

-#' Dataset with ~2650 microorganisms
+#' Data set with human pathogenic microorganisms
 #'
-#' A dataset containing 2,646 microorganisms. MO codes of the UMCG can be looked up using \code{\link{microorganisms.umcg}}.
-#' @format A data.frame with 2,646 observations and 12 variables:
+#' A data set containing 2,664 (potential) human pathogenic microorganisms. MO codes can be looked up using \code{\link{guess_mo}}.
+#' @format A \code{\link{tibble}} with 2,664 observations and 12 variables:
 #' \describe{
 #'   \item{\code{mo}}{ID of microorganism}
 #'   \item{\code{bactsys}}{Bactsyscode of microorganism}
@@ -151,10 +151,10 @@
 #' @seealso \code{\link{guess_mo}} \code{\link{antibiotics}} \code{\link{microorganisms.umcg}}
 "microorganisms"

-#' Translation table for UMCG with ~1100 microorganisms
+#' Translation table for UMCG with ~1,100 microorganisms
 #'
-#' A dataset containing all bacteria codes of UMCG MMB. These codes can be joined to data with an ID from \code{\link{microorganisms}$mo} (using \code{\link{left_join_microorganisms}}). GLIMS codes can also be translated to valid \code{mo}'s with \code{\link{guess_mo}}.
-#' @format A data.frame with 1090 observations and 2 variables:
+#' A data set containing all bacteria codes of UMCG MMB. These codes can be joined to data with an ID from \code{\link{microorganisms}$mo} (using \code{\link{left_join_microorganisms}}). GLIMS codes can also be translated to valid \code{MO}s with \code{\link{guess_mo}}.
+#' @format A \code{\link{tibble}} with 1,090 observations and 2 variables:
 #' \describe{
 #'   \item{\code{umcg}}{Code of microorganism according to UMCG MMB}
 #'   \item{\code{mo}}{Code of microorganism in \code{\link{microorganisms}}}
@@ -163,10 +163,10 @@
 #' @seealso \code{\link{guess_mo}} \code{\link{microorganisms}}
 "microorganisms.umcg"

-#' Dataset with 2000 blood culture isolates of septic patients
+#' Data set with 2000 blood culture isolates of septic patients
 #'
-#' An anonymised dataset containing 2000 microbial blood culture isolates with their full antibiograms found in septic patients in 4 different hospitals in the Netherlands, between 2001 and 2017. It is true, genuine data. This \code{data.frame} can be used to practice AMR analysis. For examples, press F1.
-#' @format A data.frame with 2000 observations and 49 variables:
+#' An anonymised data set containing 2,000 microbial blood culture isolates with their full antibiograms found in septic patients in 4 different hospitals in the Netherlands, between 2001 and 2017. It is true, genuine data. This \code{data.frame} can be used to practice AMR analysis. For examples, press F1.
+#' @format A \code{\link{tibble}} with 2,000 observations and 49 variables:
 #' \describe{
 #'   \item{\code{date}}{date of receipt at the laboratory}
 #'   \item{\code{hospital_id}}{ID of the hospital, from A to D}
@@ -185,13 +185,13 @@
 #' # PREPARATION #
 #' # ----------- #
 #'
-#' # Save this example dataset to an object, so we can edit it:
+#' # Save this example data set to an object, so we can edit it:
 #' my_data <- septic_patients
 #'
 #' # load the dplyr package to make data science A LOT easier
 #' library(dplyr)
 #'
-#' # Add first isolates to our dataset:
+#' # Add first isolates to our data set:
 #' my_data <- my_data %>%
 #'   mutate(first_isolates = first_isolate(my_data, "date", "patient_id", "mo"))
 #'
--- a/R/eucast.R
+++ b/R/eucast.R
@@ -280,8 +280,10 @@ EUCAST_rules <- function(tbl,
  }

  # join to microorganisms data set
+  col_mo_original <- NULL
  if (!tbl %>% pull(col_mo) %>% is.mo()) {
-    warning("Improve integrity of the `", col_mo, "` column by transforming it with 'as.mo'.")
+    col_mo_original <- tbl %>% pull(col_mo)
+    tbl[, col_mo] <- as.mo(tbl[, col_mo])
  }
  tbl <- tbl %>% left_join_microorganisms(by = col_mo, suffix = c("_tempmicroorganisms", ""))

@@ -685,6 +687,10 @@ EUCAST_rules <- function(tbl,
  tbl <- tbl %>% select(-c((tbl.ncol - microorganisms.ncol):tbl.ncol))
  # and remove added suffices
  colnames(tbl) <- gsub("_tempmicroorganisms", "", colnames(tbl))
+  # restore old col_mo values if needed
+  if (!is.null(col_mo_original)) {
+    tbl[, col_mo] <- col_mo_original
+  }

  if (info == TRUE) {
    cat('Done.\n\nEUCAST Expert rules applied to',
--- a/R/first_isolate.R
+++ b/R/first_isolate.R
@@ -178,7 +178,7 @@ first_isolate <- function(tbl,

  if (!is.na(col_mo)) {
    if (!tbl %>% pull(col_mo) %>% is.mo()) {
-      warning("Improve integrity of the `", col_mo, "` column by transforming it with 'as.mo'.")
+      tbl[, col_mo] <- as.mo(tbl[, col_mo])
    }
    # join to microorganisms data set
    tbl <- tbl %>% left_join_microorganisms(by = col_mo)
@@ -311,7 +311,7 @@ first_isolate <- function(tbl,
    if (info == TRUE) {
      message('No isolates found.')
    }
-    # NA's where genus is unavailable
+    # NAs where genus is unavailable
    tbl <- tbl %>%
      mutate(real_first_isolate = if_else(genus == '', NA, FALSE))
    if (output_logical == FALSE) {
@@ -406,7 +406,7 @@ first_isolate <- function(tbl,
    all_first[which(all_first[, col_icu] == TRUE), 'real_first_isolate'] <- FALSE
  }

-  # NA's where genus is unavailable
+  # NAs where genus is unavailable
  all_first <- all_first %>%
    mutate(real_first_isolate = if_else(genus %in% c('', '(no MO)', NA), NA, real_first_isolate))

--- a/R/globals.R
+++ b/R/globals.R
@@ -16,61 +16,41 @@
 # GNU General Public License for more details.                         #
 # ==================================================================== #

-globalVariables(c('abname',
-                  'Antibiotic',
-                  'Interpretation',
-                  'Percentage',
-                  'bind_rows',
-                  'element_blank',
-                  'element_line',
-                  'theme',
-                  'theme_minimal',
-                  'antibiotic',
-                  'antibiotics',
-                  'atc',
-                  'bactid',
-                  'C_chisq_sim',
-                  'certe',
-                  'cnt',
-                  'count',
-                  'Count',
-                  'counts',
-                  'cum_count',
-                  'cum_percent',
-                  'date_lab',
-                  'days_diff',
-                  'fctlvl',
-                  'first_isolate_row_index',
-                  'Freq',
-                  'fullname',
-                  'genus',
-                  'gramstain',
-                  'item',
-                  'key_ab',
-                  'key_ab_lag',
-                  'key_ab_other',
-                  'labs',
-                  'median',
-                  'mic',
-                  'MIC',
-                  'microorganisms',
-                  'mocode',
-                  'n',
-                  'na.omit',
-                  'observations',
-                  'official',
-                  'other_pat_or_mo',
-                  'Pasted',
-                  'patient_id',
-                  'quantile',
-                  'R',
-                  'real_first_isolate',
-                  'S',
-                  'septic_patients',
-                  'species',
-                  'umcg',
-                  'value',
-                  'values',
-                  'View',
-                  'y',
-                  '.'))
+globalVariables(c(".",
+                  "antibiotic",
+                  "Antibiotic",
+                  "antibiotics",
+                  "cnt",
+                  "count",
+                  "Count",
+                  "cum_count",
+                  "cum_percent",
+                  "date_lab",
+                  "days_diff",
+                  "fctlvl",
+                  "first_isolate_row_index",
+                  "Freq",
+                  "genus",
+                  "gramstain",
+                  "Interpretation",
+                  "item",
+                  "key_ab",
+                  "key_ab_lag",
+                  "key_ab_other",
+                  "median",
+                  "mic",
+                  "microorganisms",
+                  "mo",
+                  "n",
+                  "observations",
+                  "other_pat_or_mo",
+                  "Pasted",
+                  "patient_id",
+                  "Percentage",
+                  "R",
+                  "real_first_isolate",
+                  "S",
+                  "septic_patients",
+                  "species",
+                  "value",
+                  "y"))
--- a/R/key_antibiotics.R
+++ b/R/key_antibiotics.R
@@ -140,6 +140,9 @@ key_antibiotics <- function(tbl,
                    GramNeg_4, GramNeg_5, GramNeg_6)
  gram_negative <- gram_negative[!is.na(gram_negative)]

+  if (!tbl %>% pull(col_mo) %>% is.mo()) {
+    tbl[, col_mo] <- as.mo(tbl[, col_mo])
+  }
  # join microorganisms
  tbl <- tbl %>% left_join_microorganisms(col_mo)

--- a/R/mo.R
+++ b/R/mo.R
@@ -91,7 +91,6 @@
 #' }
 as.mo <- function(x, Becker = FALSE, Lancefield = FALSE) {

-
  if (NCOL(x) == 2) {
    # support tidyverse selection like: df %>% select(colA, colB)
    # paste these columns together
@@ -131,74 +130,11 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE) {
  x_species <- paste(x, 'species')
  # add start en stop regex
  x <- paste0('^', x, '$')
+  x_withspaces_all <- x_withspaces
  x_withspaces <- paste0('^', x_withspaces, '$')

  for (i in 1:length(x)) {

-    if (Becker == TRUE | Becker == "all") {
-      mo <- suppressWarnings(guess_mo(x_backup[i]))
-      if (mo %like% '^STA') {
-        # See Source. It's this figure:
-        # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4187637/figure/F3/
-        species <- left_join_microorganisms(mo)$species
-        if (species %in% c("arlettae", "auricularis", "capitis",
-                           "caprae", "carnosus", "cohnii", "condimenti",
-                           "devriesei", "epidermidis", "equorum",
-                           "fleurettii", "gallinarum", "haemolyticus",
-                           "hominis", "jettensis", "kloosii", "lentus",
-                           "lugdunensis", "massiliensis", "microti",
-                           "muscae", "nepalensis", "pasteuri", "petrasii",
-                           "pettenkoferi", "piscifermentans", "rostri",
-                           "saccharolyticus", "saprophyticus", "sciuri",
-                           "stepanovicii", "simulans", "succinus",
-                           "vitulinus", "warneri", "xylosus")) {
-          x[i] <- "STACNS"
-          next
-        } else if ((Becker == "all"  & species == "aureus")
-                   | species %in% c("simiae", "agnetis", "chromogenes",
-                                    "delphini", "felis", "lutrae",
-                                    "hyicus", "intermedius",
-                                    "pseudintermedius", "pseudointermedius",
-                                    "schleiferi")) {
-          x[i] <- "STACPS"
-          next
-        }
-      }
-    }
-
-    if (Lancefield == TRUE) {
-      mo <- suppressWarnings(guess_mo(x_backup[i]))
-      if (mo %like% '^STC') {
-        # See Source
-        species <- left_join_microorganisms(mo)$species
-        if (species == "pyogenes") {
-          x[i] <- "STCGRA"
-          next
-        }
-        if (species == "agalactiae") {
-          x[i] <- "STCGRB"
-          next
-        }
-        if (species %in% c("equisimilis", "equi",
-                           "zooepidemicus", "dysgalactiae")) {
-          x[i] <- "STCGRC"
-          next
-        }
-        if (species == "anginosus") {
-          x[i] <- "STCGRF"
-          next
-        }
-        if (species == "sanguis") {
-          x[i] <- "STCGRH"
-          next
-        }
-        if (species == "salivarius") {
-          x[i] <- "STCGRK"
-          next
-        }
-      }
-    }
-
    if (identical(x_trimmed[i], "")) {
      # empty values
      x[i] <- NA
@@ -206,12 +142,12 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE) {
      next
    }
    if (x_backup[i] %in% AMR::microorganisms$mo) {
-      # is already a valid mo
+      # is already a valid MO code
      x[i] <- x_backup[i]
      next
    }
    if (x_trimmed[i] %in% AMR::microorganisms$mo) {
-      # is already a valid mo
+      # is already a valid MO code
      x[i] <- x_trimmed[i]
      next
    }
@@ -303,6 +239,13 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE) {
      next
    }

+    # try fullname without start and stop regex, to also find subspecies, like "K. pneu rhino"
+    found <- MOs[which(gsub("[\\(\\)]", "", MOs$fullname) %like% x_withspaces_all[i]),]$mo
+    if (length(found) > 0) {
+      x[i] <- found[1L]
+      next
+    }
+
    # search for GLIMS code
    found <- AMR::microorganisms.umcg[which(toupper(AMR::microorganisms.umcg$umcg) == toupper(x_trimmed[i])),]$mo
    if (length(found) > 0) {
@@ -352,6 +295,57 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE) {
            call. = FALSE)
  }

+  if (Becker == TRUE | Becker == "all") {
+    # See Source. It's this figure:
+    # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4187637/figure/F3/
+    CoNS <- MOs %>%
+      filter(genus == "Staphylococcus",
+             species %in% c("arlettae", "auricularis", "capitis",
+                            "caprae", "carnosus", "cohnii", "condimenti",
+                            "devriesei", "epidermidis", "equorum",
+                            "fleurettii", "gallinarum", "haemolyticus",
+                            "hominis", "jettensis", "kloosii", "lentus",
+                            "lugdunensis", "massiliensis", "microti",
+                            "muscae", "nepalensis", "pasteuri", "petrasii",
+                            "pettenkoferi", "piscifermentans", "rostri",
+                            "saccharolyticus", "saprophyticus", "sciuri",
+                            "stepanovicii", "simulans", "succinus",
+                            "vitulinus", "warneri", "xylosus")) %>%
+      pull(mo)
+    CoPS <- MOs %>%
+      filter(genus == "Staphylococcus",
+             species %in% c("simiae", "agnetis", "chromogenes",
+                            "delphini", "felis", "lutrae",
+                            "hyicus", "intermedius",
+                            "pseudintermedius", "pseudointermedius",
+                            "schleiferi")) %>%
+      pull(mo)
+    x[x %in% CoNS] <- "STACNS"
+    x[x %in% CoPS] <- "STACPS"
+    if (Becker == "all") {
+      x[x == "STAAUR"] <- "STACPS"
+    }
+  }
+
+  if (Lancefield == TRUE) {
+    # group A
+    x[x == "STCPYO"] <- "STCGRA" # S. pyogenes
+    # group B
+    x[x == "STCAGA"] <- "STCGRB" # S. agalactiae
+    # group C
+    S_groupC <- MOs %>% filter(genus == "Streptococcus",
+                               species %in% c("equisimilis", "equi",
+                                              "zooepidemicus", "dysgalactiae")) %>%
+      pull(mo)
+    x[x %in% S_groupC] <- "STCGRC" # S. agalactiae
+    # group F
+    x[x == "STCANG"] <- "STCGRF" # S. anginosus
+    # group H
+    x[x == "STCSAN"] <- "STCGRH" # S. sanguis
+    # group K
+    x[x == "STCSAL"] <- "STCGRK" # S. salivarius
+  }
+
  # left join the found results to the original input values (x_input)
  df_found <- data.frame(input = as.character(unique(x_input)),
                         found = x,