support A. species for as.mo, cleanup

2026-01-28 23:53:51 +01:00 · 2018-11-24 20:25:09 +01:00
parent 63ba4b2980
commit 9ddf6dc530
14 changed files with 126 additions and 93 deletions
--- a/4
+++ b/4
@@ -1,6 +1,6 @@
 Package: AMR
-Version: 0.4.0.9012
-Date: 2018-11-17
+Version: 0.4.0.9013
+Date: 2018-11-24
 Title: Antimicrobial Resistance Analysis
 Authors@R: c(
    person(
--- a/NEWS.md
+++ b/NEWS.md
@@ -23,6 +23,13 @@
 * Added column `kingdom` to the microorganisms data set, and function `mo_kingdom` to look up values
 * Tremendous speed improvement for `as.mo` (and subsequently all `mo_*` functions), as empty values wil be ignored *a priori*
 * Fewer than 3 characters as input for `as.mo` will return NA
+* Function `as.mo` (and all `mo_*` wrappers) now supports genus abbreviations with "species" attached
+  ```r
+  as.mo("E. species")        # B_ESCHR
+  mo_fullname("E. spp.")     # "Escherichia species"
+  as.mo("S. spp")            # B_STPHY
+  mo_fullname("S. species")  # "Staphylococcus species"
+  ```
 * Added parameter `combine_IR` (TRUE/FALSE) to functions `portion_df` and `count_df`, to indicate that all values of I and R must be merged into one, so the output only consists of S vs. IR (susceptible vs. non-susceptible)
 * Fix for `portion_*(..., as_percent = TRUE)` when minimal number of isolates would not be met
 * Added parameter `also_single_tested` for `portion_*` and `count_*` functions to also include cases where not all antibiotics were tested but at least one of the tested antibiotics includes the target antimicribial interpretation, see `?portion`
@@ -45,9 +52,9 @@
  * Now prints in markdown at default in non-interactive sessions
  * No longer adds the factor level column and sorts factors on count again
  * Support for class `difftime`
-  * New parameter `na`, to choose with character to print for empty values
-  * New parameter `header` to turn it off (default when `markdown = TRUE`)
-  * New parameter `title` to replace the automatically set title
+  * New parameter `na`, to choose which character to print for empty values
+  * New parameter `header` to turn the header info off (default when `markdown = TRUE`)
+  * New parameter `title` to manually setbthe title of the frequency table
 * `first_isolate` now tries to find columns to use as input when parameters are left blank
 * Improvements for MDRO algorithm (function `mdro`)
 * Data set `septic_patients` is now a `data.frame`, not a tibble anymore
@@ -66,6 +73,7 @@
 * In `g.test`, when `sum(x)` is below 1000 or any of the expected values is below 5, Fisher's Exact Test will be suggested
 * `ab_name` will try to fall back on `as.atc` when no results are found
 * Removed the addin to view data sets
+* Percentages will now will rounded more logically (e.g. in `freq` function)

 #### Other
 * New dependency on package `crayon`, to support formatted text in the console
--- a/R/freq.R
+++ b/R/freq.R
@@ -313,16 +313,22 @@ frequency_tbl <- function(x,
    }
  }

-  na_txt <- paste0(NAs %>% length() %>% format(), ' = ',
-                   (NAs %>% length() / (NAs %>% length() + x %>% length())) %>% percent(force_zero = TRUE, round = digits) %>%
-                     sub('NaN', '0', ., fixed = TRUE))
-  if (!na_txt %like% "^0 =") {
-    na_txt <- red(na_txt)
+  if (NROW(x) > 0) {
+    na_txt <- paste0(NAs %>% length() %>% format(), ' = ',
+                     (NAs %>% length() / (NAs %>% length() + x %>% length())) %>% percent(force_zero = TRUE, round = digits) %>%
+                       sub('NaN', '0', ., fixed = TRUE))
+    if (!na_txt %like% "^0 =") {
+      na_txt <- red(na_txt)
+    } else {
+      na_txt <- green(na_txt)
+    }
+    na_txt <- paste0('(of which NA: ', na_txt, ')')
  } else {
-    na_txt <- green(na_txt)
+    na_txt <- ""
  }
+
  header_txt <- header_txt %>% paste0(markdown_line, '\nLength:    ', (NAs %>% length() + x %>% length()) %>% format(),
-                                      ' (of which NA: ', na_txt, ')')
+                                      ' ', na_txt)
  header_txt <- header_txt %>% paste0(markdown_line, '\nUnique:    ', x %>% n_distinct() %>% format())

  if (NROW(x) > 0 & any(class(x) == "character")) {
@@ -592,7 +598,12 @@ print.frequency_tbl <- function(x, nmax = getOption("max.print.freq", default =
      }
      title <- paste(title, group_var)
    }
-    title <- paste("Frequency table of", trimws(title))
+    title <- trimws(title)
+    if (title == "") {
+      title <- "Frequency table"
+    } else {
+      title <- paste("Frequency table of", trimws(title))
+    }
  } else {
    title <- opt$title
  }
--- a/R/globals.R
+++ b/R/globals.R
@@ -54,6 +54,8 @@ globalVariables(c(".",
                  "microorganisms.prevDT",
                  "microorganisms.unprevDT",
                  "microorganisms.oldDT",
+                  "microorganisms.certe",
+                  "microorganisms.umcg",
                  "mo",
                  "mo.old",
                  "n",
--- a/R/misc.R
+++ b/R/misc.R
@@ -28,12 +28,24 @@ addin_insert_like <- function() {

 # No export, no Rd
 percent <- function(x, round = 1, force_zero = FALSE, ...) {
-  val <- base::round(x * 100, digits = round)
-  if (force_zero == TRUE & any(val == as.integer(val) & !is.na(val))) {
-    val[val == as.integer(val)] <- paste0(val[val == as.integer(val)], ".", strrep(0, round))
+
+  # https://stackoverflow.com/a/12688836/4575331
+  round2 <- function(x, n) (trunc((abs(x) * 10 ^ n) + 0.5) / 10 ^ n) * sign(x)
+
+  val <- round2(x, round + 2) # round up 0.5
+  val <- round(x = val * 100, digits = round) # remove floating point error
+
+  if (force_zero == TRUE) {
+    if (any(val == as.integer(val) & !is.na(val))) {
+      # add zeroes to all integers
+      val[val == as.integer(as.character(val))] <- paste0(val[val == as.integer(val)], ".", strrep(0, round))
+    }
+    # add extra zeroes if needed
+    val_decimals <- nchar(gsub(".*[.](.*)", "\\1", as.character(val)))
+    val[val_decimals < round] <- paste0(val[val_decimals < round], strrep(0, max(0, round - val_decimals)))
  }
  pct <- base::paste0(val, "%")
-  pct[pct == "NA%"] <- NA_character_
+  pct[pct %in% c("NA%", "NaN%")] <- NA_character_
  pct
 }

--- a/R/mo.R
+++ b/R/mo.R
@@ -53,7 +53,7 @@
 #' \itemize{
 #'   \item{Taxonomic kingdom: it first searches in bacteria, then fungi, then protozoa}
 #'   \item{Human pathogenic prevalence: it first searches in more prevalent microorganisms, then less prevalent ones}
-#'   \item{Valid MO codes and full names: it first searches in already valid MO code and genus/species combinations}
+#'   \item{Valid MO codes and full names: it first searches in already valid MO code and known genus/species combinations}
 #'   \item{Breakdown of input values: from here it starts to breakdown input values to find possible matches}
 #' }
 #'
@@ -126,16 +126,15 @@
 #' library(dplyr)
 #' df$mo <- df %>%
 #'   select(microorganism_name) %>%
-#'   guess_mo()
+#'   as.mo()
 #'
 #' # and can even contain 2 columns, which is convenient for genus/species combinations:
 #' df$mo <- df %>%
 #'   select(genus, species) %>%
-#'   guess_mo()
-#'
-#' # same result:
+#'   as.mo()
+#' # although this works easier and does the same:
 #' df <- df %>%
-#'   mutate(mo = guess_mo(paste(genus, species)))
+#'   mutate(mo = as.mo(paste(genus, species)))
 #' }
 as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = FALSE, reference_df = NULL) {
  structure(mo_validate(x = x, property = "mo",
@@ -160,11 +159,14 @@ guess_mo <- as.mo
 #' @importFrom data.table data.table as.data.table setkey
 exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = FALSE, reference_df = NULL, property = "mo") {

-  # These data.tables are available as data sets when the AMR package is loaded:
-  #   microorganismsDT        # this one is sorted by kingdom (B<F<P), prevalence, TSN
-  #   microorganisms.prevDT   # same as microorganismsDT, but with prevalence != 9999
-  #   microorganisms.unprevDT # same as microorganismsDT, but with prevalence == 9999
-  #   microorganisms.oldDT    # old taxonomic names, sorted by name (genus+species), TSN
+  if (!"AMR" %in% base::.packages()) {
+    library("AMR")
+    # These data.tables are available as data sets when the AMR package is loaded:
+    #   microorganismsDT        # this one is sorted by kingdom (B<F<P), prevalence, TSN
+    #   microorganisms.prevDT   # same as microorganismsDT, but with prevalence != 9999
+    #   microorganisms.unprevDT # same as microorganismsDT, but with prevalence == 9999
+    #   microorganisms.oldDT    # old taxonomic names, sorted by name (genus+species), TSN
+  }

  if (NCOL(x) == 2) {
    # support tidyverse selection like: df %>% select(colA, colB)
@@ -216,31 +218,35 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain =
    suppressWarnings(
      x <- data.frame(x = x, stringsAsFactors = FALSE) %>%
        left_join(reference_df, by = "x") %>%
-        left_join(AMR::microorganisms, by = "mo") %>%
+        left_join(microorganisms, by = "mo") %>%
        pull(property)
    )
-  } else if (all(toupper(x) %in% AMR::microorganisms.certe[, "certe"])) {
+  } else if (all(toupper(x) %in% microorganisms.certe[, "certe"])) {
    # old Certe codes
-    y <- as.data.table(AMR::microorganisms.certe)[data.table(certe = toupper(x)), on = "certe", ]
+    y <- as.data.table(microorganisms.certe)[data.table(certe = toupper(x)), on = "certe", ]
    x <- microorganismsDT[data.table(mo = y[["mo"]]), on = "mo", ..property][[1]]

  } else if (!all(x %in% microorganismsDT[[property]])) {

    x_backup <- trimws(x, which = "both")
-    x_species <- paste(x_backup, "species")
+
+    # remove spp and species
+    x <- gsub(" +(spp.?|species)", "", x_backup)
+    x_species <- paste(x, "species")
    # translate to English for supported languages of mo_property
    x <- gsub("(Gruppe|gruppe|groep|grupo|gruppo|groupe)", "group", x)
    # remove 'empty' genus and species values
    x <- gsub("(no MO)", "", x, fixed = TRUE)
-    # remove dots and other non-text in case of "E. coli" except spaces
-    x <- gsub("[^a-zA-Z0-9/ \\-]+", "", x)
+    # remove non-text in case of "E. coli" except dots and spaces
+    x <- gsub("[^.a-zA-Z0-9/ \\-]+", "", x)
+
    # but spaces before and after should be omitted
    x <- trimws(x, which = "both")
    x_trimmed <- x
    x_trimmed_species <- paste(x_trimmed, "species")
-    # replace space by regex sign
-    x_withspaces <- gsub(" ", ".* ", x, fixed = TRUE)
-    x <- gsub(" ", ".*", x, fixed = TRUE)
+    # replace space and dot by regex sign
+    x_withspaces <- gsub("[ .]+", ".* ", x)
+    x <- gsub("[ .]+", ".*", x)
    # add start en stop regex
    x <- paste0('^', x, '$')
    x_withspaces_start <- paste0('^', x_withspaces)
@@ -261,10 +267,28 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain =
        next
      }
      if (nchar(x_trimmed[i]) < 3) {
-        # fewer than 3 chars, add as failure
-        x[i] <- NA_character_
-        failures <- c(failures, x_backup[i])
-        next
+        # check if search term was like "A. species", then return first genus found with ^A
+        if (x_backup[i] %like% "species" | x_backup[i] %like% "spp[.]?") {
+          # get mo code of first hit
+          found <- microorganismsDT[fullname %like% x_withspaces_start[i], mo][[1]]
+          mo_code <- found[1L] %>% strsplit("_") %>% unlist() %>% .[1:2] %>% paste(collapse = "_")
+          found <- microorganismsDT[mo == mo_code, ..property][[1]]
+          # return first genus that begins with x_trimmed, e.g. when "E. spp."
+          if (length(found) > 0) {
+            x[i] <- found[1L]
+            next
+          } else {
+            # fewer than 3 chars, add as failure
+            x[i] <- NA_character_
+            failures <- c(failures, x_backup[i])
+            next
+          }
+        } else {
+          # fewer than 3 chars, add as failure
+          x[i] <- NA_character_
+          failures <- c(failures, x_backup[i])
+          next
+        }
      }

      # translate known trivial abbreviations to genus + species ----
@@ -353,15 +377,9 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain =
      }

      # TRY OTHER SOURCES ----
-      if (x_backup[i] %in% AMR::microorganisms.certe$certe) {
-        x[i] <- microorganismsDT[mo == AMR::microorganisms.certe[AMR::microorganisms.certe[, 1] == x_backup[i], 2], ..property][[1]][1L]
-        # x[i] <- exec_as.mo(x = AMR::microorganisms.certe[AMR::microorganisms.certe$certe == x_backup[i], "mo"],
-        #                    property = property)
-        # next
-      }
-      if (x_backup[i] %in% AMR::microorganisms.umcg[, 1]) {
-        mo_umcg <- AMR::microorganisms.umcg[AMR::microorganisms.umcg[, 1] == x_backup[i], 2]
-        mo_found <- AMR::microorganisms.certe[AMR::microorganisms.certe[, 1] == mo_umcg, 2]
+      if (x_backup[i] %in% microorganisms.umcg[, 1]) {
+        mo_umcg <- microorganisms.umcg[microorganisms.umcg[, 1] == x_backup[i], 2]
+        mo_found <- microorganisms.certe[microorganisms.certe[, 1] == mo_umcg, 2]
        if (length(mo_found) == 0) {
          # not found
          x[i] <- NA_character_
@@ -371,13 +389,15 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain =
        }
        next
      }
-      if (x_backup[i] %in% reference_df[, 1]) {
-        ref_mo <- reference_df[reference_df[, 1] == x_backup[i], 2]
-        if (ref_mo %in% microorganismsDT[, mo]) {
-          x[i] <- microorganismsDT[mo == ref_mo, ..property][[1]][1L]
-          next
-        } else {
-          warning("Value '", x_backup[i], "' was found in reference_df, but '", ref_mo, "' is not a valid MO code.", call. = FALSE)
+      if (!is.null(reference_df)) {
+        if (x_backup[i] %in% reference_df[, 1]) {
+          ref_mo <- reference_df[reference_df[, 1] == x_backup[i], 2]
+          if (ref_mo %in% microorganismsDT[, mo]) {
+            x[i] <- microorganismsDT[mo == ref_mo, ..property][[1]][1L]
+            next
+          } else {
+            warning("Value '", x_backup[i], "' was found in reference_df, but '", ref_mo, "' is not a valid MO code.", call. = FALSE)
+          }
        }
      }

--- a/R/zzz.R
+++ b/R/zzz.R
@@ -47,28 +47,3 @@ NULL
 .onLoad <- function(libname, pkgname) {
  backports::import(pkgname)
 }
-
-.onAttach <- function(libname, pkgname) {
-  # save data.tables to improve speed of as.mo:
-
-  # microorganismsDT <- data.table::as.data.table(AMR::microorganisms)
-  # microorganisms.oldDT <- data.table::as.data.table(AMR::microorganisms.old)
-  #
-  # data.table::setkey(microorganismsDT, prevalence, tsn)
-  # data.table::setkey(microorganisms.oldDT, tsn, name)
-
-  base::assign(x = "microorganismsDT",
-               value = microorganismsDT,
-               envir = base::as.environment("package:AMR"))
-  base::assign(x = "microorganisms.prevDT",
-               value = microorganismsDT[prevalence != 9999,],
-               envir = base::as.environment("package:AMR"))
-  base::assign(x = "microorganisms.unprevDT",
-               value = microorganismsDT[prevalence == 9999,],
-               envir = base::as.environment("package:AMR"))
-
-  base::assign(x = "microorganisms.oldDT",
-               value = microorganisms.oldDT,
-               envir = base::as.environment("package:AMR"))
-
-}
--- a/README.md
+++ b/README.md
@@ -347,10 +347,8 @@ key_antibiotics(...)
 # Selection of first isolates of any patient
 first_isolate(...)

-# Calculate resistance levels of antibiotics, can be used with `summarise` (dplyr)
-rsi(...)
 # Predict resistance levels of antibiotics
-rsi_predict(...)
+resistance_predict(...)

 # Get name of antibiotic by ATC code
 abname(...)
--- a/data/microorganisms.prevDT.rda
+++ b/data/microorganisms.prevDT.rda
--- a/data/microorganisms.unprevDT.rda
+++ b/data/microorganisms.unprevDT.rda
--- a/man/as.mo.Rd
+++ b/man/as.mo.Rd
@@ -58,7 +58,7 @@ This function uses Artificial Intelligence (AI) to help getting fast and logical
 \itemize{
  \item{Taxonomic kingdom: it first searches in bacteria, then fungi, then protozoa}
  \item{Human pathogenic prevalence: it first searches in more prevalent microorganisms, then less prevalent ones}
-  \item{Valid MO codes and full names: it first searches in already valid MO code and genus/species combinations}
+  \item{Valid MO codes and full names: it first searches in already valid MO code and known genus/species combinations}
  \item{Breakdown of input values: from here it starts to breakdown input values to find possible matches}
 }

@@ -132,16 +132,15 @@ df$mo <- as.mo(df$microorganism_name)
 library(dplyr)
 df$mo <- df \%>\%
  select(microorganism_name) \%>\%
-  guess_mo()
+  as.mo()

 # and can even contain 2 columns, which is convenient for genus/species combinations:
 df$mo <- df \%>\%
  select(genus, species) \%>\%
-  guess_mo()
-
-# same result:
+  as.mo()
+# although this works easier and does the same:
 df <- df \%>\%
-  mutate(mo = guess_mo(paste(genus, species)))
+  mutate(mo = as.mo(paste(genus, species)))
 }
 }
 \seealso{
--- a/tests/testthat/test-misc.R
+++ b/tests/testthat/test-misc.R
@@ -5,6 +5,9 @@ test_that("percentages works", {
  expect_equal(percent(0.5), "50%")
  expect_equal(percent(0.500, force_zero = TRUE), "50.0%")
  expect_equal(percent(0.1234), "12.3%")
+  # round up 0.5
+  expect_equal(percent(0.0054), "0.5%")
+  expect_equal(percent(0.0055), "0.6%")
 })

 test_that("size format works", {
--- a/tests/testthat/test-mo.R
+++ b/tests/testthat/test-mo.R
@@ -195,6 +195,11 @@ test_that("as.mo works", {

  # TSN of prevalent and non prevalent ones
  expect_equal(mo_TSN(c("Gomphosphaeria aponina delicatula", "Escherichia coli")),
-                   c(717, 285))
+               c(717, 285))
+
+  expect_equal(mo_fullname(c("E. spp.",
+                             "E. spp",
+                             "E. species")),
+               rep("Escherichia species", 3))

 })
--- a/tests/testthat/test-read.4d.R
+++ b/tests/testthat/test-read.4d.R
@@ -18,7 +18,7 @@ test_that("read 4D works", {
  tf <- tempfile()
  write.table(test1, file = tf, quote = F, sep = "\t")

-  x <- read.4D(tf, skip = 0)
+  x <- read.4D(tf, skip = 0, info = TRUE)
  unlink(tf)

  expect_equal(ncol(x), 11)