fix for as.mo, added also_single_tested

2025-07-23 08:23:20 +02:00 · 2018-10-19 13:53:31 +02:00
parent 032c0d51ef
commit 86b03577a7
14 changed files with 128 additions and 66 deletions
--- a/4
+++ b/4
@ -1,6 +1,6 @@
 Package: AMR
-Version: 0.4.0.9004
-Date: 2018-10-17
+Version: 0.4.0.9005
+Date: 2018-10-19
 Title: Antimicrobial Resistance Analysis
 Authors@R: c(
    person(
--- a/5
+++ b/5
@ -175,6 +175,8 @@ importFrom(data.table,as.data.table)
 importFrom(data.table,data.table)
 importFrom(data.table,setkey)
 importFrom(dplyr,"%>%")
+importFrom(dplyr,all_vars)
+importFrom(dplyr,any_vars)
 importFrom(dplyr,arrange)
 importFrom(dplyr,arrange_at)
 importFrom(dplyr,as_tibble)
@ -184,7 +186,9 @@ importFrom(dplyr,case_when)
 importFrom(dplyr,desc)
 importFrom(dplyr,everything)
 importFrom(dplyr,filter)
+importFrom(dplyr,filter_all)
 importFrom(dplyr,full_join)
+importFrom(dplyr,funs)
 importFrom(dplyr,group_by)
 importFrom(dplyr,group_by_at)
 importFrom(dplyr,group_vars)
@ -192,6 +196,7 @@ importFrom(dplyr,if_else)
 importFrom(dplyr,lag)
 importFrom(dplyr,left_join)
 importFrom(dplyr,mutate)
+importFrom(dplyr,mutate_all)
 importFrom(dplyr,mutate_at)
 importFrom(dplyr,n_distinct)
 importFrom(dplyr,progress_estimated)
--- a/NEWS.md
+++ b/NEWS.md
@ -11,10 +11,11 @@
  * Better error handling when rules cannot be applied (i.e. new values could not be inserted)
  * The amount of affected values will now only be measured once per row/column combination
  * Data set `septic_patients` now reflects these changes
-* Empty values as input for `as.mo` will be processed faster
+* Tremendous speed improvement for `as.mo` (and consequently all `mo_*` functions), as empty values wil be ignored a priori
 * Fewer than 3 characters as input for `as.mo` will return NA
 * Added parameter `combine_IR` (TRUE/FALSE) to functions `portion_df` and `count_df`, to indicate that all values of I and R must be merged into one, so the output only consists of S vs. IR (susceptible vs. non-susceptible)
 * Fix for `portion_*(..., as_percent = TRUE)` when minimal amount of isolates would not be met
+* Added parameter `also_single_tested` for `portion_*` and `count_*` functions to also include cases where not all antibiotics were tested but at least one of the tested antibiotics includes the target antimicribial interpretation, see `?portion`
 * Using `portion_*` functions now throws a warning when total available isolate is below parameter `minimum`
 * Functions `as.mo`, `as.rsi` and `as.mic` will not set package name as attribute anymore
 * Data set `septic_patients` is now a `data.frame`, not a tibble anymore
--- a/R/count.R
+++ b/R/count.R
@ -92,56 +92,61 @@
 #'   group_by(hospital_id) %>%
 #'   count_df(translate = FALSE)
 #'
-count_R <- function(...) {
+count_R <- function(..., also_single_tested = FALSE) {
  rsi_calc(...,
           type = "R",
           include_I = FALSE,
           minimum = 0,
           as_percent = FALSE,
+           also_single_tested = FALSE,
           only_count = TRUE)
 }

 #' @rdname count
 #' @export
-count_IR <- function(...) {
+count_IR <- function(..., also_single_tested = FALSE) {
  rsi_calc(...,
           type = "R",
           include_I = TRUE,
           minimum = 0,
           as_percent = FALSE,
+           also_single_tested = FALSE,
           only_count = TRUE)
 }

 #' @rdname count
 #' @export
-count_I <- function(...) {
+count_I <- function(..., also_single_tested = FALSE) {
  rsi_calc(...,
           type = "I",
           include_I = FALSE,
           minimum = 0,
           as_percent = FALSE,
+           also_single_tested = FALSE,
           only_count = TRUE)
 }

 #' @rdname count
 #' @export
-count_SI <- function(...) {
+count_SI <- function(..., also_single_tested = FALSE) {
  rsi_calc(...,
           type = "S",
           include_I = TRUE,
           minimum = 0,
           as_percent = FALSE,
+           also_single_tested = FALSE,
           only_count = TRUE)
 }

 #' @rdname count
 #' @export
-count_S <- function(...) {
+count_S <- function(..., also_single_tested = FALSE) {
  rsi_calc(...,
           type = "S",
           include_I = FALSE,
           minimum = 0,
           as_percent = FALSE,
+           also_single_tested = FALSE,
           only_count = TRUE)
 }

--- a/R/data.R
+++ b/R/data.R
@ -175,7 +175,7 @@
 #' Translation table for Certe
 #'
 #' A data set containing all bacteria codes of Certe MMB. These codes can be joined to data with an ID from \code{\link{microorganisms}$mo} (using \code{\link{left_join_microorganisms}}). GLIMS codes can also be translated to valid \code{MO}s with \code{\link{guess_mo}}.
-#' @format A \code{\link{tibble}} with 2,664 observations and 2 variables:
+#' @format A \code{\link{tibble}} with 2,665 observations and 2 variables:
 #' \describe{
 #'   \item{\code{certe}}{Code of microorganism according to Certe MMB}
 #'   \item{\code{mo}}{Code of microorganism in \code{\link{microorganisms}}}
--- a/R/eucast.R
+++ b/R/eucast.R
@ -361,15 +361,20 @@ EUCAST_rules <- function(tbl,
      changed_results <<- changed_results + sum(before != after, na.rm = TRUE) # will be reset at start of every rule

      if (verbose == TRUE) {
-        verbose_new <- data.frame(rule_type = rule[1],
-                                  rule_set = rule[2],
-                                  force_to = to,
-                                  found = length(before),
-                                  changed = sum(before != after, na.rm = TRUE),
-                                  stringsAsFactors = FALSE)
-        verbose_new$target_columns <- list(unname(cols))
-        verbose_new$target_rows <- list(unname(rows))
-        verbose_info <<- rbind(verbose_info, verbose_new)
+        for (i in 1:length(cols)) {
+          # add new row for every affected column
+          verbose_new <- data.frame(rule_type = rule[1],
+                                    rule_set = rule[2],
+                                    force_to = to,
+                                    found = length(before),
+                                    changed = sum(before != after, na.rm = TRUE),
+                                    target_column = cols[i],
+                                    stringsAsFactors = FALSE)
+          verbose_new$target_rows <- list(unname(rows))
+          rownames(verbose_new) <- NULL
+          verbose_info <<- rbind(verbose_info, verbose_new)
+        }
+
      }
    }
  }
@ -410,15 +415,15 @@ EUCAST_rules <- function(tbl,
  # since ampicillin ^= amoxicillin, get the first from the latter (not in original table)
  if (!is.na(ampi) & !is.na(amox)) {
    if (verbose == TRUE) {
-      cat("\n VERBOSE: transforming",
-          length(which(tbl[, amox] == "S" & !tbl[, ampi] %in% c("S", "I", "R"))),
-          "empty ampicillin fields to 'S' based on amoxicillin.")
-      cat("\n VERBOSE: transforming",
-          length(which(tbl[, amox] == "I" & !tbl[, ampi] %in% c("S", "I", "R"))),
-          "empty ampicillin fields to 'I' based on amoxicillin.")
-      cat("\n VERBOSE: transforming",
-          length(which(tbl[, amox] == "R" & !tbl[, ampi] %in% c("S", "I", "R"))),
-          "empty ampicillin fields to 'R' based on amoxicillin.\n")
+      cat(bgGreen("\n VERBOSE: transforming",
+                  length(which(tbl[, amox] == "S" & !tbl[, ampi] %in% c("S", "I", "R"))),
+                  "empty ampicillin fields to 'S' based on amoxicillin. "))
+      cat(bgGreen("\n VERBOSE: transforming",
+                  length(which(tbl[, amox] == "I" & !tbl[, ampi] %in% c("S", "I", "R"))),
+                  "empty ampicillin fields to 'I' based on amoxicillin. "))
+      cat(bgGreen("\n VERBOSE: transforming",
+                  length(which(tbl[, amox] == "R" & !tbl[, ampi] %in% c("S", "I", "R"))),
+                  "empty ampicillin fields to 'R' based on amoxicillin. \n"))
    }
    tbl[which(tbl[, amox] == "S" & !tbl[, ampi] %in% c("S", "I", "R")), ampi] <- "S"
    tbl[which(tbl[, amox] == "I" & !tbl[, ampi] %in% c("S", "I", "R")), ampi] <- "I"
--- a/R/mo.R
+++ b/R/mo.R
@ -195,7 +195,7 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain =
  if (all(x %in% AMR::microorganisms[, property])) {
    # already existing mo
  } else if (all(x %in% AMR::microorganisms[, "mo"])) {
-    # existing mo codes
+    # existing mo codes when not looking for property "mo"
    suppressWarnings(
      x <- data.frame(mo = x, stringsAsFactors = FALSE) %>%
        left_join(AMR::microorganisms, by = "mo") %>%
@ -661,13 +661,12 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain =
    x[x == MOs[mo == 'B_STRPTC_SAL', ..property][[1]][1L]] <- MOs[mo == 'B_STRPTC_GRK', ..property][[1]][1L]
  }

-  x_input_unique <- unique(x_input)
-  # fill in empty values again
-  x[is.na(x_input_unique) | is.null(x_input_unique) | identical(x_input_unique, "")] <- NA
+  # comply to x, which is also unique and without empty values
+  x_input_unique_nonempty <- unique(x_input[!is.na(x_input) & !is.null(x_input) & !identical(x_input, "")])

  # left join the found results to the original input values (x_input)
-  df_found <- data.frame(input = as.character(x_input_unique),
-                         found = x,
+  df_found <- data.frame(input = as.character(x_input_unique_nonempty),
+                         found = as.character(x),
                         stringsAsFactors = FALSE)
  df_input <- data.frame(input = as.character(x_input),
                         stringsAsFactors = FALSE)
--- a/R/portion.R
+++ b/R/portion.R
@ -24,6 +24,7 @@
 #' @param ... one or more vectors (or columns) with antibiotic interpretations. They will be transformed internally with \code{\link{as.rsi}} if needed. Use multiple columns to calculate (the lack of) co-resistance: the probability where one of two drugs have a resistant or susceptible result. See Examples.
 #' @param minimum the minimal amount of available isolates. Any number lower than \code{minimum} will return \code{NA} with a warning. The default number of \code{30} isolates is advised by the Clinical and Laboratory Standards Institute (CLSI) as best practice, see Source.
 #' @param as_percent a logical to indicate whether the output must be returned as a hundred fold with \% sign (a character). A value of \code{0.123456} will then be returned as \code{"12.3\%"}.
+#' @param also_single_tested a logical to indicate whether (in combination therapies) also observations should be included where not all antibiotics were tested, but at least one of the tested antibiotics contains a target interpretation (e.g. S in case of \code{portion_S} and R in case of \code{portion_R}). \strong{This would lead to selection bias in almost all cases.}
 #' @param data a \code{data.frame} containing columns with class \code{rsi} (see \code{\link{as.rsi}})
 #' @param translate_ab a column name of the \code{\link{antibiotics}} data set to translate the antibiotic abbreviations to, using \code{\link{abname}}. This can be set with \code{\link{getOption}("get_antibiotic_names")}.
 #' @param combine_IR a logical to indicate whether all values of I and R must be merged into one, so the output only consists of S vs. IR (susceptible vs. non-susceptible)
@ -134,12 +135,14 @@
 #' }
 portion_R <- function(...,
                      minimum = 30,
-                      as_percent = FALSE) {
+                      as_percent = FALSE,
+                      also_single_tested = FALSE) {
  rsi_calc(...,
           type = "R",
           include_I = FALSE,
           minimum = minimum,
           as_percent = as_percent,
+           also_single_tested = also_single_tested,
           only_count = FALSE)
 }

@ -147,12 +150,14 @@ portion_R <- function(...,
 #' @export
 portion_IR <- function(...,
                       minimum = 30,
-                       as_percent = FALSE) {
+                       as_percent = FALSE,
+                       also_single_tested = FALSE) {
  rsi_calc(...,
           type = "R",
           include_I = TRUE,
           minimum = minimum,
           as_percent = as_percent,
+           also_single_tested = also_single_tested,
           only_count = FALSE)
 }

@ -160,12 +165,14 @@ portion_IR <- function(...,
 #' @export
 portion_I <- function(...,
                      minimum = 30,
-                      as_percent = FALSE) {
+                      as_percent = FALSE,
+                      also_single_tested = FALSE) {
  rsi_calc(...,
           type = "I",
           include_I = FALSE,
           minimum = minimum,
           as_percent = as_percent,
+           also_single_tested = also_single_tested,
           only_count = FALSE)
 }

@ -173,12 +180,14 @@ portion_I <- function(...,
 #' @export
 portion_SI <- function(...,
                       minimum = 30,
-                       as_percent = FALSE) {
+                       as_percent = FALSE,
+                       also_single_tested = FALSE) {
  rsi_calc(...,
           type = "S",
           include_I = TRUE,
           minimum = minimum,
           as_percent = as_percent,
+           also_single_tested = also_single_tested,
           only_count = FALSE)
 }

@ -186,12 +195,14 @@ portion_SI <- function(...,
 #' @export
 portion_S <- function(...,
                      minimum = 30,
-                      as_percent = FALSE) {
+                      as_percent = FALSE,
+                      also_single_tested = FALSE) {
  rsi_calc(...,
           type = "S",
           include_I = FALSE,
           minimum = minimum,
           as_percent = as_percent,
+           also_single_tested = also_single_tested,
           only_count = FALSE)
 }

--- a/R/rsi_calc.R
+++ b/R/rsi_calc.R
@ -16,12 +16,13 @@
 # GNU General Public License for more details.                         #
 # ==================================================================== #

-#' @importFrom dplyr %>% pull
+#' @importFrom dplyr %>% pull all_vars any_vars filter_all funs mutate_all
 rsi_calc <- function(...,
                     type,
                     include_I,
                     minimum,
                     as_percent,
+                     also_single_tested,
                     only_count) {

  if (!is.logical(include_I)) {
@ -33,6 +34,9 @@ rsi_calc <- function(...,
  if (!is.logical(as_percent)) {
    stop('`as_percent` must be logical', call. = FALSE)
  }
+  if (!is.logical(also_single_tested)) {
+    stop('`also_single_tested` must be logical', call. = FALSE)
+  }

  dots_df <- ...elt(1) # it needs this evaluation
  dots <- base::eval(base::substitute(base::alist(...)))
@ -67,23 +71,53 @@ rsi_calc <- function(...,
  }

  print_warning <- FALSE
-  # check integrity of columns: force rsi class
+
+  type_trans <- as.integer(as.rsi(type))
+  type_others <- setdiff(1:3, type_trans)
+
  if (is.data.frame(x)) {
+    rsi_integrity_check <- character(0)
    for (i in 1:ncol(x)) {
+      # check integrity of columns: force rsi class
      if (!is.rsi(x %>% pull(i))) {
-        x[, i] <- as.rsi(x[, i])
+        rsi_integrity_check <- c(rsi_integrity_check, x %>% pull(i) %>% as.character())
+        x[, i] <- suppressWarnings(as.rsi(x[, i])) # warning will be given later
        print_warning <- TRUE
      }
      x[, i] <- x %>% pull(i) %>% as.integer()
    }
-    x <- apply(X = x,
-               MARGIN = 1,
-               FUN = min)
+    if (length(rsi_integrity_check) > 0) {
+      # this will give a warning for invalid results, of all input columns (so only 1 warning)
+      rsi_integrity_check <- as.rsi(rsi_integrity_check)
+    }
+
+    if (include_I == TRUE) {
+      x <- x %>% mutate_all(funs(ifelse(. == 2, type_trans, .)))
+    }
+
+    if (also_single_tested == TRUE) {
+      # THE CHANCE THAT AT LEAST ONE RESULT IS type
+      found <- x %>% filter_all(any_vars(. == type_trans)) %>% nrow()
+      # THE CHANCE THAT AT LEAST ONE RESULT IS type OR ALL ARE TESTED
+      total <- found + x %>% filter_all(all_vars(. %in% type_others)) %>% nrow()
+    } else {
+      x <- apply(X = x,
+                 MARGIN = 1,
+                 FUN = min)
+      found <- sum(as.integer(x) == type_trans, na.rm = TRUE)
+      total <- length(x) - sum(is.na(x))
+    }
  } else {
    if (!is.rsi(x)) {
      x <- as.rsi(x)
      print_warning <- TRUE
    }
+    x <- as.integer(x)
+    if (include_I == TRUE) {
+      x[x == 2] <- type_trans
+    }
+    found <- sum(x == type_trans, na.rm = TRUE)
+    total <- length(x) - sum(is.na(x))
  }

  if (print_warning == TRUE) {
@ -91,21 +125,10 @@ rsi_calc <- function(...,
            call. = FALSE)
  }

-  if (type == "S") {
-    found <- sum(as.integer(x) <= 1 + include_I, na.rm = TRUE)
-  } else if (type == "I") {
-    found <- sum(as.integer(x) == 2, na.rm = TRUE)
-  } else if (type == "R") {
-    found <- sum(as.integer(x) >= 3 - include_I, na.rm = TRUE)
-  } else {
-    stop("invalid type")
-  }
-
  if (only_count == TRUE) {
    return(found)
  }

-  total <- length(x) - sum(is.na(x))
  if (total < minimum) {
    warning("Introducing NA: only ", total, " results available (minimum set to ", minimum, ").", call. = FALSE)
    result <- NA
--- a/data/microorganisms.certe.rda
+++ b/data/microorganisms.certe.rda
--- a/man/count.Rd
+++ b/man/count.Rd
@ -15,15 +15,15 @@
 Wickham H. \strong{Tidy Data.} The Journal of Statistical Software, vol. 59, 2014. \url{http://vita.had.co.nz/papers/tidy-data.html}
 }
 \usage{
-count_R(...)
+count_R(..., also_single_tested = FALSE)

-count_IR(...)
+count_IR(..., also_single_tested = FALSE)

-count_I(...)
+count_I(..., also_single_tested = FALSE)

-count_SI(...)
+count_SI(..., also_single_tested = FALSE)

-count_S(...)
+count_S(..., also_single_tested = FALSE)

 count_all(...)

@ -35,6 +35,8 @@ count_df(data, translate_ab = getOption("get_antibiotic_names",
 \arguments{
 \item{...}{one or more vectors (or columns) with antibiotic interpretations. They will be transformed internally with \code{\link{as.rsi}} if needed.}

+\item{also_single_tested}{a logical to indicate whether (in combination therapies) also observations should be included where not all antibiotics were tested, but at least one of the tested antibiotics contains a target interpretation (e.g. S in case of \code{portion_S} and R in case of \code{portion_R}). \strong{This would lead to selection bias in almost all cases.}}
+
 \item{data}{a \code{data.frame} containing columns with class \code{rsi} (see \code{\link{as.rsi}})}

 \item{translate_ab}{a column name of the \code{\link{antibiotics}} data set to translate the antibiotic abbreviations to, using \code{\link{abname}}. This can be set with \code{\link{getOption}("get_antibiotic_names")}.}
--- a/man/microorganisms.certe.Rd
+++ b/man/microorganisms.certe.Rd
@ -4,7 +4,7 @@
 \name{microorganisms.certe}
 \alias{microorganisms.certe}
 \title{Translation table for Certe}
-\format{A \code{\link{tibble}} with 2,664 observations and 2 variables:
+\format{A \code{\link{tibble}} with 2,665 observations and 2 variables:
 \describe{
  \item{\code{certe}}{Code of microorganism according to Certe MMB}
  \item{\code{mo}}{Code of microorganism in \code{\link{microorganisms}}}
--- a/man/portion.Rd
+++ b/man/portion.Rd
@ -15,15 +15,20 @@
 Wickham H. \strong{Tidy Data.} The Journal of Statistical Software, vol. 59, 2014. \url{http://vita.had.co.nz/papers/tidy-data.html}
 }
 \usage{
-portion_R(..., minimum = 30, as_percent = FALSE)
+portion_R(..., minimum = 30, as_percent = FALSE,
+  also_single_tested = FALSE)

-portion_IR(..., minimum = 30, as_percent = FALSE)
+portion_IR(..., minimum = 30, as_percent = FALSE,
+  also_single_tested = FALSE)

-portion_I(..., minimum = 30, as_percent = FALSE)
+portion_I(..., minimum = 30, as_percent = FALSE,
+  also_single_tested = FALSE)

-portion_SI(..., minimum = 30, as_percent = FALSE)
+portion_SI(..., minimum = 30, as_percent = FALSE,
+  also_single_tested = FALSE)

-portion_S(..., minimum = 30, as_percent = FALSE)
+portion_S(..., minimum = 30, as_percent = FALSE,
+  also_single_tested = FALSE)

 portion_df(data, translate_ab = getOption("get_antibiotic_names",
  "official"), minimum = 30, as_percent = FALSE, combine_IR = FALSE)
@ -35,6 +40,8 @@ portion_df(data, translate_ab = getOption("get_antibiotic_names",

 \item{as_percent}{a logical to indicate whether the output must be returned as a hundred fold with \% sign (a character). A value of \code{0.123456} will then be returned as \code{"12.3\%"}.}

+\item{also_single_tested}{a logical to indicate whether (in combination therapies) also observations should be included where not all antibiotics were tested, but at least one of the tested antibiotics contains a target interpretation (e.g. S in case of \code{portion_S} and R in case of \code{portion_R}). \strong{This would lead to selection bias in almost all cases.}}
+
 \item{data}{a \code{data.frame} containing columns with class \code{rsi} (see \code{\link{as.rsi}})}

 \item{translate_ab}{a column name of the \code{\link{antibiotics}} data set to translate the antibiotic abbreviations to, using \code{\link{abname}}. This can be set with \code{\link{getOption}("get_antibiotic_names")}.}
--- a/tests/testthat/test-portion.R
+++ b/tests/testthat/test-portion.R
@ -17,6 +17,9 @@ test_that("portions works", {
  expect_equal(septic_patients %>% portion_S(amcl, gent),
               0.9210074,
               tolerance = 0.001)
+  expect_equal(septic_patients %>% portion_S(amcl, gent, also_single_tested = TRUE),
+               0.9239669,
+               tolerance = 0.001)

  # amcl+genta susceptibility around 92.1%
  expect_equal(suppressWarnings(rsi(septic_patients$amcl,
@ -66,6 +69,7 @@ test_that("portions works", {
  expect_error(portion_I("test", as_percent = "test"))
  expect_error(portion_S("test", minimum = "test"))
  expect_error(portion_S("test", as_percent = "test"))
+  expect_error(portion_S("test", also_single_tested = "test"))

  # check too low amount of isolates
  expect_identical(suppressWarnings(portion_R(septic_patients$amox, minimum = nrow(septic_patients) + 1)),