update to septic_patients, speed improvements

2025-11-30 22:42:00 +01:00 · 2018-07-25 14:17:04 +02:00
parent 03a3cb397b
commit d9e204031d
26 changed files with 273 additions and 233 deletions
--- a/R/bactid.R
+++ b/R/bactid.R
@@ -221,7 +221,6 @@ as.bactid <- function(x) {
  }
  class(x) <- "bactid"
  attr(x, 'package') <- 'AMR'
-  attr(x, 'package.version') <- packageDescription('AMR')$Version
  x
 }

--- a/R/classes.R
+++ b/R/classes.R
@@ -21,18 +21,18 @@
 #' This transforms a vector to a new class \code{rsi}, which is an ordered factor with levels \code{S < I < R}. Invalid antimicrobial interpretations will be translated as \code{NA} with a warning.
 #' @rdname as.rsi
 #' @param x vector
-#' @return Ordered factor with new class \code{rsi} and new attributes \code{package} and \code{package.version}
+#' @return Ordered factor with new class \code{rsi} and new attribute \code{package}
 #' @keywords rsi
 #' @export
 #' @importFrom dplyr %>%
-#' @importFrom utils packageDescription
+#' @seealso \code{\link{as.mic}}
 #' @examples
 #' rsi_data <- as.rsi(c(rep("S", 474), rep("I", 36), rep("R", 370)))
 #' rsi_data <- as.rsi(c(rep("S", 474), rep("I", 36), rep("R", 370), "A", "B", "C"))
 #' is.rsi(rsi_data)
 #'
 #' # this can also coerce combined MIC/RSI values:
-#' as.rsi("<= 0.002; R") # will return R
+#' as.rsi("<= 0.002; S") # will return S
 #'
 #' plot(rsi_data)    # for percentages
 #' barplot(rsi_data) # for frequencies
@@ -76,7 +76,6 @@ as.rsi <- function(x) {
    x <- x %>% factor(levels = c("S", "I", "R"), ordered = TRUE)
    class(x) <- c('rsi', 'ordered', 'factor')
    attr(x, 'package') <- 'AMR'
-    attr(x, 'package.version') <- packageDescription('AMR')$Version
    x
  }
 }
@@ -196,21 +195,21 @@ barplot.rsi <- function(height, ...) {

 #' Class 'mic'
 #'
-#' This transforms a vector to a new class\code{mic}, which is an ordered factor with valid MIC values as levels. Invalid MIC values will be translated as \code{NA} with a warning.
+#' This transforms a vector to a new class \code{mic}, which is an ordered factor with valid MIC values as levels. Invalid MIC values will be translated as \code{NA} with a warning.
 #' @rdname as.mic
 #' @param x vector
 #' @param na.rm a logical indicating whether missing values should be removed
-#' @return Ordered factor with new class \code{mic} and new attributes \code{package} and \code{package.version}
+#' @return Ordered factor with new class \code{mic} and new attribute \code{package}
 #' @keywords mic
 #' @export
 #' @importFrom dplyr %>%
-#' @importFrom utils packageDescription
+#' @seealso \code{\link{as.rsi}}
 #' @examples
 #' mic_data <- as.mic(c(">=32", "1.0", "1", "1.00", 8, "<=0.128", "8", "16", "16"))
 #' is.mic(mic_data)
 #'
 #' # this can also coerce combined MIC/RSI values:
-#' as.mic("<=0.002; R") # will return <=0.002
+#' as.mic("<=0.002; S") # will return <=0.002
 #'
 #' plot(mic_data)
 #' barplot(mic_data)
@@ -319,7 +318,6 @@ as.mic <- function(x, na.rm = FALSE) {
                ordered = TRUE)
    class(x) <- c('mic', 'ordered', 'factor')
    attr(x, 'package') <- 'AMR'
-    attr(x, 'package.version') <- packageDescription('AMR')$Version
    x
  }
 }
--- a/R/data.R
+++ b/R/data.R
@@ -252,7 +252,7 @@
 #'   \item{\code{type_nl}}{Type of microorganism in Dutch, like \code{"Bacterie"} and \code{"Schimmel/gist"}}
 #'   \item{\code{gramstain_nl}}{Gram of microorganism in Dutch, like \code{"Negatieve staven"}}
 #' }
-#' @source MOLIS (LIS of Certe) - \url{https://www.certe.nl}
+#  source MOLIS (LIS of Certe) - \url{https://www.certe.nl}
 #' @seealso \code{\link{guess_bactid}} \code{\link{antibiotics}} \code{\link{microorganisms.umcg}}
 "microorganisms"

@@ -264,14 +264,14 @@
 #'   \item{\code{mocode}}{Code of microorganism according to UMCG MMB}
 #'   \item{\code{bactid}}{Code of microorganism in \code{\link{microorganisms}}}
 #' }
-#' @source MOLIS (LIS of Certe) - \url{https://www.certe.nl} \cr \cr GLIMS (LIS of UMCG) - \url{https://www.umcg.nl}
+# source MOLIS (LIS of Certe) - \url{https://www.certe.nl} \cr \cr GLIMS (LIS of UMCG) - \url{https://www.umcg.nl}
 #' @seealso \code{\link{guess_bactid}} \code{\link{microorganisms}}
 "microorganisms.umcg"

 #' Dataset with 2000 blood culture isolates of septic patients
 #'
 #' An anonymised dataset containing 2000 microbial blood culture isolates with their antibiogram of septic patients found in 5 different hospitals in the Netherlands, between 2001 and 2017. This data.frame can be used to practice AMR analysis. For examples, press F1.
-#' @format A data.frame with 2000 observations and 47 variables:
+#' @format A data.frame with 2000 observations and 49 variables:
 #' \describe{
 #'   \item{\code{date}}{date of receipt at the laboratory}
 #'   \item{\code{hospital_id}}{ID of the hospital}
@@ -282,9 +282,9 @@
 #'   \item{\code{sex}}{sex of the patient}
 #'   \item{\code{patient_id}}{ID of the patient, first 10 characters of an SHA hash containing irretrievable information}
 #'   \item{\code{bactid}}{ID of microorganism, see \code{\link{microorganisms}}}
-#'   \item{\code{peni:mupi}}{38 different antibiotics with class \code{rsi} (see \code{\link{as.rsi}}); these column names occur in \code{\link{antibiotics}} and can be translated with \code{\link{abname}}}
+#'   \item{\code{peni:rifa}}{40 different antibiotics with class \code{rsi} (see \code{\link{as.rsi}}); these column names occur in \code{\link{antibiotics}} data set and can be translated with \code{\link{abname}}}
 #' }
-#' @source MOLIS (LIS of Certe) - \url{https://www.certe.nl}
+# source MOLIS (LIS of Certe) - \url{https://www.certe.nl}
 #' @examples
 #' # ----------- #
 #' # PREPARATION #
@@ -304,15 +304,15 @@
 #' # ANALYSIS #
 #' # -------- #
 #'
-#' # 1. Get the amoxicillin resistance percentages
-#' #    of E. coli, divided by hospital:
+#' # 1. Get the amoxicillin resistance percentages (p)
+#' #     and numbers (n) of E. coli, divided by hospital:
 #'
 #' my_data %>%
-#'   filter(bactid == "ESCCOL",
+#'   filter(bactid == guess_bactid("E. coli"),
 #'          first_isolates == TRUE) %>%
 #'   group_by(hospital_id) %>%
-#'   summarise(n = n(),
-#'             amoxicillin_resistance = rsi(amox))
+#'   summarise(n = n_rsi(amox),
+#'             p = resistance(amox))
 #'
 #'
 #' # 2. Get the amoxicillin/clavulanic acid resistance
@@ -322,6 +322,6 @@
 #'   filter(bactid == guess_bactid("E. coli"),
 #'          first_isolates == TRUE) %>%
 #'   group_by(year = format(date, "%Y")) %>%
-#'   summarise(n = n(),
-#'             amoxclav_resistance = rsi(amcl, minimum = 20))
+#'   summarise(n = n_rsi(amcl),
+#'             p = resistance(amcl, minimum = 20))
 "septic_patients"
--- a/R/first_isolate.R
+++ b/R/first_isolate.R
@@ -41,7 +41,7 @@
 #' @details \strong{WHY THIS IS SO IMPORTANT} \cr
 #'     To conduct an analysis of antimicrobial resistance, you should only include the first isolate of every patient per episode \href{https://www.ncbi.nlm.nih.gov/pubmed/17304462}{[1]}. If you would not do this, you could easily get an overestimate or underestimate of the resistance of an antibiotic. Imagine that a patient was admitted with an MRSA and that it was found in 5 different blood cultures the following week. The resistance percentage of oxacillin of all \emph{S. aureus} isolates would be overestimated, because you included this MRSA more than once. It would be \href{https://en.wikipedia.org/wiki/Selection_bias}{selection bias}.
 #' @section Key antibiotics:
-#'     There are two ways to determine whether isolates can be included as first \emph{weighted} isolates: \cr
+#'     There are two ways to determine whether isolates can be included as first \emph{weighted} isolates which will give generally the same results: \cr
 #'
 #'     \strong{1. Using} \code{type = "keyantibiotics"} \strong{and parameter} \code{ignore_I} \cr
 #'     Any difference from S to R (or vice versa) will (re)select an isolate as a first weighted isolate. With \code{ignore_I = FALSE}, also differences from I to S|R (or vice versa) will lead to this. This is a reliable method and 30-35 times faster than method 2. \cr
@@ -65,6 +65,24 @@
 #'                 col_patient_id = "patient_id",
 #'                 col_bactid = "bactid")
 #'
+#' # Now let's see if first isolates matter:
+#' A <- my_patients %>%
+#'   group_by(hospital_id) %>%
+#'   summarise(count = n_rsi(gent), # gentamicin
+#'             resistance = resistance(gent))
+#'
+#' B <- my_patients %>%
+#'   filter(first_isolate == TRUE) %>%
+#'   group_by(hospital_id) %>%
+#'   summarise(count = n_rsi(gent), # gentamicin
+#'             resistance = resistance(gent))
+#'
+#' # Have a look at A and B. B is more reliable because every isolate is
+#' # counted once. Gentamicin resitance in hospital D seems to be 5%
+#' # higher than originally thought.
+#'
+#' ## OTHER EXAMPLES:
+#'
 #' \dontrun{
 #'
 #' # set key antibiotics to a new variable
@@ -153,7 +171,7 @@ first_isolate <- function(tbl,

  if (!is.na(col_bactid)) {
    if (!tbl %>% pull(col_bactid) %>% is.bactid()) {
-      tbl[, col_bactid] <- tbl %>% pull(col_bactid) %>% as.bactid()
+      warning("Improve integrity of the `", col_bactid, "` column by transforming it with 'as.bactid'.")
    }
    tbl <- tbl %>% left_join_microorganisms(by = col_bactid)
    col_genus <- "genus"
@@ -179,7 +197,6 @@ first_isolate <- function(tbl,
    filter_specimen <- ''
  }

-  weighted.notice <- ''
  # filter on specimen group and keyantibiotics when they are filled in
  if (!is.na(filter_specimen) & filter_specimen != '') {
    check_columns_existance(col_specimen, tbl)
@@ -317,7 +334,9 @@ first_isolate <- function(tbl,
                               (date_lab - lag(date_lab)) + lag(days_diff),
                               0))

+  weighted.notice <- ''
  if (col_keyantibiotics != '') {
+    weighted.notice <- 'weighted '
    if (info == TRUE) {
      if (type == 'keyantibiotics') {
        cat('[Criteria] Inclusion based on key antibiotics, ')
--- a/R/freq.R
+++ b/R/freq.R
@@ -415,7 +415,6 @@ frequency_tbl <- function(x,

  class(df) <- c('frequency_tbl', class(df))
  attr(df, 'package') <- 'AMR'
-  attr(df, 'package.version') <- packageDescription('AMR')$Version

  if (markdown == TRUE) {
    tbl_format <- 'markdown'
@@ -567,7 +566,6 @@ print.frequency_tbl <- function(x, nmax = getOption("max.print.freq", default =
 #' @export
 as.data.frame.frequency_tbl <- function(x, ...) {
  attr(x, 'package') <- NULL
-  attr(x, 'package.version') <- NULL
  attr(x, 'opt') <- NULL
  as.data.frame.data.frame(x, ...)
 }
@@ -578,7 +576,6 @@ as.data.frame.frequency_tbl <- function(x, ...) {
 #' @importFrom dplyr as_tibble
 as_tibble.frequency_tbl <- function(x, validate = TRUE, ..., rownames = NA) {
  attr(x, 'package') <- NULL
-  attr(x, 'package.version') <- NULL
  attr(x, 'opt') <- NULL
  as_tibble(x = as.data.frame(x), validate = validate, ..., rownames = rownames)
 }
--- a/R/join_microorganisms.R
+++ b/R/join_microorganisms.R
@@ -26,8 +26,8 @@
 #' df2 <- left_join_microorganisms(df, "bacteria_id")
 #' colnames(df2)
 inner_join_microorganisms <- function(x, by = 'bactid', suffix = c("2", ""), ...) {
-  if (!any(class(x) %in% c("bactid", "data.frame", "matrix"))) {
-    x <- data.frame(bactid = as.bactid(x), stringsAsFactors = FALSE)
+  if (!any(class(x) %in% c("data.frame", "matrix"))) {
+    x <- data.frame(bactid = as.character(x), stringsAsFactors = FALSE)
  }
  # no name set to `by` parameter
  if (is.null(names(by))) {
@@ -48,8 +48,8 @@ inner_join_microorganisms <- function(x, by = 'bactid', suffix = c("2", ""), ...
 #' @rdname join
 #' @export
 left_join_microorganisms <- function(x, by = 'bactid', suffix = c("2", ""), ...) {
-  if (!any(class(x) %in% c("bactid", "data.frame", "matrix"))) {
-    x <- data.frame(bactid = as.bactid(x), stringsAsFactors = FALSE)
+  if (!any(class(x) %in% c("data.frame", "matrix"))) {
+    x <- data.frame(bactid = as.character(x), stringsAsFactors = FALSE)
  }
  # no name set to `by` parameter
  if (is.null(names(by))) {
@@ -70,8 +70,8 @@ left_join_microorganisms <- function(x, by = 'bactid', suffix = c("2", ""), ...)
 #' @rdname join
 #' @export
 right_join_microorganisms <- function(x, by = 'bactid', suffix = c("2", ""), ...) {
-  if (!any(class(x) %in% c("bactid", "data.frame", "matrix"))) {
-    x <- data.frame(bactid = as.bactid(x), stringsAsFactors = FALSE)
+  if (!any(class(x) %in% c("data.frame", "matrix"))) {
+    x <- data.frame(bactid = as.character(x), stringsAsFactors = FALSE)
  }
  # no name set to `by` parameter
  if (is.null(names(by))) {
@@ -92,8 +92,8 @@ right_join_microorganisms <- function(x, by = 'bactid', suffix = c("2", ""), ...
 #' @rdname join
 #' @export
 full_join_microorganisms <- function(x, by = 'bactid', suffix = c("2", ""), ...) {
-  if (!any(class(x) %in% c("bactid", "data.frame", "matrix"))) {
-    x <- data.frame(bactid = as.bactid(x), stringsAsFactors = FALSE)
+  if (!any(class(x) %in% c("data.frame", "matrix"))) {
+    x <- data.frame(bactid = as.character(x), stringsAsFactors = FALSE)
  }
  # no name set to `by` parameter
  if (is.null(names(by))) {
@@ -114,8 +114,8 @@ full_join_microorganisms <- function(x, by = 'bactid', suffix = c("2", ""), ...)
 #' @rdname join
 #' @export
 semi_join_microorganisms <- function(x, by = 'bactid', ...) {
-  if (!any(class(x) %in% c("bactid", "data.frame", "matrix"))) {
-    x <- data.frame(bactid = as.bactid(x), stringsAsFactors = FALSE)
+  if (!any(class(x) %in% c("data.frame", "matrix"))) {
+    x <- data.frame(bactid = as.character(x), stringsAsFactors = FALSE)
  }
  # no name set to `by` parameter
  if (is.null(names(by))) {
@@ -132,8 +132,8 @@ semi_join_microorganisms <- function(x, by = 'bactid', ...) {
 #' @rdname join
 #' @export
 anti_join_microorganisms <- function(x, by = 'bactid', ...) {
-  if (!any(class(x) %in% c("bactid", "data.frame", "matrix"))) {
-    x <- data.frame(bactid = as.bactid(x), stringsAsFactors = FALSE)
+  if (!any(class(x) %in% c("data.frame", "matrix"))) {
+    x <- data.frame(bactid = as.character(x), stringsAsFactors = FALSE)
  }
  # no name set to `by` parameter
  if (is.null(names(by))) {
--- a/R/key_antibiotics.R
+++ b/R/key_antibiotics.R
@@ -42,19 +42,26 @@
 #' @importFrom dplyr %>% mutate if_else
 #' @seealso \code{\link{first_isolate}}
 #' @examples
-#' \dontrun{
+#' # septic_patients is a dataset available in the AMR package
+#' ?septic_patients
+#' my_patients <- septic_patients
+#'
+#' library(dplyr)
 #' # set key antibiotics to a new variable
-#' tbl$keyab <- key_antibiotics(tbl)
+#' my_patients <- my_patients %>%
+#'   mutate(keyab = key_antibiotics(.)) %>%
+#'   mutate(
+#'     # now calculate first isolates
+#'     first_regular = first_isolate(., "date", "patient_id", "bactid"),
+#'     # and first WEIGHTED isolates
+#'     first_weighted = first_isolate(., "date", "patient_id", "bactid",
+#'                                    col_keyantibiotics = "keyab")
+#'   )
 #'
-#' # add regular first isolates
-#' tbl$first_isolate <-
-#'   first_isolate(tbl)
+#' # Check the difference, in this data set it results in 7% more isolates:
+#' sum(my_patients$first_regular, na.rm = TRUE)
+#' sum(my_patients$first_weighted, na.rm = TRUE)
 #'
-#' # add first WEIGHTED isolates using key antibiotics
-#' tbl$first_isolate_weighed <-
-#'   first_isolate(tbl,
-#'                 col_keyantibiotics = 'keyab')
-#' }
 #'
 #' # output of the `key_antibiotics` function could be like this:
 #' strainA <- "SSSRR.S.R..S"
@@ -169,87 +176,80 @@ key_antibiotics_equal <- function(x,
                                  points_threshold = 2,
                                  info = FALSE) {
  # x is active row, y is lag
+
  type <- type[1]

  if (length(x) != length(y)) {
    stop('Length of `x` and `y` must be equal.')
  }

+  # only show progress bar on points or when at least 5000 isolates
+  info_needed <- info == TRUE & (type == "points" | length(x) > 5000)
+
  result <- logical(length(x))

-  if (type == "keyantibiotics") {
-    if (ignore_I == TRUE) {
-      # evaluation using regular expression will treat '.' as any character
-      # so I is actually ignored then
-      x <- gsub('I', '.', x, ignore.case = TRUE)
-      y <- gsub('I', '.', y, ignore.case = TRUE)
+  if (info_needed == TRUE) {
+    p <- dplyr::progress_estimated(length(x))
+  }
+
+  for (i in 1:length(x)) {
+
+    if (info_needed == TRUE) {
+      p$tick()$print()
    }

-    for (i in 1:length(x)) {
+    if (is.na(x[i])) {
+      x[i] <- ''
+    }
+    if (is.na(y[i])) {
+      y[i] <- ''
+    }
+
+    if (x[i] == y[i]) {
+
+      result[i] <- TRUE
+
+    } else if (nchar(x[i]) != nchar(y[i])) {
+
+      result[i] <- FALSE
+
+    } else {
+
      x_split <- strsplit(x[i], "")[[1]]
      y_split <- strsplit(y[i], "")[[1]]
-      y_split[x_split == "."] <- "."
-      x_split[y_split == "."] <- "."
-      x_checkfor <- paste(x_split, collapse = "")
-      y_checkfor <- paste(y_split, collapse = "")
-      result[i] <- nchar(x[i]) == nchar(y[i]) &
-        (x_checkfor %like% y_checkfor |
-           y_checkfor %like% x_checkfor)
-    }
-    return(result)

-  } else {
+      if (type == 'keyantibiotics') {

-    if (type != 'points') {
-      stop('`', type, '` is not a valid value for type, must be "points" or "keyantibiotics". See ?first_isolate.')
-    }
+        if (ignore_I == TRUE) {
+          x_split[x_split == "I"] <- "."
+          y_split[y_split == "I"] <- "."
+        }

-    if (info == TRUE) {
-      p <- dplyr::progress_estimated(length(x))
-    }
+        y_split[x_split == "."] <- "."
+        x_split[y_split == "."] <- "."

-    for (i in 1:length(x)) {
-
-      if (info == TRUE) {
-        p$tick()$print()
-      }
-
-      if (is.na(x[i])) {
-        x[i] <- ''
-      }
-      if (is.na(y[i])) {
-        y[i] <- ''
-      }
-
-      if (nchar(x[i]) != nchar(y[i])) {
-
-        result[i] <- FALSE
-
-      } else if (x[i] == '' & y[i] == '') {
-
-        result[i] <- TRUE
-
-      } else {
-
-        x2 <- strsplit(x[i], "")[[1]]
-        y2 <- strsplit(y[i], "")[[1]]
+        result[i] <- all(x_split == y_split)

+      } else if (type == 'points') {
        # count points for every single character:
        # - no change is 0 points
        # - I <-> S|R is 0.5 point
        # - S|R <-> R|S is 1 point
        # use the levels of as.rsi (S = 1, I = 2, R = 3)

-        suppressWarnings(x2 <- x2 %>% as.rsi() %>% as.double())
-        suppressWarnings(y2 <- y2 %>% as.rsi() %>% as.double())
+        suppressWarnings(x_split <- x_split %>% as.rsi() %>% as.double())
+        suppressWarnings(y_split <- y_split %>% as.rsi() %>% as.double())

-        points <- (x2 - y2) %>% abs() %>% sum(na.rm = TRUE)
-        result[i] <- ((points / 2) >= points_threshold)
+        points <- (x_split - y_split) %>% abs() %>% sum(na.rm = TRUE) / 2
+        result[i] <- points >= points_threshold
+
+      } else {
+        stop('`', type, '` is not a valid value for type, must be "points" or "keyantibiotics". See ?first_isolate.')
      }
    }
-    if (info == TRUE) {
-      cat('\n')
-    }
-    result
  }
+  if (info_needed == TRUE) {
+    cat('\n')
+  }
+  result
 }