(v1.4.0.9025) is_new_episode()

2025-08-25 13:42:10 +02:00 · 2020-11-23 21:50:27 +01:00
parent 363218da7e
commit b045b571a6
29 changed files with 706 additions and 366 deletions
--- a/R/aa_helper_functions.R
+++ b/R/aa_helper_functions.R
@@ -71,41 +71,7 @@ addin_insert_in <- function() {

 # No export, no Rd
 addin_insert_like <- function() {
-  stop_ifnot_installed("rstudioapi")
-  # we want Ctrl/Cmd + L to iterate over %like%, %not_like% and %like_case%, so determine context first
-  
-  getSourceEditorContext <- import_fn("getSourceEditorContext", "rstudioapi")
-  insertText <- import_fn("insertText", "rstudioapi")
-  modifyRange <- import_fn("insertText", "rstudioapi")
-  document_range <- import_fn("document_range", "rstudioapi")
-  document_position <- import_fn("document_position", "rstudioapi")
-
-  context <- getSourceEditorContext()
-  current_row <- context$selection[[1]]$range$end[1]
-  current_col <- context$selection[[1]]$range$end[2]
-  current_row_txt <- context$contents[current_row]
-  
-  pos_preceded_by <- function(txt) {
-    substr(current_row_txt, current_col - nchar(txt), current_col) %like% paste0("^", txt)
-  }
-  replace_pos <- function(old, with) {
-    modifyRange(document_range(document_position(current_row, current_col - nchar(old)),
-                               document_position(current_row, current_col)),
-                text = with,
-                id = context$id)
-  }
-  
-  if (pos_preceded_by(" %like% ")) {
-    replace_pos(" %like% ", with = " %not_like% ")
-  } else if (pos_preceded_by(" %not_like% ")) {
-    replace_pos(" %not_like% ", with = " %like_case% ")
-  } else if (pos_preceded_by(" %like_case% ")) {
-    replace_pos(" %like_case% ", with = " %not_like_case% ")
-  } else if (pos_preceded_by(" %not_like_case% ")) {
-    replace_pos(" %not_like_case% ", with = " %like% ")
-  } else {
-    insertText(" %like% ")
-  }
+  import_fn("insertText", "rstudioapi")(" %like% ")
 }

 check_dataset_integrity <- function() {
--- a/R/first_isolate.R
+++ b/R/first_isolate.R
@@ -25,7 +25,7 @@

 #' Determine first (weighted) isolates
 #'
-#' Determine first (weighted) isolates of all microorganisms of every patient per episode and (if needed) per specimen type. To determine patient episodes not necessarily based on microorganisms, use [is_new_episode()] that also supports grouping with the `dplyr` package, see *Examples*.
+#' Determine first (weighted) isolates of all microorganisms of every patient per episode and (if needed) per specimen type. To determine patient episodes not necessarily based on microorganisms, use [is_new_episode()] that also supports grouping with the `dplyr` package.
 #' @inheritSection lifecycle Stable lifecycle
 #' @param x,.data a [data.frame] containing isolates.
 #' @param col_date column name of the result date (or date that is was received on the lab), defaults to the first column with a date class
@@ -45,7 +45,7 @@
 #' @param info print progress
 #' @param include_unknown logical to determine whether 'unknown' microorganisms should be included too, i.e. microbial code `"UNKNOWN"`, which defaults to `FALSE`. For WHONET users, this means that all records with organism code `"con"` (*contamination*) will be excluded at default. Isolates with a microbial ID of `NA` will always be excluded as first isolate.
 #' @param ... parameters passed on to the [first_isolate()] function
-#' @details The [is_new_episode()] function is a wrapper around the [first_isolate()] function and can be used for data sets without isolates to just determine patient episodes based on any combination of grouping variables (using `dplyr`), please see *Examples*. Since it runs [first_isolate()] for every group, it is quite slow.
+#' @details The [first_isolate()] function is a wrapper around the [is_new_episode()] function, but more efficient for data sets containing microorganism codes or names.
 #' 
 #' All isolates with a microbial ID of `NA` will be excluded as first isolate.
 #' 
@@ -130,42 +130,6 @@
 #'   # Gentamicin resistance in hospital D appears to be 3.7% higher than
 #'   # when you (erroneously) would have used all isolates for analysis.
 #' }
-#' 
-#' # filtering based on any other condition -----------------------------------
-#' 
-#' if (require("dplyr")) {
-#'   # is_new_episode() can be used in dplyr verbs to determine patient
-#'   # episodes based on any (combination of) grouping variables:
-#'   example_isolates %>%
-#'     mutate(condition = sample(x = c("A", "B", "C"), 
-#'                               size = 2000,
-#'                               replace = TRUE)) %>% 
-#'     group_by(condition) %>%
-#'     mutate(new_episode = is_new_episode())
-#'   
-#'   example_isolates %>%
-#'     group_by(hospital_id) %>% 
-#'     summarise(patients = n_distinct(patient_id),
-#'               n_episodes_365 = sum(is_new_episode(episode_days = 365)),
-#'               n_episodes_60  = sum(is_new_episode(episode_days = 60)),
-#'               n_episodes_30  = sum(is_new_episode(episode_days = 30)))
-#'     
-#'     
-#'   # grouping on microorganisms leads to the same results as first_isolate():
-#'   x <- example_isolates %>%
-#'     filter_first_isolate(include_unknown = TRUE)
-#'     
-#'   y <- example_isolates %>%
-#'     group_by(mo) %>%
-#'     filter(is_new_episode())
-#' 
-#'   identical(x$patient_id, y$patient_id)
-#'   
-#'   # but now you can group on isolates and many more:
-#'   example_isolates %>%
-#'     group_by(mo, hospital_id, ward_icu) %>%
-#'     mutate(flag_episode = is_new_episode())
-#' }
 #' }
 first_isolate <- function(x,
                          col_date = NULL,
@@ -375,28 +339,6 @@ first_isolate <- function(x,
  scope.size <- nrow(x[which(x$newvar_row_index_sorted %in% c(row.start + 1:row.end) &
                               !is.na(x$newvar_mo)), , drop = FALSE])
  
-  identify_new_year <- function(x, episode_days) {
-    # I asked on StackOverflow:
-    # https://stackoverflow.com/questions/42122245/filter-one-row-every-year
-    if (length(x) == 1) {
-      return(TRUE)
-    }
-    indices <- integer(0)
-    start <- x[1]
-    ind <- 1
-    indices[ind] <- ind
-    for (i in 2:length(x)) {
-      if (isTRUE(as.numeric(x[i] - start) >= episode_days)) {
-        ind <- ind + 1
-        indices[ind] <- i
-        start <- x[i]
-      }
-    }
-    result <- rep(FALSE, length(x))
-    result[indices] <- TRUE
-    return(result)
-  }
-  
  # Analysis of first isolate ----
  x$other_pat_or_mo <- ifelse(x$newvar_patient_id == pm_lag(x$newvar_patient_id) &
                                x$newvar_genus_species == pm_lag(x$newvar_genus_species),
@@ -407,8 +349,8 @@ first_isolate <- function(x,
                                           function(g,
                                                    df = x,
                                                    days = episode_days) {
-                                             identify_new_year(x = df[which(df$episode_group == g), "newvar_date", drop = TRUE],
-                                                               episode_days = days)
+                                             is_new_episode(x = df[which(df$episode_group == g), ]$newvar_date,
+                                                            episode_days = days)
                                           }))
  
  weighted.notice <- ""
@@ -572,67 +514,5 @@ filter_first_weighted_isolate <- function(x,
  
  subset(x, first_isolate(x = y,
                          col_date = col_date,
-                          col_patient_id = col_patient_id,
-                          ...))
-}
-
-#' @rdname first_isolate
-#' @export
-is_new_episode <- function(.data,
-                           episode_days = 365,
-                           col_date = NULL,
-                           col_patient_id = NULL) {
-  if (missing(.data)) {
-    # look it up - this also supports grouping variables
-    cur_data <- import_fn("cur_data", "dplyr", error_on_fail = FALSE)
-    if (is.null(cur_data)) {
-      stop_("parameter '.data' not set.")
-    }
-    .data <- cur_data()
-  }
-  meet_criteria(.data, allow_class = "data.frame") # also checks dimensions to be >0
-  meet_criteria(col_date, allow_class = "character", has_length = 1, allow_NULL = TRUE, is_in = colnames(x))
-  meet_criteria(col_patient_id, allow_class = "character", has_length = 1, allow_NULL = TRUE, is_in = colnames(x))
-  meet_criteria(episode_days, allow_class = c("numeric", "integer"), has_length = 1)
-  
-  # get i'th ID of group, so notices will only be thrown once
-  cur_group_id <- import_fn("cur_group_id", "dplyr", error_on_fail = FALSE)
-  first_group <- tryCatch(is.null(cur_group_id) || cur_group_id() == 1,
-                          error = function(e) TRUE)
-
-  # try to find columns based on type
-  # -- date
-  if (is.null(col_date)) {
-    col_date <- search_type_in_df(x = .data,
-                                  type = "date",
-                                  info = first_group)
-    stop_if(is.null(col_date), "`col_date` must be set")
-  }
-  
-  # -- patient id
-  if (is.null(col_patient_id)) {
-    if (all(c("First name", "Last name", "Sex") %in% colnames(.data))) {
-      # WHONET support
-      .data$patient_id <- paste(.data$`First name`, .data$`Last name`, .data$Sex)
-      col_patient_id <- "patient_id"
-      if (is.null(cur_group_id) || cur_group_id() == 1) {
-        message_("Using combined columns `", font_bold("First name"), "`, `", font_bold("Last name"), "` and `", font_bold("Sex"), "` as input for `col_patient_id`")
-      }
-    } else {
-      col_patient_id <- search_type_in_df(x = .data,
-                                          type = "patient_id",
-                                          info = first_group)
-    }
-    stop_if(is.null(col_patient_id), "`col_patient_id` must be set")
-  }
-  
-  # create any random mo, so first isolates can be calculated
-  .data$a94a8fe5 <- as.mo("Escherichia coli")
-  
-  first_isolate(.data,
-                col_date = col_date,
-                col_patient_id = col_patient_id,
-                episode_days = episode_days,
-                col_mo = "a94a8fe5",
-                info = FALSE)
+                          col_patient_id = col_patient_id))
 }
--- a/R/is_new_episode.R
+++ b/R/is_new_episode.R
@@ -0,0 +1,129 @@
+# ==================================================================== #
+# TITLE                                                                #
+# Antimicrobial Resistance (AMR) Analysis for R                        #
+#                                                                      #
+# SOURCE                                                               #
+# https://github.com/msberends/AMR                                     #
+#                                                                      #
+# LICENCE                                                              #
+# (c) 2018-2020 Berends MS, Luz CF et al.                              #
+# Developed at the University of Groningen, the Netherlands, in        #
+# collaboration with non-profit organisations Certe Medical            #
+# Diagnostics & Advice, and University Medical Center Groningen.       # 
+#                                                                      #
+# This R package is free software; you can freely use and distribute   #
+# it for both personal and commercial purposes under the terms of the  #
+# GNU General Public License version 2.0 (GNU GPL-2), as published by  #
+# the Free Software Foundation.                                        #
+# We created this package for both routine data analysis and academic  #
+# research and it was publicly released in the hope that it will be    #
+# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY.              #
+#                                                                      #
+# Visit our website for the full manual and a complete tutorial about  #
+# how to conduct AMR analysis: https://msberends.github.io/AMR/        #
+# ==================================================================== #
+
+#' Determine (new) episodes for patients
+#' 
+#' This function determines which items in a vector can be considered (the start of) a new episode, based on the parameter `episode_days`. This can be used to determine clinical episodes for any epidemiological analysis.
+#' @inheritSection lifecycle Experimental lifecycle
+#' @param x vector of dates (class `Date` or `POSIXt`)
+#' @param episode_days length of the required episode in days, defaults to 365. Every element in the input will return `TRUE` after this number of days has passed since the last included date, independent of calendar years. Please see *Details*.
+#' @param ... arguments passed on to [as.Date()]
+#' @details 
+#' Dates are first sorted from old to new. The oldest date will mark the start of the first episode. After this date, the next date will be marked that is at least `episode_days` days later than the start of the first episode. From that second marked date on, the next date will be marked that is at least `episode_days` days later than the start of the second episode which will be the start of the third episode, and so on. Before the vector is being returned, the original order will be restored.
+#' 
+#' The `dplyr` package is not required for this function to work, but this function works conveniently inside `dplyr` verbs such as [filter()], [mutate()] and [summarise()].
+#' @return a [logical] vector
+#' @export
+#' @inheritSection AMR Read more on our website!
+#' @examples
+#' # `example_isolates` is a dataset available in the AMR package.
+#' # See ?example_isolates.
+#' 
+#' is_new_episode(example_isolates$date)
+#' is_new_episode(example_isolates$date, episode_days = 60)
+#' 
+#' if (require("dplyr")) {
+#'   # is_new_episode() can also be used in dplyr verbs to determine patient
+#'   # episodes based on any (combination of) grouping variables:
+#'   example_isolates %>%
+#'     mutate(condition = sample(x = c("A", "B", "C"), 
+#'                               size = 2000,
+#'                               replace = TRUE)) %>% 
+#'     group_by(condition) %>%
+#'     mutate(new_episode = is_new_episode(date))
+#'   
+#'   example_isolates %>%
+#'     group_by(hospital_id) %>% 
+#'     summarise(patients = n_distinct(patient_id),
+#'               n_episodes_365 = sum(is_new_episode(date, episode_days = 365)),
+#'               n_episodes_60  = sum(is_new_episode(date, episode_days = 60)),
+#'               n_episodes_30  = sum(is_new_episode(date, episode_days = 30)))
+#'     
+#'     
+#'   # grouping on patients and microorganisms leads to the same results
+#'   # as first_isolate():
+#'   x <- example_isolates %>%
+#'     filter(first_isolate(., include_unknown = TRUE))
+#'     
+#'   y <- example_isolates %>%
+#'     group_by(patient_id, mo) %>%
+#'     filter(is_new_episode(date))
+#'
+#'   identical(x$patient_id, y$patient_id)
+#'   
+#'   # but is_new_episode() has a lot more flexibility than first_isolate(),
+#'   # since you can now group on anything that seems relevant:
+#'   example_isolates %>%
+#'     group_by(patient_id, mo, hospital_id, ward_icu) %>%
+#'     mutate(flag_episode = is_new_episode(date))
+#' }
+is_new_episode <- function(x, episode_days = 365, ...) {
+  meet_criteria(x, allow_class = c("Date", "POSIXt"))
+  meet_criteria(episode_days, allow_class = c("numeric", "double", "integer"), has_length = 1)
+  
+  x <- as.double(as.Date(x, ...)) # as.Date() for POSIX classes
+  if (length(x) == 1) {
+    return(TRUE)
+  }
+  if (length(x) == 2 && max(x) - min(x) >= episode_days) {
+    return(rep(TRUE, 2))
+  }
+  
+  # I asked on StackOverflow:
+  # https://stackoverflow.com/questions/42122245/filter-one-row-every-year
+  exec <- function(x, episode_days) {
+    if (length(x) == 1) {
+      return(TRUE)
+    } else if (length(x) == 2) {
+      if (max(x) - min(x) >= episode_days) {
+        return(c(TRUE, TRUE))
+      } else {
+        return(c(TRUE, FALSE))
+      }
+    }
+    indices <- integer()
+    start <- x[1]
+    ind <- 1
+    indices[1] <- 1
+    for (i in 2:length(x)) {
+      if (isTRUE((x[i] - start) >= episode_days)) {
+        ind <- ind + 1
+        indices[ind] <- i
+        start <- x[i]
+      }
+    }
+    result <- rep(FALSE, length(x))
+    result[indices] <- TRUE
+    result
+  }
+  
+  df <- data.frame(x = x,
+                   y = seq_len(length(x))) %pm>%
+    pm_arrange(x)
+  df$new <- exec(df$x, episode_days)
+  df %pm>%
+    pm_arrange(y) %pm>%
+    pm_pull(new)
+}
--- a/R/like.R
+++ b/R/like.R
@@ -41,9 +41,7 @@
 #' * Checks if `pattern` is a regular expression and sets `fixed = TRUE` if not, to greatly improve speed
 #' * Tries again with `perl = TRUE` if regex fails
 #' 
-#' Using RStudio? This function can also be inserted in your code from the Addins menu and can have its own Keyboard Shortcut like `Ctrl+Shift+L` or `Cmd+Shift+L` (see `Tools` > `Modify Keyboard Shortcuts...`). This addin iterates over all 'like' variants. So if you have defined the keyboard shortcut Ctrl/Cmd + L to this addin, it will first insert ` %like% ` and by pressing it again it will be replaced with ` %not_like% `, then ` %like_case% `, then ` %not_like_case% ` and then back to ` %like% `.
-#' 
-#' The `"%not_like%"` and `"%not_like_case%"` functions are wrappers around `"%like%"` and `"%like_case%"`.
+#' Using RStudio? The text `%like%` can also be directly inserted in your code from the Addins menu and can have its own Keyboard Shortcut like `Ctrl+Shift+L` or `Cmd+Shift+L` (see `Tools` > `Modify Keyboard Shortcuts...`).
 #' @source Idea from the [`like` function from the `data.table` package](https://github.com/Rdatatable/data.table/blob/master/R/like.R)
 #' @seealso [grep()]
 #' @inheritSection AMR Read more on our website!
@@ -67,11 +65,6 @@
 #' if (require("dplyr")) {
 #'   example_isolates %>%
 #'     filter(mo_name(mo) %like% "^ent")
-#'  
-#'   example_isolates %>% 
-#'     mutate(group = case_when(hospital_id %like% "A|D"        ~ "Group 1",
-#'                              mo_name(mo) %not_like% "^Staph" ~ "Group 2a",
-#'                              TRUE                            ~ "Group 2b"))
 #' }
 #' }
 like <- function(x, pattern, ignore.case = TRUE) {
@@ -157,14 +150,6 @@ like <- function(x, pattern, ignore.case = TRUE) {
  like(x, pattern, ignore.case = TRUE)
 }

-#' @rdname like
-#' @export
-"%not_like%" <- function(x, pattern) {
-  meet_criteria(x, allow_NA = TRUE)
-  meet_criteria(pattern, allow_NA = FALSE)
-  !like(x, pattern, ignore.case = TRUE)
-}
-
 #' @rdname like
 #' @export
 "%like_case%" <- function(x, pattern) {
@@ -173,14 +158,6 @@ like <- function(x, pattern, ignore.case = TRUE) {
  like(x, pattern, ignore.case = FALSE)
 }

-#' @rdname like
-#' @export
-"%not_like_case%" <- function(x, pattern) {
-  meet_criteria(x, allow_NA = TRUE)
-  meet_criteria(pattern, allow_NA = FALSE)
-  !like(x, pattern, ignore.case = FALSE)
-}
-
 "%like_perl%" <- function(x, pattern) {
  meet_criteria(x, allow_NA = TRUE)
  meet_criteria(pattern, allow_NA = FALSE)
--- a/R/rsi.R
+++ b/R/rsi.R
@@ -614,13 +614,18 @@ as.rsi.data.frame <- function(x,
    } else if (types[i] == "rsi") {
      ab <- ab_cols[i]
      ab_coerced <- suppressWarnings(as.ab(ab))
-      message_("=> Cleaning values in column `", font_bold(ab), "` (",
-               ifelse(ab_coerced != ab, paste0(ab_coerced, ", "), ""),
-               ab_name(ab_coerced, tolower = TRUE), ")... ",
-               appendLF = FALSE,
-               as_note = FALSE)
-      x[, ab_cols[i]] <- as.rsi.default(x = x %pm>% pm_pull(ab_cols[i]))
-      message_(" OK.", add_fn = list(font_green, font_bold), as_note = FALSE)
+      if (!all(x[, ab_cols[i], drop = TRUE] %in% c("R", "S", "I"), na.rm = TRUE)) {
+        # only print message if values are not already clean
+        message_("=> Cleaning values in column `", font_bold(ab), "` (",
+                 ifelse(ab_coerced != ab, paste0(ab_coerced, ", "), ""),
+                 ab_name(ab_coerced, tolower = TRUE), ")... ",
+                 appendLF = FALSE,
+                 as_note = FALSE)
+      }
+      x[, ab_cols[i]] <- as.rsi.default(x = as.character(x[, ab_cols[i], drop = TRUE]))
+      if (!all(x[, ab_cols[i], drop = TRUE] %in% c("R", "S", "I"), na.rm = TRUE)) {
+        message_(" OK.", add_fn = list(font_green, font_bold), as_note = FALSE)
+      }
    }
  }