(v1.2.0.9023) ab_from_text() improvement

2025-08-25 13:42:10 +02:00 · 2020-07-02 21:12:52 +02:00
parent 298e67a45b
commit 152ac5bcad
29 changed files with 175 additions and 118 deletions
--- a/R/aa_helper_functions.R
+++ b/R/aa_helper_functions.R
@@ -184,7 +184,7 @@ stop_ifnot_installed <- function(package) {
  # https://developer.r-project.org/Blog/public/2019/02/14/staged-install/index.html
  sapply(package, function(pkg)
    tryCatch(get(".packageName", envir = asNamespace(pkg)),
-             error = function(e) { 
+             error = function(e) {
               if (package == "rstudioapi") {
                 stop("This function only works in RStudio.", call. = FALSE)
               } else if (pkg != "base") {
--- a/R/ab_from_text.R
+++ b/R/ab_from_text.R
@@ -27,6 +27,7 @@
 #' @param type type of property to search for, either `"drug"`, `"dose"` or `"administration"`, see *Examples*
 #' @param collapse character to pass on to `paste(..., collapse = ...)` to only return one character per element of `text`, see *Examples*
 #' @param translate_ab if `type = "drug"`: a column name of the [antibiotics] data set to translate the antibiotic abbreviations to, using [ab_property()]. Defaults to `FALSE`. Using `TRUE` is equal to using "name".
+#' @param thorough_search logical to indicate whether the input must be extensively searched for misspelling and other faulty input values. Setting this to `TRUE` will take considerably more time than when using `FALSE`. At default, it will turn `TRUE` when all input elements contain a maximum of three words.
 #' @param ... parameters passed on to [as.ab()]
 #' @details This function is also internally used by [as.ab()], although it then only searches for the first drug name and will throw a note if more drug names could have been returned.
 #' 
@@ -85,6 +86,7 @@ ab_from_text <- function(text,
                         type = c("drug", "dose", "administration"),
                         collapse = NULL,
                         translate_ab = FALSE,
+                         thorough_search = NULL,
                         ...) {
  
  if (missing(type)) {
@@ -95,30 +97,54 @@ ab_from_text <- function(text,
  
  text <- tolower(as.character(text))
  text_split_all <- strsplit(text, "[ ;.,:\\|]")
+  progress <- progress_estimated(n = length(text_split_all), n_min = 5)
+  on.exit(close(progress))
  
  if (type %like% "(drug|ab|anti)") {
    
    translate_ab <- get_translate_ab(translate_ab)
    
-    abbr <- unlist(antibiotics$abbreviations)
-    abbr <- abbr[nchar(abbr) >= 4]
-    names_atc <- substr(c(antibiotics$name, antibiotics$atc), 1, 5)
-    synonyms <- unlist(antibiotics$synonyms)
-    synonyms <- synonyms[nchar(synonyms) >= 4]
-    to_regex <- function(x) {
-      paste0("^(",
-             paste0(unique(gsub("[^a-z0-9]", ".*", sort(tolower(x)))), collapse = "|"),
-             ").*")
+    if (isTRUE(thorough_search) | 
+        (isTRUE(is.null(thorough_search)) & max(sapply(text_split_all, length), na.rm = TRUE) <= 3)) {
+      text_split_all <- text_split_all[nchar(text_split_all) >= 4 & grepl("[a-z]+", text_split_all)]
+      result <- lapply(text_split_all, function(text_split) {
+        progress$tick()
+        suppressWarnings(
+          out <- as.ab(text_split, ...)
+        )
+      })
+      
+    } else {
+      # no thorough search
+      abbr <- unlist(antibiotics$abbreviations)
+      abbr <- abbr[nchar(abbr) >= 4]
+      names_atc <- substr(c(antibiotics$name, antibiotics$atc), 1, 5)
+      synonyms <- unlist(antibiotics$synonyms)
+      synonyms <- synonyms[nchar(synonyms) >= 4]
+      # regular expression must not be too long, so split synonyms in two:
+      synonyms_part1 <- synonyms[seq_len(0.5 * length(synonyms))]
+      synonyms_part2 <- synonyms[!synonyms %in% synonyms_part1]
+      to_regex <- function(x) {
+        paste0("^(",
+               paste0(unique(gsub("[^a-z0-9]+", "", sort(tolower(x)))), collapse = "|"),
+               ").*")
+      }
+      result <- lapply(text_split_all, function(text_split) {
+        progress$tick()
+        suppressWarnings(
+          out <- as.ab(unique(c(text_split[text_split %like_case% to_regex(abbr)],
+                                text_split[text_split %like_case% to_regex(names_atc)],
+                                text_split[text_split %like_case% to_regex(synonyms_part1)],
+                                text_split[text_split %like_case% to_regex(synonyms_part2)])
+                              ),
+                       ...)
+        )
+      })
    }
    
-    result <- lapply(text_split_all, function(text_split) {
-      suppressWarnings(
-        out <- as.ab(unique(c(text_split[grep(to_regex(abbr), text_split)],
-                              text_split[grep(to_regex(names_atc), text_split)],
-                              # regular expression must not be too long, so split synonyms in two:
-                              text_split[grep(to_regex(synonyms[c(1:0.5 * length(synonyms))]), text_split)],
-                              text_split[grep(to_regex(synonyms[c(0.5 * length(synonyms):length(synonyms))]), text_split)])),
-                     ...))
+    close(progress)
+    
+    result <- lapply(result, function(out) {
      out <- out[!is.na(out)]
      if (length(out) == 0) {
        as.ab(NA)
@@ -128,6 +154,7 @@ ab_from_text <- function(text,
        }
        out
      }
+      
    })
    
  } else if (type %like% "dos") {
@@ -167,7 +194,7 @@ ab_from_text <- function(text,
  # collapse text if needed
  if (!is.null(collapse)) {
    result <- sapply(result, function(x) {
-      if(length(x) == 1 & all(is.na(x))) {
+      if (length(x) == 1 & all(is.na(x))) {
        NA_character_
      } else {
        paste0(x, collapse = collapse)
--- a/R/bug_drug_combinations.R
+++ b/R/bug_drug_combinations.R
@@ -135,7 +135,7 @@ format.bug_drug_combinations <- function(x,
    format <- tolower(format)
    ab_txt <- rep(format, length(ab))
    for (i in seq_len(length(ab_txt))) {
-      ab_txt[i] <- gsub("ab", ab[i], ab_txt[i])
+      ab_txt[i] <- gsub("ab", as.character(as.ab(ab[i])), ab_txt[i])
      ab_txt[i] <- gsub("cid", ab_cid(ab[i]), ab_txt[i])
      ab_txt[i] <- gsub("group", ab_group(ab[i], language = language), ab_txt[i])
      ab_txt[i] <- gsub("atc_group1", ab_atc_group1(ab[i], language = language), ab_txt[i])
--- a/R/first_isolate.R
+++ b/R/first_isolate.R
@@ -259,25 +259,25 @@ first_isolate <- function(x,
  
  # arrange data to the right sorting
  if (is.null(specimen_group)) {
-      x <- x[order(x$newvar_patient_id, 
-                   x$newvar_genus_species,
-                   x$newvar_date), ]
-      rownames(x) <- NULL
-      row.start <- 1
-      row.end <- nrow(x)
+    x <- x[order(x$newvar_patient_id, 
+                 x$newvar_genus_species,
+                 x$newvar_date), ]
+    rownames(x) <- NULL
+    row.start <- 1
+    row.end <- nrow(x)
  } else {
    # filtering on specimen and only analyse these rows to save time
-      x <- x[order(pull(x, col_specimen),
-                   x$newvar_patient_id, 
-                   x$newvar_genus_species,
-                   x$newvar_date), ]
-      rownames(x) <- NULL
-      suppressWarnings(
-        row.start <- which(x %>% pull(col_specimen) == specimen_group) %>% min(na.rm = TRUE)
-      )
-      suppressWarnings(
-        row.end <- which(x %>% pull(col_specimen) == specimen_group) %>% max(na.rm = TRUE)
-      )
+    x <- x[order(pull(x, col_specimen),
+                 x$newvar_patient_id, 
+                 x$newvar_genus_species,
+                 x$newvar_date), ]
+    rownames(x) <- NULL
+    suppressWarnings(
+      row.start <- which(x %>% pull(col_specimen) == specimen_group) %>% min(na.rm = TRUE)
+    )
+    suppressWarnings(
+      row.end <- which(x %>% pull(col_specimen) == specimen_group) %>% max(na.rm = TRUE)
+    )
  }
  
  # no isolates found
@@ -290,7 +290,7 @@ first_isolate <- function(x,
  
  # did find some isolates - add new index numbers of rows
  x$newvar_row_index_sorted <- seq_len(nrow(x))
-
+  
  scope.size <- nrow(x[which(x$newvar_row_index_sorted %in% c(row.start + 1:row.end) &
                               !is.na(x$newvar_mo)), , drop = FALSE])
  
@@ -318,17 +318,17 @@ first_isolate <- function(x,
  
  # Analysis of first isolate ----
  x$other_pat_or_mo <- if_else(x$newvar_patient_id == lag(x$newvar_patient_id) &
-                                         x$newvar_genus_species == lag(x$newvar_genus_species),
-                                       FALSE,
-                                       TRUE)
+                                 x$newvar_genus_species == lag(x$newvar_genus_species),
+                               FALSE,
+                               TRUE)
  x$episode_group <- paste(x$newvar_patient_id, x$newvar_genus_species)
-  x$more_than_episode_ago <- unname(unlist(lapply(unique(x$episode_group), 
-                                                          function(g, 
-                                                                   df = x,
-                                                                   days = episode_days) {
-                                                            identify_new_year(x = df[which(df$episode_group == g), "newvar_date"],
-                                                                              episode_days = days)
-                                                          })))
+  x$more_than_episode_ago <- unlist(lapply(unique(x$episode_group),
+                                           function(g,
+                                                    df = x,
+                                                    days = episode_days) {
+                                             identify_new_year(x = df[which(df$episode_group == g), "newvar_date", drop = TRUE],
+                                                               episode_days = days)
+                                           }))
  
  weighted.notice <- ""
  if (!is.null(col_keyantibiotics)) {
@@ -336,38 +336,38 @@ first_isolate <- function(x,
    if (info == TRUE) {
      if (type == "keyantibiotics") {
        message(font_black(paste0("[Criterion] Base inclusion on key antibiotics, ",
-                            ifelse(ignore_I == FALSE, "not ", ""),
-                            "ignoring I")))
+                                  ifelse(ignore_I == FALSE, "not ", ""),
+                                  "ignoring I")))
      }
      if (type == "points") {
        message(font_black(paste0("[Criterion] Base inclusion on key antibiotics, using points threshold of "
-                            , points_threshold)))
+                                  , points_threshold)))
      }
    }
    type_param <- type
    
    x$other_key_ab <- !key_antibiotics_equal(y = x$newvar_key_ab,
-                                                     z = lag(x$newvar_key_ab),
-                                                     type = type_param,
-                                                     ignore_I = ignore_I,
-                                                     points_threshold = points_threshold,
-                                                     info = info)
+                                             z = lag(x$newvar_key_ab),
+                                             type = type_param,
+                                             ignore_I = ignore_I,
+                                             points_threshold = points_threshold,
+                                             info = info)
    # with key antibiotics
    x$newvar_first_isolate <- if_else(x$newvar_row_index_sorted >= row.start &
-                                              x$newvar_row_index_sorted <= row.end &
-                                              x$newvar_genus_species != "" & 
-                                              (x$other_pat_or_mo | x$more_than_episode_ago | x$other_key_ab),
-                                            TRUE,
-                                            FALSE)
+                                        x$newvar_row_index_sorted <= row.end &
+                                        x$newvar_genus_species != "" & 
+                                        (x$other_pat_or_mo | x$more_than_episode_ago | x$other_key_ab),
+                                      TRUE,
+                                      FALSE)
    
  } else {
    # no key antibiotics
    x$newvar_first_isolate <- if_else(x$newvar_row_index_sorted >= row.start &
-                                              x$newvar_row_index_sorted <= row.end &
-                                              x$newvar_genus_species != "" & 
-                                              (x$other_pat_or_mo | x$more_than_episode_ago),
-                                            TRUE,
-                                            FALSE)
+                                        x$newvar_row_index_sorted <= row.end &
+                                        x$newvar_genus_species != "" & 
+                                        (x$other_pat_or_mo | x$more_than_episode_ago),
+                                      TRUE,
+                                      FALSE)
  }
  
  # first one as TRUE
@@ -391,17 +391,17 @@ first_isolate <- function(x,
  # handle empty microorganisms
  if (any(x$newvar_mo == "UNKNOWN", na.rm = TRUE) & info == TRUE) {
    message(font_blue(paste0("NOTE: ", ifelse(include_unknown == TRUE, "Included ", "Excluded "), 
-                        format(sum(x$newvar_mo == "UNKNOWN", na.rm = TRUE),
-                               decimal.mark = decimal.mark, big.mark = big.mark), 
-                        " isolates with a microbial ID 'UNKNOWN' (column `", font_bold(col_mo), "`)")))
+                             format(sum(x$newvar_mo == "UNKNOWN", na.rm = TRUE),
+                                    decimal.mark = decimal.mark, big.mark = big.mark), 
+                             " isolates with a microbial ID 'UNKNOWN' (column `", font_bold(col_mo), "`)")))
  }
  x[which(x$newvar_mo == "UNKNOWN"), "newvar_first_isolate"] <- include_unknown
  
  # exclude all NAs
  if (any(is.na(x$newvar_mo)) & info == TRUE) {
    message(font_blue(paste0("NOTE: Excluded ", format(sum(is.na(x$newvar_mo), na.rm = TRUE),
-                                                  decimal.mark = decimal.mark, big.mark = big.mark), 
-                        " isolates with a microbial ID 'NA' (column `", font_bold(col_mo), "`)")))
+                                                       decimal.mark = decimal.mark, big.mark = big.mark), 
+                             " isolates with a microbial ID 'NA' (column `", font_bold(col_mo), "`)")))
  }
  x[which(is.na(x$newvar_mo)), "newvar_first_isolate"] <- FALSE
  
@@ -465,7 +465,7 @@ filter_first_weighted_isolate <- function(x,
      col_keyantibiotics <- "keyab"
    }
  }
-
+  
  subset(x, first_isolate(x = y,
                          col_date = col_date,
                          col_patient_id = col_patient_id,
--- a/R/like.R
+++ b/R/like.R
@@ -102,14 +102,19 @@ like <- function(x, pattern, ignore.case = TRUE) {
    as.integer(x) %in% base::grep(pattern, levels(x), ignore.case = FALSE, fixed = fixed)
  } else {
    tryCatch(base::grepl(pattern, x, ignore.case = FALSE, fixed = fixed),
-             error = function(e) ifelse(grepl("Invalid regexp", e$message),
-                                        # try with perl = TRUE:
-                                        return(base::grepl(pattern = pattern, x = x,
-                                                           ignore.case = FALSE, 
-                                                           fixed = fixed,
-                                                           perl = TRUE)),
-                                        # stop otherwise
-                                        stop(e$message)))
+             error = function(e) {
+               if (grepl("invalid reg(ular )?exp", e$message, ignore.case = TRUE)) {
+                 # try with perl = TRUE:
+                 return(base::grepl(pattern = pattern, 
+                                    x = x,
+                                    ignore.case = FALSE, 
+                                    fixed = fixed,
+                                    perl = TRUE))
+               } else {
+                 # stop otherwise
+                 stop(e$message)
+               }
+             })
  }
 }

--- a/R/rsi_calc.R
+++ b/R/rsi_calc.R
@@ -40,6 +40,11 @@ rsi_calc <- function(...,
  data_vars <- dots2vars(...)

  dots_df <- switch(1, ...)
+  if (is.data.frame(dots_df)) {
+    # make sure to remove all other classes like tibbles, data.tables, etc
+    dots_df <- as.data.frame(dots_df, stringsAsFactors = FALSE)
+  }
+  
  dots <- base::eval(base::substitute(base::alist(...)))
  stop_if(length(dots) == 0, "no variables selected", call = -2)

@@ -50,6 +55,7 @@ rsi_calc <- function(...,
  
  if (is.data.frame(dots_df)) {
    # data.frame passed with other columns, like: example_isolates %>% proportion_S(AMC, GEN)
+  
    dots <- as.character(dots)
    # remove first element, it's the data.frame
    if (length(dots) == 1) {
@@ -62,6 +68,10 @@ rsi_calc <- function(...,
      # and the old rsi function, which has "df" as name of the first parameter
      x <- dots_df
    } else {
+      # get dots that are in column names already, and the ones that will be once evaluated using dots_df or global env
+      # this is to support susceptibility(example_isolates, AMC, dplyr::all_of(some_vector_with_AB_names))
+      dots <- c(dots[dots %in% colnames(dots_df)],
+                eval(parse(text = dots[!dots %in% colnames(dots_df)]), envir = dots_df, enclos = globalenv()))
      dots_not_exist <- dots[!dots %in% colnames(dots_df)]
      stop_if(length(dots_not_exist) > 0, "column(s) not found: ", paste0("'", dots_not_exist, "'", collapse = ", "), call = -2)
      x <- dots_df[, dots, drop = FALSE]
@@ -72,10 +82,10 @@ rsi_calc <- function(...,
  } else {
    # multiple variables passed without pipe, like: proportion_S(example_isolates$AMC, example_isolates$GEN)
    x <- NULL
-    try(x <- as.data.frame(dots), silent = TRUE)
+    try(x <- as.data.frame(dots, stringsAsFactors = FALSE), silent = TRUE)
    if (is.null(x)) {
      # support for example_isolates %>% group_by(hospital_id) %>% summarise(amox = susceptibility(GEN, AMX))
-      x <- as.data.frame(list(...))
+      x <- as.data.frame(list(...), stringsAsFactors = FALSE)
    }
  }
  
@@ -92,9 +102,9 @@ rsi_calc <- function(...,
    rsi_integrity_check <- character(0)
    for (i in seq_len(ncol(x))) {
      # check integrity of columns: force rsi class
-      if (!is.rsi(x %>% pull(i))) {
-        rsi_integrity_check <- c(rsi_integrity_check, x %>% pull(i) %>% as.character())
-        x[, i] <- suppressWarnings(x %>% pull(i) %>% as.rsi()) # warning will be given later
+      if (!is.rsi(x[, i, drop = TRUE])) {
+        rsi_integrity_check <- c(rsi_integrity_check, as.character(x[, i, drop = TRUE]))
+        x[, i] <- suppressWarnings(as.rsi(x[, i, drop = TRUE])) # warning will be given later
        print_warning <- TRUE
      }
    }