(v1.5.0.9014) only_rsi_columns, is.rsi.eligible improvement

2026-02-09 13:52:54 +01:00 · 2021-02-02 23:57:35 +01:00
parent 20d638c193
commit 2eca8c3f01
246 changed files with 1171 additions and 965 deletions
--- a/R/guess_ab_col.R
+++ b/R/guess_ab_col.R
@@ -1,6 +1,6 @@
 # ==================================================================== #
 # TITLE                                                                #
-# Antimicrobial Resistance (AMR) Analysis for R                        #
+# Antimicrobial Resistance (AMR) Data Analysis for R                   #
 #                                                                      #
 # SOURCE                                                               #
 # https://github.com/msberends/AMR                                     #
@@ -20,7 +20,7 @@
 # useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY.              #
 #                                                                      #
 # Visit our website for the full manual and a complete tutorial about  #
-# how to conduct AMR analysis: https://msberends.github.io/AMR/        #
+# how to conduct AMR data analysis: https://msberends.github.io/AMR/   #
 # ==================================================================== #

 #' Guess Antibiotic Column
@@ -30,6 +30,7 @@
 #' @param x a [data.frame]
 #' @param search_string a text to search `x` for, will be checked with [as.ab()] if this value is not a column in `x`
 #' @param verbose a logical to indicate whether additional info should be printed
+#' @param only_rsi_columns a logical to indicate whether only antibiotic columns must be detected that were [transformed to class `<rsi>`]([rsi]) on beforehand. Defaults to `TRUE` if any column of `x` is of class `<rsi>`.
 #' @details You can look for an antibiotic (trade) name or abbreviation and it will search `x` and the [antibiotics] data set for any column containing a name or code of that antibiotic. **Longer columns names take precedence over shorter column names.**
 #' @return A column name of `x`, or `NULL` when no result is found.
 #' @export
@@ -62,35 +63,20 @@
 #'                  AMP_ED20 = "S")
 #' guess_ab_col(df, "ampicillin")
 #' # [1] "AMP_ED20"
-guess_ab_col <- function(x = NULL, search_string = NULL, verbose = FALSE) {
+guess_ab_col <- function(x = NULL, search_string = NULL, verbose = FALSE, only_rsi_columns = any(is.rsi(x))) {
  meet_criteria(x, allow_class = "data.frame", allow_NULL = TRUE)
  meet_criteria(search_string, allow_class = "character", has_length = 1, allow_NULL = TRUE)
  meet_criteria(verbose, allow_class = "logical", has_length = 1)
  
  if (is.null(x) & is.null(search_string)) {
    return(as.name("guess_ab_col"))
+  } else {
+    meet_criteria(search_string, allow_class = "character", has_length = 1, allow_NULL = FALSE)
  }
  
-  if (search_string %in% colnames(x)) {
-    ab_result <- search_string
-  } else {
-    search_string.ab <- suppressWarnings(as.ab(search_string))
-    if (search_string.ab %in% colnames(x)) {
-      ab_result <- colnames(x)[colnames(x) == search_string.ab][1L]
-      
-    } else if (any(tolower(colnames(x)) %in% tolower(unlist(ab_property(search_string.ab, "abbreviations", language = NULL))))) {
-      ab_result <- colnames(x)[tolower(colnames(x)) %in% tolower(unlist(ab_property(search_string.ab, "abbreviations", language = NULL)))][1L]
-      
-    } else {
-      # sort colnames on length - longest first
-      cols <- colnames(x[, x %pm>% colnames() %pm>% nchar() %pm>% order() %pm>% rev()])
-      df_trans <- data.frame(cols = cols,
-                             abs = suppressWarnings(as.ab(cols)),
-                             stringsAsFactors = FALSE)
-      ab_result <- df_trans[which(df_trans$abs == search_string.ab), "cols"]
-      ab_result <- ab_result[!is.na(ab_result)][1L]
-    }
-  }
+  all_found <- get_column_abx(x, info = verbose, only_rsi_columns = only_rsi_columns, verbose = verbose)
+  search_string.ab <- suppressWarnings(as.ab(search_string))
+  ab_result <- unname(all_found[names(all_found) == search_string.ab])
  
  if (length(ab_result) == 0) {
    if (verbose == TRUE) {
@@ -114,18 +100,24 @@ get_column_abx <- function(x,
                           hard_dependencies = NULL,
                           verbose = FALSE,
                           info = TRUE,
+                           only_rsi_columns = FALSE,
                           ...) {
  meet_criteria(x, allow_class = "data.frame")
  meet_criteria(soft_dependencies, allow_class = "character", allow_NULL = TRUE)
  meet_criteria(hard_dependencies, allow_class = "character", allow_NULL = TRUE)
  meet_criteria(verbose, allow_class = "logical", has_length = 1)
  meet_criteria(info, allow_class = "logical", has_length = 1)
+  meet_criteria(only_rsi_columns, allow_class = "logical", has_length = 1)
  
  if (info == TRUE) {
    message_("Auto-guessing columns suitable for analysis", appendLF = FALSE, as_note = FALSE)
  }
  
  x <- as.data.frame(x, stringsAsFactors = FALSE)
+  if (only_rsi_columns == TRUE) {
+    x <- x[, which(is.rsi(x)), drop = FALSE]
+  }
+
  if (NROW(x) > 10000) {
    # only test maximum of 10,000 values per column
    if (info == TRUE) {
@@ -141,21 +133,23 @@ get_column_abx <- function(x,
  # only check columns that are a valid AB code, ATC code, name, abbreviation or synonym,
  # or already have the <rsi> class (as.rsi) 
  # and that they have no more than 50% invalid values
-  vectr_antibiotics <- unique(toupper(unlist(antibiotics[, c("ab", "atc", "name", "abbreviations", "synonyms")])))
+  vectr_antibiotics <- unlist(AB_lookup$generalised_all)
  vectr_antibiotics <- vectr_antibiotics[!is.na(vectr_antibiotics) & nchar(vectr_antibiotics) >= 3]
-  x_columns <- vapply(FUN.VALUE = character(1), colnames(x), function(col, df = x) {
-    if (toupper(col) %in% vectr_antibiotics || 
-        is.rsi(x[, col, drop = TRUE]) ||
-        is.rsi.eligible(x[, col, drop = TRUE], threshold = 0.5)
-        ) {
-      return(col)
-    } else {
-      return(NA_character_)
-    }
-  })
+  x_columns <- vapply(FUN.VALUE = character(1), 
+                      colnames(x),
+                      function(col, df = x) {
+                        if (generalise_antibiotic_name(col) %in% vectr_antibiotics || 
+                            is.rsi(x[, col, drop = TRUE]) ||
+                            is.rsi.eligible(x[, col, drop = TRUE], threshold = 0.5)
+                        ) {
+                          return(col)
+                        } else {
+                          return(NA_character_)
+                        }
+                      })
  
  x_columns <- x_columns[!is.na(x_columns)]
-  x <- x[, x_columns, drop = FALSE] # without drop = TRUE, x will become a vector when x_columns is length 1
+  x <- x[, x_columns, drop = FALSE] # without drop = FALSE, x will become a vector when x_columns is length 1
  df_trans <- data.frame(colnames = colnames(x),
                         abcode = suppressWarnings(as.ab(colnames(x), info = FALSE)),
                         stringsAsFactors = FALSE)
@@ -164,7 +158,7 @@ get_column_abx <- function(x,
  names(x) <- df_trans$abcode
  
  # add from self-defined dots (...):
-  # such as get_column_abx(example_isolates %pm>% rename(thisone = AMX), amox = "thisone")
+  # such as get_column_abx(example_isolates %>% rename(thisone = AMX), amox = "thisone")
  dots <- list(...)
  if (length(dots) > 0) {
    newnames <- suppressWarnings(as.ab(names(dots), info = FALSE))