AI improvements

2026-02-26 18:38:38 +01:00 · 2018-12-07 12:04:55 +01:00
parent 87ad6da745
commit 8e8a9cd190
19 changed files with 199 additions and 140 deletions
--- a/R/eucast_rules.R
+++ b/R/eucast_rules.R
@@ -23,12 +23,13 @@
 #' @param info print progress
 #' @param rules a character vector that specifies which rules should be applied - one or more of \code{c("breakpoints", "expert", "other", "all")}
 #' @param verbose a logical to indicate whether extensive info should be returned as a \code{data.frame} with info about which rows and columns are effected
-#' @param amcl,amik,amox,ampi,azit,azlo,aztr,cefa,cfep,cfot,cfox,cfra,cfta,cftr,cfur,chlo,cipr,clar,clin,clox,coli,czol,dapt,doxy,erta,eryt,fosf,fusi,gent,imip,kana,levo,linc,line,mero,mezl,mino,moxi,nali,neom,neti,nitr,norf,novo,oflo,oxac,peni,pipe,pita,poly,pris,qida,rifa,roxi,siso,teic,tetr,tica,tige,tobr,trim,trsu,vanc column name of an antibiotic, see Details
+#' @param amcl,amik,amox,ampi,azit,azlo,aztr,cefa,cfep,cfot,cfox,cfra,cfta,cftr,cfur,chlo,cipr,clar,clin,clox,coli,czol,dapt,doxy,erta,eryt,fosf,fusi,gent,imip,kana,levo,linc,line,mero,mezl,mino,moxi,nali,neom,neti,nitr,norf,novo,oflo,oxac,peni,pipe,pita,poly,pris,qida,rifa,roxi,siso,teic,tetr,tica,tige,tobr,trim,trsu,vanc column name of an antibiotic, see Antibiotics
 #' @param col_bactid deprecated, use \code{col_mo} instead.
 #' @param ... parameters that are passed on to \code{eucast_rules}
 #' @inheritParams first_isolate
-#' @details To define antibiotics column names, input a text or use \code{NA} to skip a column (e.g. \code{tica = NA}). Non-existing columns will anyway be skipped with a warning. See the Antibiotics section for an explanation of the abbreviations.
 #' @section Antibiotics:
+#' To define antibiotics column names, input a text (case-insensitive) or use \code{NULL} to skip a column (e.g. \code{tica = NULL}). Non-existing columns will anyway be skipped with a warning.
+#'
 #' Abbrevations of the column containing antibiotics in the form: \strong{abbreviation}: generic name (\emph{ATC code})
 #'
 #'  \strong{amcl}: amoxicillin+clavulanic acid (\emph{J01CR02}),
--- a/R/mdro.R
+++ b/R/mdro.R
@@ -23,7 +23,7 @@
 #' @param country country code to determine guidelines. EUCAST rules will be used when left empty, see Details. Should be or a code from the \href{https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2#Officially_assigned_code_elements}{list of ISO 3166-1 alpha-2 country codes}. Case-insensitive. Currently supported are \code{de} (Germany) and \code{nl} (the Netherlands).
 #' @param info print progress
 #' @inheritParams eucast_rules
-#' @param metr column name of an antibiotic. Use \code{NA} to skip a column, like \code{tica = NA}. Non-existing columns will anyway be skipped. See the Antibiotics section for an explanation of the abbreviations.
+#' @param metr column name of an antibiotic, see Antibiotics
 #' @param ... parameters that are passed on to methods
 #' @inheritSection eucast_rules Antibiotics
 #' @details When \code{country} will be left blank, guidelines will be taken from EUCAST Expert Rules Version 3.1 "Intrinsic Resistance and Exceptional Phenotypes Tables" (\url{http://www.eucast.org/fileadmin/src/media/PDFs/EUCAST_files/Expert_Rules/Expert_rules_intrinsic_exceptional_V3.1.pdf}).
--- a/R/mic.R
+++ b/R/mic.R
@@ -200,12 +200,12 @@ summary.mic <- function(object, ...) {
  n_total <- x %>% length()
  x <- x[!is.na(x)]
  n <- x %>% length()
-  lst <- c('mic',
-           n_total - n,
-           sort(x)[1] %>% as.character(),
-           sort(x)[n] %>% as.character())
-  names(lst) <- c("Mode", "<NA>", "Min.", "Max.")
-  lst
+  c(
+    "Class" = 'mic',
+    "<NA>" = n_total - n,
+    "Min." = sort(x)[1] %>% as.character(),
+    "Max." = sort(x)[n] %>% as.character()
+  )
 }

 #' @exportMethod plot.mic
--- a/R/misc.R
+++ b/R/misc.R
@@ -51,7 +51,7 @@ percent <- function(x, round = 1, force_zero = FALSE, ...) {

 check_available_columns <- function(tbl, col.list, info = TRUE) {
  # check columns
-  col.list <- col.list[!is.na(col.list)]
+  col.list <- col.list[!is.na(col.list) & !is.null(col.list)]
  names(col.list) <- col.list
  col.list.bak <- col.list
  # are they available as upper case or lower case then?
--- a/R/mo.R
+++ b/R/mo.R
@@ -26,7 +26,7 @@
 #' @param Lancefield a logical to indicate whether beta-haemolytic \emph{Streptococci} should be categorised into Lancefield groups instead of their own species, according to Rebecca C. Lancefield [2]. These \emph{Streptococci} will be categorised in their first group, e.g. \emph{Streptococcus dysgalactiae} will be group C, although officially it was also categorised into groups G and L.
 #'
 #'   This excludes \emph{Enterococci} at default (who are in group D), use \code{Lancefield = "all"} to also categorise all \emph{Enterococci} as group D.
-#' @param allow_uncertain a logical to indicate whether empty results should be checked for only a part of the input string. When results are found, a warning will be given about the uncertainty and the result.
+#' @param allow_uncertain a logical to indicate whether the input should be checked for less possible results, see Details
 #' @param reference_df a \code{data.frame} to use for extra reference when translating \code{x} to a valid \code{mo}. The first column can be any microbial name, code or ID (used in your analysis or organisation), the second column must be a valid \code{mo} as found in the \code{\link{microorganisms}} data set.
 #' @rdname as.mo
 #' @aliases mo
@@ -34,11 +34,11 @@
 #' @details
 #' A microbial ID from this package (class: \code{mo}) typically looks like these examples:\cr
 #' \preformatted{
-#'   Code                Full name
-#'   ---------------     --------------------------------------
-#'   B_KLBSL             Klebsiella
-#'   B_KLBSL_PNE         Klebsiella pneumoniae
-#'   B_KLBSL_PNE_RHI     Klebsiella pneumoniae rhinoscleromatis
+#'   Code              Full name
+#'   ---------------   --------------------------------------
+#'   B_KLBSL           Klebsiella
+#'   B_KLBSL_PNE       Klebsiella pneumoniae
+#'   B_KLBSL_PNE_RHI   Klebsiella pneumoniae rhinoscleromatis
 #'   |   |    |   |
 #'   |   |    |   |
 #'   |   |    |    ----> subspecies, a 3-4 letter acronym
@@ -57,7 +57,7 @@
 #'   \item{Breakdown of input values: from here it starts to breakdown input values to find possible matches}
 #' }
 #'
-#' A couple of effects because of these rules
+#' A couple of effects because of these rules:
 #' \itemize{
 #'   \item{\code{"E. coli"} will return the ID of \emph{Escherichia coli} and not \emph{Entamoeba coli}, although the latter would alphabetically come first}
 #'   \item{\code{"H. influenzae"} will return the ID of \emph{Haemophilus influenzae} and not \emph{Haematobacter influenzae} for the same reason}
@@ -66,6 +66,13 @@
 #' }
 #' This means that looking up human pathogenic microorganisms takes less time than looking up human \strong{non}-pathogenic microorganisms.
 #'
+#' When using \code{allow_uncertain = TRUE} (which is the default setting), it will use additional rules if all previous AI rules failed to get valid results. Examples:
+#' \itemize{
+#'   \item{\code{"Streptococcus group B (known as S. agalactiae)"}. The text between brackets will be removed and a warning will be thrown that the result \emph{Streptococcus group B} (\code{B_STRPTC_GRB}) needs review.}
+#'   \item{\code{"S. aureus - please mind: MRSA"}. The last word will be stripped, after which the function will try to find a match. If it does not, the second last word will be stripped, etc. Again, a warning will be thrown that the result \emph{Staphylococcus aureus} (\code{B_STPHY_AUR}) needs review.}
+#'   \item{\code{"D. spartina"}. This is the abbreviation of an old taxonomic name: \emph{Didymosphaeria spartinae} (the last "e" was missing from the input). This fungus was renamed to \emph{Leptosphaeria obiones}, so a warning will be thrown that this result (\code{F_LPTSP_OBI}) needs review.}
+#' }
+#'
 #' \code{guess_mo} is an alias of \code{as.mo}.
 #' @section ITIS:
 #' \if{html}{\figure{itis_logo.jpg}{options: height=60px style=margin-bottom:5px} \cr}
@@ -94,6 +101,7 @@
 #' as.mo("S. aureus")
 #' as.mo("S aureus")
 #' as.mo("Staphylococcus aureus")
+#' as.mo("Staphylococcus aureus (MRSA)")
 #' as.mo("MRSA") # Methicillin Resistant S. aureus
 #' as.mo("VISA") # Vancomycin Intermediate S. aureus
 #' as.mo("VRSA") # Vancomycin Resistant S. aureus
@@ -136,7 +144,7 @@
 #' df <- df %>%
 #'   mutate(mo = as.mo(paste(genus, species)))
 #' }
-as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = FALSE, reference_df = NULL) {
+as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = TRUE, reference_df = NULL) {
  mo <- mo_validate(x = x, property = "mo",
                    Becker = Becker, Lancefield = Lancefield,
                    allow_uncertain = allow_uncertain, reference_df = reference_df)
@@ -155,11 +163,11 @@ is.mo <- function(x) {
 #' @export
 guess_mo <- as.mo

-#' @importFrom dplyr %>% pull left_join n_distinct
+#' @importFrom dplyr %>% pull left_join n_distinct progress_estimated
 #' @importFrom data.table data.table as.data.table setkey
 #' @importFrom crayon magenta red italic
 exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
-                       allow_uncertain = FALSE, reference_df = NULL,
+                       allow_uncertain = TRUE, reference_df = NULL,
                       property = "mo", clear_options = TRUE) {

  if (!"AMR" %in% base::.packages()) {
@@ -272,7 +280,12 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
    # cat(paste0('x_trimmed_species       "', x_trimmed_species, '"\n'))
    # cat(paste0('x_trimmed_without_group "', x_trimmed_without_group, '"\n'))

+    progress <- progress_estimated(n = length(x), min_time = 3)
+
    for (i in 1:length(x)) {
+
+      progress$tick()$print()
+
      if (identical(x_trimmed[i], "")) {
        # empty values
        x[i] <- NA_character_
@@ -615,8 +628,8 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
          } else {
            x[i] <- microorganismsDT[tsn == found[1, tsn_new], ..property][[1]]
          }
-          warning(red(paste0("UNCERTAIN - '",
-                             x_backup[i], "' -> ", italic(found[1, name]))),
+          warning(red(paste0('UNCERTAIN - "',
+                             x_backup[i], '" -> ', italic(found[1, name]))),
                             call. = FALSE, immediate. = TRUE)
          renamed_note(name_old = found[1, name],
                       name_new = microorganismsDT[tsn == found[1, tsn_new], fullname],
@@ -627,13 +640,17 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
        }

        # (2) strip values between brackets ----
-        found <- microorganismsDT[fullname %like% gsub("( [(].*[)]) ", " ", x_withspaces[i])
-                                  | fullname %like% gsub("( [(].*[)]) ", " ", x_backup[i])
-                                  | fullname %like% gsub("( [(].*[)]) ", " ", x[i]),]
+        x_backup_stripped <- gsub("( [(].*[)])", "", x_backup[i])
+        x_backup_stripped <- trimws(gsub("  ", " ", x_backup_stripped, fixed = TRUE))
+        x_species_stripped <- gsub("( [(].*[)])", "", x_species[i])
+        x_species_stripped <- trimws(gsub("  ", " ", x_species_stripped, fixed = TRUE))
+
+        found <- microorganismsDT[fullname %like% x_backup_stripped
+                                  | fullname %like% x_species_stripped,]
        if (NROW(found) > 0 & nchar(x_trimmed[i]) >= 6) {
          x[i] <- found[1, ..property][[1]]
-          warning(red(paste0("UNCERTAIN - '",
-                             x_backup[i], "' -> ", italic(found[1, fullname][[1]]), " (", found[1, mo][[1]], ")")),
+          warning(red(paste0('UNCERTAIN - "',
+                             x_backup[i], '" -> ', italic(found[1, fullname][[1]]), " (", found[1, mo][[1]], ")")),
                             call. = FALSE, immediate. = TRUE)
          next
        }
@@ -647,8 +664,8 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
              found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, clear_options = FALSE)))
              if (!is.na(found)) {
                found <- microorganismsDT[mo == found, ..property][[1]]
-                warning(red(paste0("UNCERTAIN - '",
-                                   z, "' -> ", italic(microorganismsDT[mo == found[1L], fullname][[1]]), " (", found[1L], ")")),
+                warning(red(paste0('UNCERTAIN - "',
+                                   z, '" -> ', italic(microorganismsDT[mo == found[1L], fullname][[1]]), " (", found[1L], ")")),
                                   call. = FALSE, immediate. = TRUE)
                return(found[1L])
              }
@@ -795,6 +812,21 @@ print.mo <- function(x, ...) {
  print.default(x, quote = FALSE)
 }

+#' @exportMethod summary.mo
+#' @export
+#' @noRd
+summary.mo <- function(object, ...) {
+  # unique and top 1-3
+  x <- object
+  top_3 <- unname(top_freq(freq(x), 3))
+  c("Class" = "mo",
+    "<NA>" = length(x[is.na(x)]),
+    "Unique" = dplyr::n_distinct(x[!is.na(x)]),
+    "#1" = top_3[1],
+    "#2" = top_3[2],
+    "#3" = top_3[3])
+}
+
 #' @exportMethod as.data.frame.mo
 #' @export
 #' @noRd
--- a/R/rsi.R
+++ b/R/rsi.R
@@ -39,14 +39,20 @@
 #' barplot(rsi_data) # for frequencies
 #' freq(rsi_data)    # frequency table with informative header
 #'
-#' # fastest way to transform all columns with already valid AB results to class `rsi`:
+#' # using dplyr's mutate
 #' library(dplyr)
 #' septic_patients %>%
+#'   mutate_at(vars(peni:rifa), as.rsi)
+#'
+#' # fastest way to transform all columns with already valid AB results to class `rsi`:
+#' septic_patients %>%
 #'   mutate_if(is.rsi.eligible,
 #'             as.rsi)
 as.rsi <- function(x) {
  if (is.rsi(x)) {
    x
+  } else if (identical(levels(x), c("S", "I", "R"))) {
+    structure(x, class = c('rsi', 'ordered', 'factor'))
  } else {

    x <- x %>% unlist()
@@ -102,14 +108,15 @@ is.rsi.eligible <- function(x) {
      | is.numeric(x)
      | is.mo(x)
      | identical(class(x), "Date")
-      | identical(levels(x), c("S", "I", "R"))) {
+      | is.rsi(x)) {
    # no transformation needed
    FALSE
  } else {
    # check all but a-z
-    x <- unique(gsub("[^RSIrsi]+", "", unique(x)))
-    all(x %in% c("R", "I", "S", "", NA_character_)) &
-      !all(x %in% c("", NA_character_))
+    y <- unique(gsub("[^RSIrsi]+", "", unique(x)))
+    !all(y %in% c("", NA_character_)) &
+      all(y %in% c("R", "I", "S", "", NA_character_)) &
+      max(nchar(as.character(x)), na.rm = TRUE) < 8
  }
 }

@@ -128,7 +135,7 @@ print.rsi <- function(x, ...) {
 summary.rsi <- function(object, ...) {
  x <- object
  c(
-    "Mode" = 'rsi',
+    "Class" = 'rsi',
    "<NA>" = sum(is.na(x)),
    "Sum S" = sum(x == "S", na.rm = TRUE),
    "Sum IR" = sum(x %in% c("I", "R"), na.rm = TRUE),