(v1.2.0.9017) ab_from_text() improvement

2025-08-25 13:42:10 +02:00 · 2020-06-26 12:31:27 +02:00
parent b31003c0b6
commit 4f6f056077
19 changed files with 239 additions and 118 deletions
--- a/R/ab.R
+++ b/R/ab.R
@@ -24,6 +24,7 @@
 #' Use this function to determine the antibiotic code of one or more antibiotics. The data set [antibiotics] will be searched for abbreviations, official names and synonyms (brand names).
 #' @inheritSection lifecycle Maturing lifecycle
 #' @param x character vector to determine to antibiotic ID
+#' @param flag_multiple_results logical to indicate whether a note should be printed to the console that probably more than one antibiotic code or name can be retrieved from a single input value.
 #' @param ... arguments passed on to internal functions
 #' @rdname as.ab
 #' @inheritSection WHOCC WHOCC
@@ -67,7 +68,7 @@
 #' # they use as.ab() internally:
 #' ab_name("J01FA01")    # "Erythromycin"
 #' ab_name("eryt")       # "Erythromycin"
-as.ab <- function(x, ...) {
+as.ab <- function(x, flag_multiple_results = TRUE, ...) {
  
  check_dataset_integrity()
  
@@ -75,7 +76,7 @@ as.ab <- function(x, ...) {
    return(x)
  }

-  initial <- is.null(list(...)$initial)
+  initial_search <- is.null(list(...)$initial_search)
  already_regex <- isTRUE(list(...)$already_regex)

  if (all(toupper(x) %in% antibiotics$ab)) {
@@ -114,7 +115,24 @@ as.ab <- function(x, ...) {
  x_new <- rep(NA_character_, length(x))
  x_unknown <- character(0)
  
+  note_if_more_than_one_found <- function(found, index, from_text) {
+    if (initial_search == TRUE & isTRUE(length(from_text) > 1)) {
+      message(font_blue(paste0("NOTE: more than one result was found for item ", index, ": ",
+                               paste0(ab_name(from_text, tolower = TRUE, initial_search = FALSE), collapse = ", "))))
+    }
+    found[1L]
+  }
+  
+  if (initial_search == TRUE) {
+    progress <- progress_estimated(n = length(x), n_min = 25) # start if n >= 25
+    on.exit(close(progress))
+  }
+  
  for (i in seq_len(length(x))) {
+    if (initial_search == TRUE) {
+      progress$tick()
+    }
+    
    if (is.na(x[i]) | is.null(x[i])) {
      next
    }
@@ -127,31 +145,37 @@ as.ab <- function(x, ...) {
      next
    }
    
+    if (isTRUE(flag_multiple_results) & x[i] %like% "[ ]") {
+      from_text <- suppressWarnings(ab_from_text(x[i], initial_search = FALSE, translate_ab = FALSE))
+    } else {
+      from_text <- character(0)
+    }
+    
    # exact AB code
    found <- antibiotics[which(antibiotics$ab == x[i]), ]$ab
    if (length(found) > 0) {
-      x_new[i] <- found[1L]
+      x_new[i] <- note_if_more_than_one_found(found, i, from_text)
      next
    }

    # exact ATC code
    found <- antibiotics[which(antibiotics$atc == x[i]), ]$ab
    if (length(found) > 0) {
-      x_new[i] <- found[1L]
+      x_new[i] <- note_if_more_than_one_found(found, i, from_text)
      next
    }

    # exact CID code
    found <- antibiotics[which(antibiotics$cid == x[i]), ]$ab
    if (length(found) > 0) {
-      x_new[i] <- found[1L]
+      x_new[i] <- note_if_more_than_one_found(found, i, from_text)
      next
    }

    # exact name
    found <- antibiotics[which(toupper(antibiotics$name) == x[i]), ]$ab
    if (length(found) > 0) {
-      x_new[i] <- found[1L]
+      x_new[i] <- note_if_more_than_one_found(found, i, from_text)
      next
    }
    
@@ -160,7 +184,7 @@ as.ab <- function(x, ...) {
                                   function(s) x[i] %in% s))
    found <- antibiotics$ab[loinc_found == TRUE]
    if (length(found) > 0) {
-      x_new[i] <- found[1L]
+      x_new[i] <- note_if_more_than_one_found(found, i, from_text)
      next
    }

@@ -169,7 +193,7 @@ as.ab <- function(x, ...) {
                                   function(s) x[i] %in% toupper(s)))
    found <- antibiotics$ab[synonym_found == TRUE]
    if (length(found) > 0) {
-      x_new[i] <- found[1L]
+      x_new[i] <- note_if_more_than_one_found(found, i, from_text)
      next
    }

@@ -178,7 +202,7 @@ as.ab <- function(x, ...) {
                                function(a) x[i] %in% toupper(a)))
    found <- antibiotics$ab[abbr_found == TRUE]
    if (length(found) > 0) {
-      x_new[i] <- found[1L]
+      x_new[i] <- note_if_more_than_one_found(found, i, from_text)
      next
    }

@@ -213,13 +237,13 @@ as.ab <- function(x, ...) {
    # try if name starts with it
    found <- antibiotics[which(antibiotics$name %like% paste0("^", x_spelling)), ]$ab
    if (length(found) > 0) {
-      x_new[i] <- found[1L]
+      x_new[i] <- note_if_more_than_one_found(found, i, from_text)
      next
    }
    # try if name ends with it
    found <- antibiotics[which(antibiotics$name %like% paste0(x_spelling, "$")), ]$ab
    if (nchar(x[i]) >= 4 & length(found) > 0) {
-      x_new[i] <- found[1L]
+      x_new[i] <- note_if_more_than_one_found(found, i, from_text)
      next
    }
    # and try if any synonym starts with it
@@ -227,29 +251,29 @@ as.ab <- function(x, ...) {
                                   function(s) any(s %like% paste0("^", x_spelling))))
    found <- antibiotics$ab[synonym_found == TRUE]
    if (length(found) > 0) {
-      x_new[i] <- found[1L]
+      x_new[i] <- note_if_more_than_one_found(found, i, from_text)
      next
    }
    
-    # INITIAL - More uncertain results ----
+    # INITIAL SEARCH - More uncertain results ----
    
-    if (initial == TRUE) {
+    if (initial_search == TRUE) {
      # only run on first try
      
      # try by removing all spaces
      if (x[i] %like% " ") {
-        found <- suppressWarnings(as.ab(gsub(" +", "", x[i]), initial = FALSE))
+        found <- suppressWarnings(as.ab(gsub(" +", "", x[i]), initial_search = FALSE))
        if (length(found) > 0 & !is.na(found)) {
-          x_new[i] <- found[1L]
+          x_new[i] <- note_if_more_than_one_found(found, i, from_text)
          next
        }
      }
      
      # try by removing all spaces and numbers
      if (x[i] %like% " " | x[i] %like% "[0-9]") {
-        found <- suppressWarnings(as.ab(gsub("[ 0-9]", "", x[i]), initial = FALSE))
+        found <- suppressWarnings(as.ab(gsub("[ 0-9]", "", x[i]), initial_search = FALSE))
        if (length(found) > 0 & !is.na(found)) {
-          x_new[i] <- found[1L]
+          x_new[i] <- note_if_more_than_one_found(found, i, from_text)
          next
        }
      }
@@ -266,7 +290,7 @@ as.ab <- function(x, ...) {
                                     y
                                   })[[1]],
                            collapse = "/")
-      x_translated_guess <- suppressWarnings(as.ab(x_translated, initial = FALSE))
+      x_translated_guess <- suppressWarnings(as.ab(x_translated, initial_search = FALSE))
      if (!is.na(x_translated_guess)) {
        x_new[i] <- x_translated_guess
        next
@@ -276,7 +300,7 @@ as.ab <- function(x, ...) {
      x_translated <- paste(lapply(strsplit(x_translated, "[^A-Z0-9 ]"),
                                   function(y) {
                                     for (i in seq_len(length(y))) {
-                                       y_name <- suppressWarnings(ab_name(y[i], language = NULL, initial = FALSE))
+                                       y_name <- suppressWarnings(ab_name(y[i], language = NULL, initial_search = FALSE))
                                       y[i] <- ifelse(!is.na(y_name),
                                                      y_name,
                                                      y[i])
@@ -284,7 +308,7 @@ as.ab <- function(x, ...) {
                                     y
                                   })[[1]],
                            collapse = "/")
-      x_translated_guess <- suppressWarnings(as.ab(x_translated, initial = FALSE))
+      x_translated_guess <- suppressWarnings(as.ab(x_translated, initial_search = FALSE))
      if (!is.na(x_translated_guess)) {
        x_new[i] <- x_translated_guess
        next
@@ -292,60 +316,65 @@ as.ab <- function(x, ...) {
      
      # try by removing all trailing capitals
      if (x[i] %like_case% "[a-z]+[A-Z]+$") {
-        found <- suppressWarnings(as.ab(gsub("[A-Z]+$", "", x[i]), initial = FALSE))
+        found <- suppressWarnings(as.ab(gsub("[A-Z]+$", "", x[i]), initial_search = FALSE))
        if (!is.na(found)) {
-          x_new[i] <- found[1L]
+          x_new[i] <- note_if_more_than_one_found(found, i, from_text)
          next
        }
      }
      
      # keep only letters
-      found <- suppressWarnings(as.ab(gsub("[^A-Z]", "", x[i]), initial = FALSE))
+      found <- suppressWarnings(as.ab(gsub("[^A-Z]", "", x[i]), initial_search = FALSE))
      if (!is.na(found)) {
-        x_new[i] <- found[1L]
+        x_new[i] <- note_if_more_than_one_found(found, i, from_text)
        next
      }
      
      # try from a bigger text, like from a health care record, see ?ab_from_text
-      found <- suppressWarnings(ab_from_text(x[i], initial = FALSE, translate_ab = FALSE)[1L])
+      # already calculated above if flag_multiple_results = TRUE
+      if (isTRUE(flag_multiple_results)) {
+        found <- from_text[1L]
+      } else {
+        found <- suppressWarnings(ab_from_text(x[i], initial_search = FALSE, translate_ab = FALSE)[1L])
+      }
      if (!is.na(found)) {
-        x_new[i] <- found[1L]
+        x_new[i] <- note_if_more_than_one_found(found, i, from_text)
        next
      }
      
      # first 5 except for cephalosporins, then first 7 (those cephalosporins all start quite the same!)
-      found <- suppressWarnings(as.ab(substr(x[i], 1, 5), initial = FALSE))
-      if (!is.na(found) && !ab_group(found, initial = FALSE) %like% "cephalosporins") {
-        x_new[i] <- found[1L]
+      found <- suppressWarnings(as.ab(substr(x[i], 1, 5), initial_search = FALSE))
+      if (!is.na(found) && !ab_group(found, initial_search = FALSE) %like% "cephalosporins") {
+        x_new[i] <- note_if_more_than_one_found(found, i, from_text)
        next
      }
-      found <- suppressWarnings(as.ab(substr(x[i], 1, 7), initial = FALSE))
+      found <- suppressWarnings(as.ab(substr(x[i], 1, 7), initial_search = FALSE))
      if (!is.na(found)) {
-        x_new[i] <- found[1L]
+        x_new[i] <- note_if_more_than_one_found(found, i, from_text)
        next
      }
      
      # make all consonants facultative
      search_str <- gsub("([BCDFGHJKLMNPQRSTVWXZ])", "\\1*", x[i])
-      found <- suppressWarnings(as.ab(search_str, initial = FALSE, already_regex = TRUE))
+      found <- suppressWarnings(as.ab(search_str, initial_search = FALSE, already_regex = TRUE))
      # keep at least 4 normal characters
      if (nchar(gsub(".\\*", "", search_str)) < 4) {
        found <- NA
      }
      if (!is.na(found)) {
-        x_new[i] <- found[1L]
+        x_new[i] <- note_if_more_than_one_found(found, i, from_text)
        next
      }
      
      # make all vowels facultative
      search_str <- gsub("([AEIOUY])", "\\1*", x[i])
-      found <- suppressWarnings(as.ab(search_str, initial = FALSE, already_regex = TRUE))
+      found <- suppressWarnings(as.ab(search_str, initial_search = FALSE, already_regex = TRUE))
      # keep at least 5 normal characters
      if (nchar(gsub(".\\*", "", search_str)) < 5) {
        found <- NA
      }
      if (!is.na(found)) {
-        x_new[i] <- found[1L]
+        x_new[i] <- note_if_more_than_one_found(found, i, from_text)
        next
      }
      
@@ -355,17 +384,21 @@ as.ab <- function(x, ...) {
      x_spelling <- gsub("I+", "[AEIOU]+", x_spelling, fixed = TRUE)
      x_spelling <- gsub("O+", "[AEIOU]+", x_spelling, fixed = TRUE)
      x_spelling <- gsub("U+", "[AEIOU]+", x_spelling, fixed = TRUE)
-      found <- suppressWarnings(as.ab(x_spelling, initial = FALSE, already_regex = TRUE))
+      found <- suppressWarnings(as.ab(x_spelling, initial_search = FALSE, already_regex = TRUE))
      if (!is.na(found)) {
-        x_new[i] <- found[1L]
+        x_new[i] <- note_if_more_than_one_found(found, i, from_text)
        next
      }
      
-    } # end of initial = TRUE
+    } # end of initial_search = TRUE

    # not found
    x_unknown <- c(x_unknown, x_bak[x[i] == x_bak_clean][1])
  }
+  
+  if (initial_search == TRUE) {
+    close(progress)
+  }

  # take failed ATC codes apart from rest
  x_unknown_ATCs <- x_unknown[x_unknown %like% "[A-Z][0-9][0-9][A-Z][A-Z][0-9][0-9]"]
--- a/R/ab_from_text.R
+++ b/R/ab_from_text.R
@@ -19,18 +19,24 @@
 # Visit our website for more info: https://msberends.gitlab.io/AMR.    #
 # ==================================================================== #

-#' Retrieve antimicrobial drugs from text
+#' Retrieve antimicrobial drugs from clinical text
 #' 
-#' Use this function on e.g. clinical texts from health care records. It returns a vector of antimicrobial drugs found in the texts.
+#' Use this function on e.g. clinical texts from health care records. It returns a [list] with all antimicrobial drugs found in the texts.
 #' @param text text to analyse
 #' @param collapse character to pass on to `paste(..., collapse = ...)` to only return one character per element of `text`, see Examples
-#' @param translate_ab a column name of the [antibiotics] data set to translate the antibiotic abbreviations to, using [ab_property()]. Defaults to "name", which is equal to using `TRUE`. Use a value `FALSE`, `NULL` or `NA` to prevent translation of the `<ab>` code.
+#' @param translate_ab a column name of the [antibiotics] data set to translate the antibiotic abbreviations to, using [ab_property()]. Defaults to `FALSE`. Using `TRUE` is equal to using "name".
 #' @param ... parameters passed on to [as.ab()]
-#' @details To use this for creating a new variable in a data set (e.g. with `mutate()`), it could be convenient to paste the outcome together with the `collapse` parameter so every value in your new variable will be a character of length 1:\cr
+#' @details Without using `collapse`, this function will return a [list]. This can be convenient to use e.g. inside a `mutate()`):\cr
+#' `df %>% mutate(abx = ab_from_text(clinical_text))` 
+#' 
+#' The returned AB codes can be transformed to official names, groups, etc. with all [ab_property()] functions like [ab_name()] and [ab_group()], or by using the `translate_ab` parameter.
+#' 
+#' With using `collapse`, this function will return a [character]:\cr
 #' `df %>% mutate(abx = ab_from_text(clinical_text, collapse = "|"))` 
 #' 
-#' This function is also internally used by [as.ab()], although it then only returns the first hit.
+#' This function is also internally used by [as.ab()], although it then only returns the first hit and will throw a note if more results could have been returned.
 #' @export
+#' @return A [list], or a [character] if `collapse` is not `NULL`
 #' @examples 
 #' # mind the bad spelling of amoxicillin in this line, 
 #' # straight from a true health care record:
@@ -41,10 +47,23 @@
 #' 
 #' # if you want to know which antibiotic groups were administered, check it:
 #' abx <- ab_from_text("administered amoxi/clav and cipro")
-#' ab_group(abx)
-ab_from_text <- function(text, collapse = NULL, translate_ab = "name", ...) {
+#' ab_group(abx[[1]])
+#' 
+#' if (require(dplyr)) {
+#'   tibble(clinical_text = c("given cipro and mero",
+#'                            "started on doxy today")) %>% 
+#'     mutate(abx = ab_from_text(clinical_text),
+#'            abx2 = ab_from_text(clinical_text,
+#'                                collapse = "|"),
+#'            abx3 = ab_from_text(clinical_text,
+#'                                collapse = "|",
+#'                                translate_ab = "name"))
+#' 
+#' }
+ab_from_text <- function(text, collapse = NULL, translate_ab = FALSE, ...) {
  
-  text <- tolower(text)
+  text <- tolower(as.character(text))
+  translate_ab <- get_translate_ab(translate_ab)
  
  abbr <- unlist(antibiotics$abbreviations)
  abbr <- abbr[nchar(abbr) >= 4]
@@ -57,24 +76,29 @@ ab_from_text <- function(text, collapse = NULL, translate_ab = "name", ...) {
           ").*")
  }
  
-  text_split <- unlist(strsplit(text, "[ ;.,:/\\|-]"))
-  result <- suppressWarnings(
-    as.ab(unique(c(text_split[grep(to_regex(abbr), text_split)],
-                   text_split[grep(to_regex(names), text_split)],
-                   # regular expression must not be too long, so split synonyms in two:
-                   text_split[grep(to_regex(synonyms[c(1:0.5 * length(synonyms))]), text_split)],
-                   text_split[grep(to_regex(synonyms[c(0.5 * length(synonyms):length(synonyms))]), text_split)])),
-          ...))
-  result <- result[!is.na(result)]
-  if (length(result) == 0) {
-    result <- as.ab(NA)
-  }
-  translate_ab <- get_translate_ab(translate_ab)
-  if (!isFALSE(translate_ab)) {
-    result <- ab_property(result, property = translate_ab)
-  }
+  text_split_all <- strsplit(text, "[ ;.,:/\\|-]")
+  result <- lapply(text_split_all, function(text_split) {
+    suppressWarnings(
+      out <- as.ab(unique(c(text_split[grep(to_regex(abbr), text_split)],
+                     text_split[grep(to_regex(names), text_split)],
+                     # regular expression must not be too long, so split synonyms in two:
+                     text_split[grep(to_regex(synonyms[c(1:0.5 * length(synonyms))]), text_split)],
+                     text_split[grep(to_regex(synonyms[c(0.5 * length(synonyms):length(synonyms))]), text_split)])),
+            ...))
+    out <- out[!is.na(out)]
+    if (length(out) == 0) {
+      as.ab(NA)
+    } else {
+      if (!isFALSE(translate_ab)) {
+        out <- ab_property(out, property = translate_ab, initial = FALSE)
+      }
+      out
+    }
+  })
+
  if (!is.null(collapse)) {
-    result <- paste0(result, collapse = collapse)
+    result <- sapply(result, function(x) paste0(x, collapse = collapse))
  }
+
  result
 }
--- a/R/first_isolate.R
+++ b/R/first_isolate.R
@@ -291,7 +291,8 @@ first_isolate <- function(x,
  # did find some isolates - add new index numbers of rows
  x$newvar_row_index_sorted <- seq_len(nrow(x))

-  scope.size <- row.end - row.start + 1
+  scope.size <- nrow(x[which(x$newvar_row_index_sorted %in% c(row.start + 1:row.end) &
+                               !is.na(x$newvar_mo)), , drop = FALSE])
  
  identify_new_year <- function(x, episode_days) {
    # I asked on StackOverflow:
@@ -390,7 +391,7 @@ first_isolate <- function(x,
  # handle empty microorganisms
  if (any(x$newvar_mo == "UNKNOWN", na.rm = TRUE) & info == TRUE) {
    message(font_blue(paste0("NOTE: ", ifelse(include_unknown == TRUE, "Included ", "Excluded "), 
-                        format(sum(x$newvar_mo == "UNKNOWN"),
+                        format(sum(x$newvar_mo == "UNKNOWN", na.rm = TRUE),
                               decimal.mark = decimal.mark, big.mark = big.mark), 
                        " isolates with a microbial ID 'UNKNOWN' (column `", font_bold(col_mo), "`)")))
  }
@@ -398,7 +399,7 @@ first_isolate <- function(x,
  
  # exclude all NAs
  if (any(is.na(x$newvar_mo)) & info == TRUE) {
-    message(font_blue(paste0("NOTE: Excluded ", format(sum(is.na(x$newvar_mo)),
+    message(font_blue(paste0("NOTE: Excluded ", format(sum(is.na(x$newvar_mo), na.rm = TRUE),
                                                  decimal.mark = decimal.mark, big.mark = big.mark), 
                        " isolates with a microbial ID 'NA' (column `", font_bold(col_mo), "`)")))
  }
@@ -410,18 +411,18 @@ first_isolate <- function(x,
  
  if (info == TRUE) {
    n_found <- base::sum(x$newvar_first_isolate, na.rm = TRUE)
-    p_found_total <- percentage(n_found / nrow(x))
+    p_found_total <- percentage(n_found / nrow(x[which(!is.na(x$newvar_mo)), , drop = FALSE]))
    p_found_scope <- percentage(n_found / scope.size)
    # mark up number of found
    n_found <- base::format(n_found, big.mark = big.mark, decimal.mark = decimal.mark)
    if (p_found_total != p_found_scope) {
      msg_txt <- paste0("=> Found ",
                        font_bold(paste0(n_found, " first ", weighted.notice, "isolates")),
-                        " (", p_found_scope, " within scope and ", p_found_total, " of total)")
+                        " (", p_found_scope, " within scope and ", p_found_total, " of total where a microbial ID was available)")
    } else {
      msg_txt <- paste0("=> Found ",
                        font_bold(paste0(n_found, " first ", weighted.notice, "isolates")),
-                        " (", p_found_total, " of total)")
+                        " (", p_found_total, " of total where a microbial ID was available)")
    }
    message(font_black(msg_txt))
  }