(v1.2.0.9014) ab_from_text()

2025-07-08 07:51:57 +02:00 · 2020-06-25 17:34:50 +02:00
parent 8a66e475d7
commit 344bbd57f4
32 changed files with 749 additions and 191 deletions
--- a/R/ab.R
+++ b/R/ab.R
@ -30,6 +30,7 @@
 #' @details All entries in the [antibiotics] data set have three different identifiers: a human readable EARS-Net code (column `ab`, used by ECDC and WHONET), an ATC code (column `atc`, used by WHO), and a CID code (column `cid`, Compound ID, used by PubChem). The data set contains more than 5,000 official brand names from many different countries, as found in PubChem.
 #'
 #' Use the [ab_property()] functions to get properties based on the returned antibiotic ID, see Examples.
+#' 
 #' @section Source:
 #' World Health Organization (WHO) Collaborating Centre for Drug Statistics Methodology: \url{https://www.whocc.no/atc_ddd_index/}
 #'
@ -38,7 +39,9 @@
 #' European Commission Public Health PHARMACEUTICALS - COMMUNITY REGISTER: \url{http://ec.europa.eu/health/documents/community-register/html/atc.htm}
 #' @aliases ab
 #' @return Character (vector) with class [`ab`]. Unknown values will return `NA`.
-#' @seealso [antibiotics] for the dataframe that is being used to determine ATCs.
+#' @seealso 
+#' * [antibiotics] for the dataframe that is being used to determine ATCs
+#' * [ab_from_text()] for a function to retrieve antimicrobial drugs from clinical text (from health care records)
 #' @inheritSection AMR Read more on our website!
 #' @export
 #' @examples
@ -72,6 +75,9 @@ as.ab <- function(x, ...) {
    return(x)
  }

+  initial <- is.null(list(...)$initial)
+  already_regex <- isTRUE(list(...)$already_regex)
+
  if (all(toupper(x) %in% antibiotics$ab)) {
    # valid AB code, but not yet right class
    return(structure(.Data = toupper(x),
@ -79,26 +85,30 @@ as.ab <- function(x, ...) {
  }

  x_bak <- x
+  x <- toupper(x)
  # remove diacritics
  x <- iconv(x, from = "UTF-8", to = "ASCII//TRANSLIT")
  x <- gsub('"', "", x, fixed = TRUE)
-  # remove suffices
-  x_bak_clean <- gsub("_(mic|rsi|dis[ck])$", "", x, ignore.case = TRUE)
-  # remove disk concentrations, like LVX_NM -> LVX
-  x_bak_clean <- gsub("_[A-Z]{2}[0-9_.]{0,3}$", "", x_bak_clean, ignore.case = TRUE)
-  # remove part between brackets if that's followed by another string
-  x_bak_clean <- gsub("(.*)+ [(].*[)]", "\\1", x_bak_clean)
-  # keep only max 1 space
-  x_bak_clean <- trimws(gsub(" +", " ", x_bak_clean, ignore.case = TRUE))
-  # non-character, space or number should be a slash
-  x_bak_clean <- gsub("[^A-Za-z0-9 -]", "/", x_bak_clean)
-  # spaces around non-characters must be removed: amox + clav -> amox/clav
-  x_bak_clean <- gsub("(.*[a-zA-Z0-9]) ([^a-zA-Z0-9].*)", "\\1\\2", x_bak_clean)
-  x_bak_clean <- gsub("(.*[^a-zA-Z0-9]) ([a-zA-Z0-9].*)", "\\1\\2", x_bak_clean)
-  # remove hyphen after a starting "co"
-  x_bak_clean <- gsub("^co-", "co", x_bak_clean, ignore.case = TRUE)
-  # replace text 'and' with a slash
-  x_bak_clean <- gsub(" and ", "/", x_bak_clean, ignore.case = TRUE)
+  x_bak_clean <- x
+  if (already_regex == FALSE) {
+    # remove suffices
+    x_bak_clean <- gsub("_(MIC|RSI|DIS[CK])$", "", x_bak_clean)
+    # remove disk concentrations, like LVX_NM -> LVX
+    x_bak_clean <- gsub("_[A-Z]{2}[0-9_.]{0,3}$", "", x_bak_clean)
+    # remove part between brackets if that's followed by another string
+    x_bak_clean <- gsub("(.*)+ [(].*[)]", "\\1", x_bak_clean)
+    # keep only max 1 space
+    x_bak_clean <- trimws(gsub(" +", " ", x_bak_clean))
+    # non-character, space or number should be a slash
+    x_bak_clean <- gsub("[^A-Z0-9 -]", "/", x_bak_clean)
+    # spaces around non-characters must be removed: amox + clav -> amox/clav
+    x_bak_clean <- gsub("(.*[A-Z0-9]) ([^A-Z0-9].*)", "\\1\\2", x_bak_clean)
+    x_bak_clean <- gsub("(.*[^A-Z0-9]) ([A-Z0-9].*)", "\\1\\2", x_bak_clean)
+    # remove hyphen after a starting "co"
+    x_bak_clean <- gsub("^CO-", "CO", x_bak_clean)
+    # replace text 'and' with a slash
+    x_bak_clean <- gsub(" AND ", "/", x_bak_clean)
+  }

  x <- unique(x_bak_clean)
  x_new <- rep(NA_character_, length(x))
@ -118,14 +128,14 @@ as.ab <- function(x, ...) {
    }
    
    # exact AB code
-    found <- antibiotics[which(antibiotics$ab == toupper(x[i])), ]$ab
+    found <- antibiotics[which(antibiotics$ab == x[i]), ]$ab
    if (length(found) > 0) {
      x_new[i] <- found[1L]
      next
    }

    # exact ATC code
-    found <- antibiotics[which(antibiotics$atc == toupper(x[i])), ]$ab
+    found <- antibiotics[which(antibiotics$atc == x[i]), ]$ab
    if (length(found) > 0) {
      x_new[i] <- found[1L]
      next
@ -139,7 +149,7 @@ as.ab <- function(x, ...) {
    }

    # exact name
-    found <- antibiotics[which(toupper(antibiotics$name) == toupper(x[i])), ]$ab
+    found <- antibiotics[which(toupper(antibiotics$name) == x[i]), ]$ab
    if (length(found) > 0) {
      x_new[i] <- found[1L]
      next
@ -147,11 +157,7 @@ as.ab <- function(x, ...) {
    
    # exact LOINC code
    loinc_found <- unlist(lapply(antibiotics$loinc,
-                                   function(s) if (x[i] %in% s) {
-                                     TRUE
-                                   } else {
-                                     FALSE
-                                   }))
+                                   function(s) x[i] %in% s))
    found <- antibiotics$ab[loinc_found == TRUE]
    if (length(found) > 0) {
      x_new[i] <- found[1L]
@ -160,11 +166,7 @@ as.ab <- function(x, ...) {

    # exact synonym
    synonym_found <- unlist(lapply(antibiotics$synonyms,
-                                   function(s) if (toupper(x[i]) %in% toupper(s)) {
-                                     TRUE
-                                   } else {
-                                     FALSE
-                                   }))
+                                   function(s) x[i] %in% toupper(s)))
    found <- antibiotics$ab[synonym_found == TRUE]
    if (length(found) > 0) {
      x_new[i] <- found[1L]
@ -173,90 +175,87 @@ as.ab <- function(x, ...) {

    # exact abbreviation
    abbr_found <- unlist(lapply(antibiotics$abbreviations,
-                                function(a) if (toupper(x[i]) %in% toupper(a)) {
-                                  TRUE
-                                } else {
-                                  FALSE
-                                }))
+                                function(a) x[i] %in% toupper(a)))
    found <- antibiotics$ab[abbr_found == TRUE]
    if (length(found) > 0) {
      x_new[i] <- found[1L]
      next
    }

-    # first >=4 characters of name
-    if (nchar(x[i]) >= 4) {
-      found <- antibiotics[which(toupper(antibiotics$name) %like% paste0("^", x[i])), ]$ab
-      if (length(found) > 0) {
-        x_new[i] <- found[1L]
-        next
-      }
-    }
-
    # allow characters that resemble others, but only continue when having more than 3 characters
    if (nchar(x[i]) <= 3) {
      x_unknown <- c(x_unknown, x_bak[x[i] == x_bak_clean][1])
      next
    }
-    x_spelling <- tolower(x[i])
-    x_spelling <- gsub("[iy]+", "[iy]+", x_spelling)
-    x_spelling <- gsub("(c|k|q|qu|s|z|x|ks)+", "(c|k|q|qu|s|z|x|ks)+", x_spelling)
-    x_spelling <- gsub("(ph|f|v)+", "(ph|f|v)+", x_spelling)
-    x_spelling <- gsub("(th|t)+", "(th|t)+", x_spelling)
-    x_spelling <- gsub("a+", "a+", x_spelling)
-    x_spelling <- gsub("e+", "e+", x_spelling)
-    x_spelling <- gsub("o+", "o+", x_spelling)
-    # allow any ending of -in/-ine and -im/-ime
-    x_spelling <- gsub("(\\[iy\\]\\+(n|m)|\\[iy\\]\\+(n|m)e\\+)$", "[iy]+(n|m)e*", x_spelling)
-    # allow any ending of -ol/-ole
-    x_spelling <- gsub("(o\\+l|o\\+le\\+)$", "o+le*", x_spelling)
-    # allow any ending of -on/-one
-    x_spelling <- gsub("(o\\+n|o\\+ne\\+)$", "o+ne*", x_spelling)
-    # replace multiple same characters to single one with '+', like "ll" -> "l+"
-    x_spelling <- gsub("(.)\\1+", "\\1+", x_spelling)
-    # replace spaces and slashes with a possibility on both
-    x_spelling <- gsub("[ /]", "( .*|.*/)", x_spelling)
-
+    x_spelling <- x[i]
+    if (already_regex == FALSE) {
+      x_spelling <- gsub("[IY]+", "[IY]+", x_spelling)
+      x_spelling <- gsub("(C|K|Q|QU|S|Z|X|KS)+", "(C|K|Q|QU|S|Z|X|KS)+", x_spelling)
+      x_spelling <- gsub("(PH|F|V)+", "(PH|F|V)+", x_spelling)
+      x_spelling <- gsub("(TH|T)+", "(TH|T)+", x_spelling)
+      x_spelling <- gsub("A+", "A+", x_spelling)
+      x_spelling <- gsub("E+", "E+", x_spelling)
+      x_spelling <- gsub("O+", "O+", x_spelling)
+      # allow any ending of -in/-ine and -im/-ime
+      x_spelling <- gsub("(\\[IY\\]\\+(N|M)|\\[IY\\]\\+(N|M)E\\+)$", "[IY]+(N|M)E*", x_spelling)
+      # allow any ending of -ol/-ole
+      x_spelling <- gsub("(O\\+L|O\\+LE\\+)$", "O+LE*", x_spelling)
+      # allow any ending of -on/-one
+      x_spelling <- gsub("(O\\+N|O\\+NE\\+)$", "O+NE*", x_spelling)
+      # replace multiple same characters to single one with '+', like "ll" -> "l+"
+      x_spelling <- gsub("(.)\\1+", "\\1+", x_spelling)
+      # replace spaces and slashes with a possibility on both
+      x_spelling <- gsub("[ /]", "( .*|.*/)", x_spelling)
+      # correct for digital reading text (OCR)
+      x_spelling <- gsub("[NRD]", "[NRD]", x_spelling)
+    }
+    
    # try if name starts with it
    found <- antibiotics[which(antibiotics$name %like% paste0("^", x_spelling)), ]$ab
    if (length(found) > 0) {
      x_new[i] <- found[1L]
      next
    }
+    # try if name ends with it
+    found <- antibiotics[which(antibiotics$name %like% paste0(x_spelling, "$")), ]$ab
+    if (nchar(x[i]) >= 4 & length(found) > 0) {
+      x_new[i] <- found[1L]
+      next
+    }
    # and try if any synonym starts with it
    synonym_found <- unlist(lapply(antibiotics$synonyms,
-                                   function(s) if (any(s %like% paste0("^", x_spelling))) {
-                                     TRUE
-                                   } else {
-                                     FALSE
-                                   }))
+                                   function(s) any(s %like% paste0("^", x_spelling))))
    found <- antibiotics$ab[synonym_found == TRUE]
    if (length(found) > 0) {
      x_new[i] <- found[1L]
      next
    }
-
-    # try by removing all spaces
-    if (x[i] %like% " ") {
-      found <- suppressWarnings(as.ab(gsub(" +", "", x[i])))
-      if (length(found) > 0 & !is.na(found)) {
-        x_new[i] <- found[1L]
-        next
+    
+    # INITIAL - More uncertain results ----
+    
+    if (initial == TRUE) {
+      # only run on first try
+      
+      # try by removing all spaces
+      if (x[i] %like% " ") {
+        found <- suppressWarnings(as.ab(gsub(" +", "", x[i]), initial = FALSE))
+        if (length(found) > 0 & !is.na(found)) {
+          x_new[i] <- found[1L]
+          next
+        }
      }
-    }
-
-    # try by removing all spaces and numbers
-    if (x[i] %like% " " | x[i] %like% "[0-9]") {
-      found <- suppressWarnings(as.ab(gsub("[ 0-9]", "", x[i])))
-      if (length(found) > 0 & !is.na(found)) {
-        x_new[i] <- found[1L]
-        next
+      
+      # try by removing all spaces and numbers
+      if (x[i] %like% " " | x[i] %like% "[0-9]") {
+        found <- suppressWarnings(as.ab(gsub("[ 0-9]", "", x[i]), initial = FALSE))
+        if (length(found) > 0 & !is.na(found)) {
+          x_new[i] <- found[1L]
+          next
+        }
      }
-    }
-
-    if (!isFALSE(list(...)$initial_search)) {
+      
      # transform back from other languages and try again
-      x_translated <- paste(lapply(strsplit(x[i], "[^a-zA-Z0-9 ]"),
+      x_translated <- paste(lapply(strsplit(x[i], "[^A-Z0-9 ]"),
                                   function(y) {
                                     for (i in seq_len(length(y))) {
                                       y[i] <- ifelse(tolower(y[i]) %in% tolower(translations_file$replacement),
@ -267,41 +266,102 @@ as.ab <- function(x, ...) {
                                     y
                                   })[[1]],
                            collapse = "/")
-      x_translated_guess <- suppressWarnings(as.ab(x_translated, initial_search = FALSE))
+      x_translated_guess <- suppressWarnings(as.ab(x_translated, initial = FALSE))
      if (!is.na(x_translated_guess)) {
        x_new[i] <- x_translated_guess
        next
      }
      
-      if (!isFALSE(list(...)$initial_search2)) {
-        # now also try to coerce brandname combinations like "Amoxy/clavulanic acid"
-        x_translated <- paste(lapply(strsplit(x_translated, "[^a-zA-Z0-9 ]"),
-                                     function(y) {
-                                       for (i in seq_len(length(y))) {
-                                         y_name <- suppressWarnings(ab_name(y[i], language = NULL, initial_search = FALSE, initial_search2 = FALSE))
-                                         y[i] <- ifelse(!is.na(y_name),
-                                                        y_name,
-                                                        y[i])
-                                       }
-                                       y
-                                     })[[1]],
-                              collapse = "/")
-        x_translated_guess <- suppressWarnings(as.ab(x_translated, initial_search = FALSE))
-        if (!is.na(x_translated_guess)) {
-          x_new[i] <- x_translated_guess
+      # now also try to coerce brandname combinations like "Amoxy/clavulanic acid"
+      x_translated <- paste(lapply(strsplit(x_translated, "[^A-Z0-9 ]"),
+                                   function(y) {
+                                     for (i in seq_len(length(y))) {
+                                       y_name <- suppressWarnings(ab_name(y[i], language = NULL, initial = FALSE))
+                                       y[i] <- ifelse(!is.na(y_name),
+                                                      y_name,
+                                                      y[i])
+                                     }
+                                     y
+                                   })[[1]],
+                            collapse = "/")
+      x_translated_guess <- suppressWarnings(as.ab(x_translated, initial = FALSE))
+      if (!is.na(x_translated_guess)) {
+        x_new[i] <- x_translated_guess
+        next
+      }
+      
+      # try by removing all trailing capitals
+      if (x[i] %like_case% "[a-z]+[A-Z]+$") {
+        found <- suppressWarnings(as.ab(gsub("[A-Z]+$", "", x[i]), initial = FALSE))
+        if (!is.na(found)) {
+          x_new[i] <- found[1L]
          next
        }
      }
-    }
-    
-    # try by removing all trailing capitals
-    if (x[i] %like_case% "[a-z]+[A-Z]+$") {
-      found <- suppressWarnings(as.ab(gsub("[A-Z]+$", "", x[i])))
-      if (length(found) > 0 & !is.na(found)) {
+      
+      # keep only letters
+      found <- suppressWarnings(as.ab(gsub("[^A-Z]", "", x[i]), initial = FALSE))
+      if (!is.na(found)) {
        x_new[i] <- found[1L]
        next
      }
-    }
+      
+      # try from a bigger text, like from a health care record, see ?ab_from_text
+      found <- suppressWarnings(ab_from_text(x[i], initial = FALSE, translate_ab = FALSE)[1L])
+      if (!is.na(found)) {
+        x_new[i] <- found[1L]
+        next
+      }
+      
+      # first 5 except for cephalosporins, then first 7 (those cephalosporins all start quite the same!)
+      found <- suppressWarnings(as.ab(substr(x[i], 1, 5), initial = FALSE))
+      if (!is.na(found) && !ab_group(found, initial = FALSE) %like% "cephalosporins") {
+        x_new[i] <- found[1L]
+        next
+      }
+      found <- suppressWarnings(as.ab(substr(x[i], 1, 7), initial = FALSE))
+      if (!is.na(found)) {
+        x_new[i] <- found[1L]
+        next
+      }
+      
+      # make all consonants facultative
+      search_str <- gsub("([BCDFGHJKLMNPQRSTVWXZ])", "\\1*", x[i])
+      found <- suppressWarnings(as.ab(search_str, initial = FALSE, already_regex = TRUE))
+      # keep at least 4 normal characters
+      if (nchar(gsub(".\\*", "", search_str)) < 4) {
+        found <- NA
+      }
+      if (!is.na(found)) {
+        x_new[i] <- found[1L]
+        next
+      }
+      
+      # make all vowels facultative
+      search_str <- gsub("([AEIOUY])", "\\1*", x[i])
+      found <- suppressWarnings(as.ab(search_str, initial = FALSE, already_regex = TRUE))
+      # keep at least 5 normal characters
+      if (nchar(gsub(".\\*", "", search_str)) < 5) {
+        found <- NA
+      }
+      if (!is.na(found)) {
+        x_new[i] <- found[1L]
+        next
+      }
+      
+      # allow misspelling of vowels
+      x_spelling <- gsub("A+", "[AEIOU]+", x_spelling, fixed = TRUE)
+      x_spelling <- gsub("E+", "[AEIOU]+", x_spelling, fixed = TRUE)
+      x_spelling <- gsub("I+", "[AEIOU]+", x_spelling, fixed = TRUE)
+      x_spelling <- gsub("O+", "[AEIOU]+", x_spelling, fixed = TRUE)
+      x_spelling <- gsub("U+", "[AEIOU]+", x_spelling, fixed = TRUE)
+      found <- suppressWarnings(as.ab(x_spelling, initial = FALSE, already_regex = TRUE))
+      if (!is.na(found)) {
+        x_new[i] <- found[1L]
+        next
+      }
+      
+    } # end of initial = TRUE

    # not found
    x_unknown <- c(x_unknown, x_bak[x[i] == x_bak_clean][1])
@ -316,7 +376,7 @@ as.ab <- function(x, ...) {
            ".",
            call. = FALSE)
  }
-
+  
  if (length(x_unknown) > 0) {
    warning("These values could not be coerced to a valid antimicrobial ID: ",
            paste('"', sort(unique(x_unknown)), '"', sep = "", collapse = ", "),