(v2.1.1.9159) new approach as.ab()

2026-02-24 03:09:10 +01:00 · 2025-02-26 19:23:54 +01:00
parent 122bca0f95
commit 0c3ea4b538
18 changed files with 130 additions and 284 deletions
--- a/.github/workflows/check-old-tinytest.yaml
+++ b/.github/workflows/check-old-tinytest.yaml
@@ -61,7 +61,7 @@ jobs:
      R_REMOTES_NO_ERRORS_FROM_WARNINGS: true

    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4

      - uses: r-lib/actions/setup-r@v2
        with:
--- a/.github/workflows/codecovr.yaml
+++ b/.github/workflows/codecovr.yaml
@@ -42,7 +42,7 @@ jobs:
      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
      CODECOV_TOKEN: ${{secrets.CODECOV_TOKEN}}
    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4

      - uses: r-lib/actions/setup-pandoc@v2

--- a/.github/workflows/lintr.yaml
+++ b/.github/workflows/lintr.yaml
@@ -41,7 +41,7 @@ jobs:
    env:
      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4

      - uses: r-lib/actions/setup-pandoc@v2

@@ -59,13 +59,15 @@ jobs:
        run: |
          # old: lintr::lint_package(linters = lintr::with_defaults(line_length_linter = NULL, trailing_whitespace_linter = NULL, object_name_linter = NULL, cyclocomp_linter = NULL, object_length_linter = lintr::object_length_linter(length = 50L)), exclusions = list("R/aa_helper_pm_functions.R"))
          # now get ALL linters, not just default ones
-          linters <- ls(envir = asNamespace("lintr"), pattern = "_linter$")
+          linters <- getNamespaceExports(asNamespace("lintr"))
+          linters <- sort(linters[grepl("_linter$", linters)])
          # lose deprecated
-          linters <- linters[!grepl("^(closed_curly|open_curly|paren_brace|semicolon_terminator)_linter$", linters)]
+          linters <- linters[!grepl("^(closed_curly|open_curly|paren_brace|semicolon_terminator|consecutive_stopifnot|no_tab|single_quotes|unnecessary_nested_if|unneeded_concatenation)_linter$", linters)]
          # and the ones we find unnnecessary
          linters <- linters[!grepl("^(extraction_operator|implicit_integer|line_length|object_name|nonportable_path|is)_linter$", linters)]
          # put the functions in a list
-          linters <- lapply(linters, function(l) eval(parse(text = paste0("lintr::", l, "()")), envir = asNamespace("lintr")))
+          linters_list <- lapply(linters, function(l) eval(parse(text = paste0("lintr::", l, "()")), envir = asNamespace("lintr")))
+          names(linters_list) <- linters
          # run them all!
-          lintr::lint_package(linters = linters, exclusions = list("R/aa_helper_pm_functions.R"))
+          lintr::lint_package(linters = linters_list, exclusions = list("R/aa_helper_pm_functions.R"))
        shell: Rscript {0}
--- a/.github/workflows/publish-to-pypi.yml
+++ b/.github/workflows/publish-to-pypi.yml
@@ -40,7 +40,7 @@ jobs:
    
    steps:
      - name: Checkout code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
      
      - name: Set up Python
        uses: actions/setup-python@v4
--- a/.github/workflows/website.yaml
+++ b/.github/workflows/website.yaml
@@ -43,7 +43,7 @@ jobs:
    continue-on-error: true
    steps:

-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
        with:
          # this is to keep timestamps, the default fetch-depth: 1 gets the timestamps of the moment of cloning
          # we need this for the download page on our website - dates must be of the files, not of the latest git push
--- a/2
+++ b/2
@@ -1,5 +1,5 @@
 Package: AMR
-Version: 2.1.1.9158
+Version: 2.1.1.9159
 Date: 2025-02-26
 Title: Antimicrobial Resistance Data Analysis
 Description: Functions to simplify and standardise antimicrobial resistance (AMR)
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,4 +1,4 @@
-# AMR 2.1.1.9158
+# AMR 2.1.1.9159

 *(this beta version will eventually become v3.0. We're happy to reach a new major milestone soon, which will be all about the new One Health support! Install this beta using [the instructions here](https://msberends.github.io/AMR/#latest-development-version).)*

--- a/PythonPackage/AMR/AMR.egg-info/PKG-INFO
+++ b/PythonPackage/AMR/AMR.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: AMR
-Version: 2.1.1.9158
+Version: 2.1.1.9159
 Summary: A Python wrapper for the AMR R package
 Home-page: https://github.com/msberends/AMR
 Author: Matthijs Berends
--- a/PythonPackage/AMR/dist/amr-2.1.1.9158.tar.gz
+++ b/PythonPackage/AMR/dist/amr-2.1.1.9158.tar.gz
--- a/PythonPackage/AMR/dist/amr-2.1.1.9159-py3-none-any.whl
+++ b/PythonPackage/AMR/dist/amr-2.1.1.9159-py3-none-any.whl
--- a/PythonPackage/AMR/dist/amr-2.1.1.9159.tar.gz
+++ b/PythonPackage/AMR/dist/amr-2.1.1.9159.tar.gz
--- a/PythonPackage/AMR/setup.py
+++ b/PythonPackage/AMR/setup.py
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages

 setup(
    name='AMR',
-    version='2.1.1.9158',
+    version='2.1.1.9159',
    packages=find_packages(),
    install_requires=[
        'rpy2',
--- a/R/ab.R
+++ b/R/ab.R
@@ -32,6 +32,7 @@
 #' Use this function to determine the antibiotic drug code of one or more antibiotics. The data set [antibiotics] will be searched for abbreviations, official names and synonyms (brand names).
 #' @param x a [character] vector to determine to antibiotic ID
 #' @param flag_multiple_results a [logical] to indicate whether a note should be printed to the console that probably more than one antibiotic drug code or name can be retrieved from a single input value.
+#' @param language language to coerce input values from any of the `r length(LANGUAGES_SUPPORTED)` supported languages - default to the system language if supported (see [get_AMR_locale()])
 #' @param info a [logical] to indicate whether a progress bar should be printed - the default is `TRUE` only in interactive mode
 #' @param ... arguments passed on to internal functions
 #' @rdname as.ab
@@ -67,7 +68,6 @@
 #' as.ab("J 01 FA 01")
 #' as.ab("Erythromycin")
 #' as.ab("eryt")
-#' as.ab("   eryt 123")
 #' as.ab("ERYT")
 #' as.ab("ERY")
 #' as.ab("eritromicine") # spelled wrong, yet works
@@ -92,29 +92,30 @@
 #'     set_ab_names(where(is.sir), property = "atc")
 #' }
 #' }
-as.ab <- function(x, flag_multiple_results = TRUE, info = interactive(), ...) {
+as.ab <- function(x, flag_multiple_results = TRUE, language = get_AMR_locale(), info = interactive(), ...) {
  meet_criteria(x, allow_class = c("character", "numeric", "integer", "factor"), allow_NA = TRUE)
  meet_criteria(flag_multiple_results, allow_class = "logical", has_length = 1)
+  language <- validate_language(language)
  meet_criteria(info, allow_class = "logical", has_length = 1)
-
+  
  if (is.ab(x) || all(x %in% c(AMR_env$AB_lookup$ab, NA))) {
    # all valid AB codes, but not yet right class or might have additional attributes as AMR selector
    attributes(x) <- NULL
    return(set_clean_class(x,
-      new_class = c("ab", "character")
+                           new_class = c("ab", "character")
    ))
  }
-
+  
  loop_time <- list(...)$loop_time
  if (is.null(loop_time)) {
    loop_time <- 1
  }
  already_regex <- isTRUE(list(...)$already_regex)
  fast_mode <- isTRUE(list(...)$fast_mode)
-
+  
  x_bak <- x
  x <- toupper(x)
-
+  
  # remove diacritics
  x <- iconv(x, from = "UTF-8", to = "ASCII//TRANSLIT")
  x <- gsub('"', "", x, fixed = TRUE)
@@ -125,7 +126,7 @@ as.ab <- function(x, flag_multiple_results = TRUE, info = interactive(), ...) {
  if (already_regex == FALSE) {
    x_bak_clean <- generalise_antibiotic_name(x_bak_clean)
  }
-
+  
  x <- unique(x_bak_clean) # this means that every x is in fact generalise_antibiotic_name(x)
  x_new <- rep(NA_character_, length(x))
  x_uncertain <- character(0)
@@ -152,17 +153,17 @@ as.ab <- function(x, flag_multiple_results = TRUE, info = interactive(), ...) {
    }
    found[1L]
  }
-
+  
  # Fill in names, AB codes, CID codes and ATC codes directly (`x` is already clean and uppercase)
  known_names <- x %in% AMR_env$AB_lookup$generalised_name
  x_new[known_names] <- AMR_env$AB_lookup$ab[match(x[known_names], AMR_env$AB_lookup$generalised_name)]
  known_codes_ab <- x %in% AMR_env$AB_lookup$ab
-  known_codes_atc <- vapply(FUN.VALUE = logical(1), x, function(x_) x_ %in% unlist(AMR_env$AB_lookup$atc), USE.NAMES = FALSE)
+  known_codes_atc <- vapply(FUN.VALUE = logical(1), gsub(" ", "", x), function(x_) x_ %in% unlist(AMR_env$AB_lookup$atc), USE.NAMES = FALSE)
  known_codes_cid <- x %in% AMR_env$AB_lookup$cid
  x_new[known_codes_ab] <- AMR_env$AB_lookup$ab[match(x[known_codes_ab], AMR_env$AB_lookup$ab)]
  x_new[known_codes_atc] <- AMR_env$AB_lookup$ab[vapply(
    FUN.VALUE = integer(1),
-    x[known_codes_atc],
+    gsub(" ", "", x[known_codes_atc]),
    function(x_) {
      which(vapply(
        FUN.VALUE = logical(1),
@@ -182,29 +183,29 @@ as.ab <- function(x, flag_multiple_results = TRUE, info = interactive(), ...) {
      " for ", vector_and(prev), ". Run `ab_reset_session()` to reset this. This note will be shown once per session for this input."
    )
  }
-
+  
  already_known <- known_names | known_codes_ab | known_codes_atc | known_codes_cid | previously_coerced
-
+  
  # fix for NAs
  x_new[is.na(x)] <- NA
  already_known[is.na(x)] <- FALSE
-
+  
  if (loop_time == 1 && sum(already_known) < length(x)) {
    progress <- progress_ticker(n = sum(!already_known), n_min = 25, print = info) # start if n >= 25
    on.exit(close(progress))
  }
-
+  
  for (i in which(!already_known)) {
    if (loop_time == 1) {
      progress$tick()
    }
-
+    
    if (is.na(x[i]) || is.null(x[i])) {
      next
    }
    if (identical(x[i], "") ||
-      # prevent "bacteria" from coercing to TMP, since Bacterial is a brand name of it:
-      identical(tolower(x[i]), "bacteria")) {
+        # prevent "bacteria" from coercing to TMP, since Bacterial is a brand name of it:
+        identical(tolower(x[i]), "bacteria")) {
      x_unknown <- c(x_unknown, x_bak[x[i] == x_bak_clean][1])
      next
    }
@@ -215,21 +216,21 @@ as.ab <- function(x, flag_multiple_results = TRUE, info = interactive(), ...) {
      x_new[i] <- NA_character_
      next
    }
-
+    
    if (fast_mode == FALSE && flag_multiple_results == TRUE && x[i] %like% "[ ]") {
      from_text <- tryCatch(suppressWarnings(ab_from_text(x[i], loop_time = loop_time + 1, translate_ab = FALSE)[[1]]),
-        error = function(e) character(0)
+                            error = function(e) character(0)
      )
    } else {
      from_text <- character(0)
    }
-
+    
    # old code for phenoxymethylpenicillin (Peni V)
    if (x[i] == "PNV") {
      x_new[i] <- "PHN"
      next
    }
-
+    
    # exact LOINC code
    loinc_found <- unlist(lapply(
      AMR_env$AB_lookup$generalised_loinc,
@@ -240,7 +241,7 @@ as.ab <- function(x, flag_multiple_results = TRUE, info = interactive(), ...) {
      x_new[i] <- note_if_more_than_one_found(found, i, from_text)
      next
    }
-
+    
    # exact synonym
    synonym_found <- unlist(lapply(
      AMR_env$AB_lookup$generalised_synonyms,
@@ -251,7 +252,7 @@ as.ab <- function(x, flag_multiple_results = TRUE, info = interactive(), ...) {
      x_new[i] <- note_if_more_than_one_found(found, i, from_text)
      next
    }
-
+    
    # exact abbreviation
    abbr_found <- unlist(lapply(
      AMR_env$AB_lookup$generalised_abbreviations,
@@ -263,7 +264,7 @@ as.ab <- function(x, flag_multiple_results = TRUE, info = interactive(), ...) {
      x_new[i] <- note_if_more_than_one_found(found, i, from_text)
      next
    }
-
+    
    # length of input is quite long, and Levenshtein distance is only max 2
    if (nchar(x[i]) >= 10) {
      levenshtein <- as.double(utils::adist(x[i], AMR_env$AB_lookup$generalised_name))
@@ -273,7 +274,7 @@ as.ab <- function(x, flag_multiple_results = TRUE, info = interactive(), ...) {
        next
      }
    }
-
+    
    # allow characters that resemble others, but only continue when having more than 3 characters
    if (nchar(x[i]) <= 3) {
      x_unknown <- c(x_unknown, x_bak[x[i] == x_bak_clean][1])
@@ -303,20 +304,22 @@ as.ab <- function(x, flag_multiple_results = TRUE, info = interactive(), ...) {
      x_spelling <- gsub("(O|0)", "(O|0)+", x_spelling, perl = TRUE)
      x_spelling <- gsub("++", "+", x_spelling, fixed = TRUE)
    }
-
+    
    # try if name starts with it
    found <- AMR_env$AB_lookup[which(AMR_env$AB_lookup$generalised_name %like% paste0("^", x_spelling)), "ab", drop = TRUE]
    if (length(found) > 0) {
      x_new[i] <- note_if_more_than_one_found(found, i, from_text)
      next
    }
+    
+    
    # try if name ends with it
    found <- AMR_env$AB_lookup[which(AMR_env$AB_lookup$generalised_name %like% paste0(x_spelling, "$")), "ab", drop = TRUE]
    if (nchar(x[i]) >= 4 && length(found) > 0) {
      x_new[i] <- note_if_more_than_one_found(found, i, from_text)
      next
    }
-
+    
    # and try if any synonym starts with it
    synonym_found <- unlist(lapply(
      AMR_env$AB_lookup$generalised_synonyms,
@@ -327,244 +330,71 @@ as.ab <- function(x, flag_multiple_results = TRUE, info = interactive(), ...) {
      x_new[i] <- note_if_more_than_one_found(found, i, from_text)
      next
    }
-
+    
    # INITIAL SEARCH - More uncertain results ----
-    if (loop_time <= 2 && fast_mode == FALSE) {
+    if (loop_time == 1 && fast_mode == FALSE) {
      # only run on first and second try
      
-      # base on the Levensthein distance function if length >= 6
-      if (nchar(x[i]) >= 6) {
-        l_dist <- as.double(utils::adist(x[i], AMR_env$AB_lookup$generalised_name,
-                                         ignore.case = FALSE,
-                                         fixed = TRUE,
-                                         costs = c(insertions = 1, deletions = 2, substitutions = 2),
-                                         counts = FALSE))
-        x_new[i] <- AMR_env$AB_lookup$ab[order(l_dist)][1]
-        x_uncertain <- c(x_uncertain, x_bak[x[i] == x_bak_clean][1])
-        next
-      }
-      
-      # try by removing all spaces
-      if (x[i] %like% " ") {
-        found <- suppressWarnings(as.ab(gsub(" +", "", x[i], perl = TRUE), loop_time = loop_time + 2))
-        if (length(found) > 0 && !is.na(found)) {
-          x_new[i] <- note_if_more_than_one_found(found, i, from_text)
-          x_uncertain <- c(x_uncertain, x_bak[x[i] == x_bak_clean][1])
-          next
-        }
-      }
-      
-      # try by removing all spaces and numbers
-      if (x[i] %like% " " || x[i] %like% "[0-9]") {
-        found <- suppressWarnings(as.ab(gsub("[ 0-9]", "", x[i], perl = TRUE), loop_time = loop_time + 2))
-        if (length(found) > 0 && !is.na(found)) {
-          x_new[i] <- note_if_more_than_one_found(found, i, from_text)
-          x_uncertain <- c(x_uncertain, x_bak[x[i] == x_bak_clean][1])
-          next
-        }
-      }
-      
-      # reverse a combination, e.g. clavulanic acid/amoxicillin
-      if (x[i] %like% " ") {
-        split <- strsplit(x[i], " ")[[1]]
-        permute <- function(x) {
-          if (length(x) == 1) return(list(x))
-          result <- vector("list", factorial(length(x)))
-          index <- 1
-          for (i in seq_along(x)) {
-            sub_perms <- permute(x[-i])  # Recursively get permutations of remaining elements
-            for (sub in sub_perms) {
-              result[[index]] <- c(x[i], sub)
-              index <- index + 1
-            }
-          }
-          return(result)
-        }
-        permutations <- permute(split)
-        found_perms <- character(length(permutations))
-        for (s in seq_len(length(permutations))) {
-          concat <- paste0(permutations[[s]], collapse = " ")
-          if (concat %in% AMR_env$AB_lookup$generalised_name) {
-            found_perms[s] <- AMR_env$AB_lookup[which(AMR_env$AB_lookup$generalised_name == concat), "ab", drop = TRUE]
-          } else {
-            found_perms[s] <- suppressWarnings(as.ab(concat, loop_time = loop_time + 2))
-          }
-        }
-        found_perms <- found_perms[!is.na(found_perms)]
-        if (length(found_perms) > 0) {
-          found <- found_perms[order(nchar(found_perms), decreasing = TRUE)][1]
-          x_new[i] <- note_if_more_than_one_found(found, i, from_text)
-          x_uncertain <- c(x_uncertain, x_bak[x[i] == x_bak_clean][1])
-          next
-        }
-      }
-
-      # transform back from other languages and try again
-      x_translated <- paste(
-        lapply(
-          strsplit(x[i], "[^A-Z0-9]"),
-          function(y) {
-            for (i in seq_len(length(y))) {
-              for (lang in LANGUAGES_SUPPORTED[LANGUAGES_SUPPORTED != "en"]) {
-                y[i] <- ifelse(tolower(y[i]) %in% tolower(TRANSLATIONS[, lang, drop = TRUE]),
-                  TRANSLATIONS[which(tolower(TRANSLATIONS[, lang, drop = TRUE]) == tolower(y[i]) &
-                    !isFALSE(TRANSLATIONS$fixed)), "pattern"],
-                  y[i]
-                )
-              }
-            }
-            generalise_antibiotic_name(y)
-          }
-        )[[1]],
-        collapse = "/"
-      )
-      x_translated_guess <- suppressWarnings(as.ab(x_translated, loop_time = loop_time + 2))
-      if (!is.na(x_translated_guess)) {
-        x_new[i] <- x_translated_guess
-        x_uncertain <- c(x_uncertain, x_bak[x[i] == x_bak_clean][1])
-        next
-      }
-
-      # now also try to coerce brandname combinations like "Amoxy/clavulanic acid"
-      x_translated <- paste(
-        lapply(
-          strsplit(x_translated, "[^A-Z0-9 ]"),
-          function(y) {
-            for (i in seq_len(length(y))) {
-              y_name <- suppressWarnings(ab_name(y[i], language = NULL, loop_time = loop_time + 2))
-              y[i] <- ifelse(!is.na(y_name),
-                y_name,
-                y[i]
-              )
-            }
-            generalise_antibiotic_name(y)
-          }
-        )[[1]],
-        collapse = "/"
-      )
-      x_translated_guess <- suppressWarnings(as.ab(x_translated, loop_time = loop_time + 2))
-      if (!is.na(x_translated_guess)) {
-        x_new[i] <- x_translated_guess
-        x_uncertain <- c(x_uncertain, x_bak[x[i] == x_bak_clean][1])
-        next
-      }
-
-      # try by removing all trailing capitals
-      if (x[i] %like_case% "[a-z]+[A-Z]+$") {
-        found <- suppressWarnings(as.ab(gsub("[A-Z]+$", "", x[i], perl = TRUE), loop_time = loop_time + 2))
-        if (!is.na(found)) {
-          x_new[i] <- note_if_more_than_one_found(found, i, from_text)
-          x_uncertain <- c(x_uncertain, x_bak[x[i] == x_bak_clean][1])
-          next
-        }
-      }
-
-      # keep only letters
-      found <- suppressWarnings(as.ab(gsub("[^A-Z]", "", x[i], perl = TRUE), loop_time = loop_time + 2))
-      if (!is.na(found)) {
-        x_new[i] <- note_if_more_than_one_found(found, i, from_text)
-        x_uncertain <- c(x_uncertain, x_bak[x[i] == x_bak_clean][1])
-        next
-      }
-
-      # try from a bigger text, like from a health care record, see ?ab_from_text
-      # already calculated above if flag_multiple_results = TRUE
-      if (flag_multiple_results == TRUE) {
-        found <- from_text[1L]
+      ab_df <- AMR_env$AB_lookup
+      ab_df$length_name <- nchar(ab_df$generalised_name)
+      # now retrieve Levensthein distance for name, synonyms, and translated names
+      ab_df$lev_name <- as.double(utils::adist(x[i], ab_df$generalised_name,
+                                               ignore.case = FALSE,
+                                               fixed = TRUE,
+                                               costs = c(insertions = 1, deletions = 1, substitutions = 2),
+                                               counts = FALSE))
+      ab_df$lev_syn <- vapply(FUN.VALUE = double(1),
+                              ab_df$generalised_synonyms, 
+                              function(y) ifelse(length(y[nchar(y) >= 5]) == 0,
+                                                 999,
+                                                 min(as.double(utils::adist(x[i], y[nchar(y) >= 5],  ignore.case = FALSE,
+                                                                            fixed = TRUE,
+                                                                            costs = c(insertions = 1, deletions = 1, substitutions = 2),
+                                                                            counts = FALSE)), na.rm = TRUE)),
+                              USE.NAMES = FALSE)
+      if (!is.null(language) && language != "en") {
+        ab_df$trans <- generalise_antibiotic_name(translate_AMR(ab_df$name, language = language))
+        ab_df$lev_trans <- as.double(utils::adist(x[i], ab_df$trans,
+                                                  ignore.case = FALSE,
+                                                  fixed = TRUE,
+                                                  costs = c(insertions = 1, deletions = 1, substitutions = 2),
+                                                  counts = FALSE))
      } else {
-        found <- tryCatch(suppressWarnings(ab_from_text(x[i], loop_time = loop_time + 2, translate_ab = FALSE)[[1]][1L]),
-          error = function(e) NA_character_
-        )
+        ab_df$lev_trans <- ab_df$lev_name
      }
-      if (!is.na(found)) {
-        x_new[i] <- note_if_more_than_one_found(found, i, from_text)
+      
+      if (any(ab_df$lev_name < 5, na.rm = TRUE)) {
+        x_new[i] <- ab_df$ab[order(ab_df$lev_name)][1]
        x_uncertain <- c(x_uncertain, x_bak[x[i] == x_bak_clean][1])
        next
-      }
-
-      # first 5 except for cephalosporins, then first 7 (those cephalosporins all start quite the same!)
-      found <- suppressWarnings(as.ab(substr(x[i], 1, 5), loop_time = loop_time + 2))
-      if (!is.na(found) && ab_group(found, loop_time = loop_time + 1) %unlike% "cephalosporins") {
-        x_new[i] <- note_if_more_than_one_found(found, i, from_text)
-        next
-      }
-      found <- suppressWarnings(as.ab(substr(x[i], 1, 7), loop_time = loop_time + 2))
-      if (!is.na(found)) {
-        x_new[i] <- note_if_more_than_one_found(found, i, from_text)
+      } else if (any(ab_df$lev_trans < 5, na.rm = TRUE)) {
+        x_new[i] <- ab_df$ab[order(ab_df$lev_trans)][1]
        x_uncertain <- c(x_uncertain, x_bak[x[i] == x_bak_clean][1])
        next
-      }
-
-      # make all consonants facultative
-      search_str <- gsub("([BCDFGHJKLMNPQRSTVWXZ])", "\\1*", x[i], perl = TRUE)
-      found <- suppressWarnings(as.ab(search_str, loop_time = loop_time + 2, already_regex = TRUE))
-      # keep at least 4 normal characters
-      if (nchar(gsub(".\\*", "", search_str, perl = TRUE)) < 4) {
-        found <- NA
-      }
-      if (!is.na(found)) {
-        x_new[i] <- note_if_more_than_one_found(found, i, from_text)
+      } else if (any(ab_df$lev_syn < 5, na.rm = TRUE)) {
+        x_new[i] <- ab_df$ab[order(ab_df$lev_syn)][1]
        x_uncertain <- c(x_uncertain, x_bak[x[i] == x_bak_clean][1])
        next
-      }
-
-      # make all vowels facultative
-      search_str <- gsub("([AEIOUY])", "\\1*", x[i], perl = TRUE)
-      found <- suppressWarnings(as.ab(search_str, loop_time = loop_time + 2, already_regex = TRUE))
-      # keep at least 5 normal characters
-      if (nchar(gsub(".\\*", "", search_str, perl = TRUE)) < 5) {
-        found <- NA
-      }
-      if (!is.na(found)) {
-        x_new[i] <- note_if_more_than_one_found(found, i, from_text)
-        x_uncertain <- c(x_uncertain, x_bak[x[i] == x_bak_clean][1])
-        next
-      }
-
-      # allow misspelling of vowels
-      x_spelling <- gsub("A+", "[AEIOU]+", x_spelling, fixed = TRUE)
-      x_spelling <- gsub("E+", "[AEIOU]+", x_spelling, fixed = TRUE)
-      x_spelling <- gsub("I+", "[AEIOU]+", x_spelling, fixed = TRUE)
-      x_spelling <- gsub("O+", "[AEIOU]+", x_spelling, fixed = TRUE)
-      x_spelling <- gsub("U+", "[AEIOU]+", x_spelling, fixed = TRUE)
-      found <- suppressWarnings(as.ab(x_spelling, loop_time = loop_time + 2, already_regex = TRUE))
-      if (!is.na(found)) {
-        x_new[i] <- note_if_more_than_one_found(found, i, from_text)
-        x_uncertain <- c(x_uncertain, x_bak[x[i] == x_bak_clean][1])
-        next
-      }
-
-      # try with switched character, like "mreopenem"
-      for (j in seq_len(nchar(x[i]))) {
-        x_switched <- paste0(
-          # beginning part:
-          substr(x[i], 1, j - 1),
-          # here is the switching of 2 characters:
-          substr(x[i], j + 1, j + 1),
-          substr(x[i], j, j),
-          # ending part:
-          substr(x[i], j + 2, nchar(x[i]))
-        )
-        found <- suppressWarnings(as.ab(x_switched, loop_time = loop_time + 1))
-        if (!is.na(found)) {
-          break
+      } else {
+        # then just take name if Levensthein is max 100% of length of name
+        ab_df$lev_len_ratio <- ab_df$lev_name / ab_df$length_name
+        if (any(ab_df$lev_len_ratio < 1)) {
+          ab_df <- ab_df[ab_df$lev_len_ratio < 1, , drop = FALSE]
+          x_new[i] <- ab_df$ab[order(ab_df$lev_name)][1]
+          x_uncertain <- c(x_uncertain, x_bak[x[i] == x_bak_clean][1])
+          next
        }
      }
-      if (!is.na(found)) {
-        x_new[i] <- found[1L]
-        x_uncertain <- c(x_uncertain, x_bak[x[i] == x_bak_clean][1])
-        next
-      }
-    } # end of loop_time <= 2
-
-    # not found
+    }
+    
+    # nothing found
    x_unknown <- c(x_unknown, x_bak[x[i] == x_bak_clean][1])
  }
-
+  
  if (loop_time == 1 && sum(already_known) < length(x)) {
    close(progress)
  }
-
+  
  # save to package env to save time for next time
  if (loop_time == 1) {
    AMR_env$ab_previously_coerced <- AMR_env$ab_previously_coerced[which(!AMR_env$ab_previously_coerced$x %in% x), , drop = FALSE]
@@ -578,7 +408,7 @@ as.ab <- function(x, flag_multiple_results = TRUE, info = interactive(), ...) {
      )
    ))
  }
-
+  
  # take failed ATC codes apart from rest
  if (length(x_unknown_ATCs) > 0 && fast_mode == FALSE) {
    warning_(
@@ -619,14 +449,14 @@ as.ab <- function(x, flag_multiple_results = TRUE, info = interactive(), ...) {
               ". If required, use `add_custom_antimicrobials()` to add custom entries.")
    }
  }
-
+  
  x_result <- x_new[match(x_bak_clean, x)]
  if (length(x_result) == 0) {
    x_result <- NA_character_
  }
-
+  
  set_clean_class(x_result,
-    new_class = c("ab", "character")
+                  new_class = c("ab", "character")
  )
 }

@@ -767,16 +597,20 @@ generalise_antibiotic_name <- function(x) {
  x <- gsub("[^A-Z0-9 -)(]", "/", x, perl = TRUE)
  # correct for 'high level' antibiotics
  x <- trimws(gsub("([^A-Z0-9/ -]+)?(HIGH(.?LE?VE?L)?|[^A-Z0-9/]H[^A-Z0-9]?L)([^A-Z0-9 -]+)?", "-HIGH", x, perl = TRUE))
-  x <- trimws(gsub("^(-HIGH)(.*)", "\\2\\1", x))
+  x <- trimws(gsub("^(-HIGH)(.*)", "\\2\\1", x, perl = TRUE))
  # remove part between brackets if that's followed by another string
  x <- gsub("(.*)+ [(].*[)]", "\\1", x)
-  # spaces around non-characters must be removed: amox + clav -> amox/clav
+  # spaces around non-characters must be removed: amox + clav -> amox clav
  x <- gsub("(.*[A-Z0-9]) ([^A-Z0-9].*)", "\\1\\2", x, perl = TRUE)
  x <- gsub("(.*[^A-Z0-9]) ([A-Z0-9].*)", "\\1\\2", x, perl = TRUE)
  # remove hyphen after a starting "co"
  x <- gsub("^CO-", "CO", x, perl = TRUE)
  # replace operators with a space
  x <- gsub("(/| AND | WITH | W/|[+]|[-])+", " ", x, perl = TRUE)
+  # replace more than 1 space
+  x <- trimws(gsub(" +", " ", x, perl = TRUE))
+  # move HIGH to end
+  x <- trimws(gsub("(.*) HIGH(.*)", "\\1\\2 HIGH", x, perl = TRUE))
  x
 }

@@ -789,9 +623,9 @@ get_translate_ab <- function(translate_ab) {
  } else {
    translate_ab <- tolower(translate_ab)
    stop_ifnot(translate_ab %in% colnames(AMR::antibiotics),
-      "invalid value for 'translate_ab', this must be a column name of the antibiotics data set\n",
-      "or TRUE (equals 'name') or FALSE to not translate at all.",
-      call = FALSE
+               "invalid value for 'translate_ab', this must be a column name of the antibiotics data set\n",
+               "or TRUE (equals 'name') or FALSE to not translate at all.",
+               call = FALSE
    )
    translate_ab
  }
--- a/data-raw/gpt_training_text_v2.1.1.9159.txt
+++ b/data-raw/gpt_training_text_v2.1.1.9159.txt
@@ -1,6 +1,6 @@
 This knowledge base contains all context you must know about the AMR package for R. You are a GPT trained to be an assistant for the AMR package in R. You are an incredible R specialist, especially trained in this package and in the tidyverse.

-First and foremost, you are trained on version 2.1.1.9158. Remember this whenever someone asks which AMR package version you’re at.
+First and foremost, you are trained on version 2.1.1.9159. Remember this whenever someone asks which AMR package version you’re at.

 Below are the contents of the  file, the  file, and all the  files (documentation) in the package. Every file content is split using 100 hypens.
 ----------------------------------------------------------------------------------------------------
@@ -2577,7 +2577,8 @@ THE PART HEREAFTER CONTAINS CONTENTS FROM FILE 'man/as.ab.Rd':
 \alias{ab_reset_session}
 \title{Transform Input to an Antibiotic ID}
 \usage{
-as.ab(x, flag_multiple_results = TRUE, info = interactive(), ...)
+as.ab(x, flag_multiple_results = TRUE, language = get_AMR_locale(),
+  info = interactive(), ...)

 is.ab(x)

@@ -2588,6 +2589,8 @@ ab_reset_session()

 \item{flag_multiple_results}{a \link{logical} to indicate whether a note should be printed to the console that probably more than one antibiotic drug code or name can be retrieved from a single input value.}

+\item{language}{language to coerce input values from any of the 20 supported languages - default to the system language if supported (see \code{\link[=get_AMR_locale]{get_AMR_locale()}})}
+
 \item{info}{a \link{logical} to indicate whether a progress bar should be printed - the default is \code{TRUE} only in interactive mode}

 \item{...}{arguments passed on to internal functions}
@@ -2644,7 +2647,6 @@ as.ab("J01FA01")
 as.ab("J 01 FA 01")
 as.ab("Erythromycin")
 as.ab("eryt")
-as.ab("   eryt 123")
 as.ab("ERYT")
 as.ab("ERY")
 as.ab("eritromicine") # spelled wrong, yet works
--- a/man/as.ab.Rd
+++ b/man/as.ab.Rd
@@ -7,7 +7,8 @@
 \alias{ab_reset_session}
 \title{Transform Input to an Antibiotic ID}
 \usage{
-as.ab(x, flag_multiple_results = TRUE, info = interactive(), ...)
+as.ab(x, flag_multiple_results = TRUE, language = get_AMR_locale(),
+  info = interactive(), ...)

 is.ab(x)

@@ -18,6 +19,8 @@ ab_reset_session()

 \item{flag_multiple_results}{a \link{logical} to indicate whether a note should be printed to the console that probably more than one antibiotic drug code or name can be retrieved from a single input value.}

+\item{language}{language to coerce input values from any of the 20 supported languages - default to the system language if supported (see \code{\link[=get_AMR_locale]{get_AMR_locale()}})}
+
 \item{info}{a \link{logical} to indicate whether a progress bar should be printed - the default is \code{TRUE} only in interactive mode}

 \item{...}{arguments passed on to internal functions}
@@ -74,7 +77,6 @@ as.ab("J01FA01")
 as.ab("J 01 FA 01")
 as.ab("Erythromycin")
 as.ab("eryt")
-as.ab("   eryt 123")
 as.ab("ERYT")
 as.ab("ERY")
 as.ab("eritromicine") # spelled wrong, yet works
--- a/tests/testthat/test-ab.R
+++ b/tests/testthat/test-ab.R
@@ -27,20 +27,21 @@
 # how to conduct AMR data analysis: https://msberends.github.io/AMR/   #
 # ==================================================================== #

+ab_reset_session()
+
 expect_equal(
  as.character(as.ab(c(
    "J01FA01",
    "J 01 FA 01",
    "Erythromycin",
    "eryt",
-    "   eryt 123",
    "ERYT",
    "ERY",
    "erytromicine",
    "Erythrocin",
    "Romycin"
  ))),
-  rep("ERY", 10)
+  rep("ERY", 9)
 )

 expect_identical(class(as.ab("amox")), c("ab", "character"))
@@ -49,7 +50,7 @@ expect_true(is.ab(as.ab("amox")))
 expect_stdout(print(as.ab("amox")))
 expect_stdout(print(data.frame(a = as.ab("amox"))))

-# expect_warning(as.ab("J00AA00")) # ATC not yet available in data set
+expect_warning(as.ab("J00AA00")) # ATC not yet available in data set
 # expect_warning(as.ab("UNKNOWN"))

 expect_stdout(print(as.ab("amox")))
--- a/tests/testthat/test-ab_from_text.R
+++ b/tests/testthat/test-ab_from_text.R
@@ -27,20 +27,22 @@
 # how to conduct AMR data analysis: https://msberends.github.io/AMR/   #
 # ==================================================================== #

+ab_reset_session()
+
 expect_identical(
-  ab_from_text("28/03/2020 regular amoxicilliin 500mg po tds")[[1]],
+  ab_from_text("28/03/2020 amoxicilliin 500mg po tds")[[1]],
  as.ab("Amoxicillin")
 )
 expect_identical(
-  ab_from_text("28/03/2020 regular amoxicilliin 500mg po tds", thorough_search = TRUE)[[1]],
+  ab_from_text("28/03/2020 amoxicilliin 500mg po tds", thorough_search = TRUE)[[1]],
  as.ab("Amoxicillin")
 )
 expect_identical(
-  ab_from_text("28/03/2020 regular amoxicilliin 500mg po tds", thorough_search = FALSE)[[1]],
+  ab_from_text("28/03/2020 amoxicilliin 500mg po tds", thorough_search = FALSE)[[1]],
  as.ab("Amoxicillin")
 )
 expect_identical(
-  ab_from_text("28/03/2020 regular amoxicilliin 500mg po tds", translate_ab = TRUE)[[1]],
+  ab_from_text("28/03/2020 amoxicilliin 500mg po tds", translate_ab = TRUE)[[1]],
  "Amoxicillin"
 )
 expect_identical(
@@ -49,10 +51,10 @@ expect_identical(
 )

 expect_identical(
-  ab_from_text("28/03/2020 regular amoxicilliin 500mg po tds", type = "dose")[[1]],
+  ab_from_text("28/03/2020 amoxicilliin 500mg po tds", type = "dose")[[1]],
  500
 )
 expect_identical(
-  ab_from_text("28/03/2020 regular amoxicilliin 500mg po tds", type = "admin")[[1]],
+  ab_from_text("28/03/2020 amoxicilliin 500mg po tds", type = "admin")[[1]],
  "oral"
 )
--- a/tests/testthat/test-ab_property.R
+++ b/tests/testthat/test-ab_property.R
@@ -27,6 +27,8 @@
 # how to conduct AMR data analysis: https://msberends.github.io/AMR/   #
 # ==================================================================== #

+ab_reset_session()
+
 expect_identical(ab_name("AMX", language = NULL), "Amoxicillin")
 expect_identical(ab_atc("AMX"), "J01CA04")
 expect_identical(ab_cid("AMX"), as.integer(33613))
@@ -94,3 +96,4 @@ if (AMR:::pkg_is_available("dplyr", min_version = "1.0.0", also_load = TRUE)) {
      set_ab_names(NIT:VAN) %>%
      colnames())))
 }
+