diff --git a/.github/workflows/check-old-tinytest.yaml b/.github/workflows/check-old-tinytest.yaml index d4ebd6024..4f90f2606 100644 --- a/.github/workflows/check-old-tinytest.yaml +++ b/.github/workflows/check-old-tinytest.yaml @@ -61,7 +61,7 @@ jobs: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: r-lib/actions/setup-r@v2 with: diff --git a/.github/workflows/codecovr.yaml b/.github/workflows/codecovr.yaml index 52612db9e..927d867c1 100644 --- a/.github/workflows/codecovr.yaml +++ b/.github/workflows/codecovr.yaml @@ -42,7 +42,7 @@ jobs: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} CODECOV_TOKEN: ${{secrets.CODECOV_TOKEN}} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: r-lib/actions/setup-pandoc@v2 diff --git a/.github/workflows/lintr.yaml b/.github/workflows/lintr.yaml index f70b5c03d..982e789a7 100644 --- a/.github/workflows/lintr.yaml +++ b/.github/workflows/lintr.yaml @@ -41,7 +41,7 @@ jobs: env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: r-lib/actions/setup-pandoc@v2 @@ -59,13 +59,15 @@ jobs: run: | # old: lintr::lint_package(linters = lintr::with_defaults(line_length_linter = NULL, trailing_whitespace_linter = NULL, object_name_linter = NULL, cyclocomp_linter = NULL, object_length_linter = lintr::object_length_linter(length = 50L)), exclusions = list("R/aa_helper_pm_functions.R")) # now get ALL linters, not just default ones - linters <- ls(envir = asNamespace("lintr"), pattern = "_linter$") + linters <- getNamespaceExports(asNamespace("lintr")) + linters <- sort(linters[grepl("_linter$", linters)]) # lose deprecated - linters <- linters[!grepl("^(closed_curly|open_curly|paren_brace|semicolon_terminator)_linter$", linters)] + linters <- linters[!grepl("^(closed_curly|open_curly|paren_brace|semicolon_terminator|consecutive_stopifnot|no_tab|single_quotes|unnecessary_nested_if|unneeded_concatenation)_linter$", linters)] # and the ones we find unnnecessary linters <- linters[!grepl("^(extraction_operator|implicit_integer|line_length|object_name|nonportable_path|is)_linter$", linters)] # put the functions in a list - linters <- lapply(linters, function(l) eval(parse(text = paste0("lintr::", l, "()")), envir = asNamespace("lintr"))) + linters_list <- lapply(linters, function(l) eval(parse(text = paste0("lintr::", l, "()")), envir = asNamespace("lintr"))) + names(linters_list) <- linters # run them all! - lintr::lint_package(linters = linters, exclusions = list("R/aa_helper_pm_functions.R")) + lintr::lint_package(linters = linters_list, exclusions = list("R/aa_helper_pm_functions.R")) shell: Rscript {0} diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml index 742ee59ef..ad12e8421 100644 --- a/.github/workflows/publish-to-pypi.yml +++ b/.github/workflows/publish-to-pypi.yml @@ -40,7 +40,7 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v4 diff --git a/.github/workflows/website.yaml b/.github/workflows/website.yaml index 9ea727bd7..0169301cf 100644 --- a/.github/workflows/website.yaml +++ b/.github/workflows/website.yaml @@ -43,7 +43,7 @@ jobs: continue-on-error: true steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: # this is to keep timestamps, the default fetch-depth: 1 gets the timestamps of the moment of cloning # we need this for the download page on our website - dates must be of the files, not of the latest git push diff --git a/DESCRIPTION b/DESCRIPTION index 3cc5ae666..4f68d510f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: AMR -Version: 2.1.1.9158 +Version: 2.1.1.9159 Date: 2025-02-26 Title: Antimicrobial Resistance Data Analysis Description: Functions to simplify and standardise antimicrobial resistance (AMR) diff --git a/NEWS.md b/NEWS.md index 2bc8d05ce..0e8dbb204 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -# AMR 2.1.1.9158 +# AMR 2.1.1.9159 *(this beta version will eventually become v3.0. We're happy to reach a new major milestone soon, which will be all about the new One Health support! Install this beta using [the instructions here](https://msberends.github.io/AMR/#latest-development-version).)* diff --git a/PythonPackage/AMR/AMR.egg-info/PKG-INFO b/PythonPackage/AMR/AMR.egg-info/PKG-INFO index fcb87067a..f7b2915db 100644 --- a/PythonPackage/AMR/AMR.egg-info/PKG-INFO +++ b/PythonPackage/AMR/AMR.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.2 Name: AMR -Version: 2.1.1.9158 +Version: 2.1.1.9159 Summary: A Python wrapper for the AMR R package Home-page: https://github.com/msberends/AMR Author: Matthijs Berends diff --git a/PythonPackage/AMR/dist/amr-2.1.1.9158.tar.gz b/PythonPackage/AMR/dist/amr-2.1.1.9158.tar.gz deleted file mode 100644 index 5a99784af..000000000 Binary files a/PythonPackage/AMR/dist/amr-2.1.1.9158.tar.gz and /dev/null differ diff --git a/PythonPackage/AMR/dist/amr-2.1.1.9158-py3-none-any.whl b/PythonPackage/AMR/dist/amr-2.1.1.9159-py3-none-any.whl similarity index 52% rename from PythonPackage/AMR/dist/amr-2.1.1.9158-py3-none-any.whl rename to PythonPackage/AMR/dist/amr-2.1.1.9159-py3-none-any.whl index 495c67175..3b32c5035 100644 Binary files a/PythonPackage/AMR/dist/amr-2.1.1.9158-py3-none-any.whl and b/PythonPackage/AMR/dist/amr-2.1.1.9159-py3-none-any.whl differ diff --git a/PythonPackage/AMR/dist/amr-2.1.1.9159.tar.gz b/PythonPackage/AMR/dist/amr-2.1.1.9159.tar.gz new file mode 100644 index 000000000..a9af2aaaf Binary files /dev/null and b/PythonPackage/AMR/dist/amr-2.1.1.9159.tar.gz differ diff --git a/PythonPackage/AMR/setup.py b/PythonPackage/AMR/setup.py index d798de27a..6b1854d9e 100644 --- a/PythonPackage/AMR/setup.py +++ b/PythonPackage/AMR/setup.py @@ -2,7 +2,7 @@ from setuptools import setup, find_packages setup( name='AMR', - version='2.1.1.9158', + version='2.1.1.9159', packages=find_packages(), install_requires=[ 'rpy2', diff --git a/R/ab.R b/R/ab.R index ec03ccd16..c1505b79a 100755 --- a/R/ab.R +++ b/R/ab.R @@ -32,6 +32,7 @@ #' Use this function to determine the antibiotic drug code of one or more antibiotics. The data set [antibiotics] will be searched for abbreviations, official names and synonyms (brand names). #' @param x a [character] vector to determine to antibiotic ID #' @param flag_multiple_results a [logical] to indicate whether a note should be printed to the console that probably more than one antibiotic drug code or name can be retrieved from a single input value. +#' @param language language to coerce input values from any of the `r length(LANGUAGES_SUPPORTED)` supported languages - default to the system language if supported (see [get_AMR_locale()]) #' @param info a [logical] to indicate whether a progress bar should be printed - the default is `TRUE` only in interactive mode #' @param ... arguments passed on to internal functions #' @rdname as.ab @@ -67,7 +68,6 @@ #' as.ab("J 01 FA 01") #' as.ab("Erythromycin") #' as.ab("eryt") -#' as.ab(" eryt 123") #' as.ab("ERYT") #' as.ab("ERY") #' as.ab("eritromicine") # spelled wrong, yet works @@ -92,29 +92,30 @@ #' set_ab_names(where(is.sir), property = "atc") #' } #' } -as.ab <- function(x, flag_multiple_results = TRUE, info = interactive(), ...) { +as.ab <- function(x, flag_multiple_results = TRUE, language = get_AMR_locale(), info = interactive(), ...) { meet_criteria(x, allow_class = c("character", "numeric", "integer", "factor"), allow_NA = TRUE) meet_criteria(flag_multiple_results, allow_class = "logical", has_length = 1) + language <- validate_language(language) meet_criteria(info, allow_class = "logical", has_length = 1) - + if (is.ab(x) || all(x %in% c(AMR_env$AB_lookup$ab, NA))) { # all valid AB codes, but not yet right class or might have additional attributes as AMR selector attributes(x) <- NULL return(set_clean_class(x, - new_class = c("ab", "character") + new_class = c("ab", "character") )) } - + loop_time <- list(...)$loop_time if (is.null(loop_time)) { loop_time <- 1 } already_regex <- isTRUE(list(...)$already_regex) fast_mode <- isTRUE(list(...)$fast_mode) - + x_bak <- x x <- toupper(x) - + # remove diacritics x <- iconv(x, from = "UTF-8", to = "ASCII//TRANSLIT") x <- gsub('"', "", x, fixed = TRUE) @@ -125,7 +126,7 @@ as.ab <- function(x, flag_multiple_results = TRUE, info = interactive(), ...) { if (already_regex == FALSE) { x_bak_clean <- generalise_antibiotic_name(x_bak_clean) } - + x <- unique(x_bak_clean) # this means that every x is in fact generalise_antibiotic_name(x) x_new <- rep(NA_character_, length(x)) x_uncertain <- character(0) @@ -152,17 +153,17 @@ as.ab <- function(x, flag_multiple_results = TRUE, info = interactive(), ...) { } found[1L] } - + # Fill in names, AB codes, CID codes and ATC codes directly (`x` is already clean and uppercase) known_names <- x %in% AMR_env$AB_lookup$generalised_name x_new[known_names] <- AMR_env$AB_lookup$ab[match(x[known_names], AMR_env$AB_lookup$generalised_name)] known_codes_ab <- x %in% AMR_env$AB_lookup$ab - known_codes_atc <- vapply(FUN.VALUE = logical(1), x, function(x_) x_ %in% unlist(AMR_env$AB_lookup$atc), USE.NAMES = FALSE) + known_codes_atc <- vapply(FUN.VALUE = logical(1), gsub(" ", "", x), function(x_) x_ %in% unlist(AMR_env$AB_lookup$atc), USE.NAMES = FALSE) known_codes_cid <- x %in% AMR_env$AB_lookup$cid x_new[known_codes_ab] <- AMR_env$AB_lookup$ab[match(x[known_codes_ab], AMR_env$AB_lookup$ab)] x_new[known_codes_atc] <- AMR_env$AB_lookup$ab[vapply( FUN.VALUE = integer(1), - x[known_codes_atc], + gsub(" ", "", x[known_codes_atc]), function(x_) { which(vapply( FUN.VALUE = logical(1), @@ -182,29 +183,29 @@ as.ab <- function(x, flag_multiple_results = TRUE, info = interactive(), ...) { " for ", vector_and(prev), ". Run `ab_reset_session()` to reset this. This note will be shown once per session for this input." ) } - + already_known <- known_names | known_codes_ab | known_codes_atc | known_codes_cid | previously_coerced - + # fix for NAs x_new[is.na(x)] <- NA already_known[is.na(x)] <- FALSE - + if (loop_time == 1 && sum(already_known) < length(x)) { progress <- progress_ticker(n = sum(!already_known), n_min = 25, print = info) # start if n >= 25 on.exit(close(progress)) } - + for (i in which(!already_known)) { if (loop_time == 1) { progress$tick() } - + if (is.na(x[i]) || is.null(x[i])) { next } if (identical(x[i], "") || - # prevent "bacteria" from coercing to TMP, since Bacterial is a brand name of it: - identical(tolower(x[i]), "bacteria")) { + # prevent "bacteria" from coercing to TMP, since Bacterial is a brand name of it: + identical(tolower(x[i]), "bacteria")) { x_unknown <- c(x_unknown, x_bak[x[i] == x_bak_clean][1]) next } @@ -215,21 +216,21 @@ as.ab <- function(x, flag_multiple_results = TRUE, info = interactive(), ...) { x_new[i] <- NA_character_ next } - + if (fast_mode == FALSE && flag_multiple_results == TRUE && x[i] %like% "[ ]") { from_text <- tryCatch(suppressWarnings(ab_from_text(x[i], loop_time = loop_time + 1, translate_ab = FALSE)[[1]]), - error = function(e) character(0) + error = function(e) character(0) ) } else { from_text <- character(0) } - + # old code for phenoxymethylpenicillin (Peni V) if (x[i] == "PNV") { x_new[i] <- "PHN" next } - + # exact LOINC code loinc_found <- unlist(lapply( AMR_env$AB_lookup$generalised_loinc, @@ -240,7 +241,7 @@ as.ab <- function(x, flag_multiple_results = TRUE, info = interactive(), ...) { x_new[i] <- note_if_more_than_one_found(found, i, from_text) next } - + # exact synonym synonym_found <- unlist(lapply( AMR_env$AB_lookup$generalised_synonyms, @@ -251,7 +252,7 @@ as.ab <- function(x, flag_multiple_results = TRUE, info = interactive(), ...) { x_new[i] <- note_if_more_than_one_found(found, i, from_text) next } - + # exact abbreviation abbr_found <- unlist(lapply( AMR_env$AB_lookup$generalised_abbreviations, @@ -263,7 +264,7 @@ as.ab <- function(x, flag_multiple_results = TRUE, info = interactive(), ...) { x_new[i] <- note_if_more_than_one_found(found, i, from_text) next } - + # length of input is quite long, and Levenshtein distance is only max 2 if (nchar(x[i]) >= 10) { levenshtein <- as.double(utils::adist(x[i], AMR_env$AB_lookup$generalised_name)) @@ -273,7 +274,7 @@ as.ab <- function(x, flag_multiple_results = TRUE, info = interactive(), ...) { next } } - + # allow characters that resemble others, but only continue when having more than 3 characters if (nchar(x[i]) <= 3) { x_unknown <- c(x_unknown, x_bak[x[i] == x_bak_clean][1]) @@ -303,20 +304,22 @@ as.ab <- function(x, flag_multiple_results = TRUE, info = interactive(), ...) { x_spelling <- gsub("(O|0)", "(O|0)+", x_spelling, perl = TRUE) x_spelling <- gsub("++", "+", x_spelling, fixed = TRUE) } - + # try if name starts with it found <- AMR_env$AB_lookup[which(AMR_env$AB_lookup$generalised_name %like% paste0("^", x_spelling)), "ab", drop = TRUE] if (length(found) > 0) { x_new[i] <- note_if_more_than_one_found(found, i, from_text) next } + + # try if name ends with it found <- AMR_env$AB_lookup[which(AMR_env$AB_lookup$generalised_name %like% paste0(x_spelling, "$")), "ab", drop = TRUE] if (nchar(x[i]) >= 4 && length(found) > 0) { x_new[i] <- note_if_more_than_one_found(found, i, from_text) next } - + # and try if any synonym starts with it synonym_found <- unlist(lapply( AMR_env$AB_lookup$generalised_synonyms, @@ -327,244 +330,71 @@ as.ab <- function(x, flag_multiple_results = TRUE, info = interactive(), ...) { x_new[i] <- note_if_more_than_one_found(found, i, from_text) next } - + # INITIAL SEARCH - More uncertain results ---- - if (loop_time <= 2 && fast_mode == FALSE) { + if (loop_time == 1 && fast_mode == FALSE) { # only run on first and second try - # base on the Levensthein distance function if length >= 6 - if (nchar(x[i]) >= 6) { - l_dist <- as.double(utils::adist(x[i], AMR_env$AB_lookup$generalised_name, - ignore.case = FALSE, - fixed = TRUE, - costs = c(insertions = 1, deletions = 2, substitutions = 2), - counts = FALSE)) - x_new[i] <- AMR_env$AB_lookup$ab[order(l_dist)][1] - x_uncertain <- c(x_uncertain, x_bak[x[i] == x_bak_clean][1]) - next - } - - # try by removing all spaces - if (x[i] %like% " ") { - found <- suppressWarnings(as.ab(gsub(" +", "", x[i], perl = TRUE), loop_time = loop_time + 2)) - if (length(found) > 0 && !is.na(found)) { - x_new[i] <- note_if_more_than_one_found(found, i, from_text) - x_uncertain <- c(x_uncertain, x_bak[x[i] == x_bak_clean][1]) - next - } - } - - # try by removing all spaces and numbers - if (x[i] %like% " " || x[i] %like% "[0-9]") { - found <- suppressWarnings(as.ab(gsub("[ 0-9]", "", x[i], perl = TRUE), loop_time = loop_time + 2)) - if (length(found) > 0 && !is.na(found)) { - x_new[i] <- note_if_more_than_one_found(found, i, from_text) - x_uncertain <- c(x_uncertain, x_bak[x[i] == x_bak_clean][1]) - next - } - } - - # reverse a combination, e.g. clavulanic acid/amoxicillin - if (x[i] %like% " ") { - split <- strsplit(x[i], " ")[[1]] - permute <- function(x) { - if (length(x) == 1) return(list(x)) - result <- vector("list", factorial(length(x))) - index <- 1 - for (i in seq_along(x)) { - sub_perms <- permute(x[-i]) # Recursively get permutations of remaining elements - for (sub in sub_perms) { - result[[index]] <- c(x[i], sub) - index <- index + 1 - } - } - return(result) - } - permutations <- permute(split) - found_perms <- character(length(permutations)) - for (s in seq_len(length(permutations))) { - concat <- paste0(permutations[[s]], collapse = " ") - if (concat %in% AMR_env$AB_lookup$generalised_name) { - found_perms[s] <- AMR_env$AB_lookup[which(AMR_env$AB_lookup$generalised_name == concat), "ab", drop = TRUE] - } else { - found_perms[s] <- suppressWarnings(as.ab(concat, loop_time = loop_time + 2)) - } - } - found_perms <- found_perms[!is.na(found_perms)] - if (length(found_perms) > 0) { - found <- found_perms[order(nchar(found_perms), decreasing = TRUE)][1] - x_new[i] <- note_if_more_than_one_found(found, i, from_text) - x_uncertain <- c(x_uncertain, x_bak[x[i] == x_bak_clean][1]) - next - } - } - - # transform back from other languages and try again - x_translated <- paste( - lapply( - strsplit(x[i], "[^A-Z0-9]"), - function(y) { - for (i in seq_len(length(y))) { - for (lang in LANGUAGES_SUPPORTED[LANGUAGES_SUPPORTED != "en"]) { - y[i] <- ifelse(tolower(y[i]) %in% tolower(TRANSLATIONS[, lang, drop = TRUE]), - TRANSLATIONS[which(tolower(TRANSLATIONS[, lang, drop = TRUE]) == tolower(y[i]) & - !isFALSE(TRANSLATIONS$fixed)), "pattern"], - y[i] - ) - } - } - generalise_antibiotic_name(y) - } - )[[1]], - collapse = "/" - ) - x_translated_guess <- suppressWarnings(as.ab(x_translated, loop_time = loop_time + 2)) - if (!is.na(x_translated_guess)) { - x_new[i] <- x_translated_guess - x_uncertain <- c(x_uncertain, x_bak[x[i] == x_bak_clean][1]) - next - } - - # now also try to coerce brandname combinations like "Amoxy/clavulanic acid" - x_translated <- paste( - lapply( - strsplit(x_translated, "[^A-Z0-9 ]"), - function(y) { - for (i in seq_len(length(y))) { - y_name <- suppressWarnings(ab_name(y[i], language = NULL, loop_time = loop_time + 2)) - y[i] <- ifelse(!is.na(y_name), - y_name, - y[i] - ) - } - generalise_antibiotic_name(y) - } - )[[1]], - collapse = "/" - ) - x_translated_guess <- suppressWarnings(as.ab(x_translated, loop_time = loop_time + 2)) - if (!is.na(x_translated_guess)) { - x_new[i] <- x_translated_guess - x_uncertain <- c(x_uncertain, x_bak[x[i] == x_bak_clean][1]) - next - } - - # try by removing all trailing capitals - if (x[i] %like_case% "[a-z]+[A-Z]+$") { - found <- suppressWarnings(as.ab(gsub("[A-Z]+$", "", x[i], perl = TRUE), loop_time = loop_time + 2)) - if (!is.na(found)) { - x_new[i] <- note_if_more_than_one_found(found, i, from_text) - x_uncertain <- c(x_uncertain, x_bak[x[i] == x_bak_clean][1]) - next - } - } - - # keep only letters - found <- suppressWarnings(as.ab(gsub("[^A-Z]", "", x[i], perl = TRUE), loop_time = loop_time + 2)) - if (!is.na(found)) { - x_new[i] <- note_if_more_than_one_found(found, i, from_text) - x_uncertain <- c(x_uncertain, x_bak[x[i] == x_bak_clean][1]) - next - } - - # try from a bigger text, like from a health care record, see ?ab_from_text - # already calculated above if flag_multiple_results = TRUE - if (flag_multiple_results == TRUE) { - found <- from_text[1L] + ab_df <- AMR_env$AB_lookup + ab_df$length_name <- nchar(ab_df$generalised_name) + # now retrieve Levensthein distance for name, synonyms, and translated names + ab_df$lev_name <- as.double(utils::adist(x[i], ab_df$generalised_name, + ignore.case = FALSE, + fixed = TRUE, + costs = c(insertions = 1, deletions = 1, substitutions = 2), + counts = FALSE)) + ab_df$lev_syn <- vapply(FUN.VALUE = double(1), + ab_df$generalised_synonyms, + function(y) ifelse(length(y[nchar(y) >= 5]) == 0, + 999, + min(as.double(utils::adist(x[i], y[nchar(y) >= 5], ignore.case = FALSE, + fixed = TRUE, + costs = c(insertions = 1, deletions = 1, substitutions = 2), + counts = FALSE)), na.rm = TRUE)), + USE.NAMES = FALSE) + if (!is.null(language) && language != "en") { + ab_df$trans <- generalise_antibiotic_name(translate_AMR(ab_df$name, language = language)) + ab_df$lev_trans <- as.double(utils::adist(x[i], ab_df$trans, + ignore.case = FALSE, + fixed = TRUE, + costs = c(insertions = 1, deletions = 1, substitutions = 2), + counts = FALSE)) } else { - found <- tryCatch(suppressWarnings(ab_from_text(x[i], loop_time = loop_time + 2, translate_ab = FALSE)[[1]][1L]), - error = function(e) NA_character_ - ) + ab_df$lev_trans <- ab_df$lev_name } - if (!is.na(found)) { - x_new[i] <- note_if_more_than_one_found(found, i, from_text) + + if (any(ab_df$lev_name < 5, na.rm = TRUE)) { + x_new[i] <- ab_df$ab[order(ab_df$lev_name)][1] x_uncertain <- c(x_uncertain, x_bak[x[i] == x_bak_clean][1]) next - } - - # first 5 except for cephalosporins, then first 7 (those cephalosporins all start quite the same!) - found <- suppressWarnings(as.ab(substr(x[i], 1, 5), loop_time = loop_time + 2)) - if (!is.na(found) && ab_group(found, loop_time = loop_time + 1) %unlike% "cephalosporins") { - x_new[i] <- note_if_more_than_one_found(found, i, from_text) - next - } - found <- suppressWarnings(as.ab(substr(x[i], 1, 7), loop_time = loop_time + 2)) - if (!is.na(found)) { - x_new[i] <- note_if_more_than_one_found(found, i, from_text) + } else if (any(ab_df$lev_trans < 5, na.rm = TRUE)) { + x_new[i] <- ab_df$ab[order(ab_df$lev_trans)][1] x_uncertain <- c(x_uncertain, x_bak[x[i] == x_bak_clean][1]) next - } - - # make all consonants facultative - search_str <- gsub("([BCDFGHJKLMNPQRSTVWXZ])", "\\1*", x[i], perl = TRUE) - found <- suppressWarnings(as.ab(search_str, loop_time = loop_time + 2, already_regex = TRUE)) - # keep at least 4 normal characters - if (nchar(gsub(".\\*", "", search_str, perl = TRUE)) < 4) { - found <- NA - } - if (!is.na(found)) { - x_new[i] <- note_if_more_than_one_found(found, i, from_text) + } else if (any(ab_df$lev_syn < 5, na.rm = TRUE)) { + x_new[i] <- ab_df$ab[order(ab_df$lev_syn)][1] x_uncertain <- c(x_uncertain, x_bak[x[i] == x_bak_clean][1]) next - } - - # make all vowels facultative - search_str <- gsub("([AEIOUY])", "\\1*", x[i], perl = TRUE) - found <- suppressWarnings(as.ab(search_str, loop_time = loop_time + 2, already_regex = TRUE)) - # keep at least 5 normal characters - if (nchar(gsub(".\\*", "", search_str, perl = TRUE)) < 5) { - found <- NA - } - if (!is.na(found)) { - x_new[i] <- note_if_more_than_one_found(found, i, from_text) - x_uncertain <- c(x_uncertain, x_bak[x[i] == x_bak_clean][1]) - next - } - - # allow misspelling of vowels - x_spelling <- gsub("A+", "[AEIOU]+", x_spelling, fixed = TRUE) - x_spelling <- gsub("E+", "[AEIOU]+", x_spelling, fixed = TRUE) - x_spelling <- gsub("I+", "[AEIOU]+", x_spelling, fixed = TRUE) - x_spelling <- gsub("O+", "[AEIOU]+", x_spelling, fixed = TRUE) - x_spelling <- gsub("U+", "[AEIOU]+", x_spelling, fixed = TRUE) - found <- suppressWarnings(as.ab(x_spelling, loop_time = loop_time + 2, already_regex = TRUE)) - if (!is.na(found)) { - x_new[i] <- note_if_more_than_one_found(found, i, from_text) - x_uncertain <- c(x_uncertain, x_bak[x[i] == x_bak_clean][1]) - next - } - - # try with switched character, like "mreopenem" - for (j in seq_len(nchar(x[i]))) { - x_switched <- paste0( - # beginning part: - substr(x[i], 1, j - 1), - # here is the switching of 2 characters: - substr(x[i], j + 1, j + 1), - substr(x[i], j, j), - # ending part: - substr(x[i], j + 2, nchar(x[i])) - ) - found <- suppressWarnings(as.ab(x_switched, loop_time = loop_time + 1)) - if (!is.na(found)) { - break + } else { + # then just take name if Levensthein is max 100% of length of name + ab_df$lev_len_ratio <- ab_df$lev_name / ab_df$length_name + if (any(ab_df$lev_len_ratio < 1)) { + ab_df <- ab_df[ab_df$lev_len_ratio < 1, , drop = FALSE] + x_new[i] <- ab_df$ab[order(ab_df$lev_name)][1] + x_uncertain <- c(x_uncertain, x_bak[x[i] == x_bak_clean][1]) + next } } - if (!is.na(found)) { - x_new[i] <- found[1L] - x_uncertain <- c(x_uncertain, x_bak[x[i] == x_bak_clean][1]) - next - } - } # end of loop_time <= 2 - - # not found + } + + # nothing found x_unknown <- c(x_unknown, x_bak[x[i] == x_bak_clean][1]) } - + if (loop_time == 1 && sum(already_known) < length(x)) { close(progress) } - + # save to package env to save time for next time if (loop_time == 1) { AMR_env$ab_previously_coerced <- AMR_env$ab_previously_coerced[which(!AMR_env$ab_previously_coerced$x %in% x), , drop = FALSE] @@ -578,7 +408,7 @@ as.ab <- function(x, flag_multiple_results = TRUE, info = interactive(), ...) { ) )) } - + # take failed ATC codes apart from rest if (length(x_unknown_ATCs) > 0 && fast_mode == FALSE) { warning_( @@ -619,14 +449,14 @@ as.ab <- function(x, flag_multiple_results = TRUE, info = interactive(), ...) { ". If required, use `add_custom_antimicrobials()` to add custom entries.") } } - + x_result <- x_new[match(x_bak_clean, x)] if (length(x_result) == 0) { x_result <- NA_character_ } - + set_clean_class(x_result, - new_class = c("ab", "character") + new_class = c("ab", "character") ) } @@ -767,16 +597,20 @@ generalise_antibiotic_name <- function(x) { x <- gsub("[^A-Z0-9 -)(]", "/", x, perl = TRUE) # correct for 'high level' antibiotics x <- trimws(gsub("([^A-Z0-9/ -]+)?(HIGH(.?LE?VE?L)?|[^A-Z0-9/]H[^A-Z0-9]?L)([^A-Z0-9 -]+)?", "-HIGH", x, perl = TRUE)) - x <- trimws(gsub("^(-HIGH)(.*)", "\\2\\1", x)) + x <- trimws(gsub("^(-HIGH)(.*)", "\\2\\1", x, perl = TRUE)) # remove part between brackets if that's followed by another string x <- gsub("(.*)+ [(].*[)]", "\\1", x) - # spaces around non-characters must be removed: amox + clav -> amox/clav + # spaces around non-characters must be removed: amox + clav -> amox clav x <- gsub("(.*[A-Z0-9]) ([^A-Z0-9].*)", "\\1\\2", x, perl = TRUE) x <- gsub("(.*[^A-Z0-9]) ([A-Z0-9].*)", "\\1\\2", x, perl = TRUE) # remove hyphen after a starting "co" x <- gsub("^CO-", "CO", x, perl = TRUE) # replace operators with a space x <- gsub("(/| AND | WITH | W/|[+]|[-])+", " ", x, perl = TRUE) + # replace more than 1 space + x <- trimws(gsub(" +", " ", x, perl = TRUE)) + # move HIGH to end + x <- trimws(gsub("(.*) HIGH(.*)", "\\1\\2 HIGH", x, perl = TRUE)) x } @@ -789,9 +623,9 @@ get_translate_ab <- function(translate_ab) { } else { translate_ab <- tolower(translate_ab) stop_ifnot(translate_ab %in% colnames(AMR::antibiotics), - "invalid value for 'translate_ab', this must be a column name of the antibiotics data set\n", - "or TRUE (equals 'name') or FALSE to not translate at all.", - call = FALSE + "invalid value for 'translate_ab', this must be a column name of the antibiotics data set\n", + "or TRUE (equals 'name') or FALSE to not translate at all.", + call = FALSE ) translate_ab } diff --git a/data-raw/gpt_training_text_v2.1.1.9158.txt b/data-raw/gpt_training_text_v2.1.1.9159.txt similarity index 99% rename from data-raw/gpt_training_text_v2.1.1.9158.txt rename to data-raw/gpt_training_text_v2.1.1.9159.txt index 0ebbcddfa..deffcfe6d 100644 --- a/data-raw/gpt_training_text_v2.1.1.9158.txt +++ b/data-raw/gpt_training_text_v2.1.1.9159.txt @@ -1,6 +1,6 @@ This knowledge base contains all context you must know about the AMR package for R. You are a GPT trained to be an assistant for the AMR package in R. You are an incredible R specialist, especially trained in this package and in the tidyverse. -First and foremost, you are trained on version 2.1.1.9158. Remember this whenever someone asks which AMR package version you’re at. +First and foremost, you are trained on version 2.1.1.9159. Remember this whenever someone asks which AMR package version you’re at. Below are the contents of the file, the file, and all the files (documentation) in the package. Every file content is split using 100 hypens. ---------------------------------------------------------------------------------------------------- @@ -2577,7 +2577,8 @@ THE PART HEREAFTER CONTAINS CONTENTS FROM FILE 'man/as.ab.Rd': \alias{ab_reset_session} \title{Transform Input to an Antibiotic ID} \usage{ -as.ab(x, flag_multiple_results = TRUE, info = interactive(), ...) +as.ab(x, flag_multiple_results = TRUE, language = get_AMR_locale(), + info = interactive(), ...) is.ab(x) @@ -2588,6 +2589,8 @@ ab_reset_session() \item{flag_multiple_results}{a \link{logical} to indicate whether a note should be printed to the console that probably more than one antibiotic drug code or name can be retrieved from a single input value.} +\item{language}{language to coerce input values from any of the 20 supported languages - default to the system language if supported (see \code{\link[=get_AMR_locale]{get_AMR_locale()}})} + \item{info}{a \link{logical} to indicate whether a progress bar should be printed - the default is \code{TRUE} only in interactive mode} \item{...}{arguments passed on to internal functions} @@ -2644,7 +2647,6 @@ as.ab("J01FA01") as.ab("J 01 FA 01") as.ab("Erythromycin") as.ab("eryt") -as.ab(" eryt 123") as.ab("ERYT") as.ab("ERY") as.ab("eritromicine") # spelled wrong, yet works diff --git a/man/as.ab.Rd b/man/as.ab.Rd index f29314f22..80ff9a700 100644 --- a/man/as.ab.Rd +++ b/man/as.ab.Rd @@ -7,7 +7,8 @@ \alias{ab_reset_session} \title{Transform Input to an Antibiotic ID} \usage{ -as.ab(x, flag_multiple_results = TRUE, info = interactive(), ...) +as.ab(x, flag_multiple_results = TRUE, language = get_AMR_locale(), + info = interactive(), ...) is.ab(x) @@ -18,6 +19,8 @@ ab_reset_session() \item{flag_multiple_results}{a \link{logical} to indicate whether a note should be printed to the console that probably more than one antibiotic drug code or name can be retrieved from a single input value.} +\item{language}{language to coerce input values from any of the 20 supported languages - default to the system language if supported (see \code{\link[=get_AMR_locale]{get_AMR_locale()}})} + \item{info}{a \link{logical} to indicate whether a progress bar should be printed - the default is \code{TRUE} only in interactive mode} \item{...}{arguments passed on to internal functions} @@ -74,7 +77,6 @@ as.ab("J01FA01") as.ab("J 01 FA 01") as.ab("Erythromycin") as.ab("eryt") -as.ab(" eryt 123") as.ab("ERYT") as.ab("ERY") as.ab("eritromicine") # spelled wrong, yet works diff --git a/tests/testthat/test-ab.R b/tests/testthat/test-ab.R index 0a3a40eb7..e4dbb58ff 100755 --- a/tests/testthat/test-ab.R +++ b/tests/testthat/test-ab.R @@ -27,20 +27,21 @@ # how to conduct AMR data analysis: https://msberends.github.io/AMR/ # # ==================================================================== # +ab_reset_session() + expect_equal( as.character(as.ab(c( "J01FA01", "J 01 FA 01", "Erythromycin", "eryt", - " eryt 123", "ERYT", "ERY", "erytromicine", "Erythrocin", "Romycin" ))), - rep("ERY", 10) + rep("ERY", 9) ) expect_identical(class(as.ab("amox")), c("ab", "character")) @@ -49,7 +50,7 @@ expect_true(is.ab(as.ab("amox"))) expect_stdout(print(as.ab("amox"))) expect_stdout(print(data.frame(a = as.ab("amox")))) -# expect_warning(as.ab("J00AA00")) # ATC not yet available in data set +expect_warning(as.ab("J00AA00")) # ATC not yet available in data set # expect_warning(as.ab("UNKNOWN")) expect_stdout(print(as.ab("amox"))) diff --git a/tests/testthat/test-ab_from_text.R b/tests/testthat/test-ab_from_text.R index 4321941cd..3ae1fc36e 100644 --- a/tests/testthat/test-ab_from_text.R +++ b/tests/testthat/test-ab_from_text.R @@ -27,20 +27,22 @@ # how to conduct AMR data analysis: https://msberends.github.io/AMR/ # # ==================================================================== # +ab_reset_session() + expect_identical( - ab_from_text("28/03/2020 regular amoxicilliin 500mg po tds")[[1]], + ab_from_text("28/03/2020 amoxicilliin 500mg po tds")[[1]], as.ab("Amoxicillin") ) expect_identical( - ab_from_text("28/03/2020 regular amoxicilliin 500mg po tds", thorough_search = TRUE)[[1]], + ab_from_text("28/03/2020 amoxicilliin 500mg po tds", thorough_search = TRUE)[[1]], as.ab("Amoxicillin") ) expect_identical( - ab_from_text("28/03/2020 regular amoxicilliin 500mg po tds", thorough_search = FALSE)[[1]], + ab_from_text("28/03/2020 amoxicilliin 500mg po tds", thorough_search = FALSE)[[1]], as.ab("Amoxicillin") ) expect_identical( - ab_from_text("28/03/2020 regular amoxicilliin 500mg po tds", translate_ab = TRUE)[[1]], + ab_from_text("28/03/2020 amoxicilliin 500mg po tds", translate_ab = TRUE)[[1]], "Amoxicillin" ) expect_identical( @@ -49,10 +51,10 @@ expect_identical( ) expect_identical( - ab_from_text("28/03/2020 regular amoxicilliin 500mg po tds", type = "dose")[[1]], + ab_from_text("28/03/2020 amoxicilliin 500mg po tds", type = "dose")[[1]], 500 ) expect_identical( - ab_from_text("28/03/2020 regular amoxicilliin 500mg po tds", type = "admin")[[1]], + ab_from_text("28/03/2020 amoxicilliin 500mg po tds", type = "admin")[[1]], "oral" ) diff --git a/tests/testthat/test-ab_property.R b/tests/testthat/test-ab_property.R index a4ba1718e..0105af624 100644 --- a/tests/testthat/test-ab_property.R +++ b/tests/testthat/test-ab_property.R @@ -27,6 +27,8 @@ # how to conduct AMR data analysis: https://msberends.github.io/AMR/ # # ==================================================================== # +ab_reset_session() + expect_identical(ab_name("AMX", language = NULL), "Amoxicillin") expect_identical(ab_atc("AMX"), "J01CA04") expect_identical(ab_cid("AMX"), as.integer(33613)) @@ -94,3 +96,4 @@ if (AMR:::pkg_is_available("dplyr", min_version = "1.0.0", also_load = TRUE)) { set_ab_names(NIT:VAN) %>% colnames()))) } +