From cd178ee56968ccec7a8e04aa3f27a3bbebb55e74 Mon Sep 17 00:00:00 2001 From: "Matthijs S. Berends" Date: Thu, 12 Sep 2019 15:08:53 +0200 Subject: [PATCH] (v0.7.1.9072) key_antibiotics() for foreign systems --- DESCRIPTION | 4 +- NEWS.md | 7 +- R/age.R | 4 + R/atc_online.R | 2 +- R/key_antibiotics.R | 25 +- R/like.R | 12 +- R/misc.R | 55 - R/mo.R | 52 +- R/mo2.R | 1489 +++++++++++++++++++++++++ R/sysdata.rda | Bin 7464 -> 7450 bytes R/{get_locale.R => translate.R} | 55 +- data-raw/eucast_rules.tsv | 8 +- data-raw/translations.tsv | 2 +- docs/LICENSE-text.html | 2 +- docs/articles/index.html | 2 +- docs/authors.html | 2 +- docs/index.html | 2 +- docs/news/index.html | 13 +- docs/reference/AMR-deprecated.html | 2 +- docs/reference/age.html | 2 +- docs/reference/age_groups.html | 2 +- docs/reference/index.html | 2 +- docs/reference/p_symbol.html | 2 +- docs/reference/translate.html | 4 +- man/translate.Rd | 2 +- tests/testthat/test-data.R | 2 + tests/testthat/test-key_antibiotics.R | 3 + tests/testthat/test-like.R | 2 +- 28 files changed, 1640 insertions(+), 119 deletions(-) create mode 100644 R/mo2.R rename R/{get_locale.R => translate.R} (69%) diff --git a/DESCRIPTION b/DESCRIPTION index e5420975..55bf8cad 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: AMR -Version: 0.7.1.9071 -Date: 2019-09-03 +Version: 0.7.1.9072 +Date: 2019-09-12 Title: Antimicrobial Resistance Analysis Authors@R: c( person(role = c("aut", "cre"), diff --git a/NEWS.md b/NEWS.md index 51260642..84de5238 100755 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,5 @@ -# AMR 0.7.1.9071 -Last updated: 03-Sep-2019 +# AMR 0.7.1.9072 +Last updated: 12-Sep-2019 ### Breaking * Determination of first isolates now **excludes** all 'unknown' microorganisms at default, i.e. microbial code `"UNKNOWN"`. They can be included with the new parameter `include_unknown`: @@ -99,6 +99,9 @@ * Function `availability()` now uses `portion_R()` instead of `portion_IR()`, to comply with EUCAST insights * Functions `age()` and `age_groups()` now have a `na.rm` parameter to remove empty values * Renamed function `p.symbol()` to `p_symbol()` (the former is now deprecated and will be removed in a future version) +* Using negative values for `x` in `age_groups()` will now introduce `NA`s and not return an error anymore +* Fix for determining the system's language +* Fix for `key_antibiotics()` on foreign systems #### Other * Added Prof Dr Casper Albers as doctoral advisor and Dr Bart Meijer, Dr Dennis Souverein and Annick Lenglet as contributors diff --git a/R/age.R b/R/age.R index 4a7f35d4..fb92d4c8 100755 --- a/R/age.R +++ b/R/age.R @@ -145,6 +145,10 @@ age_groups <- function(x, split_at = c(12, 25, 55, 75), na.rm = FALSE) { if (!is.numeric(x)) { stop("`x` and must be numeric, not a ", paste0(class(x), collapse = "/"), ".") } + if (any(x < 0, na.rm = TRUE)) { + x[x < 0] <- NA + warning("NAs introduced for ages below 0.") + } if (is.character(split_at)) { split_at <- split_at[1L] if (split_at %like% "^(child|kid|junior)") { diff --git a/R/atc_online.R b/R/atc_online.R index 469c28be..a7eddce6 100644 --- a/R/atc_online.R +++ b/R/atc_online.R @@ -81,7 +81,7 @@ atc_online_property <- function(atc_code, } if (!all(atc_code %in% AMR::antibiotics)) { - atc_code <- as.character(as.atc(atc_code)) + atc_code <- as.character(ab_atc(atc_code)) } if (!curl::has_internet()) { diff --git a/R/key_antibiotics.R b/R/key_antibiotics.R index 71ebb29c..81d12899 100755 --- a/R/key_antibiotics.R +++ b/R/key_antibiotics.R @@ -184,25 +184,28 @@ key_antibiotics <- function(x, # join to microorganisms data set x <- x %>% + as.data.frame(stringsAsFactors = FALSE) %>% mutate_at(vars(col_mo), as.mo) %>% left_join_microorganisms(by = col_mo) %>% mutate(key_ab = NA_character_, - gramstain = mo_gramstain(pull(., col_mo))) - + gramstain = mo_gramstain(pull(., col_mo), language = NULL)) + # Gram + x <- x %>% mutate(key_ab = if_else(gramstain == "Gram-positive", - apply(X = x[, gram_positive], - MARGIN = 1, - FUN = function(x) paste(x, collapse = "")), + tryCatch(apply(X = x[, gram_positive], + MARGIN = 1, + FUN = function(x) paste(x, collapse = "")), + error = function(e) paste0(rep(".", 12), collapse = "")), key_ab)) - + # Gram - x <- x %>% mutate(key_ab = if_else(gramstain == "Gram-negative", - apply(X = x[, gram_negative], - MARGIN = 1, - FUN = function(x) paste(x, collapse = "")), + tryCatch(apply(X = x[, gram_negative], + MARGIN = 1, + FUN = function(x) paste(x, collapse = "")), + error = function(e) paste0(rep(".", 12), collapse = "")), key_ab)) # format @@ -211,6 +214,10 @@ key_antibiotics <- function(x, gsub('(NA|NULL)', '.', .) %>% gsub('[^SIR]', '.', ., ignore.case = TRUE) %>% toupper() + + if (n_distinct(key_abs) == 1) { + warning("No distinct key antibiotics determined.", call. = FALSE) + } key_abs diff --git a/R/like.R b/R/like.R index f69d5fe8..6819e44c 100755 --- a/R/like.R +++ b/R/like.R @@ -56,8 +56,16 @@ like <- function(x, pattern) { if (length(pattern) > 1) { if (length(x) != length(pattern)) { - pattern <- pattern[1] - warning('only the first element of argument `pattern` used for `%like%`', call. = TRUE) + if (length(x) == 1) { + x <- rep(x, length(pattern)) + } + # return TRUE for every 'x' that matches any 'pattern', FALSE otherwise + res <- sapply(pattern, function(pttrn) x %like% pttrn) + res2 <- as.logical(rowSums(res)) + # get only first item of every hit in pattern + res2[duplicated(res)] <- FALSE + res2[rowSums(res) == 0] <- NA + return(res2) } else { # x and pattern are of same length, so items with each other res <- vector(length = length(pattern)) diff --git a/R/misc.R b/R/misc.R index a5b05ac4..ffe9ee5b 100755 --- a/R/misc.R +++ b/R/misc.R @@ -123,61 +123,6 @@ stopifnot_installed_package <- function(package) { return(invisible()) } -# translate strings based on inst/translations.tsv -#' @importFrom dplyr %>% filter -translate_AMR <- function(from, language = get_locale(), only_unknown = FALSE) { - # if (getOption("AMR_locale", "en") != language) { - # language <- getOption("AMR_locale", "en") - # } - - if (is.null(language)) { - return(from) - } - if (language %in% c("en", "")) { - return(from) - } - - df_trans <- translations_file # internal data file - - if (!language %in% df_trans$lang) { - stop("Unsupported language: '", language, "' - use one of: ", - paste0("'", sort(unique(df_trans$lang)), "'", collapse = ", "), - call. = FALSE) - } - - df_trans <- df_trans %>% filter(lang == language) - if (only_unknown == TRUE) { - df_trans <- df_trans %>% filter(pattern %like% "unknown") - } - - # default case sensitive if value if 'ignore.case' is missing: - df_trans$ignore.case[is.na(df_trans$ignore.case)] <- FALSE - # default not using regular expressions (fixed = TRUE) if 'fixed' is missing: - df_trans$fixed[is.na(df_trans$fixed)] <- TRUE - - # check if text to look for is in one of the patterns - any_form_in_patterns <- tryCatch(any(from %like% paste0("(", paste(df_trans$pattern, collapse = "|"), ")")), - error = function(e) { - warning("Translation not possible. Please open an issue on GitLab (https://gitlab.com/msberends/AMR/issues) or GitHub (https://github.com/msberends/AMR/issues).", call. = FALSE) - return(FALSE) - }) - if (NROW(df_trans) == 0 | !any_form_in_patterns) { - return(from) - } - - for (i in 1:nrow(df_trans)) { - from <- gsub(x = from, - pattern = df_trans$pattern[i], - replacement = df_trans$replacement[i], - fixed = df_trans$fixed[i], - ignore.case = df_trans$ignore.case[i]) - } - - # force UTF-8 for diacritics - base::enc2utf8(from) - -} - "%or%" <- function(x, y) { if (is.null(x) | is.null(y)) { if (is.null(x)) { diff --git a/R/mo.R b/R/mo.R index b68da3f6..520fc225 100755 --- a/R/mo.R +++ b/R/mo.R @@ -432,13 +432,13 @@ exec_as.mo <- function(x, } x <- y - } else if (all(x %in% read_mo_history(uncertainty_level, - force = force_mo_history)$x)) { - # previously found code - x <- microorganismsDT[data.table(mo = get_mo_history(x, - uncertainty_level, - force = force_mo_history)), - on = "mo", ..property][[1]] + # } else if (all(x %in% read_mo_history(uncertainty_level, + # force = force_mo_history)$x)) { + # # previously found code + # x <- microorganismsDT[data.table(mo = get_mo_history(x, + # uncertainty_level, + # force = force_mo_history)), + # on = "mo", ..property][[1]] } else if (all(tolower(x) %in% microorganismsDT$fullname_lower)) { # we need special treatment for very prevalent full names, they are likely! @@ -561,17 +561,17 @@ exec_as.mo <- function(x, progress$tick()$print() - if (initial_search == TRUE) { - found <- microorganismsDT[mo == get_mo_history(x_backup[i], - uncertainty_level, - force = force_mo_history), - ..property][[1]] - # previously found result - if (length(found) > 0) { - x[i] <- found[1L] - next - } - } + # if (initial_search == TRUE) { + # found <- microorganismsDT[mo == get_mo_history(x_backup[i], + # uncertainty_level, + # force = force_mo_history), + # ..property][[1]] + # # previously found result + # if (length(found) > 0) { + # x[i] <- found[1L] + # next + # } + # } found <- microorganismsDT[mo == toupper(x_backup[i]), ..property][[1]] # is a valid MO code @@ -826,6 +826,7 @@ exec_as.mo <- function(x, if (initial_search == TRUE) { set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) } + next } else if (grepl("[sS]almonella [A-Z][a-z]+ ?.*", x_backup_without_spp[i], ignore.case = FALSE)) { # Salmonella with capital letter species like "Salmonella Goettingen" - they're all S. enterica x[i] <- microorganismsDT[mo == 'B_SLMNL_ENT', ..property][[1]][1L] @@ -833,11 +834,11 @@ exec_as.mo <- function(x, set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) } uncertainties <- rbind(uncertainties, - data.frame(uncertainty_level = 1, - input = x_backup_without_spp[i], - result_mo = "B_SLMNL_ENT")) + format_uncertainty_as_df(uncertainty_level = 1, + input = x_backup_without_spp[i], + result_mo = "B_SLMNL_ENT")) + next } - next } # trivial names known to the field: @@ -1850,8 +1851,11 @@ mo_renamed <- function() { #' @export #' @noRd print.mo_renamed <- function(x, ...) { - items <- getOption("mo_renamed") - base::message(blue(paste("NOTE:", names(items), "was renamed", items, collapse = "\n"), collapse = "\n")) + items <- x #getOption("mo_renamed") + old <- names(x) + new <- x + + cat(blue(paste("NOTE:", italic(names(items)), "was renamed", italic(items), collapse = "\n"), collapse = "\n")) } nr2char <- function(x) { diff --git a/R/mo2.R b/R/mo2.R new file mode 100644 index 00000000..8ed1c821 --- /dev/null +++ b/R/mo2.R @@ -0,0 +1,1489 @@ +# ==================================================================== # +# TITLE # +# Antimicrobial Resistance (AMR) Analysis # +# # +# SOURCE # +# https://gitlab.com/msberends/AMR # +# # +# LICENCE # +# (c) 2019 Berends MS (m.s.berends@umcg.nl), Luz CF (c.f.luz@umcg.nl) # +# # +# This R package is free software; you can freely use and distribute # +# it for both personal and commercial purposes under the terms of the # +# GNU General Public License version 2.0 (GNU GPL-2), as published by # +# the Free Software Foundation. # +# # +# This R package was created for academic research and was publicly # +# released in the hope that it will be useful, but it comes WITHOUT # +# ANY WARRANTY OR LIABILITY. # +# Visit our website for more info: https://msberends.gitlab.io/AMR. # +# ==================================================================== # + +# THIS IS A TEST FUNCTION +as.mo2 <- function(x, + Becker = FALSE, + Lancefield = FALSE, + allow_uncertain = TRUE, + reference_df = get_mo_source(), + property = "mo", + initial_search = TRUE, + dyslexia_mode = FALSE, + force_mo_history = FALSE, + debug = FALSE) { + + if (!"AMR" %in% base::.packages()) { + require("AMR") + # check onLoad() in R/zzz.R: data tables are created there. + } + + # WHONET: xxx = no growth + x[tolower(as.character(paste0(x, ""))) %in% c("", "xxx", "na", "nan")] <- NA_character_ + + if (initial_search == TRUE) { + options(mo_failures = NULL) + options(mo_uncertainties = NULL) + options(mo_renamed = NULL) + } + options(mo_renamed_last_run = NULL) + + if (NCOL(x) == 2) { + # support tidyverse selection like: df %>% select(colA, colB) + # paste these columns together + x_vector <- vector("character", NROW(x)) + for (i in 1:NROW(x)) { + x_vector[i] <- paste(pull(x[i,], 1), pull(x[i,], 2), sep = " ") + } + x <- x_vector + } else { + if (NCOL(x) > 2) { + stop('`x` can be 2 columns at most', call. = FALSE) + } + x[is.null(x)] <- NA + + # support tidyverse selection like: df %>% select(colA) + if (!is.vector(x) & !is.null(dim(x))) { + x <- pull(x, 1) + } + } + + notes <- character(0) + uncertainties <- data.frame(uncertainty = integer(0), + input = character(0), + fullname = character(0), + renamed_to = character(0), + mo = character(0), + stringsAsFactors = FALSE) + failures <- character(0) + uncertainty_level <- translate_allow_uncertain(allow_uncertain) + + + # x_input <- x + # already strip leading and trailing spaces + #x <- trimws(x, which = "both") + # only check the uniques, which is way faster + #x <- unique(x) + # remove empty values (to later fill them in again with NAs) + # ("xxx" is WHONET code for 'no growth') + # x <- x[!is.na(x) + # & !is.null(x) + # & !identical(x, "") + # & !identical(x, "xxx")] + + # conversion of old MO codes from v0.5.0 (ITIS) to later versions (Catalogue of Life) + if (any(x %like% "^[BFP]_[A-Z]{3,7}") & !all(x %in% microorganisms$mo)) { + leftpart <- gsub("^([BFP]_[A-Z]{3,7}).*", "\\1", x) + if (any(leftpart %in% names(mo_codes_v0.5.0))) { + rightpart <- gsub("^[BFP]_[A-Z]{3,7}(.*)", "\\1", x) + leftpart <- mo_codes_v0.5.0[leftpart] + x[!is.na(leftpart)] <- paste0(leftpart[!is.na(leftpart)], rightpart[!is.na(leftpart)]) + } + # now check if some are still old + still_old <- x[x %in% names(mo_codes_v0.5.0)] + if (length(still_old) > 0) { + x[x %in% names(mo_codes_v0.5.0)] <- data.frame(old = still_old, stringsAsFactors = FALSE) %>% + left_join(data.frame(old = names(mo_codes_v0.5.0), + new = mo_codes_v0.5.0, + stringsAsFactors = FALSE), by = "old") %>% + # if they couldn't be found, replace them with the old ones again, + # so they will throw a warning in the end + mutate(new = ifelse(is.na(new), old, new)) %>% + pull(new) + } + } + + # # defined df to check for + # if (!is.null(reference_df)) { + # if (!mo_source_isvalid(reference_df)) { + # stop("`reference_df` must contain a column `mo` with values from the 'microorganisms' data set.", call. = FALSE) + # } + # reference_df <- reference_df %>% filter(!is.na(mo)) + # # keep only first two columns, second must be mo + # if (colnames(reference_df)[1] == "mo") { + # reference_df <- reference_df[, c(2, 1)] + # } else { + # reference_df <- reference_df[, c(1, 2)] + # } + # colnames(reference_df)[1] <- "x" + # # remove factors, just keep characters + # suppressWarnings( + # reference_df[] <- lapply(reference_df, as.character) + # ) + # } + # + # # all empty + # if (all(identical(trimws(x_input), "") | is.na(x_input) | length(x) == 0)) { + # if (property == "mo") { + # return(to_class_mo(rep(NA_character_, length(x_input)))) + # } else { + # return(rep(NA_character_, length(x_input))) + # } + # + # } else if (all(x %in% reference_df[, 1][[1]])) { + # # all in reference df + # colnames(reference_df)[1] <- "x" + # suppressWarnings( + # x <- data.frame(x = x, stringsAsFactors = FALSE) %>% + # left_join(reference_df, by = "x") %>% + # left_join(AMR::microorganisms, by = "mo") %>% + # pull(property) + # ) + # + # } else if (all(x %in% AMR::microorganisms$mo)) { + # # existing mo codes when not looking for property "mo", like mo_genus("B_ESCHR_COL") + # y <- microorganismsDT[prevalence == 1][data.table(mo = x), on = "mo", ..property][[1]] + # if (any(is.na(y))) { + # y[is.na(y)] <- microorganismsDT[prevalence == 2][data.table(mo = x[is.na(y)]), + # on = "mo", + # ..property][[1]] + # } + # if (any(is.na(y))) { + # y[is.na(y)] <- microorganismsDT[prevalence == 3][data.table(mo = x[is.na(y)]), + # on = "mo", + # ..property][[1]] + # } + # x <- y + # + # # } else if (all(x %in% read_mo_history(uncertainty_level, + # # force = force_mo_history)$x)) { + # # # previously found code + # # x <- microorganismsDT[data.table(mo = get_mo_history(x, + # # uncertainty_level, + # # force = force_mo_history)), + # # on = "mo", ..property][[1]] + # + # } else if (all(tolower(x) %in% microorganismsDT$fullname_lower)) { + # # we need special treatment for very prevalent full names, they are likely! + # # e.g. as.mo("Staphylococcus aureus") + # y <- microorganismsDT[prevalence == 1][data.table(fullname_lower = tolower(x)), on = "fullname_lower", ..property][[1]] + # if (any(is.na(y))) { + # y[is.na(y)] <- microorganismsDT[prevalence == 2][data.table(fullname_lower = tolower(x[is.na(y)])), + # on = "fullname_lower", + # ..property][[1]] + # } + # if (any(is.na(y))) { + # y[is.na(y)] <- microorganismsDT[prevalence == 3][data.table(fullname_lower = tolower(x[is.na(y)])), + # on = "fullname_lower", + # ..property][[1]] + # } + # x <- y + # + # } else if (all(toupper(x) %in% AMR::microorganisms.codes$code)) { + # # commonly used MO codes + # y <- as.data.table(AMR::microorganisms.codes)[data.table(code = toupper(x)), on = "code", ] + # # save them to history + # set_mo_history(x, y$mo, 0, force = force_mo_history) + # + # x <- microorganismsDT[data.table(mo = y[["mo"]]), on = "mo", ..property][[1]] + # + # } else if (!all(x %in% AMR::microorganisms[, property])) { + # + if (1 == 1) { + strip_whitespace <- function(x) { + # all whitespaces (tab, new lines, etc.) should be one space + # and spaces before and after should be omitted + trimws(gsub("[\\s]+", " ", x, perl = TRUE), which = "both") + } + + + x_new <- rep(NA_character_, length(x)) + + # keep only dots, letters, numbers, slashes, spaces and dashes + x <- gsub("[^.a-zA-Z0-9/ \\-]+", "", x) + # remove spp and species + x <- gsub(" +(spp.?|ssp.?|sp.? |ss ?.?|subsp.?|subspecies|biovar |serovar |species)", " ", x, ignore.case = TRUE) + # remove 'genus' as first word + x <- gsub("^genus ", "", x, ignore.case = TRUE) + # remove 'uncertain'-like texts + x <- trimws(gsub("(uncertain|susp[ie]c[a-z]+|verdacht)", "", x, ignore.case = TRUE)) + x <- strip_whitespace(x) + x_backup <- x + + # remove spp and species + #x <- gsub(" +(spp.?|ssp.?|sp.? |ss ?.?|subsp.?|subspecies|biovar |serovar |species)", " ", x_backup, ignore.case = TRUE) + #x <- strip_whitespace(x) + + x_backup_without_spp <- x + x_species <- paste(x, "species") + # translate to English for supported languages of mo_property + x <- gsub("(gruppe|groep|grupo|gruppo|groupe)", "group", x, ignore.case = TRUE) + x <- gsub("(vergroen)[a-z]*", "viridans", x, ignore.case = TRUE) + x <- gsub("(hefe|gist|gisten|levadura|lievito|fermento|levure)[a-z]*", "yeast", x, ignore.case = TRUE) + x <- gsub("(schimmels?|mofo|molde|stampo|moisissure|fungi)[a-z]*", "fungus", x, ignore.case = TRUE) + x <- gsub("Fungus[ph|f]rya", "Fungiphrya", x, ignore.case = TRUE) + # remove non-text in case of "E. coli" except dots and spaces + # x <- gsub("[^.a-zA-Z0-9/ \\-]+", "", x) + # replace minus by a space + x <- gsub("-+", " ", x) + # replace hemolytic by haemolytic + x <- gsub("ha?emoly", "haemoly", x, ignore.case = TRUE) + # place minus back in streptococci + x <- gsub("(alpha|beta|gamma).?ha?emoly", "\\1-haemoly", x, ignore.case = TRUE) + # remove genus as first word + # x <- gsub("^genus ", "", x, ignore.case = TRUE) + # remove 'uncertain' like texts + # x <- trimws(gsub("(uncertain|susp[ie]c[a-z]+|verdacht)", "", x, ignore.case = TRUE)) + # allow characters that resemble others = dyslexia_mode ---- + if (dyslexia_mode == TRUE) { + x <- tolower(x) + x <- gsub("[iy]+", "[iy]+", x) + x <- gsub("(c|k|q|qu|s|z|x|ks)+", "(c|k|q|qu|s|z|x|ks)+", x) + x <- gsub("(ph|hp|f|v)+", "(ph|hp|f|v)+", x) + x <- gsub("(th|ht|t)+", "(th|ht|t)+", x) + x <- gsub("a+", "a+", x) + x <- gsub("u+", "u+", x) + # allow any ending of -um, -us, -ium, -icum, -ius, -icus, -ica and -a (needs perl for the negative backward lookup): + x <- gsub("(u\\+\\(c\\|k\\|q\\|qu\\+\\|s\\|z\\|x\\|ks\\)\\+)(?![a-z])", + "(u[s|m]|[iy][ck]?u[ms]|[iy]?[ck]?a)", x, ignore.case = TRUE, perl = TRUE) + x <- gsub("(\\[iy\\]\\+\\(c\\|k\\|q\\|qu\\+\\|s\\|z\\|x\\|ks\\)\\+a\\+)(?![a-z])", + "(u[s|m]|[iy][ck]?u[ms]|[iy]?[ck]?a)", x, ignore.case = TRUE, perl = TRUE) + x <- gsub("(\\[iy\\]\\+u\\+m)(?![a-z])", + "(u[s|m]|[iy][ck]?u[ms]|[iy]?[ck]?a)", x, ignore.case = TRUE, perl = TRUE) + x <- gsub("e+", "e+", x, ignore.case = TRUE) + x <- gsub("o+", "o+", x, ignore.case = TRUE) + x <- gsub("(.)\\1+", "\\1+", x) + # allow ending in -en or -us + x <- gsub("e\\+n(?![a-z[])", "(e+n|u+(c|k|q|qu|s|z|x|ks)+)", x, ignore.case = TRUE, perl = TRUE) + # if the input is longer than 10 characters, allow any constant between all characters, as some might have forgotten a character + # this will allow "Pasteurella damatis" to be correctly read as "Pasteurella dagmatis". + constants <- paste(letters[!letters %in% c("a", "e", "i", "o", "u")], collapse = "") + #x[nchar(x_backup_without_spp) > 10] <- gsub("([a-z])([a-z])", paste0("\\1[", constants, "]?\\2"), x[nchar(x_backup_without_spp) > 10], ignore.case = TRUE) + x[nchar(x_backup_without_spp) > 10] <- gsub("[+]", paste0("+[", constants, "]?"), x[nchar(x_backup_without_spp) > 10]) + } + x <- strip_whitespace(x) + + x_trimmed <- x + x_trimmed_species <- paste(x_trimmed, "species") + x_trimmed_without_group <- gsub(" gro.u.p$", "", x_trimmed, ignore.case = TRUE) + # remove last part from "-" or "/" + x_trimmed_without_group <- gsub("(.*)[-/].*", "\\1", x_trimmed_without_group) + # replace space and dot by regex sign + x_withspaces <- gsub("[ .]+", ".* ", x) + x <- gsub("[ .]+", ".*", x) + # add start en stop regex + x <- paste0('^', x, '$') + x_withspaces_start_only <- paste0('^', x_withspaces) + x_withspaces_end_only <- paste0(x_withspaces, '$') + x_withspaces_start_end <- paste0('^', x_withspaces, '$') + + if (isTRUE(debug)) { + print(data.frame( + x_backup, + x, + x_species, + x_withspaces_start_only, + x_withspaces_end_only, + x_withspaces_start_end, + x_backup_without_spp, + x_trimmed, + x_trimmed_species, + x_trimmed_without_group), right = FALSE) + # cat(paste0('x "', x, '"\n')) + # cat(paste0('x_species "', x_species, '"\n')) + # cat(paste0('x_withspaces_start_only "', x_withspaces_start_only, '"\n')) + # cat(paste0('x_withspaces_end_only "', x_withspaces_end_only, '"\n')) + # cat(paste0('x_withspaces_start_end "', x_withspaces_start_end, '"\n')) + # cat(paste0('x_backup "', x_backup, '"\n')) + # cat(paste0('x_backup_without_spp "', x_backup_without_spp, '"\n')) + # cat(paste0('x_trimmed "', x_trimmed, '"\n')) + # cat(paste0('x_trimmed_species "', x_trimmed_species, '"\n')) + # cat(paste0('x_trimmed_without_group "', x_trimmed_without_group, '"\n')) + } + + #progress <- progress_estimated(n = length(x), min_time = 3) + + # THE NEW WAY ---- + nothing_more_to_do <- function() !any(is.na(x_new) & !is.na(x_backup)) + + lookup_regexes <- function(data, property, regex) { + prop <- regex %>% + sapply(function(pattern) pull(data, property) %like% pattern) %>% + as.data.frame() %>% + lapply(function(c) suppressWarnings(min(pull(data, property)[c]))) %>% + unlist() + if (is.null(prop)) { + return(rep(NA, length(regex))) + } + DT <- data.table(prop) + colnames(DT) <- property + microorganismsDT[DT, on = property, "mo"][[1]] + } + + # LATER: only unique if more than 500 values, of which max 85% distinct? + + x_backup_upper <- toupper(x_backup) + x_backup_lower <- tolower(x_backup) + + # exclude all viruses (there is no fullname containing 'virus' in the data set) + x_new[x_backup %like% "virus"] <- "UNKNOWN" + + # try available fields in the microorganisms data set + x_new[x_backup_upper %in% microorganisms$mo] <- microorganismsDT[data.table(mo = x_backup_upper[x_backup_upper %in% microorganisms$mo]), on = "mo", "mo"][[1]] + x_new[x_backup_lower %in% microorganismsDT$fullname_lower] <- microorganismsDT[data.table(fullname_lower = x_backup_lower[x_backup_lower %in% microorganismsDT$fullname_lower]), on = "fullname_lower", "mo"][[1]] + # x_new[x_backup %in% microorganisms$col_id] <- microorganismsDT[data.table(col_id = as.integer(x_backup[x_backup %in% microorganisms$col_id])), on = "col_id", ..property][[1]] + + # old names + old_names <- x_backup[x_backup_lower %in% microorganisms.oldDT$fullname_lower] + x_new[x_backup_lower %in% microorganisms.oldDT$fullname_lower] <- microorganismsDT[microorganisms.oldDT[data.table(fullname_lower = x_backup_lower[x_backup_lower %in% microorganisms.oldDT$fullname_lower]), on = "fullname_lower", "col_id_new"], on = c("col_id" = "col_id_new"), "mo"][[1]] + + if (nothing_more_to_do()) { + if (property != "mo") { + return(microorganismsDT[data.table(mo = x_new), on = "mo", ..property][[1]]) + } else { + return(to_class_mo(x_new)) + } + } + + # codes from the microorganisms.codes data set + x_new[x_backup_upper %in% microorganisms.codes$code] <- as.data.table(microorganisms.codes)[data.table(code = x_backup_upper[x_backup_upper %in% microorganisms.codes$code]), on = "code", "mo"][[1]] + if (!is.null(reference_df)) { + colnames(reference_df)[1] <- "code" + x_new[x_backup_upper %in% reference_df$code] <- as.data.table(reference_df)[data.table(code = x_backup_upper[x_backup_upper %in% reference_df$code]), on = "code", mo][[1]] + if (!all(x_new %in% microorganisms$mo, na.rm = TRUE)) { + warning("Values ", paste(x_new[!x_new %in% c(NA, microorganisms$mo)], collapse = ", "), " found in reference_df, but these are not valid MO codes.", call. = FALSE) + x_new[!x_new %in% c(NA, microorganisms$mo)] <- "UNKNOWN" + } + } + + x_new[x_backup_upper %in% c("MRSA", "MSSA", "VISA", "VRSA")] <- "B_STPHY_AUR" + x_new[x_backup_upper %in% c("MRSE", "MSSE")] <- "B_STPHY_EPI" + x_new[x_backup_upper %in% c("AIEC", "ATEC", "DAEC", "EAEC", "EHEC", "EIEC", "EPEC", "ETEC", "NMEC", "STEC", "UPEC") + | x_backup_upper %like% "O?(26|103|104|111|121|145|157)"] <- "B_ESCHR_COL" + x_new[x_backup_upper %in% c("MRPA")] <- "B_PSDMN_AER" + x_new[x_backup_upper %in% c("CRSM")] <- "B_STNTR_MAL" + x_new[x_backup_upper %in% c("PISP", "PRSP", "VISP", "VRSP")] <- "B_STRPT_PNE" + x_new[x_backup_upper %in% c("VRE") + | x_backup %like% "(enterococci|enterokok|enterococo)[a-z]*?$"] <- "B_ENTRC" + + # start showing progress bar here + progress <- progress_estimated(n = 3, min_time = 0) + # most prevalent (1) + x_new[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6] <- microorganismsDT[prevalence == 1] %>% lookup_regexes("fullname", x_withspaces_start_end[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6]) + x_new[!is.na(x_backup) & is.na(x_new)] <- microorganismsDT[prevalence == 1] %>% lookup_regexes("fullname", paste0(trimws(x_withspaces_start_only), " ")[!is.na(x_backup) & is.na(x_new)]) + x_new[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6] <- microorganismsDT[prevalence == 1] %>% lookup_regexes("fullname", x_withspaces_start_only[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6]) + x_new[!is.na(x_backup) & is.na(x_new)] <- microorganismsDT[prevalence == 1] %>% lookup_regexes("fullname", paste0(" ", trimws(x_withspaces_end_only))[!is.na(x_backup) & is.na(x_new)]) + x_new[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6] <- microorganismsDT[prevalence == 1] %>% lookup_regexes("fullname", x_trimmed[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6]) + x_new[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6] <- microorganismsDT[prevalence == 1] %>% lookup_regexes("fullname", x_trimmed_without_group[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6]) + x_new[!is.na(x_backup) & is.na(x_new)] <- microorganismsDT[prevalence == 1] %>% lookup_regexes("fullname", x_withspaces_start_only[!is.na(x_backup) & is.na(x_new)]) + if (nothing_more_to_do()) { + if (property != "mo") { + return(microorganismsDT[data.table(mo = x_new), on = "mo", ..property][[1]]) + } else { + return(to_class_mo(x_new)) + } + } + progress$tick()$print() + # less prevalent (2) + x_new[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6] <- microorganismsDT[prevalence == 2] %>% lookup_regexes("fullname", x_withspaces_start_end[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6]) + x_new[!is.na(x_backup) & is.na(x_new)] <- microorganismsDT[prevalence == 2] %>% lookup_regexes("fullname", paste0(trimws(x_withspaces_start_only), " ")[!is.na(x_backup) & is.na(x_new)]) + x_new[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6] <- microorganismsDT[prevalence == 2] %>% lookup_regexes("fullname", x_withspaces_start_only[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6]) + x_new[!is.na(x_backup) & is.na(x_new)] <- microorganismsDT[prevalence == 2] %>% lookup_regexes("fullname", paste0(" ", trimws(x_withspaces_end_only))[!is.na(x_backup) & is.na(x_new)]) + x_new[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6] <- microorganismsDT[prevalence == 2] %>% lookup_regexes("fullname", x_trimmed[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6]) + x_new[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6] <- microorganismsDT[prevalence == 2] %>% lookup_regexes("fullname", x_trimmed_without_group[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6]) + x_new[!is.na(x_backup) & is.na(x_new)] <- microorganismsDT[prevalence == 2] %>% lookup_regexes("fullname", x_withspaces_start_only[!is.na(x_backup) & is.na(x_new)]) + if (nothing_more_to_do()) { + if (property != "mo") { + return(microorganismsDT[data.table(mo = x_new), on = "mo", ..property][[1]]) + } else { + return(to_class_mo(x_new)) + } + } + progress$tick()$print() + # least prevalent (3) + x_new[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6] <- microorganismsDT[prevalence == 3] %>% lookup_regexes("fullname", x_withspaces_start_end[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6]) + x_new[!is.na(x_backup) & is.na(x_new)] <- microorganismsDT[prevalence == 3] %>% lookup_regexes("fullname", paste0(trimws(x_withspaces_start_only), " ")[!is.na(x_backup) & is.na(x_new)]) + x_new[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6] <- microorganismsDT[prevalence == 3] %>% lookup_regexes("fullname", x_withspaces_start_only[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6]) + x_new[!is.na(x_backup) & is.na(x_new)] <- microorganismsDT[prevalence == 3] %>% lookup_regexes("fullname", paste0(" ", trimws(x_withspaces_end_only))[!is.na(x_backup) & is.na(x_new)]) + x_new[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6] <- microorganismsDT[prevalence == 3] %>% lookup_regexes("fullname", x_trimmed[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6]) + x_new[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6] <- microorganismsDT[prevalence == 3] %>% lookup_regexes("fullname", x_trimmed_without_group[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6]) + x_new[!is.na(x_backup) & is.na(x_new)] <- microorganismsDT[prevalence == 3] %>% lookup_regexes("fullname", x_withspaces_start_only[!is.na(x_backup) & is.na(x_new)]) + + # all others are UNKNOWN + x_new[!is.na(x_backup) & is.na(x_new)] <- "UNKNOWN" + progress$tick()$print() + + return(to_class_mo(x_new)) + #for (i in 1:length(x)) { + for (i in character(0)) { + + x[i] <- "UNKNOWN" + next + + # progress$tick()$print() + + # if (initial_search == TRUE) { + # found <- microorganismsDT[mo == get_mo_history(x_backup[i], + # uncertainty_level, + # force = force_mo_history), + # ..property][[1]] + # # previously found result + # if (length(found) > 0) { + # x[i] <- found[1L] + # next + # } + # } + + found <- microorganismsDT[mo == toupper(x_backup[i]), ..property][[1]] + # is a valid MO code + if (length(found) > 0) { + x[i] <- found[1L] + next + } + + found <- microorganismsDT[fullname_lower %in% tolower(c(x_backup[i], x_backup_without_spp[i])), ..property][[1]] + # most probable: is exact match in fullname + if (length(found) > 0) { + x[i] <- found[1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + + found <- microorganismsDT[col_id == x_backup[i], ..property][[1]] + # is a valid Catalogue of Life ID + if (NROW(found) > 0) { + x[i] <- found[1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + + + # WHONET: xxx = no growth + if (tolower(as.character(paste0(x_backup_without_spp[i], ""))) %in% c("", "xxx", "na", "nan")) { + x[i] <- NA_character_ + next + } + + if (tolower(x_backup_without_spp[i]) %in% c("other", "none", "unknown")) { + # empty and nonsense values, ignore without warning + x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + + # check for very small input, but ignore the O antigens of E. coli + if (nchar(gsub("[^a-zA-Z]", "", x_trimmed[i])) < 3 + & !x_backup_without_spp[i] %like% "O?(26|103|104|104|111|121|145|157)") { + # check if search term was like "A. species", then return first genus found with ^A + # if (x_backup[i] %like% "[a-z]+ species" | x_backup[i] %like% "[a-z] spp[.]?") { + # # get mo code of first hit + # found <- microorganismsDT[fullname %like% x_withspaces_start_only[i], mo] + # if (length(found) > 0) { + # mo_code <- found[1L] %>% strsplit("_") %>% unlist() %>% .[1:2] %>% paste(collapse = "_") + # found <- microorganismsDT[mo == mo_code, ..property][[1]] + # # return first genus that begins with x_trimmed, e.g. when "E. spp." + # if (length(found) > 0) { + # x[i] <- found[1L] + # if (initial_search == TRUE) { + # set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + # } + # next + # } + # } + # } + # fewer than 3 chars and not looked for species, add as failure + x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]] + if (initial_search == TRUE) { + failures <- c(failures, x_backup[i]) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + + if (x_backup_without_spp[i] %like% "virus") { + # there is no fullname like virus, so don't try to coerce it + x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]] + if (initial_search == TRUE) { + failures <- c(failures, x_backup[i]) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + + # translate known trivial abbreviations to genus + species ---- + if (!is.na(x_trimmed[i])) { + if (toupper(x_backup_without_spp[i]) %in% c('MRSA', 'MSSA', 'VISA', 'VRSA')) { + x[i] <- microorganismsDT[mo == 'B_STPHY_AUR', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + if (toupper(x_backup_without_spp[i]) %in% c('MRSE', 'MSSE')) { + x[i] <- microorganismsDT[mo == 'B_STPHY_EPI', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + if (toupper(x_backup_without_spp[i]) == "VRE" + | x_backup_without_spp[i] %like% '(enterococci|enterokok|enterococo)[a-z]*?$') { + x[i] <- microorganismsDT[mo == 'B_ENTRC', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + # support for: + # - AIEC (Adherent-Invasive E. coli) + # - ATEC (Atypical Entero-pathogenic E. coli) + # - DAEC (Diffusely Adhering E. coli) + # - EAEC (Entero-Aggresive E. coli) + # - EHEC (Entero-Haemorrhagic E. coli) + # - EIEC (Entero-Invasive E. coli) + # - EPEC (Entero-Pathogenic E. coli) + # - ETEC (Entero-Toxigenic E. coli) + # - NMEC (Neonatal Meningitis‐causing E. coli) + # - STEC (Shiga-toxin producing E. coli) + # - UPEC (Uropathogenic E. coli) + if (toupper(x_backup_without_spp[i]) %in% c("AIEC", "ATEC", "DAEC", "EAEC", "EHEC", "EIEC", "EPEC", "ETEC", "NMEC", "STEC", "UPEC") + # also support O-antigens of E. coli: O26, O103, O104, O111, O121, O145, O157 + | x_backup_without_spp[i] %like% "O?(26|103|104|111|121|145|157)") { + x[i] <- microorganismsDT[mo == 'B_ESCHR_COL', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + if (toupper(x_backup_without_spp[i]) == 'MRPA') { + # multi resistant P. aeruginosa + x[i] <- microorganismsDT[mo == 'B_PSDMN_AER', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + if (toupper(x_backup_without_spp[i]) == 'CRS' + | toupper(x_backup_without_spp[i]) == 'CRSM') { + # co-trim resistant S. maltophilia + x[i] <- microorganismsDT[mo == 'B_STNTR_MAL', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + if (toupper(x_backup_without_spp[i]) %in% c('PISP', 'PRSP', 'VISP', 'VRSP')) { + # peni I, peni R, vanco I, vanco R: S. pneumoniae + x[i] <- microorganismsDT[mo == 'B_STRPT_PNE', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + if (x_backup_without_spp[i] %like% '^G[ABCDFGHK]S$') { + # Streptococci, like GBS = Group B Streptococci (B_STRPT_GRB) + x[i] <- microorganismsDT[mo == gsub("G([ABCDFGHK])S", "B_STRPT_GR\\1", x_backup_without_spp[i], ignore.case = TRUE), ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + if (x_backup_without_spp[i] %like% '(streptococ|streptokok).* [ABCDFGHK]$') { + # Streptococci in different languages, like "estreptococos grupo B" + x[i] <- microorganismsDT[mo == gsub(".*(streptococ|streptokok|estreptococ).* ([ABCDFGHK])$", "B_STRPT_GR\\2", x_backup_without_spp[i], ignore.case = TRUE), ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + if (x_backup_without_spp[i] %like% 'group [ABCDFGHK] (streptococ|streptokok|estreptococ)') { + # Streptococci in different languages, like "Group A Streptococci" + x[i] <- microorganismsDT[mo == gsub(".*group ([ABCDFGHK]) (streptococ|streptokok|estreptococ).*", "B_STRPT_GR\\1", x_backup_without_spp[i], ignore.case = TRUE), ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + if (x_backup_without_spp[i] %like% 'haemoly.*strept') { + # Haemolytic streptococci in different languages + x[i] <- microorganismsDT[mo == 'B_STRPT_HAE', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + # CoNS/CoPS in different languages (support for German, Dutch, Spanish, Portuguese) ---- + if (x_backup_without_spp[i] %like% '[ck]oagulas[ea] negatie?[vf]' + | x_trimmed[i] %like% '[ck]oagulas[ea] negatie?[vf]' + | x_backup_without_spp[i] %like% '[ck]o?ns[^a-z]?$') { + # coerce S. coagulase negative + x[i] <- microorganismsDT[mo == 'B_STPHY_CNS', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + if (x_backup_without_spp[i] %like% '[ck]oagulas[ea] positie?[vf]' + | x_trimmed[i] %like% '[ck]oagulas[ea] positie?[vf]' + | x_backup_without_spp[i] %like% '[ck]o?ps[^a-z]?$') { + # coerce S. coagulase positive + x[i] <- microorganismsDT[mo == 'B_STPHY_CPS', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + # streptococcal groups: milleri and viridans + if (x_trimmed[i] %like% 'strepto.* milleri' + | x_backup_without_spp[i] %like% 'strepto.* milleri' + | x_backup_without_spp[i] %like% 'mgs[^a-z]?$') { + # Milleri Group Streptococcus (MGS) + x[i] <- microorganismsDT[mo == 'B_STRPT_MIL', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + if (x_trimmed[i] %like% 'strepto.* viridans' + | x_backup_without_spp[i] %like% 'strepto.* viridans' + | x_backup_without_spp[i] %like% 'vgs[^a-z]?$') { + # Viridans Group Streptococcus (VGS) + x[i] <- microorganismsDT[mo == 'B_STRPT_VIR', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + if (x_backup_without_spp[i] %like% 'gram[ -]?neg.*' + | x_backup_without_spp[i] %like% 'negatie?[vf]' + | x_trimmed[i] %like% 'gram[ -]?neg.*') { + # coerce Gram negatives + x[i] <- microorganismsDT[mo == 'B_GRAMN', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + if (x_backup_without_spp[i] %like% 'gram[ -]?pos.*' + | x_backup_without_spp[i] %like% 'positie?[vf]' + | x_trimmed[i] %like% 'gram[ -]?pos.*') { + # coerce Gram positives + x[i] <- microorganismsDT[mo == 'B_GRAMP', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + if (x_backup_without_spp[i] %like% "salmonella [a-z]+ ?.*") { + if (x_backup_without_spp[i] %like% "Salmonella group") { + # Salmonella Group A to Z, just return S. species for now + x[i] <- microorganismsDT[mo == 'B_SLMNL', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + } else if (grepl("[sS]almonella [A-Z][a-z]+ ?.*", x_backup_without_spp[i], ignore.case = FALSE)) { + # Salmonella with capital letter species like "Salmonella Goettingen" - they're all S. enterica + x[i] <- microorganismsDT[mo == 'B_SLMNL_ENT', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + uncertainties <- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = 1, + input = x_backup_without_spp[i], + result_mo = "B_SLMNL_ENT")) + } + next + } + + # trivial names known to the field: + if ("meningococcus" %like% x_trimmed[i]) { + # coerce S. coagulase positive + x[i] <- microorganismsDT[mo == 'B_NESSR_MEN', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + if ("gonococcus" %like% x_trimmed[i]) { + # coerce S. coagulase positive + x[i] <- microorganismsDT[mo == 'B_NESSR_GON', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + if ("pneumococcus" %like% x_trimmed[i]) { + # coerce S. coagulase positive + x[i] <- microorganismsDT[mo == 'B_STRPT_PNE', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + } + + # FIRST TRY FULLNAMES AND CODES ---- + # if only genus is available, return only genus + if (all(!c(x[i], x_trimmed[i]) %like% " ")) { + found <- microorganismsDT[fullname_lower %in% tolower(c(x_species[i], x_trimmed_species[i])), ..property][[1]] + if (length(found) > 0) { + x[i] <- found[1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + if (nchar(x_backup_without_spp[i]) >= 6) { + found <- microorganismsDT[fullname_lower %like% paste0("^", unregex(x_backup_without_spp[i]), "[a-z]+"), ..property][[1]] + if (length(found) > 0) { + x[i] <- found[1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + } + # rest of genus only is in allow_uncertain part. + } + + # TRY OTHER SOURCES ---- + # WHONET and other common LIS codes + if (toupper(x_backup[i]) %in% AMR::microorganisms.codes[, 1]) { + mo_found <- AMR::microorganisms.codes[toupper(x_backup[i]) == AMR::microorganisms.codes[, 1], "mo"][1L] + if (length(mo_found) > 0) { + x[i] <- microorganismsDT[mo == mo_found, ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + } + if (!is.null(reference_df)) { + # self-defined reference + if (x_backup[i] %in% reference_df[, 1]) { + ref_mo <- reference_df[reference_df[, 1] == x_backup[i], "mo"] + if (ref_mo %in% microorganismsDT[, mo]) { + x[i] <- microorganismsDT[mo == ref_mo, ..property][[1]][1L] + next + } else { + warning("Value '", x_backup[i], "' was found in reference_df, but '", ref_mo, "' is not a valid MO code.", call. = FALSE) + } + } + } + + # allow no codes less than 4 characters long, was already checked for WHONET above + if (nchar(x_backup_without_spp[i]) < 4) { + x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]] + if (initial_search == TRUE) { + failures <- c(failures, x_backup[i]) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + + check_per_prevalence <- function(data_to_check, + a.x_backup, + b.x_trimmed, + c.x_trimmed_without_group, + d.x_withspaces_start_end, + e.x_withspaces_start_only, + f.x_withspaces_end_only, + g.x_backup_without_spp) { + + # try probable: trimmed version of fullname ---- + found <- data_to_check[fullname_lower %in% tolower(g.x_backup_without_spp), ..property][[1]] + if (length(found) > 0) { + return(found[1L]) + } + + # try any match keeping spaces ---- + found <- data_to_check[fullname %like% d.x_withspaces_start_end, ..property][[1]] + if (length(found) > 0 & nchar(g.x_backup_without_spp) >= 6) { + return(found[1L]) + } + + # try any match keeping spaces, not ending with $ ---- + found <- data_to_check[fullname %like% paste0(trimws(e.x_withspaces_start_only), " "), ..property][[1]] + if (length(found) > 0) { + return(found[1L]) + } + found <- data_to_check[fullname %like% e.x_withspaces_start_only, ..property][[1]] + if (length(found) > 0 & nchar(g.x_backup_without_spp) >= 6) { + return(found[1L]) + } + + # try any match keeping spaces, not start with ^ ---- + found <- data_to_check[fullname %like% paste0(" ", trimws(f.x_withspaces_end_only)), ..property][[1]] + if (length(found) > 0) { + return(found[1L]) + } + + # try a trimmed version + found <- data_to_check[fullname_lower %like% b.x_trimmed + | fullname_lower %like% c.x_trimmed_without_group, ..property][[1]] + if (length(found) > 0 & nchar(g.x_backup_without_spp) >= 6) { + return(found[1L]) + } + + + # try splitting of characters in the middle and then find ID ---- + # only when text length is 6 or lower + # like esco = E. coli, klpn = K. pneumoniae, stau = S. aureus, staaur = S. aureus + if (nchar(g.x_backup_without_spp) <= 6) { + x_length <- nchar(g.x_backup_without_spp) + x_split <- paste0("^", + g.x_backup_without_spp %>% substr(1, x_length / 2), + '.* ', + g.x_backup_without_spp %>% substr((x_length / 2) + 1, x_length)) + found <- data_to_check[fullname %like% x_split, ..property][[1]] + if (length(found) > 0) { + return(found[1L]) + } + } + + # try fullname without start and without nchar limit of >= 6 ---- + # like "K. pneu rhino" >> "Klebsiella pneumoniae (rhinoscleromatis)" = KLEPNERH + found <- data_to_check[fullname %like% e.x_withspaces_start_only, ..property][[1]] + if (length(found) > 0) { + return(found[1L]) + } + + # didn't found any + return(NA_character_) + } + + # FIRST TRY VERY PREVALENT IN HUMAN INFECTIONS ---- + x[i] <- check_per_prevalence(data_to_check = microorganismsDT[prevalence == 1], + a.x_backup = x_backup[i], + b.x_trimmed = x_trimmed[i], + c.x_trimmed_without_group = x_trimmed_without_group[i], + d.x_withspaces_start_end = x_withspaces_start_end[i], + e.x_withspaces_start_only = x_withspaces_start_only[i], + f.x_withspaces_end_only = x_withspaces_end_only[i], + g.x_backup_without_spp = x_backup_without_spp[i]) + if (!empty_result(x[i])) { + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + # THEN TRY PREVALENT IN HUMAN INFECTIONS ---- + x[i] <- check_per_prevalence(data_to_check = microorganismsDT[prevalence == 2], + a.x_backup = x_backup[i], + b.x_trimmed = x_trimmed[i], + c.x_trimmed_without_group = x_trimmed_without_group[i], + d.x_withspaces_start_end = x_withspaces_start_end[i], + e.x_withspaces_start_only = x_withspaces_start_only[i], + f.x_withspaces_end_only = x_withspaces_end_only[i], + g.x_backup_without_spp = x_backup_without_spp[i]) + if (!empty_result(x[i])) { + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + # THEN UNPREVALENT IN HUMAN INFECTIONS ---- + x[i] <- check_per_prevalence(data_to_check = microorganismsDT[prevalence == 3], + a.x_backup = x_backup[i], + b.x_trimmed = x_trimmed[i], + c.x_trimmed_without_group = x_trimmed_without_group[i], + d.x_withspaces_start_end = x_withspaces_start_end[i], + e.x_withspaces_start_only = x_withspaces_start_only[i], + f.x_withspaces_end_only = x_withspaces_end_only[i], + g.x_backup_without_spp = x_backup_without_spp[i]) + if (!empty_result(x[i])) { + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + + # MISCELLANEOUS ---- + + # look for old taxonomic names ---- + found <- microorganisms.oldDT[fullname_lower == tolower(x_backup[i]) + | fullname %like% x_withspaces_start_end[i],] + if (NROW(found) > 0) { + col_id_new <- found[1, col_id_new] + # when property is "ref" (which is the case in mo_ref, mo_authors and mo_year), return the old value, so: + # mo_ref("Chlamydia psittaci") = "Page, 1968" (with warning) + # mo_ref("Chlamydophila psittaci") = "Everett et al., 1999" + if (property == "ref") { + x[i] <- found[1, ref] + } else { + x[i] <- microorganismsDT[col_id == found[1, col_id_new], ..property][[1]] + } + options(mo_renamed_last_run = found[1, fullname]) + was_renamed(name_old = found[1, fullname], + name_new = microorganismsDT[col_id == found[1, col_id_new], fullname], + ref_old = found[1, ref], + ref_new = microorganismsDT[col_id == found[1, col_id_new], ref], + mo = microorganismsDT[col_id == found[1, col_id_new], mo]) + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + + # check for uncertain results ---- + uncertain_fn <- function(a.x_backup, + b.x_trimmed, + c.x_withspaces_start_end, + d.x_withspaces_start_only, + f.x_withspaces_end_only, + g.x_backup_without_spp) { + + if (uncertainty_level == 0) { + # do not allow uncertainties + return(NA_character_) + } + + # UNCERTAINTY LEVEL 1 ---- + if (uncertainty_level >= 1) { + now_checks_for_uncertainty_level <- 1 + + # (1) look again for old taxonomic names, now for G. species ---- + if (isTRUE(debug)) { + cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (1) look again for old taxonomic names, now for G. species\n") + } + if (isTRUE(debug)) { + message("Running '", c.x_withspaces_start_end, "' and '", d.x_withspaces_start_only, "'") + } + found <- microorganisms.oldDT[fullname %like% c.x_withspaces_start_end + | fullname %like% d.x_withspaces_start_only] + if (NROW(found) > 0 & nchar(g.x_backup_without_spp) >= 6) { + if (property == "ref") { + # when property is "ref" (which is the case in mo_ref, mo_authors and mo_year), return the old value, so: + # mo_ref("Chlamydia psittaci) = "Page, 1968" (with warning) + # mo_ref("Chlamydophila psittaci) = "Everett et al., 1999" + x <- found[1, ref] + } else { + x <- microorganismsDT[col_id == found[1, col_id_new], ..property][[1]] + } + was_renamed(name_old = found[1, fullname], + name_new = microorganismsDT[col_id == found[1, col_id_new], fullname], + ref_old = found[1, ref], + ref_new = microorganismsDT[col_id == found[1, col_id_new], ref], + mo = microorganismsDT[col_id == found[1, col_id_new], mo]) + options(mo_renamed_last_run = found[1, fullname]) + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = microorganismsDT[col_id == found[1, col_id_new], mo])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(x, property), 1, force = force_mo_history) + } + return(x) + } + + # (2) Try with misspelled input ---- + # just rerun with dyslexia_mode = TRUE will used the extensive regex part above + if (isTRUE(debug)) { + cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (2) Try with misspelled input\n") + } + if (isTRUE(debug)) { + message("Running '", a.x_backup, "'") + } + # first try without dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) + if (empty_result(found)) { + # then with dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) + } + if (!empty_result(found)) { + found_result <- found + found <- microorganismsDT[mo == found, ..property][[1]] + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(found[1L], property), 1, force = force_mo_history) + } + return(found[1L]) + } + } + + # UNCERTAINTY LEVEL 2 ---- + if (uncertainty_level >= 2) { + now_checks_for_uncertainty_level <- 2 + + # (3) look for genus only, part of name ---- + if (isTRUE(debug)) { + cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (3) look for genus only, part of name\n") + } + if (nchar(g.x_backup_without_spp) > 4 & !b.x_trimmed %like% " ") { + if (!grepl("^[A-Z][a-z]+", b.x_trimmed, ignore.case = FALSE)) { + if (isTRUE(debug)) { + message("Running '", paste(b.x_trimmed, "species"), "'") + } + # not when input is like Genustext, because then Neospora would lead to Actinokineospora + found <- microorganismsDT[fullname_lower %like% paste(b.x_trimmed, "species"), ..property][[1]] + if (length(found) > 0) { + x[i] <- found[1L] + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(x, property), 2, force = force_mo_history) + } + return(x) + } + } + } + + # (4) strip values between brackets ---- + if (isTRUE(debug)) { + cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (4) strip values between brackets\n") + } + a.x_backup_stripped <- gsub("( *[(].*[)] *)", " ", a.x_backup) + a.x_backup_stripped <- trimws(gsub(" +", " ", a.x_backup_stripped)) + if (isTRUE(debug)) { + message("Running '", a.x_backup_stripped, "'") + } + # first try without dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup_stripped, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) + if (empty_result(found)) { + # then with dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup_stripped, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) + } + if (!empty_result(found) & nchar(g.x_backup_without_spp) >= 6) { + found_result <- found + found <- microorganismsDT[mo == found, ..property][[1]] + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history) + } + return(found[1L]) + } + + # (5) inverse input ---- + if (isTRUE(debug)) { + cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (5) inverse input\n") + } + a.x_backup_inversed <- paste(rev(unlist(strsplit(a.x_backup, split = " "))), collapse = " ") + if (isTRUE(debug)) { + message("Running '", a.x_backup_inversed, "'") + } + # first try without dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup_inversed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) + if (empty_result(found)) { + # then with dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup_inversed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) + } + if (!empty_result(found) & nchar(g.x_backup_without_spp) >= 6) { + found_result <- found + found <- microorganismsDT[mo == found, ..property][[1]] + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history) + } + return(found[1L]) + } + + # (6) try to strip off half an element from end and check the remains ---- + if (isTRUE(debug)) { + cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (6) try to strip off half an element from end and check the remains\n") + } + x_strip <- a.x_backup %>% strsplit(" ") %>% unlist() + if (length(x_strip) > 1) { + for (i in 1:(length(x_strip) - 1)) { + lastword <- x_strip[length(x_strip) - i + 1] + lastword_half <- substr(lastword, 1, as.integer(nchar(lastword) / 2)) + # remove last half of the second term + x_strip_collapsed <- paste(c(x_strip[1:(length(x_strip) - i)], lastword_half), collapse = " ") + if (nchar(x_strip_collapsed) >= 4 & nchar(lastword_half) > 2) { + if (isTRUE(debug)) { + message("Running '", x_strip_collapsed, "'") + } + # first try without dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) + if (empty_result(found)) { + # then with dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) + } + if (!empty_result(found)) { + found_result <- found + found <- microorganismsDT[mo == found, ..property][[1]] + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history) + } + return(found[1L]) + } + } + } + } + # (7) try to strip off one element from end and check the remains ---- + if (isTRUE(debug)) { + cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (7) try to strip off one element from end and check the remains\n") + } + if (length(x_strip) > 1) { + for (i in 1:(length(x_strip) - 1)) { + x_strip_collapsed <- paste(x_strip[1:(length(x_strip) - i)], collapse = " ") + if (nchar(x_strip_collapsed) >= 6) { + if (isTRUE(debug)) { + message("Running '", x_strip_collapsed, "'") + } + # first try without dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) + if (empty_result(found)) { + # then with dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) + } + if (!empty_result(found)) { + found_result <- found + found <- microorganismsDT[mo == found, ..property][[1]] + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history) + } + return(found[1L]) + } + } + } + } + # (8) check for unknown yeasts/fungi ---- + if (isTRUE(debug)) { + cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (8) check for unknown yeasts/fungi\n") + } + if (b.x_trimmed %like% "yeast") { + found <- "F_YEAST" + found_result <- found + found <- microorganismsDT[mo == found, ..property][[1]] + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history) + } + return(found[1L]) + } + if (b.x_trimmed %like% "(fungus|fungi)" & !b.x_trimmed %like% "Fungiphrya") { + found <- "F_FUNGUS" + found_result <- found + found <- microorganismsDT[mo == found, ..property][[1]] + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history) + } + return(found[1L]) + } + # (9) try to strip off one element from start and check the remains (only allow >= 2-part name outcome) ---- + if (isTRUE(debug)) { + cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (9) try to strip off one element from start and check the remains (only allow >= 2-part name outcome)\n") + } + x_strip <- a.x_backup %>% strsplit(" ") %>% unlist() + if (length(x_strip) > 1 & nchar(g.x_backup_without_spp) >= 6) { + for (i in 2:(length(x_strip))) { + x_strip_collapsed <- paste(x_strip[i:length(x_strip)], collapse = " ") + if (isTRUE(debug)) { + message("Running '", x_strip_collapsed, "'") + } + # first try without dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) + if (empty_result(found)) { + # then with dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) + } + if (!empty_result(found)) { + found_result <- found + found <- microorganismsDT[mo == found_result[1L], ..property][[1]] + # uncertainty level 2 only if searched part contains a space (otherwise it will be found with lvl 3) + if (x_strip_collapsed %like% " ") { + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history) + } + return(found[1L]) + } + } + } + } + } + + # UNCERTAINTY LEVEL 3 ---- + if (uncertainty_level >= 3) { + now_checks_for_uncertainty_level <- 3 + + # (10) try to strip off one element from start and check the remains (any text size) ---- + if (isTRUE(debug)) { + cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (10) try to strip off one element from start and check the remains (any text size)\n") + } + x_strip <- a.x_backup %>% strsplit(" ") %>% unlist() + if (length(x_strip) > 1 & nchar(g.x_backup_without_spp) >= 6) { + for (i in 2:(length(x_strip))) { + x_strip_collapsed <- paste(x_strip[i:length(x_strip)], collapse = " ") + if (isTRUE(debug)) { + message("Running '", x_strip_collapsed, "'") + } + # first try without dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) + if (empty_result(found)) { + # then with dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) + } + if (!empty_result(found)) { + found_result <- found + found <- microorganismsDT[mo == found, ..property][[1]] + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(found[1L], property), 3, force = force_mo_history) + } + return(found[1L]) + } + } + } + # (11) try to strip off one element from end and check the remains (any text size) ---- + # (this is in fact 7 but without nchar limit of >=6) + if (isTRUE(debug)) { + cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (11) try to strip off one element from end and check the remains (any text size)\n") + } + if (length(x_strip) > 1) { + for (i in 1:(length(x_strip) - 1)) { + x_strip_collapsed <- paste(x_strip[1:(length(x_strip) - i)], collapse = " ") + if (isTRUE(debug)) { + message("Running '", x_strip_collapsed, "'") + } + # first try without dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) + if (empty_result(found)) { + # then with dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) + } + if (!empty_result(found)) { + found_result <- found + found <- microorganismsDT[mo == found, ..property][[1]] + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history) + } + return(found[1L]) + } + } + } + + # (12) part of a name (very unlikely match) ---- + if (isTRUE(debug)) { + cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (12) part of a name (very unlikely match)\n") + } + if (isTRUE(debug)) { + message("Running '", f.x_withspaces_end_only, "'") + } + found <- microorganismsDT[fullname %like% f.x_withspaces_end_only] + if (nrow(found) > 0) { + found_result <- found[["mo"]] + if (!empty_result(found_result) & nchar(g.x_backup_without_spp) >= 6) { + found <- microorganismsDT[mo == found_result[1L], ..property][[1]] + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(found[1L], property), 3, force = force_mo_history) + } + return(found[1L]) + } + } + } + + # didn't found in uncertain results too + return(NA_character_) + } + x[i] <- uncertain_fn(x_backup[i], + x_trimmed[i], + x_withspaces_start_end[i], + x_withspaces_start_only[i], + x_withspaces_end_only[i], + x_backup_without_spp[i]) + if (!empty_result(x[i])) { + # no set_mo_history here - it is already set in uncertain_fn() + next + } + + # no results found: make them UNKNOWN ---- + x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]] + if (initial_search == TRUE) { + failures <- c(failures, x_backup[i]) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + } + } + + # handling failures ---- + failures <- failures[!failures %in% c(NA, NULL, NaN)] + if (length(failures) > 0 & initial_search == TRUE) { + options(mo_failures = sort(unique(failures))) + plural <- c("value", "it", "was") + if (n_distinct(failures) > 1) { + plural <- c("values", "them", "were") + } + total_failures <- length(x_input[as.character(x_input) %in% as.character(failures) & !x_input %in% c(NA, NULL, NaN)]) + total_n <- length(x_input[!x_input %in% c(NA, NULL, NaN)]) + msg <- paste0(nr2char(n_distinct(failures)), " unique ", plural[1], + " (covering ", percent(total_failures / total_n, round = 1, force_zero = TRUE), + ") could not be coerced and ", plural[3], " considered 'unknown'") + if (n_distinct(failures) <= 10) { + msg <- paste0(msg, ": ", paste('"', unique(failures), '"', sep = "", collapse = ', ')) + } + msg <- paste0(msg, ". Use mo_failures() to review ", plural[2], ". Edit the `allow_uncertain` parameter if needed (see ?as.mo).") + warning(red(msg), + call. = FALSE, + immediate. = TRUE) # thus will always be shown, even if >= warnings + } + # handling uncertainties ---- + if (NROW(uncertainties) > 0 & initial_search == TRUE) { + options(mo_uncertainties = as.list(distinct(uncertainties, input, .keep_all = TRUE))) + + plural <- c("", "it") + if (NROW(uncertainties) > 1) { + plural <- c("s", "them") + } + msg <- paste0("\nResult", plural[1], " of ", nr2char(NROW(uncertainties)), " value", plural[1], + " was guessed with uncertainty. Use mo_uncertainties() to review ", plural[2], ".") + warning(red(msg), + call. = FALSE, + immediate. = TRUE) # thus will always be shown, even if >= warnings + } + + # Becker ---- + if (Becker == TRUE | Becker == "all") { + # See Source. It's this figure: + # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4187637/figure/F3/ + MOs_staph <- microorganismsDT[genus == "Staphylococcus"] + setkey(MOs_staph, species) + CoNS <- MOs_staph[species %in% c("arlettae", "auricularis", "capitis", + "caprae", "carnosus", "chromogenes", "cohnii", "condimenti", + "devriesei", "epidermidis", "equorum", "felis", + "fleurettii", "gallinarum", "haemolyticus", + "hominis", "jettensis", "kloosii", "lentus", + "lugdunensis", "massiliensis", "microti", + "muscae", "nepalensis", "pasteuri", "petrasii", + "pettenkoferi", "piscifermentans", "rostri", + "saccharolyticus", "saprophyticus", "sciuri", + "stepanovicii", "simulans", "succinus", + "vitulinus", "warneri", "xylosus") + | (species == "schleiferi" & subspecies %in% c("schleiferi", "")), ..property][[1]] + CoPS <- MOs_staph[species %in% c("simiae", "agnetis", + "delphini", "lutrae", + "hyicus", "intermedius", + "pseudintermedius", "pseudointermedius", + "schweitzeri", "argenteus") + | (species == "schleiferi" & subspecies == "coagulans"), ..property][[1]] + + # warn when species found that are not in Becker (2014, PMID 25278577) and Becker (2019, PMID 30872103) + post_Becker <- c("argensis", "caeli", "cornubiensis", "edaphicus") + if (any(x %in% MOs_staph[species %in% post_Becker, ..property][[1]])) { + + warning("Becker ", italic("et al."), " (2014, 2019) does not contain these species named after their publication: ", + italic(paste("S.", + sort(mo_species(unique(x[x %in% MOs_staph[species %in% post_Becker, ..property][[1]]]))), + collapse = ", ")), + ".", + call. = FALSE, + immediate. = TRUE) + } + + x[x %in% CoNS] <- microorganismsDT[mo == 'B_STPHY_CNS', ..property][[1]][1L] + x[x %in% CoPS] <- microorganismsDT[mo == 'B_STPHY_CPS', ..property][[1]][1L] + if (Becker == "all") { + x[x %in% microorganismsDT[mo %like% '^B_STPHY_AUR', ..property][[1]]] <- microorganismsDT[mo == 'B_STPHY_CPS', ..property][[1]][1L] + } + } + + # Lancefield ---- + if (Lancefield == TRUE | Lancefield == "all") { + # group A - S. pyogenes + x[x == microorganismsDT[mo == 'B_STRPT_PYO', ..property][[1]][1L]] <- microorganismsDT[mo == 'B_STRPT_GRA', ..property][[1]][1L] + # group B - S. agalactiae + x[x == microorganismsDT[mo == 'B_STRPT_AGA', ..property][[1]][1L]] <- microorganismsDT[mo == 'B_STRPT_GRB', ..property][[1]][1L] + # group C + S_groupC <- microorganismsDT %>% filter(genus == "Streptococcus", + species %in% c("equisimilis", "equi", + "zooepidemicus", "dysgalactiae")) %>% + pull(property) + x[x %in% S_groupC] <- microorganismsDT[mo == 'B_STRPT_GRC', ..property][[1]][1L] + if (Lancefield == "all") { + # all Enterococci + x[x %like% "^(Enterococcus|B_ENTRC)"] <- microorganismsDT[mo == 'B_STRPT_GRD', ..property][[1]][1L] + } + # group F - S. anginosus + x[x == microorganismsDT[mo == 'B_STRPT_ANG', ..property][[1]][1L]] <- microorganismsDT[mo == 'B_STRPT_GRF', ..property][[1]][1L] + # group H - S. sanguinis + x[x == microorganismsDT[mo == 'B_STRPT_SAN', ..property][[1]][1L]] <- microorganismsDT[mo == 'B_STRPT_GRH', ..property][[1]][1L] + # group K - S. salivarius + x[x == microorganismsDT[mo == 'B_STRPT_SAL', ..property][[1]][1L]] <- microorganismsDT[mo == 'B_STRPT_GRK', ..property][[1]][1L] + } + + # Wrap up ---------------------------------------------------------------- + + # comply to x, which is also unique and without empty values + x_input_unique_nonempty <- unique(x_input[!is.na(x_input) + & !is.null(x_input) + & !identical(x_input, "") + & !identical(x_input, "xxx")]) + + # left join the found results to the original input values (x_input) + df_found <- data.frame(input = as.character(x_input_unique_nonempty), + found = as.character(x), + stringsAsFactors = FALSE) + df_input <- data.frame(input = as.character(x_input), + stringsAsFactors = FALSE) + + suppressWarnings( + x <- df_input %>% + left_join(df_found, + by = "input") %>% + pull(found) + ) + + if (property == "mo") { + x <- to_class_mo(x) + } + + if (length(mo_renamed()) > 0) { + print(mo_renamed()) + } + + x +} diff --git a/R/sysdata.rda b/R/sysdata.rda index 186c9e9c9c4e123789b788a3d7c4f2644923e14e..9588a803de3be4c5d14da1684af1efb21c181f10 100644 GIT binary patch delta 7401 zcmVNP!2C>v8v4FCb4Xc_GykZ(uL^d-}ssY6ypp~AKb!AA3RZuDG`q+Xm zdp-h)J0CRyfXXMam3f}VFt*=%uA%^EfRYHv0Rj%#qNLJ=pX2iVSeFsRB8Z0BlYc@2 z2Ip|4K`UhTFQX3%#VF$uQ2_b7vTkAf^{}J3Rpg?KaJ{k+6ywAJ12hp6Yz6I{l?Fs4 zsMLg_F_aKQjHLjg0u)ps|3sq^WJsii=%kAXiO3wpJX#U(a&^ROz^Oo&=}NL9)I)?q zqJ^Lp1SCuV%v2jh0A+x2_x;Yyk81xX@OW!MeKd@W^7RX8)(q3>ptHM-#|P2 zu{nKSJaD>SiNVu+S+{IB9r+rP7OE>dc;4m{oE~<}&ADO04YlUZbEzLmw`f&@$GA_| z-SD&%&SZlGwBwl+-aMEfkm)k8*4!HPH8-6EgVHukd$yFLD=`u~3>?){N;IY+sb~d9U5P-_VWY49i1{ zp!2T7%S5QSoTjymRM42x_TY8nc_cZy2Hm5wIKSz2E7Lf7KoV)x4RFW9c^A~*A|0grjjvxwOlqgURQjQgk5vOhoBykHuyLsdE3uS znl1U9-gI;0!Wt>IB^P|%Jj-WZ2cKfClUz)k+u|to6aWB`5N!~O2&?=(?*~rc9&}Mq z*~kh65HAE(MDIZq`%ytbI6j=pi6cGs)NDB!8Km{a3 z0TM`~ASj0}AcwgWM0x=QRB$4ohg1&FjHM~;5ee+>^^lDfYythTS3e4qAO#tJ2Gpr$ z4VxK%d_*NcQbdsi{i6lTHu#8M>{yq;0oyGZn z%OvGzy#VL~5bcR0s2fk0Cm%S#UB|Mojw5% zE2UH#PmBy#mG%I^K-OCbz*q2DE2P zUd`p>P>rO__IX;imgW|XT}o4bNU&4Il1Eu0o!7b{;9lfO%D7!i6cn_3J-aGv2wW+p zNgtQxe#mP-AVCeqODSdKE<>bPCukVZ8hh^OdB|NriXFB6PZkZEj3Hr!3$(te>5$fZ zb+{jeVA54-KF()niYDX~hC=R%Lg4>8)8?s?BCA0MC4`KbL*#}R6ONaGdt%R!rJI~VVNzWuGO_q=*@y-9Ci zN(lD_fv|5V- zbi_xf@ajP6mUNGo2r*)$r%5XkHu!qI83Z7*#%HIOwHm<5O&n5=St+hxOz_^a83IJ4% z z*UPj?@<*=$%`94J)1KuR$r#0Q=H$3hGrElfQ_-L~#*jWR^UD4C-=w91!)A8-AmxmalT_D zBZfmJNeIuuT-utw4LWpiYu?qLV_c|DIbM>$`^(~g1Gt0Hk1Y!{NDPGMiO>>@K!QRk z>j-93tiHOZado>Zz8@%EcBZ4oY>(5^XT~{+JUmg{*Pe1+HEvYxWVria`TpzZDEbA! zapTw%v&*p!>3^yfR*)@KMXXwaT3D!MWPaF|E|D;9B`2RZhsb*1z@~sANI-;O`T2ZH z^bwGM6R7}jiLNWx0qpak86h^o^?ok)zlI&2AR2O0@}HRP`$fC)4%RmjTYM7ssHKD9 z8;|V5=3*I1jf`C>3N*NhW=B$~0wfkOY*17gGf0F-(?l2b9~`0(PF3kBNfSLo?{!Fe zjZrU?vd*5OS&m`+10+1u78uA>idUI@74kiQ^d?4xVxfq@5RpO!2y{ZljFjRGbvtLs z4y7%*dzRza0}K!@p;d-Y#JB2ypsBCS(>d5r#;9DNM|N z%@DB-BIz(Iaim}zoQF7i{M|lUwtKIT@rRm~i1e5s)MWK=I}MoV$0~#&2|(7E+5%h% zUWAHl6`*!OZrI zO$-`7r1#H!-Fi_;`iuD?IX_6mwgf|K9|~w{K;wbHSjm9-?jV z>qFmZ3sNW7xtJ?@2o{5rAP71DVJ&(CS(mIP;D{4$;B2@slmTGMmtd%! z&q9Fj@`z#rNTeVF>e6t+lr_Ae(g0y9X%b5d4j~FpP`xShg2AX=2szFI#qP#`bWoYr zYM}$H35S4l4Na4N4H|FAbLX|DN=pnVtXLex(Ch6Tiw2k`fk|+IH7zVdL^vNAC=hfF6H`H`3m}9AL<*jD?CVG@d5Uq!t`g^K2AXz%XC+w4766sW z5;re6^tso}g~}mfC|1fpU@t)L7ncFQmBj~U=1q^s-LI7TjsuhH^R8G4u!^(p;2qNot%^u z=BRNF!6c7Fd|k;o4w8?55Z<&Hrj1>Uc5>9V0Su@eer;wYz9<#2dt%k5BrUl1*zyf&&hVf8uOLF%bweB({9{s6|kuJ`8EU%q^!-xbJjFz+c(#Cr z2_j@l~q4HQXBv;$rzVCe~a14a<^MPkI`SuDd=-V(g z0eq1|gOV;+M(%fiOBGXUhdp1ck^(mz@B<}19R=bYodLG0Vm1+7-fL~t{da=C zXdfpYCg!KtVSi_QnvgT(3qFky{Jom-VD$$eLs%_^0aQkRYbbG$e2rYrhNFc?m^t@Q zETM4f#E}?*gN*(xQK(M_*v22Ut zMn8a#3D+opJ;Mj_WZ2^Cc1T4kE}%m$i$;x&<_8>gkIdlcz65fq;xup?MV6b;|$Vq@}Nk)kB3zp^dnge9X3{5Ip&h|GgZS#BJ zjK&v#8bo5bNwyVv)2E+CY=|!O=Hj8-5{f)hyzOE=hPIfUT(IB2e?lA6QsfI539j`9 zE>hWT@`2Gqz!UkR!=00JJ5Ri)tn8pP)6`sBfOSem?t=s3E3aX~)`YOx5`eKm7=?uc zvwpzu1QjBPY8p*qW)YpWm6DG{7$q~9#*}7%7@G;zqiG3c4F3o?+ZZT22uTJ)#sHy= z*zt}I#gYat>9^GdK7~R;RR)dBZw6s7WC+v)!U0%7A7TZxaU@B)+fq4+MDq;5@JVMz z;jVtMw)1Je{o0oaPg^v>Ap!&o183ec6bObovYk4>)+pr}7T;MW$1xjnQPe(-ts*gh z0t4mH^%??hd~IJ~5VgLXf(_@}p+#_S5;lsY%szI7tr(90F-2F zjAmqz$z8_bV)zVL!VBQ1LM!8OJ_j?bry{~A8fS_^2Okj0uXk6b*tD)jP!fg!c2PRQU(YYrw0Sv z2|$g|8iFCKK|_`r^9AU9VM?xq$Z<8`0qZwmT4{kCrBP1ghll}jFPL%AO7BQzLSzlw z+(>Y+Bw%7G@VScs;u2>G#|+|94SSNv|(oLt^gqf0vgcHxD?BO<5RoZ z-?pK=X8smT<4Oq=8s1IAfiUI6vESllTn>dp7^74OJBGzR3^|sd>r_>7EQcE!wOlV^ zXhm>h){qp0L;R+m;I?!aQTn=<6X~D%g_p>wwShQ=Yqt`ZL4*kz0ufN?Iidj`IUdrJ zbhtxWq*aOxRv?y?7AT^tLwO~C-e7;tvA~KzsIs*u7EDN?CMvMgSY@grHoy-ojF3WN z1hNStz#9ly8zE4YNjIc;4`X4u?)`j+mE`qeoI-D87WP6=nvj!1nXD28lr!X|4F!Q_ z+fryDhykRKtObl#K?xZEgc%WtaVX$+q5&cr=w*_b7^QT6omd70Tq;n1P}EqYPI{sM zZ=EAaD3a|-Fo?;3u4YIS8clAGrp47H7Doj_jR0FI4=q@#r@Y3$6~fv=R#33)0u}BQ zR~Ib-z%+90dxNbg$i!Ez>3!bS(`iifi!)Ndd}8+liQfVTr^U$=Nm6r2gCvF|fLLu{ zf>015oYego9H=+hTxENILJi?{7a)URz@F55+xCv~mf9k7Xw2_q zvZ$xAx0|6j#A)VV2lxWnJMZvrCyzX&>_?sWk zFdLg>=5YcL=rke>4Y=jxTaNDc3u&-wgw=#Alu1z_Fa)fKP%IgL0W3&FV>T@&rHfpj zt?>JM%<6nywtn&+0pzrtjYlcCP(T65D+PTLC=!AN-Zt)#)MNz5D->!QUe{BpNaii! zF;KvvEm$O1v;2MO%ol_nuotRVptC~Ba< zM}XT98W|{mfWZ==;ShqD7&X^UeDz+c7-r>M~HA_&aCXy87IR>Xw&Esry*HWyZ zL803iU9ENPJ%EF#lq2|>c~A)=6o!L1ofYBU`7#2X<3BLqYZ2?ggRa`0Zi;WYe$j;GB%sB`i4=w-N@Su6Rv30P zVi7n{x8RQff+T=&E`MJpdqZ<(;2o$H#?4#Ns|bW6XmyQ|P5lNLjXM#NFwNx~+4N(*VIjRHpm5#9xuaB+RK-LHr4IJ3Vs1kPpkFdl-%^CUvh1_S%Z{y8Rb~Rf zy|5@}M@VE0Is$@~nFy!AKup0$RJeu&j_oF}25lKjgrr3T)IxyGZOGQf*I^aKl-ELk z0q+D}^(ZV9V7`=gSQ`M`9Cc)fNP%ZdeP(H$JIIuHp{d11u)L36t)0WvQJyL+N)QUI z90mBDImeM3M36SUe~b4tdK^mmJqjI0Dh|WgU`HWH?#KHxr+!Q*yfG*{LKYoDfX8TV zjU@cvT@TFYeBSnhWHtkY4Z=W31pz^SK*WInm8y_n4Q_K{C~__MLnSQ@RQ{A+!t{L= zN3l;?v5S`sa*`;!216>W4Nx@ud1-E>XgVExJ4^oX?bhLgkqE z21k#-HrOZb7I0oB`CQQ?kbx$rUhNA8_~BY_dj+j)+~Su1?hZO^QLtDhY7k;FDHZ_+ z42nUHq?WQEBB(K7z1vsA@b=UiwsCE%rY;A4(^FB$!)CF`gPUk1fMfK=!kv!9 znmANG10i7p@;>4x%osuOG(%w!fEQAY$>jU1;B_9_sifc4K~TIvuP#+S z|Frz(&x!s&91nh_KE{i(ctpuD_)xmwyh5~0D+Tq0;XFuy2uemujFke5`(!T^xEL@|?GXrSK)J(Z1=H-YWCWM{n8@EtUsSF_kpcIB!hi+h4#YwD{}D(peitUG7Nkl delta 7415 zcmVF-oF3eRSccVdB-_Bsx|9Ws#4%F)pvj@v9&-t4^;p4zpXnkj`;4vx5k z7gSZ-bR>vCCMHt_GeAs@8m7vAlPTnl>P>>1Q_($7Q`85k>ODgsdV|zx+Ch*5L5dU9 zX@400w{tfY1N{1u8vHOp^q}(WV3#5NU+KMiGRYnq?1D)HaCqAF5~?8&TFB-(VF6T_8#4-PQ4$Cj=N0BJx;1Y`h#2VhZBW1~zj75>_gK+KeNloM!#aE z0$-gf$cs@95ekYHfK(8XFat4AZGR8}mI24b?{Aqy#Wu^~O-#@$3JATgyiCoCAD?X+ z72#*v=*{Q{e}*Sd)aA=R8_F4$hbKY%?!(ifR9sG(tz#85CN#ac9XxL&heNJS_|^|*>H}7VZ$+e!pQhv5 zwgT{xm$QwTBfapt3}GRNq=9&ML|bY(8xN3f zIxsYWA=*le-kaK?DYYBujEAlTqYd=US?T-SdbG!?dbtk6z4I>$l7I98)qXiVo_~Yi z)3=u%1GvJN$WJ7rPh$$vX3ZDXqNZewm(5Vs?X=MtoPipBQY_jE88cU0ATg+n90-xq zCCbKXYGfR4N+1jr_TH4Tm0dK6ks|(!I`2j~-)*ye&_QPDLDR;1vE-PCmLo?;D{{SB z+uR<1Xz@-rz#bB9segK^o<^(?Z4SWSx)GE5G(bTXQFmvYR^y%Kn94lxXvM}M93ymN zJYA&osiiAgQIuiFAwA4f3m7d)vf$0r)`*ITKA7Dmi(6e}r3cdETKJb^G~p-00aOqW zNgMqhjLA^?dbQIHfvlMqARf+9Tdf~q(W1VdE=Tbfdo>j;G9Yw}1&inaiLF;^a1 z8c{3;p@x9M6@Nm5NI^@8KwLm)njK8QvB|uU{kp%sN1olUGM7Ev_1t zFJ{b|T*-T+e&N4;>GeEg`|;a}t}E-j|8d$45>S5CRXkq8IwwISX78+h~+F zSQ?}#A<}mHbkQM)Uq00h;DZoDhN(FXuGaaeFM>ZdLxzu&+h5>2Z)GV?@Qd5%ivWnS z0Tw71Q{xj`;Gch5Go~+Q^6{uf(q?-sty@cT3r4Pjlz-AJ6!Wr0&`76|@(6lY#}aYw z7gEIqEgy@1%9;Wf3TaYD&E|ffXg?T14C!S`!e z5PyX!tcPw)@d`WkXe}6%KwAjL8i($a$S2iPgzl)&5?~HHO(^>ps;Uv_h(u0I!5@R1 zj;||kZcAHg!uQ>bGdy1>Fw<>I;`$R_Bs~63^MB|BuU}?og56i3{CM6XGSO{c%^Kr| z5OX?g7(U(=g_$nEHDUsgI2PGJFO4$UN=S5gItLD|i>rKbVtxa_Am@w>H(SD90=P%Q zmRyitmV`fU&e%x+&9=7|w3jc_oZzcO#msEjL_Xn5(-ug;IPm+Qocb_&{p0QUpBV`m zZGYp%40pGg=F(3pt0S2r*)$=TR%tH~9MfIfNj-_Gmlhgg<8n zjq^&m<5m>F@JFDeU}}u$#&#p*Q1aH-1Gs={Z^Ks?|W2E zXGs#GOhE{d0t0rG4}ju(-CbT4)r-}t^;W9g_|PY(8fm974f*(p}{(v ztN52aWhG_rrWWBg@11&&&SSeu+i!I};5`Du50ZI zR8$U|y2L1?G9Il14?T`BkQY1!Kz|QgVRjKR`~JS1eEte3W|dHdLq@cO7ytx}io8AK zwYsT$FRnB&kU6I6%g7u(u7+%wUwN)Qj2oK5Kqg6;2I8keAGf<7ZW25clIU#L8=Mbm z3LVlE0dhe#*jEU&T;eSzC7&^yG{mXs-A2aGb2e@dLr(UsElo0^J1X?n2Y=aKKsnHN zqsc~! z`}^5AGCs;&p5!Noa3?%?qGNsrmCu_{0JG;a&0VGlo0d;9OVM-d_Q0V|Ll{ASZg@&mLPWgIM&$qZaetvRu+>5bU=t46%r!Pm@MzO~hdz99;8EJujD2FVTD%^lA$k#bD zN8wTH(E{F128*F2B-RH41crrD9b9j>v9hHC4uLvq>NWvp5P-0WWDd>!iG^;WyJlDk zcv|tO(SMScP>O1Ryd^Q6uK8u}zttYlj+wx9vMtCC#17$ka2xlypz7SovGw-r-hD>_ z$@Y0yECkq2K0fCzuKyWgs!Hrr>aHFotC@4q4!vq>j(Rak5-Jzml55|8CvnzX z$+^Qlolr7)2(PQHl7gJ&4O7`9k=`Gjo=z*6qkoWX+6+^Vf7hO^JJ!GK-u!%~YxYGZW9bNFGx>ZTxyr@K8 z%8vl`6tCd@hS(7fmTAb}Z-qv*g!Kq{C9PNhPQjtPR2$OVy~eG3jEP64)W%4cXGTja z<$rME0R|%_tw23a$|Y`(q3`j05x6NVLBNp|F~~XKD@fQbI7cKh5|s?n;UHd;#z>g( zjhP~W$=R5JKEG$jyaZ}gnHe4dg3m7pfs;h@Dtmh#@D)`mXhNucRt*UNKQ7hwyy-)L zWG72V3pzL4uu$_x5pkpo#EIA(p=DF=B7ezZsx4Uuh-Ea52c}I4A!7N?bAFW##KENS zI26O7TIt@2AG8BMDfV$JpP8e9q>>Q1Hp-7AOCiMz_XYW#8oyyA1a3Sa28w;0OT<4X z5bjknHW5o-5`{?})mlqS2SUpViVpk)y~?I27$(FZwx>h${z~-0`{SS?s9O{TQGXe# zhiVVG-|P4L&yh#gJUr+YP`G^=>zqNmTD701GE&|F^et~?ufS2NWESm`G{M>Lus}Y9gu{qkRBYsvoS&JrmqUCYkqCA4-xqcHs zY?&d6rAt}f#^tTPZ+(TB!hhpPj8`c($$8VOpB&i`UDKP2hk_*(c_n$;#CuI`n4Nt% zZ@&Cc(KRkWv4EQIP-5jRmfr9j)I5Nn%@!RNP0Z~+B|S$~0j8dU;@ktJDHpa34{)xY zqlK*rVX`FwVuCRX3I|sGLEH#D2Lal8k14DZ#(7?y3nk!ac%1~$kbj1`=~+hI3Fi%; z5IMduRCbV(48`mLQyQ(baP40(WbK>%a8&D6Bqd;L;I;tFCJezV0DxRI0^w~`9kZUM zJju6>Lm{!BiWUWU>B~IF;x^uGH{Z8X;VJ2}OcD?vK(ID^Ooaj=p8Y4qST!i|G%fHf zlb@{{c~S2lveuCpfqwzn$mAIUa1Lu%(E$rvv4|kvZ&0GRH;Ev!Bg}*%5vC}f>)GY| z$Wk!E2$qrrEeOB44WT6Q-2A-P zyySp#>PQ$MUaTAsa3umaKxznvtpyHPYtReK^@S?B6CuRcfCs4Eg=wY)a+O6pk{$pB z#J*w2K`Xf-l?jkHZ*d{Qz>$H7r^4ne1CU9b6P6jtC1#Ttk#LFRICN5r8EC@J8(aWF z2n02u4#*VB@qeSw+omp>5z|^*Z$$(cT2!tQtZFtmYtPrYm2f&03}TH$BJLX$`51H- zpy{eAxRyhWjasf3u{0vMF>6Q)LLvTBPn0d41{8fdmlN>M{X)ywskMPPg=@r&Ai@NU zfe5H`98my|91mGZvRom`sTE>_6^JFJkwq02hUAxX1AqQ)js#K#MU|;JGGavuF;#|| z!!1z}wg7n~WP%eAC6Gw00kDOEvK0wblYvO-4zppo*6v~*AGgZV8-t`$O6mrIx{Go` z*RcdCpwBHRXeAIu!kQ9H=+hTxEK~4S(Tv7Z5?PU?*C|gbPIp1Xh4#hS3Sr zJRfH6QrkpMZ5f_6ODc-RsxJ^i=kAPHxzLOCfZnyO23zq-ERZ`CkS-Di0vX-0DMpP+ zWLz7+GtqO5AI8}IvjMrbPG=w?4km;_fwwHZCAjeMx-F)`suNZau2CgKfWQ*6AwaNX z1b?w15scWhnwBkcelKt1?lY3Go&RXlToR;Rr%SW>iF9^@irnz&g+?jheUFRuKqB(E3iHPlJh# zj*fAXFx&4Q&kuTY?o) z<(CUR9$a;5s;e*-1@J(joO**GVZ8>CqC?$ zQFvldb%ZQBg#nJR-5N>yKFS}2G(I042FPp&2pfcekO~5Vfq#ht0V`&ZU=3&IL{Q?} z?uJTQ8marSdW+HX)gIEGqhl8?8092Ub_|A9SQ?;d_j1zRNYHaSdb-R8rsk+})xOEV z^_<0r@N6mK zq*y)Juz5`Ph<{wO9?;0~^v$*h=b1J%!;KpZ8;{ zekC}D1Hz(45o8$<5AMbGsCoP7Klok-<5aMbSIO3Q& z>Cg?&fU=v1&jZw`#2|F^*)VE(!Sn}8i@-2JLr}K zSUa@sR@;NI7C=6Ul1E>Z{YK}G9#14r^@^RncxLxR$uam4yCA$`$|e=!_XOgAoKO}~ zX*Ua!EPoP9-ZTQZfEs~b9%y_g+k1xIHb;T^j*XAFarR(=5%khDDFzyOd3G-ixF%?Z zP!Y*OT1cu$)MJ|w74s)B+%^ar2EmYlP$)GGRI$h)dyg=oftKJlbexW}WK%R*0bBLH z#6ylSPnv{2@Izvt6t1*jvPM`A`T4)znbQl!OMd`{q;`agrrQWgMoNs80*m}|7m8dA z7%2F}LK=`RaZ|gs060oQ$dK?BevmJ1Roc0p#3Yb|X@g?n2SI(U5bWJ3HD|bo=s&J* zYq01*I5y-ixwNTiti7J}X)RYf4QCJZKy^P*A9tLIpr6 z4OkHN(DfmJz&WY}@+iQyvI`?<5+z%@qMo$mL;w$FApBn8NEh;NnF%rt(+`_Xw`(-9 poph$=-5AqeZJm+t7s!MzzQ@016}%K#x+uT=UC9*TLPG=l7*KMAF "Staphylococcus coagulase negativo (CoNS)" get_locale <- function() { - if (getOption("AMR_locale", "en") != "en") { + if (!is.null(getOption("AMR_locale", default = NULL))) { return(getOption("AMR_locale")) } @@ -73,6 +73,7 @@ get_locale <- function() { # Check the locale settings for a start with one of these languages: # grepl() with ignore.case = FALSE is faster than %like% + if (grepl("^(English|en_|EN_)", lang, ignore.case = FALSE)) { # as first option to optimise speed "en" @@ -93,3 +94,55 @@ get_locale <- function() { "en" } } + +# translate strings based on inst/translations.tsv +#' @importFrom dplyr %>% filter +translate_AMR <- function(from, language = get_locale(), only_unknown = FALSE) { + + if (is.null(language)) { + return(from) + } + if (language %in% c("en", "", NA)) { + return(from) + } + + df_trans <- translations_file # internal data file + + if (!language %in% df_trans$lang) { + stop("Unsupported language: '", language, "' - use one of: ", + paste0("'", sort(unique(df_trans$lang)), "'", collapse = ", "), + call. = FALSE) + } + + df_trans <- df_trans %>% filter(lang == language) + if (only_unknown == TRUE) { + df_trans <- df_trans %>% filter(pattern %like% "unknown") + } + + # default case sensitive if value if 'ignore.case' is missing: + df_trans$ignore.case[is.na(df_trans$ignore.case)] <- FALSE + # default not using regular expressions (fixed = TRUE) if 'fixed' is missing: + df_trans$fixed[is.na(df_trans$fixed)] <- TRUE + + # check if text to look for is in one of the patterns + any_form_in_patterns <- tryCatch(any(from %like% paste0("(", paste(df_trans$pattern, collapse = "|"), ")")), + error = function(e) { + warning("Translation not possible. Please open an issue on GitLab (https://gitlab.com/msberends/AMR/issues) or GitHub (https://github.com/msberends/AMR/issues).", call. = FALSE) + return(FALSE) + }) + if (NROW(df_trans) == 0 | !any_form_in_patterns) { + return(from) + } + + for (i in 1:nrow(df_trans)) { + from <- gsub(x = from, + pattern = df_trans$pattern[i], + replacement = df_trans$replacement[i], + fixed = df_trans$fixed[i], + ignore.case = df_trans$ignore.case[i]) + } + + # force UTF-8 for diacritics + base::enc2utf8(from) + +} diff --git a/data-raw/eucast_rules.tsv b/data-raw/eucast_rules.tsv index 664994a0..c80f122c 100644 --- a/data-raw/eucast_rules.tsv +++ b/data-raw/eucast_rules.tsv @@ -1,9 +1,9 @@ # --------------------------------------------------------------------------------------------------- # For editing this EUCAST reference file, these values can all be used for target antibiotics: -# all_betalactams, aminoglycosides, carbapenems, cephalosporins, cephalosporins_without_CAZ, fluoroquinolones, -# glycopeptides, macrolides, minopenicillins, polymyxins, streptogramins, tetracyclines, ureidopenicillins -# and all separate EARS-Net letter codes like AMC. They can be separated by comma: 'AMC, fluoroquinolones'. -# The if_mo_property column can be any column name from the AMR::microorganisms data set, or "genus_species" or "gramstain". +# 'all_betalactams', 'aminoglycosides', 'aminopenicillins', 'carbapenems', 'cephalosporins', 'cephalosporins_without_CAZ', +# 'fluoroquinolones', 'glycopeptides', 'macrolides', 'polymyxins', 'streptogramins', 'tetracyclines', 'ureidopenicillins' +# and all separate EARS-Net letter codes like 'AMC'. They can be separated by comma: 'AMC, fluoroquinolones'. +# The 'if_mo_property' column can be any column name from the AMR::microorganisms data set, or "genus_species" or "gramstain". # The like.is.one_of column must contain one of: like, is, one_of ('like' will read the first column as regular expression) # The EUCAST guideline contains references to the 'Burkholderia cepacia complex'. All species in this group can be found in: LiPuma J, Curr Opin Pulm Med. 2005 Nov;11(6):528-33. (PMID 16217180). # >>>>> IF YOU WANT TO IMPORT THIS FILE INTO YOUR OWN SOFTWARE, HAVE THE FIRST 10 LINES SKIPPED <<<<< diff --git a/data-raw/translations.tsv b/data-raw/translations.tsv index b1f6e6dc..feba8456 100644 --- a/data-raw/translations.tsv +++ b/data-raw/translations.tsv @@ -354,7 +354,7 @@ nl Ticarcillin/beta-lactamase inhibitor Ticarcilline/enzymremmer nl Ticarcillin/clavulanic acid Ticarcilline/clavulaanzuur nl Tinidazole Tinidazol nl Tobramycin Tobramycine -nl Trimethoprim/sulfamethoxazole Trimethoprim/sulfamethoxazol +nl Trimethoprim/sulfamethoxazole Cotrimoxazol nl Troleandomycin Troleandomycine nl Trovafloxacin Trovafloxacine nl Vancomycin Vancomycine diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index 2ba7fb61..64d7693f 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -78,7 +78,7 @@ AMR (for R) - 0.7.1.9071 + 0.7.1.9072 diff --git a/docs/articles/index.html b/docs/articles/index.html index 0579eeae..0b297047 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -78,7 +78,7 @@ AMR (for R) - 0.7.1.9071 + 0.7.1.9072 diff --git a/docs/authors.html b/docs/authors.html index 7ae2bb88..6bddc998 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -78,7 +78,7 @@ AMR (for R) - 0.7.1.9071 + 0.7.1.9072 diff --git a/docs/index.html b/docs/index.html index 5376e2af..98dc1422 100644 --- a/docs/index.html +++ b/docs/index.html @@ -42,7 +42,7 @@ AMR (for R) - 0.7.1.9071 + 0.7.1.9072 diff --git a/docs/news/index.html b/docs/news/index.html index f74cb984..e58ae6de 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -78,7 +78,7 @@ AMR (for R) - 0.7.1.9071 + 0.7.1.9072 @@ -225,11 +225,11 @@ -
+

-AMR 0.7.1.9071 Unreleased +AMR 0.7.1.9072 Unreleased

-

Last updated: 03-Sep-2019

+

Last updated: 12-Sep-2019

Breaking

@@ -342,6 +342,9 @@ Since this is a major change, usage of the old also_single_tested w
  • Function availability() now uses portion_R() instead of portion_IR(), to comply with EUCAST insights
  • Functions age() and age_groups() now have a na.rm parameter to remove empty values
  • Renamed function p.symbol() to p_symbol() (the former is now deprecated and will be removed in a future version)
  • +
  • Using negative values for x in age_groups() will now introduce NAs and not return an error anymore
  • +
  • Fix for determining the system’s language
  • +
  • Fix for key_antibiotics() on foreign systems
  • @@ -1261,7 +1264,7 @@ Using as.mo(..., allow_uncertain = 3)

    Contents

    diff --git a/docs/reference/age.html b/docs/reference/age.html index e3af9941..43e57c9b 100644 --- a/docs/reference/age.html +++ b/docs/reference/age.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9070 + 0.7.1.9072
    diff --git a/docs/reference/age_groups.html b/docs/reference/age_groups.html index b7ab5a4e..4069613a 100644 --- a/docs/reference/age_groups.html +++ b/docs/reference/age_groups.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9070 + 0.7.1.9072
    diff --git a/docs/reference/index.html b/docs/reference/index.html index fd5b9a5f..e85c7ddc 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -78,7 +78,7 @@ AMR (for R) - 0.7.1.9071 + 0.7.1.9072
    diff --git a/docs/reference/p_symbol.html b/docs/reference/p_symbol.html index 10bc52e9..98401561 100644 --- a/docs/reference/p_symbol.html +++ b/docs/reference/p_symbol.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9071 + 0.7.1.9072 diff --git a/docs/reference/translate.html b/docs/reference/translate.html index e6f48c26..14176405 100644 --- a/docs/reference/translate.html +++ b/docs/reference/translate.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9055 + 0.7.1.9072 @@ -297,7 +297,7 @@