diff --git a/DESCRIPTION b/DESCRIPTION index e5420975..55bf8cad 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: AMR -Version: 0.7.1.9071 -Date: 2019-09-03 +Version: 0.7.1.9072 +Date: 2019-09-12 Title: Antimicrobial Resistance Analysis Authors@R: c( person(role = c("aut", "cre"), diff --git a/NEWS.md b/NEWS.md index 51260642..84de5238 100755 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,5 @@ -# AMR 0.7.1.9071 -Last updated: 03-Sep-2019 +# AMR 0.7.1.9072 +Last updated: 12-Sep-2019 ### Breaking * Determination of first isolates now **excludes** all 'unknown' microorganisms at default, i.e. microbial code `"UNKNOWN"`. They can be included with the new parameter `include_unknown`: @@ -99,6 +99,9 @@ * Function `availability()` now uses `portion_R()` instead of `portion_IR()`, to comply with EUCAST insights * Functions `age()` and `age_groups()` now have a `na.rm` parameter to remove empty values * Renamed function `p.symbol()` to `p_symbol()` (the former is now deprecated and will be removed in a future version) +* Using negative values for `x` in `age_groups()` will now introduce `NA`s and not return an error anymore +* Fix for determining the system's language +* Fix for `key_antibiotics()` on foreign systems #### Other * Added Prof Dr Casper Albers as doctoral advisor and Dr Bart Meijer, Dr Dennis Souverein and Annick Lenglet as contributors diff --git a/R/age.R b/R/age.R index 4a7f35d4..fb92d4c8 100755 --- a/R/age.R +++ b/R/age.R @@ -145,6 +145,10 @@ age_groups <- function(x, split_at = c(12, 25, 55, 75), na.rm = FALSE) { if (!is.numeric(x)) { stop("`x` and must be numeric, not a ", paste0(class(x), collapse = "/"), ".") } + if (any(x < 0, na.rm = TRUE)) { + x[x < 0] <- NA + warning("NAs introduced for ages below 0.") + } if (is.character(split_at)) { split_at <- split_at[1L] if (split_at %like% "^(child|kid|junior)") { diff --git a/R/atc_online.R b/R/atc_online.R index 469c28be..a7eddce6 100644 --- a/R/atc_online.R +++ b/R/atc_online.R @@ -81,7 +81,7 @@ atc_online_property <- function(atc_code, } if (!all(atc_code %in% AMR::antibiotics)) { - atc_code <- as.character(as.atc(atc_code)) + atc_code <- as.character(ab_atc(atc_code)) } if (!curl::has_internet()) { diff --git a/R/key_antibiotics.R b/R/key_antibiotics.R index 71ebb29c..81d12899 100755 --- a/R/key_antibiotics.R +++ b/R/key_antibiotics.R @@ -184,25 +184,28 @@ key_antibiotics <- function(x, # join to microorganisms data set x <- x %>% + as.data.frame(stringsAsFactors = FALSE) %>% mutate_at(vars(col_mo), as.mo) %>% left_join_microorganisms(by = col_mo) %>% mutate(key_ab = NA_character_, - gramstain = mo_gramstain(pull(., col_mo))) - + gramstain = mo_gramstain(pull(., col_mo), language = NULL)) + # Gram + x <- x %>% mutate(key_ab = if_else(gramstain == "Gram-positive", - apply(X = x[, gram_positive], - MARGIN = 1, - FUN = function(x) paste(x, collapse = "")), + tryCatch(apply(X = x[, gram_positive], + MARGIN = 1, + FUN = function(x) paste(x, collapse = "")), + error = function(e) paste0(rep(".", 12), collapse = "")), key_ab)) - + # Gram - x <- x %>% mutate(key_ab = if_else(gramstain == "Gram-negative", - apply(X = x[, gram_negative], - MARGIN = 1, - FUN = function(x) paste(x, collapse = "")), + tryCatch(apply(X = x[, gram_negative], + MARGIN = 1, + FUN = function(x) paste(x, collapse = "")), + error = function(e) paste0(rep(".", 12), collapse = "")), key_ab)) # format @@ -211,6 +214,10 @@ key_antibiotics <- function(x, gsub('(NA|NULL)', '.', .) %>% gsub('[^SIR]', '.', ., ignore.case = TRUE) %>% toupper() + + if (n_distinct(key_abs) == 1) { + warning("No distinct key antibiotics determined.", call. = FALSE) + } key_abs diff --git a/R/like.R b/R/like.R index f69d5fe8..6819e44c 100755 --- a/R/like.R +++ b/R/like.R @@ -56,8 +56,16 @@ like <- function(x, pattern) { if (length(pattern) > 1) { if (length(x) != length(pattern)) { - pattern <- pattern[1] - warning('only the first element of argument `pattern` used for `%like%`', call. = TRUE) + if (length(x) == 1) { + x <- rep(x, length(pattern)) + } + # return TRUE for every 'x' that matches any 'pattern', FALSE otherwise + res <- sapply(pattern, function(pttrn) x %like% pttrn) + res2 <- as.logical(rowSums(res)) + # get only first item of every hit in pattern + res2[duplicated(res)] <- FALSE + res2[rowSums(res) == 0] <- NA + return(res2) } else { # x and pattern are of same length, so items with each other res <- vector(length = length(pattern)) diff --git a/R/misc.R b/R/misc.R index a5b05ac4..ffe9ee5b 100755 --- a/R/misc.R +++ b/R/misc.R @@ -123,61 +123,6 @@ stopifnot_installed_package <- function(package) { return(invisible()) } -# translate strings based on inst/translations.tsv -#' @importFrom dplyr %>% filter -translate_AMR <- function(from, language = get_locale(), only_unknown = FALSE) { - # if (getOption("AMR_locale", "en") != language) { - # language <- getOption("AMR_locale", "en") - # } - - if (is.null(language)) { - return(from) - } - if (language %in% c("en", "")) { - return(from) - } - - df_trans <- translations_file # internal data file - - if (!language %in% df_trans$lang) { - stop("Unsupported language: '", language, "' - use one of: ", - paste0("'", sort(unique(df_trans$lang)), "'", collapse = ", "), - call. = FALSE) - } - - df_trans <- df_trans %>% filter(lang == language) - if (only_unknown == TRUE) { - df_trans <- df_trans %>% filter(pattern %like% "unknown") - } - - # default case sensitive if value if 'ignore.case' is missing: - df_trans$ignore.case[is.na(df_trans$ignore.case)] <- FALSE - # default not using regular expressions (fixed = TRUE) if 'fixed' is missing: - df_trans$fixed[is.na(df_trans$fixed)] <- TRUE - - # check if text to look for is in one of the patterns - any_form_in_patterns <- tryCatch(any(from %like% paste0("(", paste(df_trans$pattern, collapse = "|"), ")")), - error = function(e) { - warning("Translation not possible. Please open an issue on GitLab (https://gitlab.com/msberends/AMR/issues) or GitHub (https://github.com/msberends/AMR/issues).", call. = FALSE) - return(FALSE) - }) - if (NROW(df_trans) == 0 | !any_form_in_patterns) { - return(from) - } - - for (i in 1:nrow(df_trans)) { - from <- gsub(x = from, - pattern = df_trans$pattern[i], - replacement = df_trans$replacement[i], - fixed = df_trans$fixed[i], - ignore.case = df_trans$ignore.case[i]) - } - - # force UTF-8 for diacritics - base::enc2utf8(from) - -} - "%or%" <- function(x, y) { if (is.null(x) | is.null(y)) { if (is.null(x)) { diff --git a/R/mo.R b/R/mo.R index b68da3f6..520fc225 100755 --- a/R/mo.R +++ b/R/mo.R @@ -432,13 +432,13 @@ exec_as.mo <- function(x, } x <- y - } else if (all(x %in% read_mo_history(uncertainty_level, - force = force_mo_history)$x)) { - # previously found code - x <- microorganismsDT[data.table(mo = get_mo_history(x, - uncertainty_level, - force = force_mo_history)), - on = "mo", ..property][[1]] + # } else if (all(x %in% read_mo_history(uncertainty_level, + # force = force_mo_history)$x)) { + # # previously found code + # x <- microorganismsDT[data.table(mo = get_mo_history(x, + # uncertainty_level, + # force = force_mo_history)), + # on = "mo", ..property][[1]] } else if (all(tolower(x) %in% microorganismsDT$fullname_lower)) { # we need special treatment for very prevalent full names, they are likely! @@ -561,17 +561,17 @@ exec_as.mo <- function(x, progress$tick()$print() - if (initial_search == TRUE) { - found <- microorganismsDT[mo == get_mo_history(x_backup[i], - uncertainty_level, - force = force_mo_history), - ..property][[1]] - # previously found result - if (length(found) > 0) { - x[i] <- found[1L] - next - } - } + # if (initial_search == TRUE) { + # found <- microorganismsDT[mo == get_mo_history(x_backup[i], + # uncertainty_level, + # force = force_mo_history), + # ..property][[1]] + # # previously found result + # if (length(found) > 0) { + # x[i] <- found[1L] + # next + # } + # } found <- microorganismsDT[mo == toupper(x_backup[i]), ..property][[1]] # is a valid MO code @@ -826,6 +826,7 @@ exec_as.mo <- function(x, if (initial_search == TRUE) { set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) } + next } else if (grepl("[sS]almonella [A-Z][a-z]+ ?.*", x_backup_without_spp[i], ignore.case = FALSE)) { # Salmonella with capital letter species like "Salmonella Goettingen" - they're all S. enterica x[i] <- microorganismsDT[mo == 'B_SLMNL_ENT', ..property][[1]][1L] @@ -833,11 +834,11 @@ exec_as.mo <- function(x, set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) } uncertainties <- rbind(uncertainties, - data.frame(uncertainty_level = 1, - input = x_backup_without_spp[i], - result_mo = "B_SLMNL_ENT")) + format_uncertainty_as_df(uncertainty_level = 1, + input = x_backup_without_spp[i], + result_mo = "B_SLMNL_ENT")) + next } - next } # trivial names known to the field: @@ -1850,8 +1851,11 @@ mo_renamed <- function() { #' @export #' @noRd print.mo_renamed <- function(x, ...) { - items <- getOption("mo_renamed") - base::message(blue(paste("NOTE:", names(items), "was renamed", items, collapse = "\n"), collapse = "\n")) + items <- x #getOption("mo_renamed") + old <- names(x) + new <- x + + cat(blue(paste("NOTE:", italic(names(items)), "was renamed", italic(items), collapse = "\n"), collapse = "\n")) } nr2char <- function(x) { diff --git a/R/mo2.R b/R/mo2.R new file mode 100644 index 00000000..8ed1c821 --- /dev/null +++ b/R/mo2.R @@ -0,0 +1,1489 @@ +# ==================================================================== # +# TITLE # +# Antimicrobial Resistance (AMR) Analysis # +# # +# SOURCE # +# https://gitlab.com/msberends/AMR # +# # +# LICENCE # +# (c) 2019 Berends MS (m.s.berends@umcg.nl), Luz CF (c.f.luz@umcg.nl) # +# # +# This R package is free software; you can freely use and distribute # +# it for both personal and commercial purposes under the terms of the # +# GNU General Public License version 2.0 (GNU GPL-2), as published by # +# the Free Software Foundation. # +# # +# This R package was created for academic research and was publicly # +# released in the hope that it will be useful, but it comes WITHOUT # +# ANY WARRANTY OR LIABILITY. # +# Visit our website for more info: https://msberends.gitlab.io/AMR. # +# ==================================================================== # + +# THIS IS A TEST FUNCTION +as.mo2 <- function(x, + Becker = FALSE, + Lancefield = FALSE, + allow_uncertain = TRUE, + reference_df = get_mo_source(), + property = "mo", + initial_search = TRUE, + dyslexia_mode = FALSE, + force_mo_history = FALSE, + debug = FALSE) { + + if (!"AMR" %in% base::.packages()) { + require("AMR") + # check onLoad() in R/zzz.R: data tables are created there. + } + + # WHONET: xxx = no growth + x[tolower(as.character(paste0(x, ""))) %in% c("", "xxx", "na", "nan")] <- NA_character_ + + if (initial_search == TRUE) { + options(mo_failures = NULL) + options(mo_uncertainties = NULL) + options(mo_renamed = NULL) + } + options(mo_renamed_last_run = NULL) + + if (NCOL(x) == 2) { + # support tidyverse selection like: df %>% select(colA, colB) + # paste these columns together + x_vector <- vector("character", NROW(x)) + for (i in 1:NROW(x)) { + x_vector[i] <- paste(pull(x[i,], 1), pull(x[i,], 2), sep = " ") + } + x <- x_vector + } else { + if (NCOL(x) > 2) { + stop('`x` can be 2 columns at most', call. = FALSE) + } + x[is.null(x)] <- NA + + # support tidyverse selection like: df %>% select(colA) + if (!is.vector(x) & !is.null(dim(x))) { + x <- pull(x, 1) + } + } + + notes <- character(0) + uncertainties <- data.frame(uncertainty = integer(0), + input = character(0), + fullname = character(0), + renamed_to = character(0), + mo = character(0), + stringsAsFactors = FALSE) + failures <- character(0) + uncertainty_level <- translate_allow_uncertain(allow_uncertain) + + + # x_input <- x + # already strip leading and trailing spaces + #x <- trimws(x, which = "both") + # only check the uniques, which is way faster + #x <- unique(x) + # remove empty values (to later fill them in again with NAs) + # ("xxx" is WHONET code for 'no growth') + # x <- x[!is.na(x) + # & !is.null(x) + # & !identical(x, "") + # & !identical(x, "xxx")] + + # conversion of old MO codes from v0.5.0 (ITIS) to later versions (Catalogue of Life) + if (any(x %like% "^[BFP]_[A-Z]{3,7}") & !all(x %in% microorganisms$mo)) { + leftpart <- gsub("^([BFP]_[A-Z]{3,7}).*", "\\1", x) + if (any(leftpart %in% names(mo_codes_v0.5.0))) { + rightpart <- gsub("^[BFP]_[A-Z]{3,7}(.*)", "\\1", x) + leftpart <- mo_codes_v0.5.0[leftpart] + x[!is.na(leftpart)] <- paste0(leftpart[!is.na(leftpart)], rightpart[!is.na(leftpart)]) + } + # now check if some are still old + still_old <- x[x %in% names(mo_codes_v0.5.0)] + if (length(still_old) > 0) { + x[x %in% names(mo_codes_v0.5.0)] <- data.frame(old = still_old, stringsAsFactors = FALSE) %>% + left_join(data.frame(old = names(mo_codes_v0.5.0), + new = mo_codes_v0.5.0, + stringsAsFactors = FALSE), by = "old") %>% + # if they couldn't be found, replace them with the old ones again, + # so they will throw a warning in the end + mutate(new = ifelse(is.na(new), old, new)) %>% + pull(new) + } + } + + # # defined df to check for + # if (!is.null(reference_df)) { + # if (!mo_source_isvalid(reference_df)) { + # stop("`reference_df` must contain a column `mo` with values from the 'microorganisms' data set.", call. = FALSE) + # } + # reference_df <- reference_df %>% filter(!is.na(mo)) + # # keep only first two columns, second must be mo + # if (colnames(reference_df)[1] == "mo") { + # reference_df <- reference_df[, c(2, 1)] + # } else { + # reference_df <- reference_df[, c(1, 2)] + # } + # colnames(reference_df)[1] <- "x" + # # remove factors, just keep characters + # suppressWarnings( + # reference_df[] <- lapply(reference_df, as.character) + # ) + # } + # + # # all empty + # if (all(identical(trimws(x_input), "") | is.na(x_input) | length(x) == 0)) { + # if (property == "mo") { + # return(to_class_mo(rep(NA_character_, length(x_input)))) + # } else { + # return(rep(NA_character_, length(x_input))) + # } + # + # } else if (all(x %in% reference_df[, 1][[1]])) { + # # all in reference df + # colnames(reference_df)[1] <- "x" + # suppressWarnings( + # x <- data.frame(x = x, stringsAsFactors = FALSE) %>% + # left_join(reference_df, by = "x") %>% + # left_join(AMR::microorganisms, by = "mo") %>% + # pull(property) + # ) + # + # } else if (all(x %in% AMR::microorganisms$mo)) { + # # existing mo codes when not looking for property "mo", like mo_genus("B_ESCHR_COL") + # y <- microorganismsDT[prevalence == 1][data.table(mo = x), on = "mo", ..property][[1]] + # if (any(is.na(y))) { + # y[is.na(y)] <- microorganismsDT[prevalence == 2][data.table(mo = x[is.na(y)]), + # on = "mo", + # ..property][[1]] + # } + # if (any(is.na(y))) { + # y[is.na(y)] <- microorganismsDT[prevalence == 3][data.table(mo = x[is.na(y)]), + # on = "mo", + # ..property][[1]] + # } + # x <- y + # + # # } else if (all(x %in% read_mo_history(uncertainty_level, + # # force = force_mo_history)$x)) { + # # # previously found code + # # x <- microorganismsDT[data.table(mo = get_mo_history(x, + # # uncertainty_level, + # # force = force_mo_history)), + # # on = "mo", ..property][[1]] + # + # } else if (all(tolower(x) %in% microorganismsDT$fullname_lower)) { + # # we need special treatment for very prevalent full names, they are likely! + # # e.g. as.mo("Staphylococcus aureus") + # y <- microorganismsDT[prevalence == 1][data.table(fullname_lower = tolower(x)), on = "fullname_lower", ..property][[1]] + # if (any(is.na(y))) { + # y[is.na(y)] <- microorganismsDT[prevalence == 2][data.table(fullname_lower = tolower(x[is.na(y)])), + # on = "fullname_lower", + # ..property][[1]] + # } + # if (any(is.na(y))) { + # y[is.na(y)] <- microorganismsDT[prevalence == 3][data.table(fullname_lower = tolower(x[is.na(y)])), + # on = "fullname_lower", + # ..property][[1]] + # } + # x <- y + # + # } else if (all(toupper(x) %in% AMR::microorganisms.codes$code)) { + # # commonly used MO codes + # y <- as.data.table(AMR::microorganisms.codes)[data.table(code = toupper(x)), on = "code", ] + # # save them to history + # set_mo_history(x, y$mo, 0, force = force_mo_history) + # + # x <- microorganismsDT[data.table(mo = y[["mo"]]), on = "mo", ..property][[1]] + # + # } else if (!all(x %in% AMR::microorganisms[, property])) { + # + if (1 == 1) { + strip_whitespace <- function(x) { + # all whitespaces (tab, new lines, etc.) should be one space + # and spaces before and after should be omitted + trimws(gsub("[\\s]+", " ", x, perl = TRUE), which = "both") + } + + + x_new <- rep(NA_character_, length(x)) + + # keep only dots, letters, numbers, slashes, spaces and dashes + x <- gsub("[^.a-zA-Z0-9/ \\-]+", "", x) + # remove spp and species + x <- gsub(" +(spp.?|ssp.?|sp.? |ss ?.?|subsp.?|subspecies|biovar |serovar |species)", " ", x, ignore.case = TRUE) + # remove 'genus' as first word + x <- gsub("^genus ", "", x, ignore.case = TRUE) + # remove 'uncertain'-like texts + x <- trimws(gsub("(uncertain|susp[ie]c[a-z]+|verdacht)", "", x, ignore.case = TRUE)) + x <- strip_whitespace(x) + x_backup <- x + + # remove spp and species + #x <- gsub(" +(spp.?|ssp.?|sp.? |ss ?.?|subsp.?|subspecies|biovar |serovar |species)", " ", x_backup, ignore.case = TRUE) + #x <- strip_whitespace(x) + + x_backup_without_spp <- x + x_species <- paste(x, "species") + # translate to English for supported languages of mo_property + x <- gsub("(gruppe|groep|grupo|gruppo|groupe)", "group", x, ignore.case = TRUE) + x <- gsub("(vergroen)[a-z]*", "viridans", x, ignore.case = TRUE) + x <- gsub("(hefe|gist|gisten|levadura|lievito|fermento|levure)[a-z]*", "yeast", x, ignore.case = TRUE) + x <- gsub("(schimmels?|mofo|molde|stampo|moisissure|fungi)[a-z]*", "fungus", x, ignore.case = TRUE) + x <- gsub("Fungus[ph|f]rya", "Fungiphrya", x, ignore.case = TRUE) + # remove non-text in case of "E. coli" except dots and spaces + # x <- gsub("[^.a-zA-Z0-9/ \\-]+", "", x) + # replace minus by a space + x <- gsub("-+", " ", x) + # replace hemolytic by haemolytic + x <- gsub("ha?emoly", "haemoly", x, ignore.case = TRUE) + # place minus back in streptococci + x <- gsub("(alpha|beta|gamma).?ha?emoly", "\\1-haemoly", x, ignore.case = TRUE) + # remove genus as first word + # x <- gsub("^genus ", "", x, ignore.case = TRUE) + # remove 'uncertain' like texts + # x <- trimws(gsub("(uncertain|susp[ie]c[a-z]+|verdacht)", "", x, ignore.case = TRUE)) + # allow characters that resemble others = dyslexia_mode ---- + if (dyslexia_mode == TRUE) { + x <- tolower(x) + x <- gsub("[iy]+", "[iy]+", x) + x <- gsub("(c|k|q|qu|s|z|x|ks)+", "(c|k|q|qu|s|z|x|ks)+", x) + x <- gsub("(ph|hp|f|v)+", "(ph|hp|f|v)+", x) + x <- gsub("(th|ht|t)+", "(th|ht|t)+", x) + x <- gsub("a+", "a+", x) + x <- gsub("u+", "u+", x) + # allow any ending of -um, -us, -ium, -icum, -ius, -icus, -ica and -a (needs perl for the negative backward lookup): + x <- gsub("(u\\+\\(c\\|k\\|q\\|qu\\+\\|s\\|z\\|x\\|ks\\)\\+)(?![a-z])", + "(u[s|m]|[iy][ck]?u[ms]|[iy]?[ck]?a)", x, ignore.case = TRUE, perl = TRUE) + x <- gsub("(\\[iy\\]\\+\\(c\\|k\\|q\\|qu\\+\\|s\\|z\\|x\\|ks\\)\\+a\\+)(?![a-z])", + "(u[s|m]|[iy][ck]?u[ms]|[iy]?[ck]?a)", x, ignore.case = TRUE, perl = TRUE) + x <- gsub("(\\[iy\\]\\+u\\+m)(?![a-z])", + "(u[s|m]|[iy][ck]?u[ms]|[iy]?[ck]?a)", x, ignore.case = TRUE, perl = TRUE) + x <- gsub("e+", "e+", x, ignore.case = TRUE) + x <- gsub("o+", "o+", x, ignore.case = TRUE) + x <- gsub("(.)\\1+", "\\1+", x) + # allow ending in -en or -us + x <- gsub("e\\+n(?![a-z[])", "(e+n|u+(c|k|q|qu|s|z|x|ks)+)", x, ignore.case = TRUE, perl = TRUE) + # if the input is longer than 10 characters, allow any constant between all characters, as some might have forgotten a character + # this will allow "Pasteurella damatis" to be correctly read as "Pasteurella dagmatis". + constants <- paste(letters[!letters %in% c("a", "e", "i", "o", "u")], collapse = "") + #x[nchar(x_backup_without_spp) > 10] <- gsub("([a-z])([a-z])", paste0("\\1[", constants, "]?\\2"), x[nchar(x_backup_without_spp) > 10], ignore.case = TRUE) + x[nchar(x_backup_without_spp) > 10] <- gsub("[+]", paste0("+[", constants, "]?"), x[nchar(x_backup_without_spp) > 10]) + } + x <- strip_whitespace(x) + + x_trimmed <- x + x_trimmed_species <- paste(x_trimmed, "species") + x_trimmed_without_group <- gsub(" gro.u.p$", "", x_trimmed, ignore.case = TRUE) + # remove last part from "-" or "/" + x_trimmed_without_group <- gsub("(.*)[-/].*", "\\1", x_trimmed_without_group) + # replace space and dot by regex sign + x_withspaces <- gsub("[ .]+", ".* ", x) + x <- gsub("[ .]+", ".*", x) + # add start en stop regex + x <- paste0('^', x, '$') + x_withspaces_start_only <- paste0('^', x_withspaces) + x_withspaces_end_only <- paste0(x_withspaces, '$') + x_withspaces_start_end <- paste0('^', x_withspaces, '$') + + if (isTRUE(debug)) { + print(data.frame( + x_backup, + x, + x_species, + x_withspaces_start_only, + x_withspaces_end_only, + x_withspaces_start_end, + x_backup_without_spp, + x_trimmed, + x_trimmed_species, + x_trimmed_without_group), right = FALSE) + # cat(paste0('x "', x, '"\n')) + # cat(paste0('x_species "', x_species, '"\n')) + # cat(paste0('x_withspaces_start_only "', x_withspaces_start_only, '"\n')) + # cat(paste0('x_withspaces_end_only "', x_withspaces_end_only, '"\n')) + # cat(paste0('x_withspaces_start_end "', x_withspaces_start_end, '"\n')) + # cat(paste0('x_backup "', x_backup, '"\n')) + # cat(paste0('x_backup_without_spp "', x_backup_without_spp, '"\n')) + # cat(paste0('x_trimmed "', x_trimmed, '"\n')) + # cat(paste0('x_trimmed_species "', x_trimmed_species, '"\n')) + # cat(paste0('x_trimmed_without_group "', x_trimmed_without_group, '"\n')) + } + + #progress <- progress_estimated(n = length(x), min_time = 3) + + # THE NEW WAY ---- + nothing_more_to_do <- function() !any(is.na(x_new) & !is.na(x_backup)) + + lookup_regexes <- function(data, property, regex) { + prop <- regex %>% + sapply(function(pattern) pull(data, property) %like% pattern) %>% + as.data.frame() %>% + lapply(function(c) suppressWarnings(min(pull(data, property)[c]))) %>% + unlist() + if (is.null(prop)) { + return(rep(NA, length(regex))) + } + DT <- data.table(prop) + colnames(DT) <- property + microorganismsDT[DT, on = property, "mo"][[1]] + } + + # LATER: only unique if more than 500 values, of which max 85% distinct? + + x_backup_upper <- toupper(x_backup) + x_backup_lower <- tolower(x_backup) + + # exclude all viruses (there is no fullname containing 'virus' in the data set) + x_new[x_backup %like% "virus"] <- "UNKNOWN" + + # try available fields in the microorganisms data set + x_new[x_backup_upper %in% microorganisms$mo] <- microorganismsDT[data.table(mo = x_backup_upper[x_backup_upper %in% microorganisms$mo]), on = "mo", "mo"][[1]] + x_new[x_backup_lower %in% microorganismsDT$fullname_lower] <- microorganismsDT[data.table(fullname_lower = x_backup_lower[x_backup_lower %in% microorganismsDT$fullname_lower]), on = "fullname_lower", "mo"][[1]] + # x_new[x_backup %in% microorganisms$col_id] <- microorganismsDT[data.table(col_id = as.integer(x_backup[x_backup %in% microorganisms$col_id])), on = "col_id", ..property][[1]] + + # old names + old_names <- x_backup[x_backup_lower %in% microorganisms.oldDT$fullname_lower] + x_new[x_backup_lower %in% microorganisms.oldDT$fullname_lower] <- microorganismsDT[microorganisms.oldDT[data.table(fullname_lower = x_backup_lower[x_backup_lower %in% microorganisms.oldDT$fullname_lower]), on = "fullname_lower", "col_id_new"], on = c("col_id" = "col_id_new"), "mo"][[1]] + + if (nothing_more_to_do()) { + if (property != "mo") { + return(microorganismsDT[data.table(mo = x_new), on = "mo", ..property][[1]]) + } else { + return(to_class_mo(x_new)) + } + } + + # codes from the microorganisms.codes data set + x_new[x_backup_upper %in% microorganisms.codes$code] <- as.data.table(microorganisms.codes)[data.table(code = x_backup_upper[x_backup_upper %in% microorganisms.codes$code]), on = "code", "mo"][[1]] + if (!is.null(reference_df)) { + colnames(reference_df)[1] <- "code" + x_new[x_backup_upper %in% reference_df$code] <- as.data.table(reference_df)[data.table(code = x_backup_upper[x_backup_upper %in% reference_df$code]), on = "code", mo][[1]] + if (!all(x_new %in% microorganisms$mo, na.rm = TRUE)) { + warning("Values ", paste(x_new[!x_new %in% c(NA, microorganisms$mo)], collapse = ", "), " found in reference_df, but these are not valid MO codes.", call. = FALSE) + x_new[!x_new %in% c(NA, microorganisms$mo)] <- "UNKNOWN" + } + } + + x_new[x_backup_upper %in% c("MRSA", "MSSA", "VISA", "VRSA")] <- "B_STPHY_AUR" + x_new[x_backup_upper %in% c("MRSE", "MSSE")] <- "B_STPHY_EPI" + x_new[x_backup_upper %in% c("AIEC", "ATEC", "DAEC", "EAEC", "EHEC", "EIEC", "EPEC", "ETEC", "NMEC", "STEC", "UPEC") + | x_backup_upper %like% "O?(26|103|104|111|121|145|157)"] <- "B_ESCHR_COL" + x_new[x_backup_upper %in% c("MRPA")] <- "B_PSDMN_AER" + x_new[x_backup_upper %in% c("CRSM")] <- "B_STNTR_MAL" + x_new[x_backup_upper %in% c("PISP", "PRSP", "VISP", "VRSP")] <- "B_STRPT_PNE" + x_new[x_backup_upper %in% c("VRE") + | x_backup %like% "(enterococci|enterokok|enterococo)[a-z]*?$"] <- "B_ENTRC" + + # start showing progress bar here + progress <- progress_estimated(n = 3, min_time = 0) + # most prevalent (1) + x_new[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6] <- microorganismsDT[prevalence == 1] %>% lookup_regexes("fullname", x_withspaces_start_end[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6]) + x_new[!is.na(x_backup) & is.na(x_new)] <- microorganismsDT[prevalence == 1] %>% lookup_regexes("fullname", paste0(trimws(x_withspaces_start_only), " ")[!is.na(x_backup) & is.na(x_new)]) + x_new[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6] <- microorganismsDT[prevalence == 1] %>% lookup_regexes("fullname", x_withspaces_start_only[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6]) + x_new[!is.na(x_backup) & is.na(x_new)] <- microorganismsDT[prevalence == 1] %>% lookup_regexes("fullname", paste0(" ", trimws(x_withspaces_end_only))[!is.na(x_backup) & is.na(x_new)]) + x_new[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6] <- microorganismsDT[prevalence == 1] %>% lookup_regexes("fullname", x_trimmed[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6]) + x_new[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6] <- microorganismsDT[prevalence == 1] %>% lookup_regexes("fullname", x_trimmed_without_group[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6]) + x_new[!is.na(x_backup) & is.na(x_new)] <- microorganismsDT[prevalence == 1] %>% lookup_regexes("fullname", x_withspaces_start_only[!is.na(x_backup) & is.na(x_new)]) + if (nothing_more_to_do()) { + if (property != "mo") { + return(microorganismsDT[data.table(mo = x_new), on = "mo", ..property][[1]]) + } else { + return(to_class_mo(x_new)) + } + } + progress$tick()$print() + # less prevalent (2) + x_new[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6] <- microorganismsDT[prevalence == 2] %>% lookup_regexes("fullname", x_withspaces_start_end[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6]) + x_new[!is.na(x_backup) & is.na(x_new)] <- microorganismsDT[prevalence == 2] %>% lookup_regexes("fullname", paste0(trimws(x_withspaces_start_only), " ")[!is.na(x_backup) & is.na(x_new)]) + x_new[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6] <- microorganismsDT[prevalence == 2] %>% lookup_regexes("fullname", x_withspaces_start_only[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6]) + x_new[!is.na(x_backup) & is.na(x_new)] <- microorganismsDT[prevalence == 2] %>% lookup_regexes("fullname", paste0(" ", trimws(x_withspaces_end_only))[!is.na(x_backup) & is.na(x_new)]) + x_new[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6] <- microorganismsDT[prevalence == 2] %>% lookup_regexes("fullname", x_trimmed[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6]) + x_new[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6] <- microorganismsDT[prevalence == 2] %>% lookup_regexes("fullname", x_trimmed_without_group[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6]) + x_new[!is.na(x_backup) & is.na(x_new)] <- microorganismsDT[prevalence == 2] %>% lookup_regexes("fullname", x_withspaces_start_only[!is.na(x_backup) & is.na(x_new)]) + if (nothing_more_to_do()) { + if (property != "mo") { + return(microorganismsDT[data.table(mo = x_new), on = "mo", ..property][[1]]) + } else { + return(to_class_mo(x_new)) + } + } + progress$tick()$print() + # least prevalent (3) + x_new[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6] <- microorganismsDT[prevalence == 3] %>% lookup_regexes("fullname", x_withspaces_start_end[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6]) + x_new[!is.na(x_backup) & is.na(x_new)] <- microorganismsDT[prevalence == 3] %>% lookup_regexes("fullname", paste0(trimws(x_withspaces_start_only), " ")[!is.na(x_backup) & is.na(x_new)]) + x_new[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6] <- microorganismsDT[prevalence == 3] %>% lookup_regexes("fullname", x_withspaces_start_only[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6]) + x_new[!is.na(x_backup) & is.na(x_new)] <- microorganismsDT[prevalence == 3] %>% lookup_regexes("fullname", paste0(" ", trimws(x_withspaces_end_only))[!is.na(x_backup) & is.na(x_new)]) + x_new[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6] <- microorganismsDT[prevalence == 3] %>% lookup_regexes("fullname", x_trimmed[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6]) + x_new[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6] <- microorganismsDT[prevalence == 3] %>% lookup_regexes("fullname", x_trimmed_without_group[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6]) + x_new[!is.na(x_backup) & is.na(x_new)] <- microorganismsDT[prevalence == 3] %>% lookup_regexes("fullname", x_withspaces_start_only[!is.na(x_backup) & is.na(x_new)]) + + # all others are UNKNOWN + x_new[!is.na(x_backup) & is.na(x_new)] <- "UNKNOWN" + progress$tick()$print() + + return(to_class_mo(x_new)) + #for (i in 1:length(x)) { + for (i in character(0)) { + + x[i] <- "UNKNOWN" + next + + # progress$tick()$print() + + # if (initial_search == TRUE) { + # found <- microorganismsDT[mo == get_mo_history(x_backup[i], + # uncertainty_level, + # force = force_mo_history), + # ..property][[1]] + # # previously found result + # if (length(found) > 0) { + # x[i] <- found[1L] + # next + # } + # } + + found <- microorganismsDT[mo == toupper(x_backup[i]), ..property][[1]] + # is a valid MO code + if (length(found) > 0) { + x[i] <- found[1L] + next + } + + found <- microorganismsDT[fullname_lower %in% tolower(c(x_backup[i], x_backup_without_spp[i])), ..property][[1]] + # most probable: is exact match in fullname + if (length(found) > 0) { + x[i] <- found[1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + + found <- microorganismsDT[col_id == x_backup[i], ..property][[1]] + # is a valid Catalogue of Life ID + if (NROW(found) > 0) { + x[i] <- found[1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + + + # WHONET: xxx = no growth + if (tolower(as.character(paste0(x_backup_without_spp[i], ""))) %in% c("", "xxx", "na", "nan")) { + x[i] <- NA_character_ + next + } + + if (tolower(x_backup_without_spp[i]) %in% c("other", "none", "unknown")) { + # empty and nonsense values, ignore without warning + x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + + # check for very small input, but ignore the O antigens of E. coli + if (nchar(gsub("[^a-zA-Z]", "", x_trimmed[i])) < 3 + & !x_backup_without_spp[i] %like% "O?(26|103|104|104|111|121|145|157)") { + # check if search term was like "A. species", then return first genus found with ^A + # if (x_backup[i] %like% "[a-z]+ species" | x_backup[i] %like% "[a-z] spp[.]?") { + # # get mo code of first hit + # found <- microorganismsDT[fullname %like% x_withspaces_start_only[i], mo] + # if (length(found) > 0) { + # mo_code <- found[1L] %>% strsplit("_") %>% unlist() %>% .[1:2] %>% paste(collapse = "_") + # found <- microorganismsDT[mo == mo_code, ..property][[1]] + # # return first genus that begins with x_trimmed, e.g. when "E. spp." + # if (length(found) > 0) { + # x[i] <- found[1L] + # if (initial_search == TRUE) { + # set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + # } + # next + # } + # } + # } + # fewer than 3 chars and not looked for species, add as failure + x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]] + if (initial_search == TRUE) { + failures <- c(failures, x_backup[i]) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + + if (x_backup_without_spp[i] %like% "virus") { + # there is no fullname like virus, so don't try to coerce it + x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]] + if (initial_search == TRUE) { + failures <- c(failures, x_backup[i]) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + + # translate known trivial abbreviations to genus + species ---- + if (!is.na(x_trimmed[i])) { + if (toupper(x_backup_without_spp[i]) %in% c('MRSA', 'MSSA', 'VISA', 'VRSA')) { + x[i] <- microorganismsDT[mo == 'B_STPHY_AUR', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + if (toupper(x_backup_without_spp[i]) %in% c('MRSE', 'MSSE')) { + x[i] <- microorganismsDT[mo == 'B_STPHY_EPI', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + if (toupper(x_backup_without_spp[i]) == "VRE" + | x_backup_without_spp[i] %like% '(enterococci|enterokok|enterococo)[a-z]*?$') { + x[i] <- microorganismsDT[mo == 'B_ENTRC', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + # support for: + # - AIEC (Adherent-Invasive E. coli) + # - ATEC (Atypical Entero-pathogenic E. coli) + # - DAEC (Diffusely Adhering E. coli) + # - EAEC (Entero-Aggresive E. coli) + # - EHEC (Entero-Haemorrhagic E. coli) + # - EIEC (Entero-Invasive E. coli) + # - EPEC (Entero-Pathogenic E. coli) + # - ETEC (Entero-Toxigenic E. coli) + # - NMEC (Neonatal Meningitis‐causing E. coli) + # - STEC (Shiga-toxin producing E. coli) + # - UPEC (Uropathogenic E. coli) + if (toupper(x_backup_without_spp[i]) %in% c("AIEC", "ATEC", "DAEC", "EAEC", "EHEC", "EIEC", "EPEC", "ETEC", "NMEC", "STEC", "UPEC") + # also support O-antigens of E. coli: O26, O103, O104, O111, O121, O145, O157 + | x_backup_without_spp[i] %like% "O?(26|103|104|111|121|145|157)") { + x[i] <- microorganismsDT[mo == 'B_ESCHR_COL', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + if (toupper(x_backup_without_spp[i]) == 'MRPA') { + # multi resistant P. aeruginosa + x[i] <- microorganismsDT[mo == 'B_PSDMN_AER', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + if (toupper(x_backup_without_spp[i]) == 'CRS' + | toupper(x_backup_without_spp[i]) == 'CRSM') { + # co-trim resistant S. maltophilia + x[i] <- microorganismsDT[mo == 'B_STNTR_MAL', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + if (toupper(x_backup_without_spp[i]) %in% c('PISP', 'PRSP', 'VISP', 'VRSP')) { + # peni I, peni R, vanco I, vanco R: S. pneumoniae + x[i] <- microorganismsDT[mo == 'B_STRPT_PNE', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + if (x_backup_without_spp[i] %like% '^G[ABCDFGHK]S$') { + # Streptococci, like GBS = Group B Streptococci (B_STRPT_GRB) + x[i] <- microorganismsDT[mo == gsub("G([ABCDFGHK])S", "B_STRPT_GR\\1", x_backup_without_spp[i], ignore.case = TRUE), ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + if (x_backup_without_spp[i] %like% '(streptococ|streptokok).* [ABCDFGHK]$') { + # Streptococci in different languages, like "estreptococos grupo B" + x[i] <- microorganismsDT[mo == gsub(".*(streptococ|streptokok|estreptococ).* ([ABCDFGHK])$", "B_STRPT_GR\\2", x_backup_without_spp[i], ignore.case = TRUE), ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + if (x_backup_without_spp[i] %like% 'group [ABCDFGHK] (streptococ|streptokok|estreptococ)') { + # Streptococci in different languages, like "Group A Streptococci" + x[i] <- microorganismsDT[mo == gsub(".*group ([ABCDFGHK]) (streptococ|streptokok|estreptococ).*", "B_STRPT_GR\\1", x_backup_without_spp[i], ignore.case = TRUE), ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + if (x_backup_without_spp[i] %like% 'haemoly.*strept') { + # Haemolytic streptococci in different languages + x[i] <- microorganismsDT[mo == 'B_STRPT_HAE', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + # CoNS/CoPS in different languages (support for German, Dutch, Spanish, Portuguese) ---- + if (x_backup_without_spp[i] %like% '[ck]oagulas[ea] negatie?[vf]' + | x_trimmed[i] %like% '[ck]oagulas[ea] negatie?[vf]' + | x_backup_without_spp[i] %like% '[ck]o?ns[^a-z]?$') { + # coerce S. coagulase negative + x[i] <- microorganismsDT[mo == 'B_STPHY_CNS', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + if (x_backup_without_spp[i] %like% '[ck]oagulas[ea] positie?[vf]' + | x_trimmed[i] %like% '[ck]oagulas[ea] positie?[vf]' + | x_backup_without_spp[i] %like% '[ck]o?ps[^a-z]?$') { + # coerce S. coagulase positive + x[i] <- microorganismsDT[mo == 'B_STPHY_CPS', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + # streptococcal groups: milleri and viridans + if (x_trimmed[i] %like% 'strepto.* milleri' + | x_backup_without_spp[i] %like% 'strepto.* milleri' + | x_backup_without_spp[i] %like% 'mgs[^a-z]?$') { + # Milleri Group Streptococcus (MGS) + x[i] <- microorganismsDT[mo == 'B_STRPT_MIL', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + if (x_trimmed[i] %like% 'strepto.* viridans' + | x_backup_without_spp[i] %like% 'strepto.* viridans' + | x_backup_without_spp[i] %like% 'vgs[^a-z]?$') { + # Viridans Group Streptococcus (VGS) + x[i] <- microorganismsDT[mo == 'B_STRPT_VIR', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + if (x_backup_without_spp[i] %like% 'gram[ -]?neg.*' + | x_backup_without_spp[i] %like% 'negatie?[vf]' + | x_trimmed[i] %like% 'gram[ -]?neg.*') { + # coerce Gram negatives + x[i] <- microorganismsDT[mo == 'B_GRAMN', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + if (x_backup_without_spp[i] %like% 'gram[ -]?pos.*' + | x_backup_without_spp[i] %like% 'positie?[vf]' + | x_trimmed[i] %like% 'gram[ -]?pos.*') { + # coerce Gram positives + x[i] <- microorganismsDT[mo == 'B_GRAMP', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + if (x_backup_without_spp[i] %like% "salmonella [a-z]+ ?.*") { + if (x_backup_without_spp[i] %like% "Salmonella group") { + # Salmonella Group A to Z, just return S. species for now + x[i] <- microorganismsDT[mo == 'B_SLMNL', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + } else if (grepl("[sS]almonella [A-Z][a-z]+ ?.*", x_backup_without_spp[i], ignore.case = FALSE)) { + # Salmonella with capital letter species like "Salmonella Goettingen" - they're all S. enterica + x[i] <- microorganismsDT[mo == 'B_SLMNL_ENT', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + uncertainties <- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = 1, + input = x_backup_without_spp[i], + result_mo = "B_SLMNL_ENT")) + } + next + } + + # trivial names known to the field: + if ("meningococcus" %like% x_trimmed[i]) { + # coerce S. coagulase positive + x[i] <- microorganismsDT[mo == 'B_NESSR_MEN', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + if ("gonococcus" %like% x_trimmed[i]) { + # coerce S. coagulase positive + x[i] <- microorganismsDT[mo == 'B_NESSR_GON', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + if ("pneumococcus" %like% x_trimmed[i]) { + # coerce S. coagulase positive + x[i] <- microorganismsDT[mo == 'B_STRPT_PNE', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + } + + # FIRST TRY FULLNAMES AND CODES ---- + # if only genus is available, return only genus + if (all(!c(x[i], x_trimmed[i]) %like% " ")) { + found <- microorganismsDT[fullname_lower %in% tolower(c(x_species[i], x_trimmed_species[i])), ..property][[1]] + if (length(found) > 0) { + x[i] <- found[1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + if (nchar(x_backup_without_spp[i]) >= 6) { + found <- microorganismsDT[fullname_lower %like% paste0("^", unregex(x_backup_without_spp[i]), "[a-z]+"), ..property][[1]] + if (length(found) > 0) { + x[i] <- found[1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + } + # rest of genus only is in allow_uncertain part. + } + + # TRY OTHER SOURCES ---- + # WHONET and other common LIS codes + if (toupper(x_backup[i]) %in% AMR::microorganisms.codes[, 1]) { + mo_found <- AMR::microorganisms.codes[toupper(x_backup[i]) == AMR::microorganisms.codes[, 1], "mo"][1L] + if (length(mo_found) > 0) { + x[i] <- microorganismsDT[mo == mo_found, ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + } + if (!is.null(reference_df)) { + # self-defined reference + if (x_backup[i] %in% reference_df[, 1]) { + ref_mo <- reference_df[reference_df[, 1] == x_backup[i], "mo"] + if (ref_mo %in% microorganismsDT[, mo]) { + x[i] <- microorganismsDT[mo == ref_mo, ..property][[1]][1L] + next + } else { + warning("Value '", x_backup[i], "' was found in reference_df, but '", ref_mo, "' is not a valid MO code.", call. = FALSE) + } + } + } + + # allow no codes less than 4 characters long, was already checked for WHONET above + if (nchar(x_backup_without_spp[i]) < 4) { + x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]] + if (initial_search == TRUE) { + failures <- c(failures, x_backup[i]) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + + check_per_prevalence <- function(data_to_check, + a.x_backup, + b.x_trimmed, + c.x_trimmed_without_group, + d.x_withspaces_start_end, + e.x_withspaces_start_only, + f.x_withspaces_end_only, + g.x_backup_without_spp) { + + # try probable: trimmed version of fullname ---- + found <- data_to_check[fullname_lower %in% tolower(g.x_backup_without_spp), ..property][[1]] + if (length(found) > 0) { + return(found[1L]) + } + + # try any match keeping spaces ---- + found <- data_to_check[fullname %like% d.x_withspaces_start_end, ..property][[1]] + if (length(found) > 0 & nchar(g.x_backup_without_spp) >= 6) { + return(found[1L]) + } + + # try any match keeping spaces, not ending with $ ---- + found <- data_to_check[fullname %like% paste0(trimws(e.x_withspaces_start_only), " "), ..property][[1]] + if (length(found) > 0) { + return(found[1L]) + } + found <- data_to_check[fullname %like% e.x_withspaces_start_only, ..property][[1]] + if (length(found) > 0 & nchar(g.x_backup_without_spp) >= 6) { + return(found[1L]) + } + + # try any match keeping spaces, not start with ^ ---- + found <- data_to_check[fullname %like% paste0(" ", trimws(f.x_withspaces_end_only)), ..property][[1]] + if (length(found) > 0) { + return(found[1L]) + } + + # try a trimmed version + found <- data_to_check[fullname_lower %like% b.x_trimmed + | fullname_lower %like% c.x_trimmed_without_group, ..property][[1]] + if (length(found) > 0 & nchar(g.x_backup_without_spp) >= 6) { + return(found[1L]) + } + + + # try splitting of characters in the middle and then find ID ---- + # only when text length is 6 or lower + # like esco = E. coli, klpn = K. pneumoniae, stau = S. aureus, staaur = S. aureus + if (nchar(g.x_backup_without_spp) <= 6) { + x_length <- nchar(g.x_backup_without_spp) + x_split <- paste0("^", + g.x_backup_without_spp %>% substr(1, x_length / 2), + '.* ', + g.x_backup_without_spp %>% substr((x_length / 2) + 1, x_length)) + found <- data_to_check[fullname %like% x_split, ..property][[1]] + if (length(found) > 0) { + return(found[1L]) + } + } + + # try fullname without start and without nchar limit of >= 6 ---- + # like "K. pneu rhino" >> "Klebsiella pneumoniae (rhinoscleromatis)" = KLEPNERH + found <- data_to_check[fullname %like% e.x_withspaces_start_only, ..property][[1]] + if (length(found) > 0) { + return(found[1L]) + } + + # didn't found any + return(NA_character_) + } + + # FIRST TRY VERY PREVALENT IN HUMAN INFECTIONS ---- + x[i] <- check_per_prevalence(data_to_check = microorganismsDT[prevalence == 1], + a.x_backup = x_backup[i], + b.x_trimmed = x_trimmed[i], + c.x_trimmed_without_group = x_trimmed_without_group[i], + d.x_withspaces_start_end = x_withspaces_start_end[i], + e.x_withspaces_start_only = x_withspaces_start_only[i], + f.x_withspaces_end_only = x_withspaces_end_only[i], + g.x_backup_without_spp = x_backup_without_spp[i]) + if (!empty_result(x[i])) { + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + # THEN TRY PREVALENT IN HUMAN INFECTIONS ---- + x[i] <- check_per_prevalence(data_to_check = microorganismsDT[prevalence == 2], + a.x_backup = x_backup[i], + b.x_trimmed = x_trimmed[i], + c.x_trimmed_without_group = x_trimmed_without_group[i], + d.x_withspaces_start_end = x_withspaces_start_end[i], + e.x_withspaces_start_only = x_withspaces_start_only[i], + f.x_withspaces_end_only = x_withspaces_end_only[i], + g.x_backup_without_spp = x_backup_without_spp[i]) + if (!empty_result(x[i])) { + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + # THEN UNPREVALENT IN HUMAN INFECTIONS ---- + x[i] <- check_per_prevalence(data_to_check = microorganismsDT[prevalence == 3], + a.x_backup = x_backup[i], + b.x_trimmed = x_trimmed[i], + c.x_trimmed_without_group = x_trimmed_without_group[i], + d.x_withspaces_start_end = x_withspaces_start_end[i], + e.x_withspaces_start_only = x_withspaces_start_only[i], + f.x_withspaces_end_only = x_withspaces_end_only[i], + g.x_backup_without_spp = x_backup_without_spp[i]) + if (!empty_result(x[i])) { + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + + # MISCELLANEOUS ---- + + # look for old taxonomic names ---- + found <- microorganisms.oldDT[fullname_lower == tolower(x_backup[i]) + | fullname %like% x_withspaces_start_end[i],] + if (NROW(found) > 0) { + col_id_new <- found[1, col_id_new] + # when property is "ref" (which is the case in mo_ref, mo_authors and mo_year), return the old value, so: + # mo_ref("Chlamydia psittaci") = "Page, 1968" (with warning) + # mo_ref("Chlamydophila psittaci") = "Everett et al., 1999" + if (property == "ref") { + x[i] <- found[1, ref] + } else { + x[i] <- microorganismsDT[col_id == found[1, col_id_new], ..property][[1]] + } + options(mo_renamed_last_run = found[1, fullname]) + was_renamed(name_old = found[1, fullname], + name_new = microorganismsDT[col_id == found[1, col_id_new], fullname], + ref_old = found[1, ref], + ref_new = microorganismsDT[col_id == found[1, col_id_new], ref], + mo = microorganismsDT[col_id == found[1, col_id_new], mo]) + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + + # check for uncertain results ---- + uncertain_fn <- function(a.x_backup, + b.x_trimmed, + c.x_withspaces_start_end, + d.x_withspaces_start_only, + f.x_withspaces_end_only, + g.x_backup_without_spp) { + + if (uncertainty_level == 0) { + # do not allow uncertainties + return(NA_character_) + } + + # UNCERTAINTY LEVEL 1 ---- + if (uncertainty_level >= 1) { + now_checks_for_uncertainty_level <- 1 + + # (1) look again for old taxonomic names, now for G. species ---- + if (isTRUE(debug)) { + cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (1) look again for old taxonomic names, now for G. species\n") + } + if (isTRUE(debug)) { + message("Running '", c.x_withspaces_start_end, "' and '", d.x_withspaces_start_only, "'") + } + found <- microorganisms.oldDT[fullname %like% c.x_withspaces_start_end + | fullname %like% d.x_withspaces_start_only] + if (NROW(found) > 0 & nchar(g.x_backup_without_spp) >= 6) { + if (property == "ref") { + # when property is "ref" (which is the case in mo_ref, mo_authors and mo_year), return the old value, so: + # mo_ref("Chlamydia psittaci) = "Page, 1968" (with warning) + # mo_ref("Chlamydophila psittaci) = "Everett et al., 1999" + x <- found[1, ref] + } else { + x <- microorganismsDT[col_id == found[1, col_id_new], ..property][[1]] + } + was_renamed(name_old = found[1, fullname], + name_new = microorganismsDT[col_id == found[1, col_id_new], fullname], + ref_old = found[1, ref], + ref_new = microorganismsDT[col_id == found[1, col_id_new], ref], + mo = microorganismsDT[col_id == found[1, col_id_new], mo]) + options(mo_renamed_last_run = found[1, fullname]) + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = microorganismsDT[col_id == found[1, col_id_new], mo])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(x, property), 1, force = force_mo_history) + } + return(x) + } + + # (2) Try with misspelled input ---- + # just rerun with dyslexia_mode = TRUE will used the extensive regex part above + if (isTRUE(debug)) { + cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (2) Try with misspelled input\n") + } + if (isTRUE(debug)) { + message("Running '", a.x_backup, "'") + } + # first try without dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) + if (empty_result(found)) { + # then with dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) + } + if (!empty_result(found)) { + found_result <- found + found <- microorganismsDT[mo == found, ..property][[1]] + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(found[1L], property), 1, force = force_mo_history) + } + return(found[1L]) + } + } + + # UNCERTAINTY LEVEL 2 ---- + if (uncertainty_level >= 2) { + now_checks_for_uncertainty_level <- 2 + + # (3) look for genus only, part of name ---- + if (isTRUE(debug)) { + cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (3) look for genus only, part of name\n") + } + if (nchar(g.x_backup_without_spp) > 4 & !b.x_trimmed %like% " ") { + if (!grepl("^[A-Z][a-z]+", b.x_trimmed, ignore.case = FALSE)) { + if (isTRUE(debug)) { + message("Running '", paste(b.x_trimmed, "species"), "'") + } + # not when input is like Genustext, because then Neospora would lead to Actinokineospora + found <- microorganismsDT[fullname_lower %like% paste(b.x_trimmed, "species"), ..property][[1]] + if (length(found) > 0) { + x[i] <- found[1L] + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(x, property), 2, force = force_mo_history) + } + return(x) + } + } + } + + # (4) strip values between brackets ---- + if (isTRUE(debug)) { + cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (4) strip values between brackets\n") + } + a.x_backup_stripped <- gsub("( *[(].*[)] *)", " ", a.x_backup) + a.x_backup_stripped <- trimws(gsub(" +", " ", a.x_backup_stripped)) + if (isTRUE(debug)) { + message("Running '", a.x_backup_stripped, "'") + } + # first try without dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup_stripped, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) + if (empty_result(found)) { + # then with dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup_stripped, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) + } + if (!empty_result(found) & nchar(g.x_backup_without_spp) >= 6) { + found_result <- found + found <- microorganismsDT[mo == found, ..property][[1]] + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history) + } + return(found[1L]) + } + + # (5) inverse input ---- + if (isTRUE(debug)) { + cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (5) inverse input\n") + } + a.x_backup_inversed <- paste(rev(unlist(strsplit(a.x_backup, split = " "))), collapse = " ") + if (isTRUE(debug)) { + message("Running '", a.x_backup_inversed, "'") + } + # first try without dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup_inversed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) + if (empty_result(found)) { + # then with dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup_inversed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) + } + if (!empty_result(found) & nchar(g.x_backup_without_spp) >= 6) { + found_result <- found + found <- microorganismsDT[mo == found, ..property][[1]] + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history) + } + return(found[1L]) + } + + # (6) try to strip off half an element from end and check the remains ---- + if (isTRUE(debug)) { + cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (6) try to strip off half an element from end and check the remains\n") + } + x_strip <- a.x_backup %>% strsplit(" ") %>% unlist() + if (length(x_strip) > 1) { + for (i in 1:(length(x_strip) - 1)) { + lastword <- x_strip[length(x_strip) - i + 1] + lastword_half <- substr(lastword, 1, as.integer(nchar(lastword) / 2)) + # remove last half of the second term + x_strip_collapsed <- paste(c(x_strip[1:(length(x_strip) - i)], lastword_half), collapse = " ") + if (nchar(x_strip_collapsed) >= 4 & nchar(lastword_half) > 2) { + if (isTRUE(debug)) { + message("Running '", x_strip_collapsed, "'") + } + # first try without dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) + if (empty_result(found)) { + # then with dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) + } + if (!empty_result(found)) { + found_result <- found + found <- microorganismsDT[mo == found, ..property][[1]] + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history) + } + return(found[1L]) + } + } + } + } + # (7) try to strip off one element from end and check the remains ---- + if (isTRUE(debug)) { + cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (7) try to strip off one element from end and check the remains\n") + } + if (length(x_strip) > 1) { + for (i in 1:(length(x_strip) - 1)) { + x_strip_collapsed <- paste(x_strip[1:(length(x_strip) - i)], collapse = " ") + if (nchar(x_strip_collapsed) >= 6) { + if (isTRUE(debug)) { + message("Running '", x_strip_collapsed, "'") + } + # first try without dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) + if (empty_result(found)) { + # then with dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) + } + if (!empty_result(found)) { + found_result <- found + found <- microorganismsDT[mo == found, ..property][[1]] + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history) + } + return(found[1L]) + } + } + } + } + # (8) check for unknown yeasts/fungi ---- + if (isTRUE(debug)) { + cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (8) check for unknown yeasts/fungi\n") + } + if (b.x_trimmed %like% "yeast") { + found <- "F_YEAST" + found_result <- found + found <- microorganismsDT[mo == found, ..property][[1]] + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history) + } + return(found[1L]) + } + if (b.x_trimmed %like% "(fungus|fungi)" & !b.x_trimmed %like% "Fungiphrya") { + found <- "F_FUNGUS" + found_result <- found + found <- microorganismsDT[mo == found, ..property][[1]] + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history) + } + return(found[1L]) + } + # (9) try to strip off one element from start and check the remains (only allow >= 2-part name outcome) ---- + if (isTRUE(debug)) { + cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (9) try to strip off one element from start and check the remains (only allow >= 2-part name outcome)\n") + } + x_strip <- a.x_backup %>% strsplit(" ") %>% unlist() + if (length(x_strip) > 1 & nchar(g.x_backup_without_spp) >= 6) { + for (i in 2:(length(x_strip))) { + x_strip_collapsed <- paste(x_strip[i:length(x_strip)], collapse = " ") + if (isTRUE(debug)) { + message("Running '", x_strip_collapsed, "'") + } + # first try without dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) + if (empty_result(found)) { + # then with dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) + } + if (!empty_result(found)) { + found_result <- found + found <- microorganismsDT[mo == found_result[1L], ..property][[1]] + # uncertainty level 2 only if searched part contains a space (otherwise it will be found with lvl 3) + if (x_strip_collapsed %like% " ") { + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history) + } + return(found[1L]) + } + } + } + } + } + + # UNCERTAINTY LEVEL 3 ---- + if (uncertainty_level >= 3) { + now_checks_for_uncertainty_level <- 3 + + # (10) try to strip off one element from start and check the remains (any text size) ---- + if (isTRUE(debug)) { + cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (10) try to strip off one element from start and check the remains (any text size)\n") + } + x_strip <- a.x_backup %>% strsplit(" ") %>% unlist() + if (length(x_strip) > 1 & nchar(g.x_backup_without_spp) >= 6) { + for (i in 2:(length(x_strip))) { + x_strip_collapsed <- paste(x_strip[i:length(x_strip)], collapse = " ") + if (isTRUE(debug)) { + message("Running '", x_strip_collapsed, "'") + } + # first try without dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) + if (empty_result(found)) { + # then with dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) + } + if (!empty_result(found)) { + found_result <- found + found <- microorganismsDT[mo == found, ..property][[1]] + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(found[1L], property), 3, force = force_mo_history) + } + return(found[1L]) + } + } + } + # (11) try to strip off one element from end and check the remains (any text size) ---- + # (this is in fact 7 but without nchar limit of >=6) + if (isTRUE(debug)) { + cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (11) try to strip off one element from end and check the remains (any text size)\n") + } + if (length(x_strip) > 1) { + for (i in 1:(length(x_strip) - 1)) { + x_strip_collapsed <- paste(x_strip[1:(length(x_strip) - i)], collapse = " ") + if (isTRUE(debug)) { + message("Running '", x_strip_collapsed, "'") + } + # first try without dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) + if (empty_result(found)) { + # then with dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) + } + if (!empty_result(found)) { + found_result <- found + found <- microorganismsDT[mo == found, ..property][[1]] + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history) + } + return(found[1L]) + } + } + } + + # (12) part of a name (very unlikely match) ---- + if (isTRUE(debug)) { + cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (12) part of a name (very unlikely match)\n") + } + if (isTRUE(debug)) { + message("Running '", f.x_withspaces_end_only, "'") + } + found <- microorganismsDT[fullname %like% f.x_withspaces_end_only] + if (nrow(found) > 0) { + found_result <- found[["mo"]] + if (!empty_result(found_result) & nchar(g.x_backup_without_spp) >= 6) { + found <- microorganismsDT[mo == found_result[1L], ..property][[1]] + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(found[1L], property), 3, force = force_mo_history) + } + return(found[1L]) + } + } + } + + # didn't found in uncertain results too + return(NA_character_) + } + x[i] <- uncertain_fn(x_backup[i], + x_trimmed[i], + x_withspaces_start_end[i], + x_withspaces_start_only[i], + x_withspaces_end_only[i], + x_backup_without_spp[i]) + if (!empty_result(x[i])) { + # no set_mo_history here - it is already set in uncertain_fn() + next + } + + # no results found: make them UNKNOWN ---- + x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]] + if (initial_search == TRUE) { + failures <- c(failures, x_backup[i]) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + } + } + + # handling failures ---- + failures <- failures[!failures %in% c(NA, NULL, NaN)] + if (length(failures) > 0 & initial_search == TRUE) { + options(mo_failures = sort(unique(failures))) + plural <- c("value", "it", "was") + if (n_distinct(failures) > 1) { + plural <- c("values", "them", "were") + } + total_failures <- length(x_input[as.character(x_input) %in% as.character(failures) & !x_input %in% c(NA, NULL, NaN)]) + total_n <- length(x_input[!x_input %in% c(NA, NULL, NaN)]) + msg <- paste0(nr2char(n_distinct(failures)), " unique ", plural[1], + " (covering ", percent(total_failures / total_n, round = 1, force_zero = TRUE), + ") could not be coerced and ", plural[3], " considered 'unknown'") + if (n_distinct(failures) <= 10) { + msg <- paste0(msg, ": ", paste('"', unique(failures), '"', sep = "", collapse = ', ')) + } + msg <- paste0(msg, ". Use mo_failures() to review ", plural[2], ". Edit the `allow_uncertain` parameter if needed (see ?as.mo).") + warning(red(msg), + call. = FALSE, + immediate. = TRUE) # thus will always be shown, even if >= warnings + } + # handling uncertainties ---- + if (NROW(uncertainties) > 0 & initial_search == TRUE) { + options(mo_uncertainties = as.list(distinct(uncertainties, input, .keep_all = TRUE))) + + plural <- c("", "it") + if (NROW(uncertainties) > 1) { + plural <- c("s", "them") + } + msg <- paste0("\nResult", plural[1], " of ", nr2char(NROW(uncertainties)), " value", plural[1], + " was guessed with uncertainty. Use mo_uncertainties() to review ", plural[2], ".") + warning(red(msg), + call. = FALSE, + immediate. = TRUE) # thus will always be shown, even if >= warnings + } + + # Becker ---- + if (Becker == TRUE | Becker == "all") { + # See Source. It's this figure: + # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4187637/figure/F3/ + MOs_staph <- microorganismsDT[genus == "Staphylococcus"] + setkey(MOs_staph, species) + CoNS <- MOs_staph[species %in% c("arlettae", "auricularis", "capitis", + "caprae", "carnosus", "chromogenes", "cohnii", "condimenti", + "devriesei", "epidermidis", "equorum", "felis", + "fleurettii", "gallinarum", "haemolyticus", + "hominis", "jettensis", "kloosii", "lentus", + "lugdunensis", "massiliensis", "microti", + "muscae", "nepalensis", "pasteuri", "petrasii", + "pettenkoferi", "piscifermentans", "rostri", + "saccharolyticus", "saprophyticus", "sciuri", + "stepanovicii", "simulans", "succinus", + "vitulinus", "warneri", "xylosus") + | (species == "schleiferi" & subspecies %in% c("schleiferi", "")), ..property][[1]] + CoPS <- MOs_staph[species %in% c("simiae", "agnetis", + "delphini", "lutrae", + "hyicus", "intermedius", + "pseudintermedius", "pseudointermedius", + "schweitzeri", "argenteus") + | (species == "schleiferi" & subspecies == "coagulans"), ..property][[1]] + + # warn when species found that are not in Becker (2014, PMID 25278577) and Becker (2019, PMID 30872103) + post_Becker <- c("argensis", "caeli", "cornubiensis", "edaphicus") + if (any(x %in% MOs_staph[species %in% post_Becker, ..property][[1]])) { + + warning("Becker ", italic("et al."), " (2014, 2019) does not contain these species named after their publication: ", + italic(paste("S.", + sort(mo_species(unique(x[x %in% MOs_staph[species %in% post_Becker, ..property][[1]]]))), + collapse = ", ")), + ".", + call. = FALSE, + immediate. = TRUE) + } + + x[x %in% CoNS] <- microorganismsDT[mo == 'B_STPHY_CNS', ..property][[1]][1L] + x[x %in% CoPS] <- microorganismsDT[mo == 'B_STPHY_CPS', ..property][[1]][1L] + if (Becker == "all") { + x[x %in% microorganismsDT[mo %like% '^B_STPHY_AUR', ..property][[1]]] <- microorganismsDT[mo == 'B_STPHY_CPS', ..property][[1]][1L] + } + } + + # Lancefield ---- + if (Lancefield == TRUE | Lancefield == "all") { + # group A - S. pyogenes + x[x == microorganismsDT[mo == 'B_STRPT_PYO', ..property][[1]][1L]] <- microorganismsDT[mo == 'B_STRPT_GRA', ..property][[1]][1L] + # group B - S. agalactiae + x[x == microorganismsDT[mo == 'B_STRPT_AGA', ..property][[1]][1L]] <- microorganismsDT[mo == 'B_STRPT_GRB', ..property][[1]][1L] + # group C + S_groupC <- microorganismsDT %>% filter(genus == "Streptococcus", + species %in% c("equisimilis", "equi", + "zooepidemicus", "dysgalactiae")) %>% + pull(property) + x[x %in% S_groupC] <- microorganismsDT[mo == 'B_STRPT_GRC', ..property][[1]][1L] + if (Lancefield == "all") { + # all Enterococci + x[x %like% "^(Enterococcus|B_ENTRC)"] <- microorganismsDT[mo == 'B_STRPT_GRD', ..property][[1]][1L] + } + # group F - S. anginosus + x[x == microorganismsDT[mo == 'B_STRPT_ANG', ..property][[1]][1L]] <- microorganismsDT[mo == 'B_STRPT_GRF', ..property][[1]][1L] + # group H - S. sanguinis + x[x == microorganismsDT[mo == 'B_STRPT_SAN', ..property][[1]][1L]] <- microorganismsDT[mo == 'B_STRPT_GRH', ..property][[1]][1L] + # group K - S. salivarius + x[x == microorganismsDT[mo == 'B_STRPT_SAL', ..property][[1]][1L]] <- microorganismsDT[mo == 'B_STRPT_GRK', ..property][[1]][1L] + } + + # Wrap up ---------------------------------------------------------------- + + # comply to x, which is also unique and without empty values + x_input_unique_nonempty <- unique(x_input[!is.na(x_input) + & !is.null(x_input) + & !identical(x_input, "") + & !identical(x_input, "xxx")]) + + # left join the found results to the original input values (x_input) + df_found <- data.frame(input = as.character(x_input_unique_nonempty), + found = as.character(x), + stringsAsFactors = FALSE) + df_input <- data.frame(input = as.character(x_input), + stringsAsFactors = FALSE) + + suppressWarnings( + x <- df_input %>% + left_join(df_found, + by = "input") %>% + pull(found) + ) + + if (property == "mo") { + x <- to_class_mo(x) + } + + if (length(mo_renamed()) > 0) { + print(mo_renamed()) + } + + x +} diff --git a/R/sysdata.rda b/R/sysdata.rda index 186c9e9c..9588a803 100644 Binary files a/R/sysdata.rda and b/R/sysdata.rda differ diff --git a/R/get_locale.R b/R/translate.R similarity index 69% rename from R/get_locale.R rename to R/translate.R index a7248214..d0fe61fe 100755 --- a/R/get_locale.R +++ b/R/translate.R @@ -64,7 +64,7 @@ #' mo_name("CoNS", language = "pt") #' #> "Staphylococcus coagulase negativo (CoNS)" get_locale <- function() { - if (getOption("AMR_locale", "en") != "en") { + if (!is.null(getOption("AMR_locale", default = NULL))) { return(getOption("AMR_locale")) } @@ -73,6 +73,7 @@ get_locale <- function() { # Check the locale settings for a start with one of these languages: # grepl() with ignore.case = FALSE is faster than %like% + if (grepl("^(English|en_|EN_)", lang, ignore.case = FALSE)) { # as first option to optimise speed "en" @@ -93,3 +94,55 @@ get_locale <- function() { "en" } } + +# translate strings based on inst/translations.tsv +#' @importFrom dplyr %>% filter +translate_AMR <- function(from, language = get_locale(), only_unknown = FALSE) { + + if (is.null(language)) { + return(from) + } + if (language %in% c("en", "", NA)) { + return(from) + } + + df_trans <- translations_file # internal data file + + if (!language %in% df_trans$lang) { + stop("Unsupported language: '", language, "' - use one of: ", + paste0("'", sort(unique(df_trans$lang)), "'", collapse = ", "), + call. = FALSE) + } + + df_trans <- df_trans %>% filter(lang == language) + if (only_unknown == TRUE) { + df_trans <- df_trans %>% filter(pattern %like% "unknown") + } + + # default case sensitive if value if 'ignore.case' is missing: + df_trans$ignore.case[is.na(df_trans$ignore.case)] <- FALSE + # default not using regular expressions (fixed = TRUE) if 'fixed' is missing: + df_trans$fixed[is.na(df_trans$fixed)] <- TRUE + + # check if text to look for is in one of the patterns + any_form_in_patterns <- tryCatch(any(from %like% paste0("(", paste(df_trans$pattern, collapse = "|"), ")")), + error = function(e) { + warning("Translation not possible. Please open an issue on GitLab (https://gitlab.com/msberends/AMR/issues) or GitHub (https://github.com/msberends/AMR/issues).", call. = FALSE) + return(FALSE) + }) + if (NROW(df_trans) == 0 | !any_form_in_patterns) { + return(from) + } + + for (i in 1:nrow(df_trans)) { + from <- gsub(x = from, + pattern = df_trans$pattern[i], + replacement = df_trans$replacement[i], + fixed = df_trans$fixed[i], + ignore.case = df_trans$ignore.case[i]) + } + + # force UTF-8 for diacritics + base::enc2utf8(from) + +} diff --git a/data-raw/eucast_rules.tsv b/data-raw/eucast_rules.tsv index 664994a0..c80f122c 100644 --- a/data-raw/eucast_rules.tsv +++ b/data-raw/eucast_rules.tsv @@ -1,9 +1,9 @@ # --------------------------------------------------------------------------------------------------- # For editing this EUCAST reference file, these values can all be used for target antibiotics: -# all_betalactams, aminoglycosides, carbapenems, cephalosporins, cephalosporins_without_CAZ, fluoroquinolones, -# glycopeptides, macrolides, minopenicillins, polymyxins, streptogramins, tetracyclines, ureidopenicillins -# and all separate EARS-Net letter codes like AMC. They can be separated by comma: 'AMC, fluoroquinolones'. -# The if_mo_property column can be any column name from the AMR::microorganisms data set, or "genus_species" or "gramstain". +# 'all_betalactams', 'aminoglycosides', 'aminopenicillins', 'carbapenems', 'cephalosporins', 'cephalosporins_without_CAZ', +# 'fluoroquinolones', 'glycopeptides', 'macrolides', 'polymyxins', 'streptogramins', 'tetracyclines', 'ureidopenicillins' +# and all separate EARS-Net letter codes like 'AMC'. They can be separated by comma: 'AMC, fluoroquinolones'. +# The 'if_mo_property' column can be any column name from the AMR::microorganisms data set, or "genus_species" or "gramstain". # The like.is.one_of column must contain one of: like, is, one_of ('like' will read the first column as regular expression) # The EUCAST guideline contains references to the 'Burkholderia cepacia complex'. All species in this group can be found in: LiPuma J, Curr Opin Pulm Med. 2005 Nov;11(6):528-33. (PMID 16217180). # >>>>> IF YOU WANT TO IMPORT THIS FILE INTO YOUR OWN SOFTWARE, HAVE THE FIRST 10 LINES SKIPPED <<<<< diff --git a/data-raw/translations.tsv b/data-raw/translations.tsv index b1f6e6dc..feba8456 100644 --- a/data-raw/translations.tsv +++ b/data-raw/translations.tsv @@ -354,7 +354,7 @@ nl Ticarcillin/beta-lactamase inhibitor Ticarcilline/enzymremmer nl Ticarcillin/clavulanic acid Ticarcilline/clavulaanzuur nl Tinidazole Tinidazol nl Tobramycin Tobramycine -nl Trimethoprim/sulfamethoxazole Trimethoprim/sulfamethoxazol +nl Trimethoprim/sulfamethoxazole Cotrimoxazol nl Troleandomycin Troleandomycine nl Trovafloxacin Trovafloxacine nl Vancomycin Vancomycine diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index 2ba7fb61..64d7693f 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -78,7 +78,7 @@ AMR (for R) - 0.7.1.9071 + 0.7.1.9072 diff --git a/docs/articles/index.html b/docs/articles/index.html index 0579eeae..0b297047 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -78,7 +78,7 @@ AMR (for R) - 0.7.1.9071 + 0.7.1.9072 diff --git a/docs/authors.html b/docs/authors.html index 7ae2bb88..6bddc998 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -78,7 +78,7 @@ AMR (for R) - 0.7.1.9071 + 0.7.1.9072 diff --git a/docs/index.html b/docs/index.html index 5376e2af..98dc1422 100644 --- a/docs/index.html +++ b/docs/index.html @@ -42,7 +42,7 @@ AMR (for R) - 0.7.1.9071 + 0.7.1.9072 diff --git a/docs/news/index.html b/docs/news/index.html index f74cb984..e58ae6de 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -78,7 +78,7 @@ AMR (for R) - 0.7.1.9071 + 0.7.1.9072 @@ -225,11 +225,11 @@ -
+

-AMR 0.7.1.9071 Unreleased +AMR 0.7.1.9072 Unreleased

-

Last updated: 03-Sep-2019

+

Last updated: 12-Sep-2019

Breaking

@@ -342,6 +342,9 @@ Since this is a major change, usage of the old also_single_tested w
  • Function availability() now uses portion_R() instead of portion_IR(), to comply with EUCAST insights
  • Functions age() and age_groups() now have a na.rm parameter to remove empty values
  • Renamed function p.symbol() to p_symbol() (the former is now deprecated and will be removed in a future version)
  • +
  • Using negative values for x in age_groups() will now introduce NAs and not return an error anymore
  • +
  • Fix for determining the system’s language
  • +
  • Fix for key_antibiotics() on foreign systems
  • @@ -1261,7 +1264,7 @@ Using as.mo(..., allow_uncertain = 3)

    Contents

    diff --git a/docs/reference/age.html b/docs/reference/age.html index e3af9941..43e57c9b 100644 --- a/docs/reference/age.html +++ b/docs/reference/age.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9070 + 0.7.1.9072
    diff --git a/docs/reference/age_groups.html b/docs/reference/age_groups.html index b7ab5a4e..4069613a 100644 --- a/docs/reference/age_groups.html +++ b/docs/reference/age_groups.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9070 + 0.7.1.9072
    diff --git a/docs/reference/index.html b/docs/reference/index.html index fd5b9a5f..e85c7ddc 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -78,7 +78,7 @@ AMR (for R) - 0.7.1.9071 + 0.7.1.9072
    diff --git a/docs/reference/p_symbol.html b/docs/reference/p_symbol.html index 10bc52e9..98401561 100644 --- a/docs/reference/p_symbol.html +++ b/docs/reference/p_symbol.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9071 + 0.7.1.9072 diff --git a/docs/reference/translate.html b/docs/reference/translate.html index e6f48c26..14176405 100644 --- a/docs/reference/translate.html +++ b/docs/reference/translate.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9055 + 0.7.1.9072 @@ -297,7 +297,7 @@