diff --git a/.Rbuildignore b/.Rbuildignore index 87240fb2..e2e1dfe2 100755 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -21,3 +21,4 @@ ^pkgdown$ ^public$ ^data-raw$ +R/aa_test.R$ diff --git a/.gitignore b/.gitignore index 9a2c3cb6..3b12302a 100755 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,4 @@ packrat/src/ cran-comments.md data-raw/taxon.tab data-raw/DSMZ_bactnames.xlsx +R/aa_test.R diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8e301830..bcb8cfd5 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -118,7 +118,7 @@ coverage: # install missing and outdated packages - Rscript -e 'source(".gitlab-ci.R"); gl_update_pkg_all(repos = "https://cran.rstudio.com", quiet = TRUE, install_pkgdown = FALSE)' # codecov token is set in https://gitlab.com/msberends/AMR/settings/ci_cd - - Rscript -e "cc <- covr::package_coverage(line_exclusions = list('R/atc_online.R', 'R/mo_source.R')); covr::codecov(coverage = cc, token = '$codecov'); cat('Code coverage:', covr::percent_coverage(cc))" + - Rscript -e "cc <- covr::package_coverage(); covr::codecov(coverage = cc, token = '$codecov'); cat('Code coverage:', covr::percent_coverage(cc))" coverage: '/Code coverage: \d+\.\d+/' pages: diff --git a/DESCRIPTION b/DESCRIPTION index 55bf8cad..68ab9046 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: AMR -Version: 0.7.1.9072 -Date: 2019-09-12 +Version: 0.7.1.9073 +Date: 2019-09-15 Title: Antimicrobial Resistance Analysis Authors@R: c( person(role = c("aut", "cre"), @@ -17,6 +17,8 @@ Authors@R: c( family = "Glasner", given = "Corinna", email = "c.glasner@umcg.nl", comment = c(ORCID = "0000-0003-1241-1328")), person(role = "ctb", family = "Hassing", given = c("Erwin", "E.", "A."), email = "e.hassing@certe.nl"), + person(role = "ctb", + family = "Hazenberg", given = c("Eric", "H.", "L.", "C.", "M."), email = "e.hazenberg@jbz.nl"), person(role = "ctb", family = "Lenglet", given = "Annick", email = "annick.lenglet@amsterdam.msf.org"), person(role = "ctb", @@ -57,3 +59,4 @@ License: GPL-2 | file LICENSE Encoding: UTF-8 LazyData: true RoxygenNote: 6.1.1 +StagedInstall: false diff --git a/NAMESPACE b/NAMESPACE index 5d8cef10..f2181bc6 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -58,6 +58,7 @@ S3method(type_sum,mic) S3method(type_sum,mo) S3method(type_sum,rsi) export("%like%") +export("%like_case%") export(ab_atc) export(ab_atc_group1) export(ab_atc_group2) @@ -85,6 +86,7 @@ export(availability) export(brmo) export(bug_drug_combinations) export(catalogue_of_life_version) +export(clear_mo_history) export(count_I) export(count_IR) export(count_R) diff --git a/NEWS.md b/NEWS.md index 84de5238..9b7331d8 100755 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,5 @@ -# AMR 0.7.1.9072 -Last updated: 12-Sep-2019 +# AMR 0.7.1.9073 +Last updated: 15-Sep-2019 ### Breaking * Determination of first isolates now **excludes** all 'unknown' microorganisms at default, i.e. microbial code `"UNKNOWN"`. They can be included with the new parameter `include_unknown`: @@ -72,6 +72,14 @@ ``` ### Changed +* Many algorithm improvements for `as.mo()` (of which some led to additions to the `microorganisms` data set): + * Self-learning algorithm - the function now gains experience from previously determined microorganism IDs and learns from it (yielding 80-95% speed improvement for any guess after the first try) + * Big improvement for misspelled input + * These new trivial names known to the field are now understood: meningococcus, gonococcus, pneumococcus + * Updated to the latest taxonomic data (updated to August 2019, from the International Journal of Systematic and Evolutionary Microbiology + * Added support for Viridans Group Streptococci (VGS) and Milleri Group Streptococci (MGS) + * Added support for 5,000 new fungi + * Added support for unknown yeasts and fungi * Renamed data set `septic_patients` to `example_isolates` * Function `eucast_rules()`: * Fixed a bug for *Yersinia pseudotuberculosis* @@ -83,13 +91,6 @@ * Removed class `atc` - using `as.atc()` is now deprecated in favour of `ab_atc()` and this will return a character, not the `atc` class anymore * Removed deprecated functions `abname()`, `ab_official()`, `atc_name()`, `atc_official()`, `atc_property()`, `atc_tradenames()`, `atc_trivial_nl()` * Fix and speed improvement for `mo_shortname()` -* Algorithm improvements for `as.mo()` (by which some additions were made to the `microorganisms` data set: - * Big improvement for misspelled input - * These new trivial names known to the field are now understood: meningococcus, gonococcus, pneumococcus - * Updated to the latest taxonomic data (updated to August 2019, from the International Journal of Systematic and Evolutionary Microbiology - * Added support for Viridans Group Streptococci (VGS) and Milleri Group Streptococci (MGS) - * Added support for 5,000 new fungi - * Added support for unknown yeasts and fungi * Fix for using `mo_*` functions where the coercion uncertainties and failures would not be available through `mo_uncertainties()` and `mo_failures()` anymore * Deprecated the `country` parameter of `mdro()` in favour of the already existing `guideline` parameter to support multiple guidelines within one country * The `name` of `RIF` is now Rifampicin instead of Rifampin diff --git a/R/like.R b/R/like.R index 6819e44c..eb4836c7 100755 --- a/R/like.R +++ b/R/like.R @@ -21,7 +21,7 @@ #' Pattern Matching #' -#' Convenient wrapper around \code{\link[base]{grep}} to match a pattern: \code{a \%like\% b}. It always returns a \code{logical} vector and is always case-insensitive. Also, \code{pattern} (\code{b}) can be as long as \code{x} (\code{a}) to compare items of each index in both vectors. +#' Convenient wrapper around \code{\link[base]{grep}} to match a pattern: \code{a \%like\% b}. It always returns a \code{logical} vector and is always case-insensitive (use \code{a \%like_case\% b} for case-sensitive matching). Also, \code{pattern} (\code{b}) can be as long as \code{x} (\code{a}) to compare items of each index in both vectors, or can both have the same length to iterate over all cases. #' @inheritParams base::grepl #' @return A \code{logical} vector #' @name like @@ -53,14 +53,14 @@ #' left_join_microorganisms() %>% #' filter(genus %like% '^ent') %>% #' freq(genus, species) -like <- function(x, pattern) { +like <- function(x, pattern, ignore.case = TRUE) { if (length(pattern) > 1) { if (length(x) != length(pattern)) { if (length(x) == 1) { x <- rep(x, length(pattern)) } # return TRUE for every 'x' that matches any 'pattern', FALSE otherwise - res <- sapply(pattern, function(pttrn) x %like% pttrn) + res <- sapply(pattern, function(pttrn) base::grepl(pttrn, x, ignore.case = ignore.case)) res2 <- as.logical(rowSums(res)) # get only first item of every hit in pattern res2[duplicated(res)] <- FALSE @@ -71,9 +71,9 @@ like <- function(x, pattern) { res <- vector(length = length(pattern)) for (i in 1:length(res)) { if (is.factor(x[i])) { - res[i] <- as.integer(x[i]) %in% base::grep(pattern[i], levels(x[i]), ignore.case = TRUE) + res[i] <- as.integer(x[i]) %in% base::grep(pattern[i], levels(x[i]), ignore.case = ignore.case) } else { - res[i] <- base::grepl(pattern[i], x[i], ignore.case = TRUE) + res[i] <- base::grepl(pattern[i], x[i], ignore.case = ignore.case) } } return(res) @@ -82,16 +82,24 @@ like <- function(x, pattern) { # the regular way how grepl works; just one pattern against one or more x if (is.factor(x)) { - as.integer(x) %in% base::grep(pattern, levels(x), ignore.case = TRUE) + as.integer(x) %in% base::grep(pattern, levels(x), ignore.case = ignore.case) } else { - tryCatch(base::grepl(pattern, x, ignore.case = TRUE), + tryCatch(base::grepl(pattern, x, ignore.case = ignore.case), error = function(e) ifelse(test = grepl("Invalid regexp", e$message), # try with perl = TRUE: - yes = return(base::grepl(pattern, x, ignore.case = TRUE, perl = TRUE)), + yes = return(base::grepl(pattern, x, ignore.case = ignore.case, perl = TRUE)), no = stop(e$message))) } } #' @rdname like #' @export -"%like%" <- like +"%like%" <- function(x, pattern) { + like(x, pattern, ignore.case = TRUE) +} + +#' @rdname like +#' @export +"%like_case%" <- function(x, pattern) { + like(x, pattern, ignore.case = FALSE) +} diff --git a/R/misc.R b/R/misc.R index ffe9ee5b..214521ec 100755 --- a/R/misc.R +++ b/R/misc.R @@ -41,8 +41,8 @@ percent <- function(x, round = 1, force_zero = FALSE, decimal.mark = getOption(" big.mark <- " " } } - x <- percent_clean(x = x, round = round, force_zero = force_zero, - decimal.mark = decimal.mark, big.mark = big.mark, ...) + percent_clean(x = x, round = round, force_zero = force_zero, + decimal.mark = decimal.mark, big.mark = big.mark, ...) } #' @importFrom crayon blue bold red diff --git a/R/mo.R b/R/mo.R index 520fc225..c22412a8 100755 --- a/R/mo.R +++ b/R/mo.R @@ -37,7 +37,7 @@ #' @keywords mo Becker becker Lancefield lancefield guess #' @details #' \strong{General info} \cr -#' A microbial ID from this package (class: \code{mo}) typically looks like these examples:\cr +#' A microorganism ID from this package (class: \code{mo}) typically looks like these examples:\cr #' \preformatted{ #' Code Full name #' --------------- -------------------------------------- @@ -57,16 +57,15 @@ #' #' Use the \code{\link{mo_property}_*} functions to get properties based on the returned code, see Examples. #' -#' The algorithm uses data from the Catalogue of Life (see below) and from one other source (see \code{?microorganisms}). +#' The algorithm uses data from the Catalogue of Life (see below) and from one other source (see \code{\link{microorganisms}}). +#' +#' \strong{Self-learning algoritm} \cr +#' The \code{as.mo()} function gains experience from previously determined microorganism IDs and learns from it. This drastically improves both speed and reliability. Use \code{clear_mo_history()} to reset the algorithms. Only experience from your current \code{AMR} package version is used. This is done because in the future the taxonomic tree (which is included in this package) may change for any organism and it consequently has to rebuild its knowledge. +#' +#' Usually, any guess after the first try runs 80-95\% faster than the first try. +#' +# \emph{For now, learning only works per session. If R is closed or terminated, the algorithms reset. This might be resolved in a future version.} #' -# /// THIS PART WAS DELETED FROM THE MAN PAGE -# \strong{Self-learning algoritm} \cr -# The \code{as.mo()} function gains experience from previously determined microbial IDs and learns from it. This drastically improves both speed and reliability. Use \code{clean_mo_history()} to reset the algorithms. Only experience from your current \code{AMR} package version is used. This is done because in the future the taxonomic tree (which is included in this package) may change for any organism and it consequently has to rebuild its knowledge. -# -# Usually, any guess after the first try runs 80-95\% faster than the first try. -# -# For now, learning only works per session. If R is closed or terminated, the algorithms reset. This will probably be resolved in a next version. -# //// #' \strong{Intelligent rules} \cr #' This function uses intelligent rules to help getting fast and logical results. It tries to find matches in this order: #' \itemize{ @@ -105,7 +104,7 @@ #' #' Use \code{mo_uncertainties()} to get a data.frame with all values that were coerced to a valid value, but with uncertainty. #' -#' Use \code{mo_renamed()} to get a vector with all values that could be coerced based on an old, previously accepted taxonomic name. +#' Use \code{mo_renamed()} to get a data.frame with all values that could be coerced based on an old, previously accepted taxonomic name. #' #' \strong{Microbial prevalence of pathogens in humans} \cr #' The intelligent rules take into account microbial prevalence of pathogens in humans. It uses three groups and all (sub)species are in only one group. These groups are: @@ -117,7 +116,7 @@ #' #' Group 1 contains all common Gram positives and Gram negatives, like all Enterobacteriaceae and e.g. \emph{Pseudomonas} and \emph{Legionella}. #' -#' Group 2 probably contains less microbial pathogens; all other members of phyla that were found in humans in the Northern Netherlands between 2001 and 2018. +#' Group 2 contains probably less pathogenic microorganisms; all other members of phyla that were found in humans in the Northern Netherlands between 2001 and 2018. #' @inheritSection catalogue_of_life Catalogue of Life # (source as a section here, so it can be inherited by other man pages:) #' @section Source: @@ -199,7 +198,10 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = TRUE, x[tolower(as.character(paste0(x, ""))) %in% c("", "xxx", "na", "nan")] <- NA_character_ uncertainty_level <- translate_allow_uncertain(allow_uncertain) - # mo_hist <- get_mo_history(x, uncertainty_level, force = isTRUE(list(...)$force_mo_history)) + mo_hist <- get_mo_history(x, + uncertainty_level, + force = isTRUE(list(...)$force_mo_history), + disable = isTRUE(list(...)$disable_mo_history)) if (mo_source_isvalid(reference_df) & isFALSE(Becker) @@ -227,35 +229,27 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = TRUE, pull("mo") ) - } else if (all(x %in% AMR::microorganisms$mo) + } else if (all(x %in% microorganismsDT$mo) & isFALSE(Becker) & isFALSE(Lancefield)) { y <- x - # } else if (!any(is.na(mo_hist)) - # & isFALSE(Becker) - # & isFALSE(Lancefield)) { - # # check previously found results - # y <- mo_hist + } else if (!any(is.na(mo_hist)) + & isFALSE(Becker) + & isFALSE(Lancefield)) { + # check previously found results + y <- mo_hist } else if (all(tolower(x) %in% microorganismsDT$fullname_lower) & isFALSE(Becker) & isFALSE(Lancefield)) { # we need special treatment for very prevalent full names, they are likely! (case insensitive) # e.g. as.mo("Staphylococcus aureus") - y <- microorganismsDT[prevalence == 1][data.table(fullname_lower = tolower(x)), - on = "fullname_lower", - "mo"][[1]] - if (any(is.na(y))) { - y[is.na(y)] <- microorganismsDT[prevalence == 2][data.table(fullname_lower = tolower(x[is.na(y)])), - on = "fullname_lower", - "mo"][[1]] - } - if (any(is.na(y))) { - y[is.na(y)] <- microorganismsDT[prevalence == 3][data.table(fullname_lower = tolower(x[is.na(y)])), - on = "fullname_lower", - "mo"][[1]] - } + y <- data.frame(fullname_lower = tolower(x), + stringsAsFactors = FALSE) %>% + left_join(microorganismsDT, by = "fullname_lower") %>% + pull(mo) + # save them to history set_mo_history(x, y, 0, force = isTRUE(list(...)$force_mo_history)) @@ -289,7 +283,9 @@ is.mo <- function(x) { # param initial_search logical - is FALSE when coming from uncertain tries, which uses exec_as.mo internally too # param dyslexia_mode logical - also check for characters that resemble others # param force_mo_history logical - whether found result must be saved with set_mo_history (default FALSE on non-interactive sessions) +# param disable_mo_history logical - whether set_mo_history and get_mo_history should be ignored # param debug logical - show different lookup texts while searching +# param uncertain_check_prevalence integer - the prevalence to check for when running for uncertain results, follows microorganisms$prevalence exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, @@ -299,7 +295,9 @@ exec_as.mo <- function(x, initial_search = TRUE, dyslexia_mode = FALSE, force_mo_history = FALSE, - debug = FALSE) { + disable_mo_history = FALSE, + debug = FALSE, + reference_data_to_use = microorganismsDT) { if (!"AMR" %in% base::.packages()) { require("AMR") @@ -336,7 +334,6 @@ exec_as.mo <- function(x, } } - notes <- character(0) uncertainties <- data.frame(uncertainty = integer(0), input = character(0), fullname = character(0), @@ -359,7 +356,7 @@ exec_as.mo <- function(x, & !identical(x, "xxx")] # conversion of old MO codes from v0.5.0 (ITIS) to later versions (Catalogue of Life) - if (any(x %like% "^[BFP]_[A-Z]{3,7}") & !all(x %in% microorganisms$mo)) { + if (any(x %like_case% "^[BFP]_[A-Z]{3,7}") & !all(x %in% microorganisms$mo)) { leftpart <- gsub("^([BFP]_[A-Z]{3,7}).*", "\\1", x) if (any(leftpart %in% names(mo_codes_v0.5.0))) { rightpart <- gsub("^[BFP]_[A-Z]{3,7}(.*)", "\\1", x) @@ -417,40 +414,45 @@ exec_as.mo <- function(x, pull(property) ) - } else if (all(x %in% AMR::microorganisms$mo)) { + } else if (all(x %in% reference_data_to_use$mo)) { # existing mo codes when not looking for property "mo", like mo_genus("B_ESCHR_COL") - y <- microorganismsDT[prevalence == 1][data.table(mo = x), on = "mo", ..property][[1]] + y <- reference_data_to_use[prevalence == 1][data.table(mo = x), on = "mo", ..property][[1]] if (any(is.na(y))) { - y[is.na(y)] <- microorganismsDT[prevalence == 2][data.table(mo = x[is.na(y)]), + y[is.na(y)] <- reference_data_to_use[prevalence == 2][data.table(mo = x[is.na(y)]), on = "mo", ..property][[1]] } if (any(is.na(y))) { - y[is.na(y)] <- microorganismsDT[prevalence == 3][data.table(mo = x[is.na(y)]), + y[is.na(y)] <- reference_data_to_use[prevalence == 3][data.table(mo = x[is.na(y)]), on = "mo", ..property][[1]] } x <- y - # } else if (all(x %in% read_mo_history(uncertainty_level, - # force = force_mo_history)$x)) { - # # previously found code - # x <- microorganismsDT[data.table(mo = get_mo_history(x, - # uncertainty_level, - # force = force_mo_history)), - # on = "mo", ..property][[1]] + } else if (all(toupper(x) %in% read_mo_history(uncertainty_level, + force = force_mo_history, + disable = disable_mo_history)$x)) { - } else if (all(tolower(x) %in% microorganismsDT$fullname_lower)) { + # previously found code + x <- data.frame(mo = get_mo_history(x, + uncertainty_level, + force = force_mo_history, + disable = disable_mo_history), + stringsAsFactors = FALSE) %>% + left_join(AMR::microorganisms, by = "mo") %>% + pull(property) + + } else if (all(tolower(x) %in% reference_data_to_use$fullname_lower)) { # we need special treatment for very prevalent full names, they are likely! # e.g. as.mo("Staphylococcus aureus") - y <- microorganismsDT[prevalence == 1][data.table(fullname_lower = tolower(x)), on = "fullname_lower", ..property][[1]] + y <- reference_data_to_use[prevalence == 1][data.table(fullname_lower = tolower(x)), on = "fullname_lower", ..property][[1]] if (any(is.na(y))) { - y[is.na(y)] <- microorganismsDT[prevalence == 2][data.table(fullname_lower = tolower(x[is.na(y)])), + y[is.na(y)] <- reference_data_to_use[prevalence == 2][data.table(fullname_lower = tolower(x[is.na(y)])), on = "fullname_lower", ..property][[1]] } if (any(is.na(y))) { - y[is.na(y)] <- microorganismsDT[prevalence == 3][data.table(fullname_lower = tolower(x[is.na(y)])), + y[is.na(y)] <- reference_data_to_use[prevalence == 3][data.table(fullname_lower = tolower(x[is.na(y)])), on = "fullname_lower", ..property][[1]] } @@ -460,45 +462,59 @@ exec_as.mo <- function(x, # commonly used MO codes y <- as.data.table(AMR::microorganisms.codes)[data.table(code = toupper(x)), on = "code", ] # save them to history - set_mo_history(x, y$mo, 0, force = force_mo_history) + set_mo_history(x, y$mo, 0, force = force_mo_history, disable = disable_mo_history) - x <- microorganismsDT[data.table(mo = y[["mo"]]), on = "mo", ..property][[1]] + x <- reference_data_to_use[data.table(mo = y[["mo"]]), on = "mo", ..property][[1]] } else if (!all(x %in% AMR::microorganisms[, property])) { - strip_whitespace <- function(x) { + strip_whitespace <- function(x, dyslexia_mode) { # all whitespaces (tab, new lines, etc.) should be one space # and spaces before and after should be omitted - trimws(gsub("[\\s]+", " ", x, perl = TRUE), which = "both") + trimmed <- trimws(gsub("[\\s]+", " ", x, perl = TRUE), which = "both") + # also, make sure the trailing and leading characters are a-z or 0-9 + # in case of non-regex + if (dyslexia_mode == FALSE) { + trimmed <- gsub("^[^a-zA-Z0-9)(]+", "", trimmed) + trimmed <- gsub("[^a-zA-Z0-9)(]+$", "", trimmed) + } + trimmed } - x <- strip_whitespace(x) + x <- strip_whitespace(x, dyslexia_mode) x_backup <- x + # from here on case-insensitive + x <- tolower(x) + # remove spp and species - x <- gsub(" +(spp.?|ssp.?|sp.? |ss ?.?|subsp.?|subspecies|biovar |serovar |species)", " ", x_backup, ignore.case = TRUE) - x <- strip_whitespace(x) + x <- gsub(" +(spp.?|ssp.?|sp.? |ss ?.?|subsp.?|subspecies|biovar |serovar |species)", " ", x, ignore.case = TRUE) + x <- gsub("(spp.?|ssp.?|subsp.?|subspecies|biovar|serovar|species)", "", x, ignore.case = TRUE) + x <- strip_whitespace(x, dyslexia_mode) x_backup_without_spp <- x x_species <- paste(x, "species") # translate to English for supported languages of mo_property - x <- gsub("(gruppe|groep|grupo|gruppo|groupe)", "group", x, ignore.case = TRUE) - x <- gsub("(vergroen)[a-z]*", "viridans", x, ignore.case = TRUE) - x <- gsub("(hefe|gist|gisten|levadura|lievito|fermento|levure)[a-z]*", "yeast", x, ignore.case = TRUE) - x <- gsub("(schimmels?|mofo|molde|stampo|moisissure|fungi)[a-z]*", "fungus", x, ignore.case = TRUE) - x <- gsub("Fungus[ph|f]rya", "Fungiphrya", x, ignore.case = TRUE) + x <- gsub("(gruppe|groep|grupo|gruppo|groupe)", "group", x) + # no groups and complexes as ending + x <- gsub("(complex|group)$", "", x) + x <- gsub("((an)?aero+b)[a-z]*", "", x) + x <- gsub("(vergroen)[a-z]*", "viridans", x) + x <- gsub("(hefe|gist|gisten|levadura|lievito|fermento|levure)[a-z]*", "yeast", x) + x <- gsub("(schimmels?|mofo|molde|stampo|moisissure|fungi)[a-z]*", "fungus", x) + x <- gsub("fungus[ph|f]rya", "fungiphrya", x) # remove non-text in case of "E. coli" except dots and spaces x <- gsub("[^.a-zA-Z0-9/ \\-]+", "", x) # replace minus by a space x <- gsub("-+", " ", x) # replace hemolytic by haemolytic - x <- gsub("ha?emoly", "haemoly", x, ignore.case = TRUE) + x <- gsub("ha?emoly", "haemoly", x) # place minus back in streptococci - x <- gsub("(alpha|beta|gamma).?ha?emoly", "\\1-haemoly", x, ignore.case = TRUE) + x <- gsub("(alpha|beta|gamma).?ha?emoly", "\\1-haemoly", x) # remove genus as first word - x <- gsub("^genus ", "", x, ignore.case = TRUE) + x <- gsub("^genus ", "", x) # remove 'uncertain' like texts - x <- trimws(gsub("(uncertain|susp[ie]c[a-z]+|verdacht)", "", x, ignore.case = TRUE)) + x <- trimws(gsub("(uncertain|susp[ie]c[a-z]+|verdacht)", "", x)) # allow characters that resemble others = dyslexia_mode ---- if (dyslexia_mode == TRUE) { x <- tolower(x) @@ -510,27 +526,27 @@ exec_as.mo <- function(x, x <- gsub("u+", "u+", x) # allow any ending of -um, -us, -ium, -icum, -ius, -icus, -ica and -a (needs perl for the negative backward lookup): x <- gsub("(u\\+\\(c\\|k\\|q\\|qu\\+\\|s\\|z\\|x\\|ks\\)\\+)(?![a-z])", - "(u[s|m]|[iy][ck]?u[ms]|[iy]?[ck]?a)", x, ignore.case = TRUE, perl = TRUE) + "(u[s|m]|[iy][ck]?u[ms]|[iy]?[ck]?a)", x, perl = TRUE) x <- gsub("(\\[iy\\]\\+\\(c\\|k\\|q\\|qu\\+\\|s\\|z\\|x\\|ks\\)\\+a\\+)(?![a-z])", - "(u[s|m]|[iy][ck]?u[ms]|[iy]?[ck]?a)", x, ignore.case = TRUE, perl = TRUE) + "(u[s|m]|[iy][ck]?u[ms]|[iy]?[ck]?a)", x, perl = TRUE) x <- gsub("(\\[iy\\]\\+u\\+m)(?![a-z])", - "(u[s|m]|[iy][ck]?u[ms]|[iy]?[ck]?a)", x, ignore.case = TRUE, perl = TRUE) - x <- gsub("e+", "e+", x, ignore.case = TRUE) - x <- gsub("o+", "o+", x, ignore.case = TRUE) + "(u[s|m]|[iy][ck]?u[ms]|[iy]?[ck]?a)", x, perl = TRUE) + x <- gsub("e+", "e+", x) + x <- gsub("o+", "o+", x) x <- gsub("(.)\\1+", "\\1+", x) # allow ending in -en or -us - x <- gsub("e\\+n(?![a-z[])", "(e+n|u+(c|k|q|qu|s|z|x|ks)+)", x, ignore.case = TRUE, perl = TRUE) + x <- gsub("e\\+n(?![a-z[])", "(e+n|u+(c|k|q|qu|s|z|x|ks)+)", x, perl = TRUE) # if the input is longer than 10 characters, allow any constant between all characters, as some might have forgotten a character # this will allow "Pasteurella damatis" to be correctly read as "Pasteurella dagmatis". constants <- paste(letters[!letters %in% c("a", "e", "i", "o", "u")], collapse = "") - #x[nchar(x_backup_without_spp) > 10] <- gsub("([a-z])([a-z])", paste0("\\1[", constants, "]?\\2"), x[nchar(x_backup_without_spp) > 10], ignore.case = TRUE) + #x[nchar(x_backup_without_spp) > 10] <- gsub("([a-z])([a-z])", paste0("\\1[", constants, "]?\\2"), x[nchar(x_backup_without_spp) > 10]) x[nchar(x_backup_without_spp) > 10] <- gsub("[+]", paste0("+[", constants, "]?"), x[nchar(x_backup_without_spp) > 10]) } - x <- strip_whitespace(x) + x <- strip_whitespace(x, dyslexia_mode) x_trimmed <- x x_trimmed_species <- paste(x_trimmed, "species") - x_trimmed_without_group <- gsub(" gro.u.p$", "", x_trimmed, ignore.case = TRUE) + x_trimmed_without_group <- gsub(" gro.u.p$", "", x_trimmed) # remove last part from "-" or "/" x_trimmed_without_group <- gsub("(.*)[-/].*", "\\1", x_trimmed_without_group) # replace space and dot by regex sign @@ -538,6 +554,7 @@ exec_as.mo <- function(x, x <- gsub("[ .]+", ".*", x) # add start en stop regex x <- paste0('^', x, '$') + x_withspaces_start_only <- paste0('^', x_withspaces) x_withspaces_end_only <- paste0(x_withspaces, '$') x_withspaces_start_end <- paste0('^', x_withspaces, '$') @@ -561,41 +578,42 @@ exec_as.mo <- function(x, progress$tick()$print() - # if (initial_search == TRUE) { - # found <- microorganismsDT[mo == get_mo_history(x_backup[i], - # uncertainty_level, - # force = force_mo_history), - # ..property][[1]] - # # previously found result - # if (length(found) > 0) { - # x[i] <- found[1L] - # next - # } - # } + mo_hist <- get_mo_history(x, uncertainty_level, force = force_mo_history, disable = disable_mo_history) + if (initial_search == TRUE & !any(is.na(mo_hist))) { + # previously found code + found <- data.frame(mo = mo_hist, + stringsAsFactors = FALSE) %>% + left_join(reference_data_to_use, by = "mo") %>% + pull(property) + if (length(found) > 0) { + x[i] <- found[1L] + next + } + } - found <- microorganismsDT[mo == toupper(x_backup[i]), ..property][[1]] + found <- reference_data_to_use[mo == toupper(x_backup[i]), ..property][[1]] # is a valid MO code if (length(found) > 0) { x[i] <- found[1L] next } - found <- microorganismsDT[fullname_lower %in% tolower(c(x_backup[i], x_backup_without_spp[i])), ..property][[1]] + found <- reference_data_to_use[fullname_lower %in% tolower(c(x_backup[i], x_backup_without_spp[i])), ..property][[1]] # most probable: is exact match in fullname if (length(found) > 0) { x[i] <- found[1L] if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) } next } - found <- microorganismsDT[col_id == x_backup[i], ..property][[1]] + found <- reference_data_to_use[col_id == x_backup[i], ..property][[1]] # is a valid Catalogue of Life ID if (NROW(found) > 0) { x[i] <- found[1L] if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) } next } @@ -611,14 +629,14 @@ exec_as.mo <- function(x, # empty and nonsense values, ignore without warning x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]] if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) } next } # check for very small input, but ignore the O antigens of E. coli if (nchar(gsub("[^a-zA-Z]", "", x_trimmed[i])) < 3 - & !x_backup_without_spp[i] %like% "O?(26|103|104|104|111|121|145|157)") { + & !x_backup_without_spp[i] %like_case% "[Oo]?(26|103|104|104|111|121|145|157)") { # check if search term was like "A. species", then return first genus found with ^A # if (x_backup[i] %like% "[a-z]+ species" | x_backup[i] %like% "[a-z] spp[.]?") { # # get mo code of first hit @@ -630,7 +648,7 @@ exec_as.mo <- function(x, # if (length(found) > 0) { # x[i] <- found[1L] # if (initial_search == TRUE) { - # set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + # set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) # } # next # } @@ -640,17 +658,17 @@ exec_as.mo <- function(x, x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]] if (initial_search == TRUE) { failures <- c(failures, x_backup[i]) - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) } next } - if (x_backup_without_spp[i] %like% "virus") { + if (x_backup_without_spp[i] %like_case% "virus") { # there is no fullname like virus, so don't try to coerce it x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]] if (initial_search == TRUE) { failures <- c(failures, x_backup[i]) - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) } next } @@ -660,22 +678,22 @@ exec_as.mo <- function(x, if (toupper(x_backup_without_spp[i]) %in% c('MRSA', 'MSSA', 'VISA', 'VRSA')) { x[i] <- microorganismsDT[mo == 'B_STPHY_AUR', ..property][[1]][1L] if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) } next } if (toupper(x_backup_without_spp[i]) %in% c('MRSE', 'MSSE')) { x[i] <- microorganismsDT[mo == 'B_STPHY_EPI', ..property][[1]][1L] if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) } next } if (toupper(x_backup_without_spp[i]) == "VRE" - | x_backup_without_spp[i] %like% '(enterococci|enterokok|enterococo)[a-z]*?$') { + | x_backup_without_spp[i] %like_case% '(enterococci|enterokok|enterococo)[a-z]*?$') { x[i] <- microorganismsDT[mo == 'B_ENTRC', ..property][[1]][1L] if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) } next } @@ -693,10 +711,10 @@ exec_as.mo <- function(x, # - UPEC (Uropathogenic E. coli) if (toupper(x_backup_without_spp[i]) %in% c("AIEC", "ATEC", "DAEC", "EAEC", "EHEC", "EIEC", "EPEC", "ETEC", "NMEC", "STEC", "UPEC") # also support O-antigens of E. coli: O26, O103, O104, O111, O121, O145, O157 - | x_backup_without_spp[i] %like% "O?(26|103|104|111|121|145|157)") { + | x_backup_without_spp[i] %like_case% "o?(26|103|104|111|121|145|157)") { x[i] <- microorganismsDT[mo == 'B_ESCHR_COL', ..property][[1]][1L] if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) } next } @@ -704,7 +722,7 @@ exec_as.mo <- function(x, # multi resistant P. aeruginosa x[i] <- microorganismsDT[mo == 'B_PSDMN_AER', ..property][[1]][1L] if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) } next } @@ -713,7 +731,7 @@ exec_as.mo <- function(x, # co-trim resistant S. maltophilia x[i] <- microorganismsDT[mo == 'B_STNTR_MAL', ..property][[1]][1L] if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) } next } @@ -721,117 +739,117 @@ exec_as.mo <- function(x, # peni I, peni R, vanco I, vanco R: S. pneumoniae x[i] <- microorganismsDT[mo == 'B_STRPT_PNE', ..property][[1]][1L] if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) } next } - if (x_backup_without_spp[i] %like% '^G[ABCDFGHK]S$') { + if (x_backup_without_spp[i] %like_case% '^g[abcdfghk]s$') { # Streptococci, like GBS = Group B Streptococci (B_STRPT_GRB) - x[i] <- microorganismsDT[mo == gsub("G([ABCDFGHK])S", "B_STRPT_GR\\1", x_backup_without_spp[i], ignore.case = TRUE), ..property][[1]][1L] + x[i] <- microorganismsDT[mo == toupper(gsub("g([abcdfghk])s", "B_STRPT_GR\\1", x_backup_without_spp[i])), ..property][[1]][1L] if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) } next } - if (x_backup_without_spp[i] %like% '(streptococ|streptokok).* [ABCDFGHK]$') { + if (x_backup_without_spp[i] %like_case% '(streptococ|streptokok).* [abcdfghk]$') { # Streptococci in different languages, like "estreptococos grupo B" - x[i] <- microorganismsDT[mo == gsub(".*(streptococ|streptokok|estreptococ).* ([ABCDFGHK])$", "B_STRPT_GR\\2", x_backup_without_spp[i], ignore.case = TRUE), ..property][[1]][1L] + x[i] <- microorganismsDT[mo == toupper(gsub(".*(streptococ|streptokok|estreptococ).* ([abcdfghk])$", "B_STRPT_GR\\2", x_backup_without_spp[i])), ..property][[1]][1L] if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) } next } - if (x_backup_without_spp[i] %like% 'group [ABCDFGHK] (streptococ|streptokok|estreptococ)') { + if (x_backup_without_spp[i] %like_case% 'group [abcdfghk] (streptococ|streptokok|estreptococ)') { # Streptococci in different languages, like "Group A Streptococci" - x[i] <- microorganismsDT[mo == gsub(".*group ([ABCDFGHK]) (streptococ|streptokok|estreptococ).*", "B_STRPT_GR\\1", x_backup_without_spp[i], ignore.case = TRUE), ..property][[1]][1L] + x[i] <- microorganismsDT[mo == toupper(gsub(".*group ([abcdfghk]) (streptococ|streptokok|estreptococ).*", "B_STRPT_GR\\1", x_backup_without_spp[i])), ..property][[1]][1L] if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) } next } - if (x_backup_without_spp[i] %like% 'haemoly.*strept') { + if (x_backup_without_spp[i] %like_case% 'haemoly.*strept') { # Haemolytic streptococci in different languages x[i] <- microorganismsDT[mo == 'B_STRPT_HAE', ..property][[1]][1L] if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) } next } # CoNS/CoPS in different languages (support for German, Dutch, Spanish, Portuguese) ---- - if (x_backup_without_spp[i] %like% '[ck]oagulas[ea] negatie?[vf]' - | x_trimmed[i] %like% '[ck]oagulas[ea] negatie?[vf]' - | x_backup_without_spp[i] %like% '[ck]o?ns[^a-z]?$') { + if (x_backup_without_spp[i] %like_case% '[ck]oagulas[ea] negatie?[vf]' + | x_trimmed[i] %like_case% '[ck]oagulas[ea] negatie?[vf]' + | x_backup_without_spp[i] %like_case% '[ck]o?ns[^a-z]?$') { # coerce S. coagulase negative x[i] <- microorganismsDT[mo == 'B_STPHY_CNS', ..property][[1]][1L] if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) } next } - if (x_backup_without_spp[i] %like% '[ck]oagulas[ea] positie?[vf]' - | x_trimmed[i] %like% '[ck]oagulas[ea] positie?[vf]' - | x_backup_without_spp[i] %like% '[ck]o?ps[^a-z]?$') { + if (x_backup_without_spp[i] %like_case% '[ck]oagulas[ea] positie?[vf]' + | x_trimmed[i] %like_case% '[ck]oagulas[ea] positie?[vf]' + | x_backup_without_spp[i] %like_case% '[ck]o?ps[^a-z]?$') { # coerce S. coagulase positive x[i] <- microorganismsDT[mo == 'B_STPHY_CPS', ..property][[1]][1L] if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) } next } # streptococcal groups: milleri and viridans - if (x_trimmed[i] %like% 'strepto.* milleri' - | x_backup_without_spp[i] %like% 'strepto.* milleri' - | x_backup_without_spp[i] %like% 'mgs[^a-z]?$') { + if (x_trimmed[i] %like_case% 'strepto.* milleri' + | x_backup_without_spp[i] %like_case% 'strepto.* milleri' + | x_backup_without_spp[i] %like_case% 'mgs[^a-z]?$') { # Milleri Group Streptococcus (MGS) x[i] <- microorganismsDT[mo == 'B_STRPT_MIL', ..property][[1]][1L] if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) } next } - if (x_trimmed[i] %like% 'strepto.* viridans' - | x_backup_without_spp[i] %like% 'strepto.* viridans' - | x_backup_without_spp[i] %like% 'vgs[^a-z]?$') { + if (x_trimmed[i] %like_case% 'strepto.* viridans' + | x_backup_without_spp[i] %like_case% 'strepto.* viridans' + | x_backup_without_spp[i] %like_case% 'vgs[^a-z]?$') { # Viridans Group Streptococcus (VGS) x[i] <- microorganismsDT[mo == 'B_STRPT_VIR', ..property][[1]][1L] if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) } next } - if (x_backup_without_spp[i] %like% 'gram[ -]?neg.*' - | x_backup_without_spp[i] %like% 'negatie?[vf]' - | x_trimmed[i] %like% 'gram[ -]?neg.*') { + if (x_backup_without_spp[i] %like_case% 'gram[ -]?neg.*' + | x_backup_without_spp[i] %like_case% 'negatie?[vf]' + | x_trimmed[i] %like_case% 'gram[ -]?neg.*') { # coerce Gram negatives x[i] <- microorganismsDT[mo == 'B_GRAMN', ..property][[1]][1L] if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) } next } - if (x_backup_without_spp[i] %like% 'gram[ -]?pos.*' - | x_backup_without_spp[i] %like% 'positie?[vf]' - | x_trimmed[i] %like% 'gram[ -]?pos.*') { + if (x_backup_without_spp[i] %like_case% 'gram[ -]?pos.*' + | x_backup_without_spp[i] %like_case% 'positie?[vf]' + | x_trimmed[i] %like_case% 'gram[ -]?pos.*') { # coerce Gram positives x[i] <- microorganismsDT[mo == 'B_GRAMP', ..property][[1]][1L] if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) } next } - if (x_backup_without_spp[i] %like% "salmonella [a-z]+ ?.*") { - if (x_backup_without_spp[i] %like% "Salmonella group") { + if (x_backup_without_spp[i] %like_case% "salmonella [a-z]+ ?.*") { + if (x_backup_without_spp[i] %like_case% "salmonella group") { # Salmonella Group A to Z, just return S. species for now x[i] <- microorganismsDT[mo == 'B_SLMNL', ..property][[1]][1L] if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) } next - } else if (grepl("[sS]almonella [A-Z][a-z]+ ?.*", x_backup_without_spp[i], ignore.case = FALSE)) { + } else if (grepl("[sS]almonella [A-Z][a-z]+ ?.*", x_backup[i], ignore.case = FALSE)) { # Salmonella with capital letter species like "Salmonella Goettingen" - they're all S. enterica x[i] <- microorganismsDT[mo == 'B_SLMNL_ENT', ..property][[1]][1L] if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) } uncertainties <- rbind(uncertainties, format_uncertainty_as_df(uncertainty_level = 1, @@ -842,91 +860,33 @@ exec_as.mo <- function(x, } # trivial names known to the field: - if ("meningococcus" %like% x_trimmed[i]) { + if ("meningococcus" %like_case% x_trimmed[i]) { # coerce S. coagulase positive x[i] <- microorganismsDT[mo == 'B_NESSR_MEN', ..property][[1]][1L] if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) } next } - if ("gonococcus" %like% x_trimmed[i]) { + if ("gonococcus" %like_case% x_trimmed[i]) { # coerce S. coagulase positive x[i] <- microorganismsDT[mo == 'B_NESSR_GON', ..property][[1]][1L] if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) } next } - if ("pneumococcus" %like% x_trimmed[i]) { + if ("pneumococcus" %like_case% x_trimmed[i]) { # coerce S. coagulase positive x[i] <- microorganismsDT[mo == 'B_STRPT_PNE', ..property][[1]][1L] if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) } next } } - # FIRST TRY FULLNAMES AND CODES ---- - # if only genus is available, return only genus - if (all(!c(x[i], x_trimmed[i]) %like% " ")) { - found <- microorganismsDT[fullname_lower %in% tolower(c(x_species[i], x_trimmed_species[i])), ..property][[1]] - if (length(found) > 0) { - x[i] <- found[1L] - if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - if (nchar(x_backup_without_spp[i]) >= 6) { - found <- microorganismsDT[fullname_lower %like% paste0("^", unregex(x_backup_without_spp[i]), "[a-z]+"), ..property][[1]] - if (length(found) > 0) { - x[i] <- found[1L] - if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - } - # rest of genus only is in allow_uncertain part. - } - - # TRY OTHER SOURCES ---- - # WHONET and other common LIS codes - if (toupper(x_backup[i]) %in% AMR::microorganisms.codes[, 1]) { - mo_found <- AMR::microorganisms.codes[toupper(x_backup[i]) == AMR::microorganisms.codes[, 1], "mo"][1L] - if (length(mo_found) > 0) { - x[i] <- microorganismsDT[mo == mo_found, ..property][[1]][1L] - if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - } - if (!is.null(reference_df)) { - # self-defined reference - if (x_backup[i] %in% reference_df[, 1]) { - ref_mo <- reference_df[reference_df[, 1] == x_backup[i], "mo"] - if (ref_mo %in% microorganismsDT[, mo]) { - x[i] <- microorganismsDT[mo == ref_mo, ..property][[1]][1L] - next - } else { - warning("Value '", x_backup[i], "' was found in reference_df, but '", ref_mo, "' is not a valid MO code.", call. = FALSE) - } - } - } - - # allow no codes less than 4 characters long, was already checked for WHONET above - if (nchar(x_backup_without_spp[i]) < 4) { - x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]] - if (initial_search == TRUE) { - failures <- c(failures, x_backup[i]) - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - + # NOW RUN THROUGH DIFFERENT PREVALENCE LEVELS check_per_prevalence <- function(data_to_check, a.x_backup, b.x_trimmed, @@ -934,7 +894,69 @@ exec_as.mo <- function(x, d.x_withspaces_start_end, e.x_withspaces_start_only, f.x_withspaces_end_only, - g.x_backup_without_spp) { + g.x_backup_without_spp, + h.x_species, + i.x_trimmed_species) { + + # FIRST TRY FULLNAMES AND CODES ---- + # if only genus is available, return only genus + + if (all(!c(x[i], b.x_trimmed) %like_case% " ")) { + found <- data_to_check[fullname_lower %in% c(h.x_species, i.x_trimmed_species), ..property][[1]] + if (length(found) > 0) { + x[i] <- found[1L] + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) + } + return(x[i]) + } + if (nchar(g.x_backup_without_spp) >= 6) { + found <- data_to_check[fullname_lower %like_case% paste0("^", unregex(g.x_backup_without_spp), "[a-z]+"), ..property][[1]] + if (length(found) > 0) { + x[i] <- found[1L] + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) + } + return(x[i]) + } + } + # rest of genus only is in allow_uncertain part. + } + + # TRY OTHER SOURCES ---- + # WHONET and other common LIS codes + if (toupper(a.x_backup) %in% AMR::microorganisms.codes[, 1]) { + mo_found <- AMR::microorganisms.codes[toupper(a.x_backup) == AMR::microorganisms.codes[, 1], "mo"][1L] + if (length(mo_found) > 0) { + x[i] <- microorganismsDT[mo == mo_found, ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) + } + return(x[i]) + } + } + if (!is.null(reference_df)) { + # self-defined reference + if (a.x_backup %in% reference_df[, 1]) { + ref_mo <- reference_df[reference_df[, 1] == a.x_backup, "mo"] + if (ref_mo %in% data_to_check[, mo]) { + x[i] <- data_to_check[mo == ref_mo, ..property][[1]][1L] + return(x[i]) + } else { + warning("Value '", a.x_backup, "' was found in reference_df, but '", ref_mo, "' is not a valid MO code.", call. = FALSE) + } + } + } + + # allow no codes less than 4 characters long, was already checked for WHONET above + if (nchar(g.x_backup_without_spp) < 4) { + x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]] + if (initial_search == TRUE) { + failures <- c(failures, a.x_backup) + set_mo_history(a.x_backup, get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) + } + return(x[i]) + } # try probable: trimmed version of fullname ---- found <- data_to_check[fullname_lower %in% tolower(g.x_backup_without_spp), ..property][[1]] @@ -943,30 +965,30 @@ exec_as.mo <- function(x, } # try any match keeping spaces ---- - found <- data_to_check[fullname %like% d.x_withspaces_start_end, ..property][[1]] + found <- data_to_check[fullname_lower %like_case% d.x_withspaces_start_end, ..property][[1]] if (length(found) > 0 & nchar(g.x_backup_without_spp) >= 6) { return(found[1L]) } # try any match keeping spaces, not ending with $ ---- - found <- data_to_check[fullname %like% paste0(trimws(e.x_withspaces_start_only), " "), ..property][[1]] + found <- data_to_check[fullname_lower %like_case% paste0(trimws(e.x_withspaces_start_only), " "), ..property][[1]] if (length(found) > 0) { return(found[1L]) } - found <- data_to_check[fullname %like% e.x_withspaces_start_only, ..property][[1]] + found <- data_to_check[fullname_lower %like_case% e.x_withspaces_start_only, ..property][[1]] if (length(found) > 0 & nchar(g.x_backup_without_spp) >= 6) { return(found[1L]) } # try any match keeping spaces, not start with ^ ---- - found <- data_to_check[fullname %like% paste0(" ", trimws(f.x_withspaces_end_only)), ..property][[1]] + found <- data_to_check[fullname_lower %like_case% paste0(" ", trimws(f.x_withspaces_end_only)), ..property][[1]] if (length(found) > 0) { return(found[1L]) } # try a trimmed version - found <- data_to_check[fullname_lower %like% b.x_trimmed - | fullname_lower %like% c.x_trimmed_without_group, ..property][[1]] + found <- data_to_check[fullname_lower %like_case% b.x_trimmed + | fullname_lower %like_case% c.x_trimmed_without_group, ..property][[1]] if (length(found) > 0 & nchar(g.x_backup_without_spp) >= 6) { return(found[1L]) } @@ -981,7 +1003,7 @@ exec_as.mo <- function(x, g.x_backup_without_spp %>% substr(1, x_length / 2), '.* ', g.x_backup_without_spp %>% substr((x_length / 2) + 1, x_length)) - found <- data_to_check[fullname %like% x_split, ..property][[1]] + found <- data_to_check[fullname_lower %like_case% x_split, ..property][[1]] if (length(found) > 0) { return(found[1L]) } @@ -989,495 +1011,523 @@ exec_as.mo <- function(x, # try fullname without start and without nchar limit of >= 6 ---- # like "K. pneu rhino" >> "Klebsiella pneumoniae (rhinoscleromatis)" = KLEPNERH - found <- data_to_check[fullname %like% e.x_withspaces_start_only, ..property][[1]] + found <- data_to_check[fullname_lower %like_case% e.x_withspaces_start_only, ..property][[1]] if (length(found) > 0) { return(found[1L]) } + # MISCELLANEOUS ---- + + # look for old taxonomic names ---- + found <- microorganisms.oldDT[fullname_lower == tolower(a.x_backup) + | fullname_lower %like_case% d.x_withspaces_start_end,] + if (NROW(found) > 0) { + col_id_new <- found[1, col_id_new] + # when property is "ref" (which is the case in mo_ref, mo_authors and mo_year), return the old value, so: + # mo_ref("Chlamydia psittaci") = "Page, 1968" (with warning) + # mo_ref("Chlamydophila psittaci") = "Everett et al., 1999" + if (property == "ref") { + x[i] <- found[1, ref] + } else { + x[i] <- microorganismsDT[col_id == found[1, col_id_new], ..property][[1]] + } + options(mo_renamed_last_run = found[1, fullname]) + was_renamed(name_old = found[1, fullname], + name_new = microorganismsDT[col_id == found[1, col_id_new], fullname], + ref_old = found[1, ref], + ref_new = microorganismsDT[col_id == found[1, col_id_new], ref], + mo = microorganismsDT[col_id == found[1, col_id_new], mo]) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) + } + return(x[i]) + } + + # check for uncertain results ---- + uncertain_fn <- function(a.x_backup, + b.x_trimmed, + d.x_withspaces_start_end, + e.x_withspaces_start_only, + f.x_withspaces_end_only, + g.x_backup_without_spp, + uncertain.reference_data_to_use) { + + if (uncertainty_level == 0) { + # do not allow uncertainties + return(NA_character_) + } + + # UNCERTAINTY LEVEL 1 ---- + if (uncertainty_level >= 1) { + now_checks_for_uncertainty_level <- 1 + + # (1) look again for old taxonomic names, now for G. species ---- + if (isTRUE(debug)) { + cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (1) look again for old taxonomic names, now for G. species\n") + } + if (isTRUE(debug)) { + message("Running '", d.x_withspaces_start_end, "' and '", e.x_withspaces_start_only, "'") + } + found <- microorganisms.oldDT[fullname_lower %like_case% d.x_withspaces_start_end + | fullname_lower %like_case% e.x_withspaces_start_only] + if (NROW(found) > 0 & nchar(g.x_backup_without_spp) >= 6) { + if (property == "ref") { + # when property is "ref" (which is the case in mo_ref, mo_authors and mo_year), return the old value, so: + # mo_ref("Chlamydia psittaci) = "Page, 1968" (with warning) + # mo_ref("Chlamydophila psittaci) = "Everett et al., 1999" + x <- found[1, ref] + } else { + x <- microorganismsDT[col_id == found[1, col_id_new], ..property][[1]] + } + was_renamed(name_old = found[1, fullname], + name_new = microorganismsDT[col_id == found[1, col_id_new], fullname], + ref_old = found[1, ref], + ref_new = microorganismsDT[col_id == found[1, col_id_new], ref], + mo = microorganismsDT[col_id == found[1, col_id_new], mo]) + options(mo_renamed_last_run = found[1, fullname]) + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = microorganismsDT[col_id == found[1, col_id_new], mo])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(x, property), 1, force = force_mo_history, disable = disable_mo_history) + } + return(x) + } + + # (2) Try with misspelled input ---- + # just rerun with dyslexia_mode = TRUE will used the extensive regex part above + if (isTRUE(debug)) { + cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (2) Try with misspelled input\n") + } + if (isTRUE(debug)) { + message("Running '", a.x_backup, "'") + } + # first try without dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug, reference_data_to_use = uncertain.reference_data_to_use))) + if (empty_result(found)) { + # then with dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug, reference_data_to_use = uncertain.reference_data_to_use))) + } + if (!empty_result(found)) { + found_result <- found + found <- reference_data_to_use[mo == found, ..property][[1]] + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(found[1L], property), 1, force = force_mo_history, disable = disable_mo_history) + } + return(found[1L]) + } + } + + # UNCERTAINTY LEVEL 2 ---- + if (uncertainty_level >= 2) { + now_checks_for_uncertainty_level <- 2 + + # (3) look for genus only, part of name ---- + if (isTRUE(debug)) { + cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (3) look for genus only, part of name\n") + } + if (nchar(g.x_backup_without_spp) > 4 & !b.x_trimmed %like_case% " ") { + if (!grepl("^[A-Z][a-z]+", b.x_trimmed, ignore.case = FALSE)) { + if (isTRUE(debug)) { + message("Running '", paste(b.x_trimmed, "species"), "'") + } + # not when input is like Genustext, because then Neospora would lead to Actinokineospora + found <- uncertain.reference_data_to_use[fullname_lower %like_case% paste(b.x_trimmed, "species"), ..property][[1]] + if (length(found) > 0) { + x[i] <- found[1L] + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(x, property), 2, force = force_mo_history, disable = disable_mo_history) + } + return(x) + } + } + } + + # (4) strip values between brackets ---- + if (isTRUE(debug)) { + cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (4) strip values between brackets\n") + } + a.x_backup_stripped <- gsub("( *[(].*[)] *)", " ", a.x_backup) + a.x_backup_stripped <- trimws(gsub(" +", " ", a.x_backup_stripped)) + if (isTRUE(debug)) { + message("Running '", a.x_backup_stripped, "'") + } + # first try without dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup_stripped, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug, reference_data_to_use = uncertain.reference_data_to_use))) + if (empty_result(found)) { + # then with dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup_stripped, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug, reference_data_to_use = uncertain.reference_data_to_use))) + } + if (!empty_result(found) & nchar(g.x_backup_without_spp) >= 6) { + found_result <- found + found <- reference_data_to_use[mo == found, ..property][[1]] + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history, disable = disable_mo_history) + } + return(found[1L]) + } + + # (5) inverse input ---- + if (isTRUE(debug)) { + cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (5) inverse input\n") + } + a.x_backup_inversed <- paste(rev(unlist(strsplit(a.x_backup, split = " "))), collapse = " ") + if (isTRUE(debug)) { + message("Running '", a.x_backup_inversed, "'") + } + # first try without dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup_inversed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug, reference_data_to_use = uncertain.reference_data_to_use))) + if (empty_result(found)) { + # then with dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup_inversed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug, reference_data_to_use = uncertain.reference_data_to_use))) + } + if (!empty_result(found) & nchar(g.x_backup_without_spp) >= 6) { + found_result <- found + found <- reference_data_to_use[mo == found, ..property][[1]] + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history, disable = disable_mo_history) + } + return(found[1L]) + } + + # (6) try to strip off half an element from end and check the remains ---- + if (isTRUE(debug)) { + cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (6) try to strip off half an element from end and check the remains\n") + } + x_strip <- a.x_backup %>% strsplit(" ") %>% unlist() + if (length(x_strip) > 1) { + for (i in 1:(length(x_strip) - 1)) { + lastword <- x_strip[length(x_strip) - i + 1] + lastword_half <- substr(lastword, 1, as.integer(nchar(lastword) / 2)) + # remove last half of the second term + x_strip_collapsed <- paste(c(x_strip[1:(length(x_strip) - i)], lastword_half), collapse = " ") + if (nchar(x_strip_collapsed) >= 4 & nchar(lastword_half) > 2) { + if (isTRUE(debug)) { + message("Running '", x_strip_collapsed, "'") + } + # first try without dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug, reference_data_to_use = uncertain.reference_data_to_use))) + if (empty_result(found)) { + # then with dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug, reference_data_to_use = uncertain.reference_data_to_use))) + } + if (!empty_result(found)) { + found_result <- found + found <- reference_data_to_use[mo == found, ..property][[1]] + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history, disable = disable_mo_history) + } + return(found[1L]) + } + } + } + } + # (7) try to strip off one element from end and check the remains ---- + if (isTRUE(debug)) { + cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (7) try to strip off one element from end and check the remains\n") + } + if (length(x_strip) > 1) { + for (i in 1:(length(x_strip) - 1)) { + x_strip_collapsed <- paste(x_strip[1:(length(x_strip) - i)], collapse = " ") + if (nchar(x_strip_collapsed) >= 6) { + if (isTRUE(debug)) { + message("Running '", x_strip_collapsed, "'") + } + # first try without dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug, reference_data_to_use = uncertain.reference_data_to_use))) + if (empty_result(found)) { + # then with dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug, reference_data_to_use = uncertain.reference_data_to_use))) + } + if (!empty_result(found)) { + found_result <- found + found <- reference_data_to_use[mo == found, ..property][[1]] + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history, disable = disable_mo_history) + } + return(found[1L]) + } + } + } + } + # (8) check for unknown yeasts/fungi ---- + if (isTRUE(debug)) { + cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (8) check for unknown yeasts/fungi\n") + } + if (b.x_trimmed %like_case% "yeast") { + found <- "F_YEAST" + found_result <- found + found <- microorganismsDT[mo == found, ..property][[1]] + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history, disable = disable_mo_history) + } + return(found[1L]) + } + if (b.x_trimmed %like_case% "(fungus|fungi)" & !b.x_trimmed %like_case% "fungiphrya") { + found <- "F_FUNGUS" + found_result <- found + found <- microorganismsDT[mo == found, ..property][[1]] + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history, disable = disable_mo_history) + } + return(found[1L]) + } + # (9) try to strip off one element from start and check the remains (only allow >= 2-part name outcome) ---- + if (isTRUE(debug)) { + cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (9) try to strip off one element from start and check the remains (only allow >= 2-part name outcome)\n") + } + x_strip <- a.x_backup %>% strsplit(" ") %>% unlist() + if (length(x_strip) > 1 & nchar(g.x_backup_without_spp) >= 6) { + for (i in 2:(length(x_strip))) { + x_strip_collapsed <- paste(x_strip[i:length(x_strip)], collapse = " ") + if (isTRUE(debug)) { + message("Running '", x_strip_collapsed, "'") + } + # first try without dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug, reference_data_to_use = uncertain.reference_data_to_use))) + if (empty_result(found)) { + # then with dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug, reference_data_to_use = uncertain.reference_data_to_use))) + } + if (!empty_result(found)) { + found_result <- found + found <- reference_data_to_use[mo == found_result[1L], ..property][[1]] + # uncertainty level 2 only if searched part contains a space (otherwise it will be found with lvl 3) + if (x_strip_collapsed %like_case% " ") { + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history, disable = disable_mo_history) + } + return(found[1L]) + } + } + } + } + } + + # UNCERTAINTY LEVEL 3 ---- + if (uncertainty_level >= 3) { + now_checks_for_uncertainty_level <- 3 + + # (10) try to strip off one element from start and check the remains (any text size) ---- + if (isTRUE(debug)) { + cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (10) try to strip off one element from start and check the remains (any text size)\n") + } + x_strip <- a.x_backup %>% strsplit(" ") %>% unlist() + if (length(x_strip) > 1 & nchar(g.x_backup_without_spp) >= 6) { + for (i in 2:(length(x_strip))) { + x_strip_collapsed <- paste(x_strip[i:length(x_strip)], collapse = " ") + if (isTRUE(debug)) { + message("Running '", x_strip_collapsed, "'") + } + # first try without dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug, reference_data_to_use = uncertain.reference_data_to_use))) + if (empty_result(found)) { + # then with dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug, reference_data_to_use = uncertain.reference_data_to_use))) + } + if (!empty_result(found)) { + found_result <- found + found <- reference_data_to_use[mo == found, ..property][[1]] + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(found[1L], property), 3, force = force_mo_history, disable = disable_mo_history) + } + return(found[1L]) + } + } + } + # (11) try to strip off one element from end and check the remains (any text size) ---- + # (this is in fact 7 but without nchar limit of >=6) + if (isTRUE(debug)) { + cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (11) try to strip off one element from end and check the remains (any text size)\n") + } + if (length(x_strip) > 1) { + for (i in 1:(length(x_strip) - 1)) { + x_strip_collapsed <- paste(x_strip[1:(length(x_strip) - i)], collapse = " ") + if (isTRUE(debug)) { + message("Running '", x_strip_collapsed, "'") + } + # first try without dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug, reference_data_to_use = uncertain.reference_data_to_use))) + if (empty_result(found)) { + # then with dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug, reference_data_to_use = uncertain.reference_data_to_use))) + } + if (!empty_result(found)) { + found_result <- found + found <- reference_data_to_use[mo == found, ..property][[1]] + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history, disable = disable_mo_history) + } + return(found[1L]) + } + } + } + + # (12) part of a name (very unlikely match) ---- + if (isTRUE(debug)) { + cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (12) part of a name (very unlikely match)\n") + } + if (isTRUE(debug)) { + message("Running '", f.x_withspaces_end_only, "'") + } + found <- reference_data_to_use[fullname_lower %like_case% f.x_withspaces_end_only] + if (nrow(found) > 0) { + found_result <- found[["mo"]] + if (!empty_result(found_result) & nchar(g.x_backup_without_spp) >= 6) { + found <- reference_data_to_use[mo == found_result[1L], ..property][[1]] + uncertainties <<- rbind(uncertainties, + format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, + input = a.x_backup, + result_mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(found[1L], property), 3, force = force_mo_history, disable = disable_mo_history) + } + return(found[1L]) + } + } + } + + # didn't found in uncertain results too + return(NA_character_) + } + + # uncertain results + # wait until prevalence == 2 to run the uncertain results on both prevalence == 1 and prevalence == 2 + if (nrow(data_to_check) == nrow(microorganismsDT[prevalence == 2])) { + x[i] <- uncertain_fn(a.x_backup = a.x_backup, + b.x_trimmed = b.x_trimmed, + d.x_withspaces_start_end = d.x_withspaces_start_end, + e.x_withspaces_start_only = e.x_withspaces_start_only, + f.x_withspaces_end_only = f.x_withspaces_end_only, + g.x_backup_without_spp = g.x_backup_without_spp, + uncertain.reference_data_to_use = microorganismsDT[prevalence %in% c(1, 2)]) + if (!empty_result(x[i])) { + # no set_mo_history here - it is already set in uncertain_fn() + return(x[i]) + } + } else if (nrow(data_to_check) == nrow(microorganismsDT[prevalence == 3])) { + x[i] <- uncertain_fn(a.x_backup = a.x_backup, + b.x_trimmed = b.x_trimmed, + d.x_withspaces_start_end = d.x_withspaces_start_end, + e.x_withspaces_start_only = e.x_withspaces_start_only, + f.x_withspaces_end_only = f.x_withspaces_end_only, + g.x_backup_without_spp = g.x_backup_without_spp, + uncertain.reference_data_to_use = microorganismsDT[prevalence == 3]) + if (!empty_result(x[i])) { + # no set_mo_history here - it is already set in uncertain_fn() + return(x[i]) + } + } + # didn't found any return(NA_character_) } # FIRST TRY VERY PREVALENT IN HUMAN INFECTIONS ---- - x[i] <- check_per_prevalence(data_to_check = microorganismsDT[prevalence == 1], + x[i] <- check_per_prevalence(data_to_check = reference_data_to_use[prevalence == 1], a.x_backup = x_backup[i], b.x_trimmed = x_trimmed[i], c.x_trimmed_without_group = x_trimmed_without_group[i], d.x_withspaces_start_end = x_withspaces_start_end[i], e.x_withspaces_start_only = x_withspaces_start_only[i], f.x_withspaces_end_only = x_withspaces_end_only[i], - g.x_backup_without_spp = x_backup_without_spp[i]) + g.x_backup_without_spp = x_backup_without_spp[i], + h.x_species = x_species[i], + i.x_trimmed_species = x_trimmed_species[i]) if (!empty_result(x[i])) { if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) } next } + # THEN TRY PREVALENT IN HUMAN INFECTIONS ---- - x[i] <- check_per_prevalence(data_to_check = microorganismsDT[prevalence == 2], + x[i] <- check_per_prevalence(data_to_check = reference_data_to_use[prevalence == 2], a.x_backup = x_backup[i], b.x_trimmed = x_trimmed[i], c.x_trimmed_without_group = x_trimmed_without_group[i], d.x_withspaces_start_end = x_withspaces_start_end[i], e.x_withspaces_start_only = x_withspaces_start_only[i], f.x_withspaces_end_only = x_withspaces_end_only[i], - g.x_backup_without_spp = x_backup_without_spp[i]) + g.x_backup_without_spp = x_backup_without_spp[i], + h.x_species = x_species[i], + i.x_trimmed_species = x_trimmed_species[i]) if (!empty_result(x[i])) { if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) } next } + # THEN UNPREVALENT IN HUMAN INFECTIONS ---- - x[i] <- check_per_prevalence(data_to_check = microorganismsDT[prevalence == 3], + x[i] <- check_per_prevalence(data_to_check = reference_data_to_use[prevalence == 3], a.x_backup = x_backup[i], b.x_trimmed = x_trimmed[i], c.x_trimmed_without_group = x_trimmed_without_group[i], d.x_withspaces_start_end = x_withspaces_start_end[i], e.x_withspaces_start_only = x_withspaces_start_only[i], f.x_withspaces_end_only = x_withspaces_end_only[i], - g.x_backup_without_spp = x_backup_without_spp[i]) + g.x_backup_without_spp = x_backup_without_spp[i], + h.x_species = x_species[i], + i.x_trimmed_species = x_trimmed_species[i]) if (!empty_result(x[i])) { if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) } next } - # MISCELLANEOUS ---- - - # look for old taxonomic names ---- - found <- microorganisms.oldDT[fullname_lower == tolower(x_backup[i]) - | fullname %like% x_withspaces_start_end[i],] - if (NROW(found) > 0) { - col_id_new <- found[1, col_id_new] - # when property is "ref" (which is the case in mo_ref, mo_authors and mo_year), return the old value, so: - # mo_ref("Chlamydia psittaci") = "Page, 1968" (with warning) - # mo_ref("Chlamydophila psittaci") = "Everett et al., 1999" - if (property == "ref") { - x[i] <- found[1, ref] - } else { - x[i] <- microorganismsDT[col_id == found[1, col_id_new], ..property][[1]] - } - options(mo_renamed_last_run = found[1, fullname]) - was_renamed(name_old = found[1, fullname], - name_new = microorganismsDT[col_id == found[1, col_id_new], fullname], - ref_old = found[1, ref], - ref_new = microorganismsDT[col_id == found[1, col_id_new], ref], - mo = microorganismsDT[col_id == found[1, col_id_new], mo]) - if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - - # check for uncertain results ---- - uncertain_fn <- function(a.x_backup, - b.x_trimmed, - c.x_withspaces_start_end, - d.x_withspaces_start_only, - f.x_withspaces_end_only, - g.x_backup_without_spp) { - - if (uncertainty_level == 0) { - # do not allow uncertainties - return(NA_character_) - } - - # UNCERTAINTY LEVEL 1 ---- - if (uncertainty_level >= 1) { - now_checks_for_uncertainty_level <- 1 - - # (1) look again for old taxonomic names, now for G. species ---- - if (isTRUE(debug)) { - cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (1) look again for old taxonomic names, now for G. species\n") - } - if (isTRUE(debug)) { - message("Running '", c.x_withspaces_start_end, "' and '", d.x_withspaces_start_only, "'") - } - found <- microorganisms.oldDT[fullname %like% c.x_withspaces_start_end - | fullname %like% d.x_withspaces_start_only] - if (NROW(found) > 0 & nchar(g.x_backup_without_spp) >= 6) { - if (property == "ref") { - # when property is "ref" (which is the case in mo_ref, mo_authors and mo_year), return the old value, so: - # mo_ref("Chlamydia psittaci) = "Page, 1968" (with warning) - # mo_ref("Chlamydophila psittaci) = "Everett et al., 1999" - x <- found[1, ref] - } else { - x <- microorganismsDT[col_id == found[1, col_id_new], ..property][[1]] - } - was_renamed(name_old = found[1, fullname], - name_new = microorganismsDT[col_id == found[1, col_id_new], fullname], - ref_old = found[1, ref], - ref_new = microorganismsDT[col_id == found[1, col_id_new], ref], - mo = microorganismsDT[col_id == found[1, col_id_new], mo]) - options(mo_renamed_last_run = found[1, fullname]) - uncertainties <<- rbind(uncertainties, - format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, - input = a.x_backup, - result_mo = microorganismsDT[col_id == found[1, col_id_new], mo])) - if (initial_search == TRUE) { - set_mo_history(a.x_backup, get_mo_code(x, property), 1, force = force_mo_history) - } - return(x) - } - - # (2) Try with misspelled input ---- - # just rerun with dyslexia_mode = TRUE will used the extensive regex part above - if (isTRUE(debug)) { - cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (2) Try with misspelled input\n") - } - if (isTRUE(debug)) { - message("Running '", a.x_backup, "'") - } - # first try without dyslexia mode - found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) - if (empty_result(found)) { - # then with dyslexia mode - found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) - } - if (!empty_result(found)) { - found_result <- found - found <- microorganismsDT[mo == found, ..property][[1]] - uncertainties <<- rbind(uncertainties, - format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, - input = a.x_backup, - result_mo = found_result[1L])) - if (initial_search == TRUE) { - set_mo_history(a.x_backup, get_mo_code(found[1L], property), 1, force = force_mo_history) - } - return(found[1L]) - } - } - - # UNCERTAINTY LEVEL 2 ---- - if (uncertainty_level >= 2) { - now_checks_for_uncertainty_level <- 2 - - # (3) look for genus only, part of name ---- - if (isTRUE(debug)) { - cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (3) look for genus only, part of name\n") - } - if (nchar(g.x_backup_without_spp) > 4 & !b.x_trimmed %like% " ") { - if (!grepl("^[A-Z][a-z]+", b.x_trimmed, ignore.case = FALSE)) { - if (isTRUE(debug)) { - message("Running '", paste(b.x_trimmed, "species"), "'") - } - # not when input is like Genustext, because then Neospora would lead to Actinokineospora - found <- microorganismsDT[fullname_lower %like% paste(b.x_trimmed, "species"), ..property][[1]] - if (length(found) > 0) { - x[i] <- found[1L] - uncertainties <<- rbind(uncertainties, - format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, - input = a.x_backup, - result_mo = found_result[1L])) - if (initial_search == TRUE) { - set_mo_history(a.x_backup, get_mo_code(x, property), 2, force = force_mo_history) - } - return(x) - } - } - } - - # (4) strip values between brackets ---- - if (isTRUE(debug)) { - cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (4) strip values between brackets\n") - } - a.x_backup_stripped <- gsub("( *[(].*[)] *)", " ", a.x_backup) - a.x_backup_stripped <- trimws(gsub(" +", " ", a.x_backup_stripped)) - if (isTRUE(debug)) { - message("Running '", a.x_backup_stripped, "'") - } - # first try without dyslexia mode - found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup_stripped, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) - if (empty_result(found)) { - # then with dyslexia mode - found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup_stripped, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) - } - if (!empty_result(found) & nchar(g.x_backup_without_spp) >= 6) { - found_result <- found - found <- microorganismsDT[mo == found, ..property][[1]] - uncertainties <<- rbind(uncertainties, - format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, - input = a.x_backup, - result_mo = found_result[1L])) - if (initial_search == TRUE) { - set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history) - } - return(found[1L]) - } - - # (5) inverse input ---- - if (isTRUE(debug)) { - cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (5) inverse input\n") - } - a.x_backup_inversed <- paste(rev(unlist(strsplit(a.x_backup, split = " "))), collapse = " ") - if (isTRUE(debug)) { - message("Running '", a.x_backup_inversed, "'") - } - # first try without dyslexia mode - found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup_inversed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) - if (empty_result(found)) { - # then with dyslexia mode - found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup_inversed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) - } - if (!empty_result(found) & nchar(g.x_backup_without_spp) >= 6) { - found_result <- found - found <- microorganismsDT[mo == found, ..property][[1]] - uncertainties <<- rbind(uncertainties, - format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, - input = a.x_backup, - result_mo = found_result[1L])) - if (initial_search == TRUE) { - set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history) - } - return(found[1L]) - } - - # (6) try to strip off half an element from end and check the remains ---- - if (isTRUE(debug)) { - cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (6) try to strip off half an element from end and check the remains\n") - } - x_strip <- a.x_backup %>% strsplit(" ") %>% unlist() - if (length(x_strip) > 1) { - for (i in 1:(length(x_strip) - 1)) { - lastword <- x_strip[length(x_strip) - i + 1] - lastword_half <- substr(lastword, 1, as.integer(nchar(lastword) / 2)) - # remove last half of the second term - x_strip_collapsed <- paste(c(x_strip[1:(length(x_strip) - i)], lastword_half), collapse = " ") - if (nchar(x_strip_collapsed) >= 4 & nchar(lastword_half) > 2) { - if (isTRUE(debug)) { - message("Running '", x_strip_collapsed, "'") - } - # first try without dyslexia mode - found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) - if (empty_result(found)) { - # then with dyslexia mode - found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) - } - if (!empty_result(found)) { - found_result <- found - found <- microorganismsDT[mo == found, ..property][[1]] - uncertainties <<- rbind(uncertainties, - format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, - input = a.x_backup, - result_mo = found_result[1L])) - if (initial_search == TRUE) { - set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history) - } - return(found[1L]) - } - } - } - } - # (7) try to strip off one element from end and check the remains ---- - if (isTRUE(debug)) { - cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (7) try to strip off one element from end and check the remains\n") - } - if (length(x_strip) > 1) { - for (i in 1:(length(x_strip) - 1)) { - x_strip_collapsed <- paste(x_strip[1:(length(x_strip) - i)], collapse = " ") - if (nchar(x_strip_collapsed) >= 6) { - if (isTRUE(debug)) { - message("Running '", x_strip_collapsed, "'") - } - # first try without dyslexia mode - found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) - if (empty_result(found)) { - # then with dyslexia mode - found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) - } - if (!empty_result(found)) { - found_result <- found - found <- microorganismsDT[mo == found, ..property][[1]] - uncertainties <<- rbind(uncertainties, - format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, - input = a.x_backup, - result_mo = found_result[1L])) - if (initial_search == TRUE) { - set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history) - } - return(found[1L]) - } - } - } - } - # (8) check for unknown yeasts/fungi ---- - if (isTRUE(debug)) { - cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (8) check for unknown yeasts/fungi\n") - } - if (b.x_trimmed %like% "yeast") { - found <- "F_YEAST" - found_result <- found - found <- microorganismsDT[mo == found, ..property][[1]] - uncertainties <<- rbind(uncertainties, - format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, - input = a.x_backup, - result_mo = found_result[1L])) - if (initial_search == TRUE) { - set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history) - } - return(found[1L]) - } - if (b.x_trimmed %like% "(fungus|fungi)" & !b.x_trimmed %like% "Fungiphrya") { - found <- "F_FUNGUS" - found_result <- found - found <- microorganismsDT[mo == found, ..property][[1]] - uncertainties <<- rbind(uncertainties, - format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, - input = a.x_backup, - result_mo = found_result[1L])) - if (initial_search == TRUE) { - set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history) - } - return(found[1L]) - } - # (9) try to strip off one element from start and check the remains (only allow >= 2-part name outcome) ---- - if (isTRUE(debug)) { - cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (9) try to strip off one element from start and check the remains (only allow >= 2-part name outcome)\n") - } - x_strip <- a.x_backup %>% strsplit(" ") %>% unlist() - if (length(x_strip) > 1 & nchar(g.x_backup_without_spp) >= 6) { - for (i in 2:(length(x_strip))) { - x_strip_collapsed <- paste(x_strip[i:length(x_strip)], collapse = " ") - if (isTRUE(debug)) { - message("Running '", x_strip_collapsed, "'") - } - # first try without dyslexia mode - found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) - if (empty_result(found)) { - # then with dyslexia mode - found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) - } - if (!empty_result(found)) { - found_result <- found - found <- microorganismsDT[mo == found_result[1L], ..property][[1]] - # uncertainty level 2 only if searched part contains a space (otherwise it will be found with lvl 3) - if (x_strip_collapsed %like% " ") { - uncertainties <<- rbind(uncertainties, - format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, - input = a.x_backup, - result_mo = found_result[1L])) - if (initial_search == TRUE) { - set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history) - } - return(found[1L]) - } - } - } - } - } - - # UNCERTAINTY LEVEL 3 ---- - if (uncertainty_level >= 3) { - now_checks_for_uncertainty_level <- 3 - - # (10) try to strip off one element from start and check the remains (any text size) ---- - if (isTRUE(debug)) { - cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (10) try to strip off one element from start and check the remains (any text size)\n") - } - x_strip <- a.x_backup %>% strsplit(" ") %>% unlist() - if (length(x_strip) > 1 & nchar(g.x_backup_without_spp) >= 6) { - for (i in 2:(length(x_strip))) { - x_strip_collapsed <- paste(x_strip[i:length(x_strip)], collapse = " ") - if (isTRUE(debug)) { - message("Running '", x_strip_collapsed, "'") - } - # first try without dyslexia mode - found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) - if (empty_result(found)) { - # then with dyslexia mode - found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) - } - if (!empty_result(found)) { - found_result <- found - found <- microorganismsDT[mo == found, ..property][[1]] - uncertainties <<- rbind(uncertainties, - format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, - input = a.x_backup, - result_mo = found_result[1L])) - if (initial_search == TRUE) { - set_mo_history(a.x_backup, get_mo_code(found[1L], property), 3, force = force_mo_history) - } - return(found[1L]) - } - } - } - # (11) try to strip off one element from end and check the remains (any text size) ---- - # (this is in fact 7 but without nchar limit of >=6) - if (isTRUE(debug)) { - cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (11) try to strip off one element from end and check the remains (any text size)\n") - } - if (length(x_strip) > 1) { - for (i in 1:(length(x_strip) - 1)) { - x_strip_collapsed <- paste(x_strip[1:(length(x_strip) - i)], collapse = " ") - if (isTRUE(debug)) { - message("Running '", x_strip_collapsed, "'") - } - # first try without dyslexia mode - found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) - if (empty_result(found)) { - # then with dyslexia mode - found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) - } - if (!empty_result(found)) { - found_result <- found - found <- microorganismsDT[mo == found, ..property][[1]] - uncertainties <<- rbind(uncertainties, - format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, - input = a.x_backup, - result_mo = found_result[1L])) - if (initial_search == TRUE) { - set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history) - } - return(found[1L]) - } - } - } - - # (12) part of a name (very unlikely match) ---- - if (isTRUE(debug)) { - cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (12) part of a name (very unlikely match)\n") - } - if (isTRUE(debug)) { - message("Running '", f.x_withspaces_end_only, "'") - } - found <- microorganismsDT[fullname %like% f.x_withspaces_end_only] - if (nrow(found) > 0) { - found_result <- found[["mo"]] - if (!empty_result(found_result) & nchar(g.x_backup_without_spp) >= 6) { - found <- microorganismsDT[mo == found_result[1L], ..property][[1]] - uncertainties <<- rbind(uncertainties, - format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, - input = a.x_backup, - result_mo = found_result[1L])) - if (initial_search == TRUE) { - set_mo_history(a.x_backup, get_mo_code(found[1L], property), 3, force = force_mo_history) - } - return(found[1L]) - } - } - } - - # didn't found in uncertain results too - return(NA_character_) - } - x[i] <- uncertain_fn(x_backup[i], - x_trimmed[i], - x_withspaces_start_end[i], - x_withspaces_start_only[i], - x_withspaces_end_only[i], - x_backup_without_spp[i]) - if (!empty_result(x[i])) { - # no set_mo_history here - it is already set in uncertain_fn() - next - } # no results found: make them UNKNOWN ---- x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]] if (initial_search == TRUE) { failures <- c(failures, x_backup[i]) - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history, disable = disable_mo_history) } } } @@ -1559,7 +1609,7 @@ exec_as.mo <- function(x, x[x %in% CoNS] <- microorganismsDT[mo == 'B_STPHY_CNS', ..property][[1]][1L] x[x %in% CoPS] <- microorganismsDT[mo == 'B_STPHY_CPS', ..property][[1]][1L] if (Becker == "all") { - x[x %in% microorganismsDT[mo %like% '^B_STPHY_AUR', ..property][[1]]] <- microorganismsDT[mo == 'B_STPHY_CPS', ..property][[1]][1L] + x[x %in% microorganismsDT[mo %like_case% '^B_STPHY_AUR', ..property][[1]]] <- microorganismsDT[mo == 'B_STPHY_CPS', ..property][[1]][1L] } } @@ -1626,29 +1676,17 @@ empty_result <- function(x) { #' @importFrom crayon italic was_renamed <- function(name_old, name_new, ref_old = "", ref_new = "", mo = "") { - if (!is.na(ref_old)) { - ref_old <- paste0(" (", ref_old, ")") + newly_set <- data.frame(old_name = name_old, + new_name = name_new, + new_ref = ref_new, + mo = mo, + stringsAsFactors = FALSE) + already_set <- getOption("mo_renamed") + if (!is.null(already_set)) { + options(mo_renamed = rbind(already_set, newly_set)) } else { - ref_old <- "" + options(mo_renamed = newly_set) } - if (!is.na(ref_new)) { - ref_new <- paste0(" (", ref_new, ")") - } else { - ref_new <- "" - } - if (!is.na(mo)) { - mo <- paste0(" (", mo, ")") - } else { - mo <- "" - } - old_values <- paste0(italic(name_old), ref_old) - old_values <- gsub("et al.", italic("et al."), old_values) - new_values <- paste0(italic(name_new), ref_new, mo) - new_values <- gsub("et al.", italic("et al."), new_values) - - names(new_values) <- old_values - total <- c(getOption("mo_renamed"), new_values) - options(mo_renamed = total[order(names(total))]) } format_uncertainty_as_df <- function(uncertainty_level, @@ -1832,30 +1870,33 @@ print.mo_uncertainties <- function(x, ...) { } #' @rdname as.mo -#' @importFrom crayon strip_style +#' @importFrom dplyr distinct #' @export mo_renamed <- function() { items <- getOption("mo_renamed") if (is.null(items)) { - return(NULL) + items <- data.frame() + } else { + items <- distinct(items, old_name, .keep_all = TRUE) } - - items <- strip_style(items) - names(items) <- strip_style(names(items)) structure(.Data = items, - class = c("mo_renamed", "character")) + class = c("mo_renamed", "data.frame")) } #' @exportMethod print.mo_renamed -#' @importFrom crayon blue +#' @importFrom crayon blue italic #' @export #' @noRd print.mo_renamed <- function(x, ...) { - items <- x #getOption("mo_renamed") - old <- names(x) - new <- x - - cat(blue(paste("NOTE:", italic(names(items)), "was renamed", italic(items), collapse = "\n"), collapse = "\n")) + if (NROW(x) == 0) { + return(invisible()) + } + for (i in 1:nrow(x)) { + message(blue(paste0("NOTE: ", + italic(x$old_name[i]), " was renamed ", italic(x$new_name[i]), + " (", gsub("et al.", italic("et al."), x$new_ref[i]), ")", + " (", x$mo[i], ")"))) + } } nr2char <- function(x) { @@ -1874,12 +1915,14 @@ unregex <- function(x) { get_mo_code <- function(x, property) { # don't use right now - return(NULL) + # return(NULL) if (property == "mo") { unique(x) } else { - AMR::microorganisms[base::which(AMR::microorganisms[, property] %in% x),]$mo + microorganismsDT[get(property) == x, "mo"][[1]] + # which is ~2.5 times faster than: + # AMR::microorganisms[base::which(AMR::microorganisms[, property] %in% x),]$mo } } diff --git a/R/mo2.R b/R/mo2.R deleted file mode 100644 index 8ed1c821..00000000 --- a/R/mo2.R +++ /dev/null @@ -1,1489 +0,0 @@ -# ==================================================================== # -# TITLE # -# Antimicrobial Resistance (AMR) Analysis # -# # -# SOURCE # -# https://gitlab.com/msberends/AMR # -# # -# LICENCE # -# (c) 2019 Berends MS (m.s.berends@umcg.nl), Luz CF (c.f.luz@umcg.nl) # -# # -# This R package is free software; you can freely use and distribute # -# it for both personal and commercial purposes under the terms of the # -# GNU General Public License version 2.0 (GNU GPL-2), as published by # -# the Free Software Foundation. # -# # -# This R package was created for academic research and was publicly # -# released in the hope that it will be useful, but it comes WITHOUT # -# ANY WARRANTY OR LIABILITY. # -# Visit our website for more info: https://msberends.gitlab.io/AMR. # -# ==================================================================== # - -# THIS IS A TEST FUNCTION -as.mo2 <- function(x, - Becker = FALSE, - Lancefield = FALSE, - allow_uncertain = TRUE, - reference_df = get_mo_source(), - property = "mo", - initial_search = TRUE, - dyslexia_mode = FALSE, - force_mo_history = FALSE, - debug = FALSE) { - - if (!"AMR" %in% base::.packages()) { - require("AMR") - # check onLoad() in R/zzz.R: data tables are created there. - } - - # WHONET: xxx = no growth - x[tolower(as.character(paste0(x, ""))) %in% c("", "xxx", "na", "nan")] <- NA_character_ - - if (initial_search == TRUE) { - options(mo_failures = NULL) - options(mo_uncertainties = NULL) - options(mo_renamed = NULL) - } - options(mo_renamed_last_run = NULL) - - if (NCOL(x) == 2) { - # support tidyverse selection like: df %>% select(colA, colB) - # paste these columns together - x_vector <- vector("character", NROW(x)) - for (i in 1:NROW(x)) { - x_vector[i] <- paste(pull(x[i,], 1), pull(x[i,], 2), sep = " ") - } - x <- x_vector - } else { - if (NCOL(x) > 2) { - stop('`x` can be 2 columns at most', call. = FALSE) - } - x[is.null(x)] <- NA - - # support tidyverse selection like: df %>% select(colA) - if (!is.vector(x) & !is.null(dim(x))) { - x <- pull(x, 1) - } - } - - notes <- character(0) - uncertainties <- data.frame(uncertainty = integer(0), - input = character(0), - fullname = character(0), - renamed_to = character(0), - mo = character(0), - stringsAsFactors = FALSE) - failures <- character(0) - uncertainty_level <- translate_allow_uncertain(allow_uncertain) - - - # x_input <- x - # already strip leading and trailing spaces - #x <- trimws(x, which = "both") - # only check the uniques, which is way faster - #x <- unique(x) - # remove empty values (to later fill them in again with NAs) - # ("xxx" is WHONET code for 'no growth') - # x <- x[!is.na(x) - # & !is.null(x) - # & !identical(x, "") - # & !identical(x, "xxx")] - - # conversion of old MO codes from v0.5.0 (ITIS) to later versions (Catalogue of Life) - if (any(x %like% "^[BFP]_[A-Z]{3,7}") & !all(x %in% microorganisms$mo)) { - leftpart <- gsub("^([BFP]_[A-Z]{3,7}).*", "\\1", x) - if (any(leftpart %in% names(mo_codes_v0.5.0))) { - rightpart <- gsub("^[BFP]_[A-Z]{3,7}(.*)", "\\1", x) - leftpart <- mo_codes_v0.5.0[leftpart] - x[!is.na(leftpart)] <- paste0(leftpart[!is.na(leftpart)], rightpart[!is.na(leftpart)]) - } - # now check if some are still old - still_old <- x[x %in% names(mo_codes_v0.5.0)] - if (length(still_old) > 0) { - x[x %in% names(mo_codes_v0.5.0)] <- data.frame(old = still_old, stringsAsFactors = FALSE) %>% - left_join(data.frame(old = names(mo_codes_v0.5.0), - new = mo_codes_v0.5.0, - stringsAsFactors = FALSE), by = "old") %>% - # if they couldn't be found, replace them with the old ones again, - # so they will throw a warning in the end - mutate(new = ifelse(is.na(new), old, new)) %>% - pull(new) - } - } - - # # defined df to check for - # if (!is.null(reference_df)) { - # if (!mo_source_isvalid(reference_df)) { - # stop("`reference_df` must contain a column `mo` with values from the 'microorganisms' data set.", call. = FALSE) - # } - # reference_df <- reference_df %>% filter(!is.na(mo)) - # # keep only first two columns, second must be mo - # if (colnames(reference_df)[1] == "mo") { - # reference_df <- reference_df[, c(2, 1)] - # } else { - # reference_df <- reference_df[, c(1, 2)] - # } - # colnames(reference_df)[1] <- "x" - # # remove factors, just keep characters - # suppressWarnings( - # reference_df[] <- lapply(reference_df, as.character) - # ) - # } - # - # # all empty - # if (all(identical(trimws(x_input), "") | is.na(x_input) | length(x) == 0)) { - # if (property == "mo") { - # return(to_class_mo(rep(NA_character_, length(x_input)))) - # } else { - # return(rep(NA_character_, length(x_input))) - # } - # - # } else if (all(x %in% reference_df[, 1][[1]])) { - # # all in reference df - # colnames(reference_df)[1] <- "x" - # suppressWarnings( - # x <- data.frame(x = x, stringsAsFactors = FALSE) %>% - # left_join(reference_df, by = "x") %>% - # left_join(AMR::microorganisms, by = "mo") %>% - # pull(property) - # ) - # - # } else if (all(x %in% AMR::microorganisms$mo)) { - # # existing mo codes when not looking for property "mo", like mo_genus("B_ESCHR_COL") - # y <- microorganismsDT[prevalence == 1][data.table(mo = x), on = "mo", ..property][[1]] - # if (any(is.na(y))) { - # y[is.na(y)] <- microorganismsDT[prevalence == 2][data.table(mo = x[is.na(y)]), - # on = "mo", - # ..property][[1]] - # } - # if (any(is.na(y))) { - # y[is.na(y)] <- microorganismsDT[prevalence == 3][data.table(mo = x[is.na(y)]), - # on = "mo", - # ..property][[1]] - # } - # x <- y - # - # # } else if (all(x %in% read_mo_history(uncertainty_level, - # # force = force_mo_history)$x)) { - # # # previously found code - # # x <- microorganismsDT[data.table(mo = get_mo_history(x, - # # uncertainty_level, - # # force = force_mo_history)), - # # on = "mo", ..property][[1]] - # - # } else if (all(tolower(x) %in% microorganismsDT$fullname_lower)) { - # # we need special treatment for very prevalent full names, they are likely! - # # e.g. as.mo("Staphylococcus aureus") - # y <- microorganismsDT[prevalence == 1][data.table(fullname_lower = tolower(x)), on = "fullname_lower", ..property][[1]] - # if (any(is.na(y))) { - # y[is.na(y)] <- microorganismsDT[prevalence == 2][data.table(fullname_lower = tolower(x[is.na(y)])), - # on = "fullname_lower", - # ..property][[1]] - # } - # if (any(is.na(y))) { - # y[is.na(y)] <- microorganismsDT[prevalence == 3][data.table(fullname_lower = tolower(x[is.na(y)])), - # on = "fullname_lower", - # ..property][[1]] - # } - # x <- y - # - # } else if (all(toupper(x) %in% AMR::microorganisms.codes$code)) { - # # commonly used MO codes - # y <- as.data.table(AMR::microorganisms.codes)[data.table(code = toupper(x)), on = "code", ] - # # save them to history - # set_mo_history(x, y$mo, 0, force = force_mo_history) - # - # x <- microorganismsDT[data.table(mo = y[["mo"]]), on = "mo", ..property][[1]] - # - # } else if (!all(x %in% AMR::microorganisms[, property])) { - # - if (1 == 1) { - strip_whitespace <- function(x) { - # all whitespaces (tab, new lines, etc.) should be one space - # and spaces before and after should be omitted - trimws(gsub("[\\s]+", " ", x, perl = TRUE), which = "both") - } - - - x_new <- rep(NA_character_, length(x)) - - # keep only dots, letters, numbers, slashes, spaces and dashes - x <- gsub("[^.a-zA-Z0-9/ \\-]+", "", x) - # remove spp and species - x <- gsub(" +(spp.?|ssp.?|sp.? |ss ?.?|subsp.?|subspecies|biovar |serovar |species)", " ", x, ignore.case = TRUE) - # remove 'genus' as first word - x <- gsub("^genus ", "", x, ignore.case = TRUE) - # remove 'uncertain'-like texts - x <- trimws(gsub("(uncertain|susp[ie]c[a-z]+|verdacht)", "", x, ignore.case = TRUE)) - x <- strip_whitespace(x) - x_backup <- x - - # remove spp and species - #x <- gsub(" +(spp.?|ssp.?|sp.? |ss ?.?|subsp.?|subspecies|biovar |serovar |species)", " ", x_backup, ignore.case = TRUE) - #x <- strip_whitespace(x) - - x_backup_without_spp <- x - x_species <- paste(x, "species") - # translate to English for supported languages of mo_property - x <- gsub("(gruppe|groep|grupo|gruppo|groupe)", "group", x, ignore.case = TRUE) - x <- gsub("(vergroen)[a-z]*", "viridans", x, ignore.case = TRUE) - x <- gsub("(hefe|gist|gisten|levadura|lievito|fermento|levure)[a-z]*", "yeast", x, ignore.case = TRUE) - x <- gsub("(schimmels?|mofo|molde|stampo|moisissure|fungi)[a-z]*", "fungus", x, ignore.case = TRUE) - x <- gsub("Fungus[ph|f]rya", "Fungiphrya", x, ignore.case = TRUE) - # remove non-text in case of "E. coli" except dots and spaces - # x <- gsub("[^.a-zA-Z0-9/ \\-]+", "", x) - # replace minus by a space - x <- gsub("-+", " ", x) - # replace hemolytic by haemolytic - x <- gsub("ha?emoly", "haemoly", x, ignore.case = TRUE) - # place minus back in streptococci - x <- gsub("(alpha|beta|gamma).?ha?emoly", "\\1-haemoly", x, ignore.case = TRUE) - # remove genus as first word - # x <- gsub("^genus ", "", x, ignore.case = TRUE) - # remove 'uncertain' like texts - # x <- trimws(gsub("(uncertain|susp[ie]c[a-z]+|verdacht)", "", x, ignore.case = TRUE)) - # allow characters that resemble others = dyslexia_mode ---- - if (dyslexia_mode == TRUE) { - x <- tolower(x) - x <- gsub("[iy]+", "[iy]+", x) - x <- gsub("(c|k|q|qu|s|z|x|ks)+", "(c|k|q|qu|s|z|x|ks)+", x) - x <- gsub("(ph|hp|f|v)+", "(ph|hp|f|v)+", x) - x <- gsub("(th|ht|t)+", "(th|ht|t)+", x) - x <- gsub("a+", "a+", x) - x <- gsub("u+", "u+", x) - # allow any ending of -um, -us, -ium, -icum, -ius, -icus, -ica and -a (needs perl for the negative backward lookup): - x <- gsub("(u\\+\\(c\\|k\\|q\\|qu\\+\\|s\\|z\\|x\\|ks\\)\\+)(?![a-z])", - "(u[s|m]|[iy][ck]?u[ms]|[iy]?[ck]?a)", x, ignore.case = TRUE, perl = TRUE) - x <- gsub("(\\[iy\\]\\+\\(c\\|k\\|q\\|qu\\+\\|s\\|z\\|x\\|ks\\)\\+a\\+)(?![a-z])", - "(u[s|m]|[iy][ck]?u[ms]|[iy]?[ck]?a)", x, ignore.case = TRUE, perl = TRUE) - x <- gsub("(\\[iy\\]\\+u\\+m)(?![a-z])", - "(u[s|m]|[iy][ck]?u[ms]|[iy]?[ck]?a)", x, ignore.case = TRUE, perl = TRUE) - x <- gsub("e+", "e+", x, ignore.case = TRUE) - x <- gsub("o+", "o+", x, ignore.case = TRUE) - x <- gsub("(.)\\1+", "\\1+", x) - # allow ending in -en or -us - x <- gsub("e\\+n(?![a-z[])", "(e+n|u+(c|k|q|qu|s|z|x|ks)+)", x, ignore.case = TRUE, perl = TRUE) - # if the input is longer than 10 characters, allow any constant between all characters, as some might have forgotten a character - # this will allow "Pasteurella damatis" to be correctly read as "Pasteurella dagmatis". - constants <- paste(letters[!letters %in% c("a", "e", "i", "o", "u")], collapse = "") - #x[nchar(x_backup_without_spp) > 10] <- gsub("([a-z])([a-z])", paste0("\\1[", constants, "]?\\2"), x[nchar(x_backup_without_spp) > 10], ignore.case = TRUE) - x[nchar(x_backup_without_spp) > 10] <- gsub("[+]", paste0("+[", constants, "]?"), x[nchar(x_backup_without_spp) > 10]) - } - x <- strip_whitespace(x) - - x_trimmed <- x - x_trimmed_species <- paste(x_trimmed, "species") - x_trimmed_without_group <- gsub(" gro.u.p$", "", x_trimmed, ignore.case = TRUE) - # remove last part from "-" or "/" - x_trimmed_without_group <- gsub("(.*)[-/].*", "\\1", x_trimmed_without_group) - # replace space and dot by regex sign - x_withspaces <- gsub("[ .]+", ".* ", x) - x <- gsub("[ .]+", ".*", x) - # add start en stop regex - x <- paste0('^', x, '$') - x_withspaces_start_only <- paste0('^', x_withspaces) - x_withspaces_end_only <- paste0(x_withspaces, '$') - x_withspaces_start_end <- paste0('^', x_withspaces, '$') - - if (isTRUE(debug)) { - print(data.frame( - x_backup, - x, - x_species, - x_withspaces_start_only, - x_withspaces_end_only, - x_withspaces_start_end, - x_backup_without_spp, - x_trimmed, - x_trimmed_species, - x_trimmed_without_group), right = FALSE) - # cat(paste0('x "', x, '"\n')) - # cat(paste0('x_species "', x_species, '"\n')) - # cat(paste0('x_withspaces_start_only "', x_withspaces_start_only, '"\n')) - # cat(paste0('x_withspaces_end_only "', x_withspaces_end_only, '"\n')) - # cat(paste0('x_withspaces_start_end "', x_withspaces_start_end, '"\n')) - # cat(paste0('x_backup "', x_backup, '"\n')) - # cat(paste0('x_backup_without_spp "', x_backup_without_spp, '"\n')) - # cat(paste0('x_trimmed "', x_trimmed, '"\n')) - # cat(paste0('x_trimmed_species "', x_trimmed_species, '"\n')) - # cat(paste0('x_trimmed_without_group "', x_trimmed_without_group, '"\n')) - } - - #progress <- progress_estimated(n = length(x), min_time = 3) - - # THE NEW WAY ---- - nothing_more_to_do <- function() !any(is.na(x_new) & !is.na(x_backup)) - - lookup_regexes <- function(data, property, regex) { - prop <- regex %>% - sapply(function(pattern) pull(data, property) %like% pattern) %>% - as.data.frame() %>% - lapply(function(c) suppressWarnings(min(pull(data, property)[c]))) %>% - unlist() - if (is.null(prop)) { - return(rep(NA, length(regex))) - } - DT <- data.table(prop) - colnames(DT) <- property - microorganismsDT[DT, on = property, "mo"][[1]] - } - - # LATER: only unique if more than 500 values, of which max 85% distinct? - - x_backup_upper <- toupper(x_backup) - x_backup_lower <- tolower(x_backup) - - # exclude all viruses (there is no fullname containing 'virus' in the data set) - x_new[x_backup %like% "virus"] <- "UNKNOWN" - - # try available fields in the microorganisms data set - x_new[x_backup_upper %in% microorganisms$mo] <- microorganismsDT[data.table(mo = x_backup_upper[x_backup_upper %in% microorganisms$mo]), on = "mo", "mo"][[1]] - x_new[x_backup_lower %in% microorganismsDT$fullname_lower] <- microorganismsDT[data.table(fullname_lower = x_backup_lower[x_backup_lower %in% microorganismsDT$fullname_lower]), on = "fullname_lower", "mo"][[1]] - # x_new[x_backup %in% microorganisms$col_id] <- microorganismsDT[data.table(col_id = as.integer(x_backup[x_backup %in% microorganisms$col_id])), on = "col_id", ..property][[1]] - - # old names - old_names <- x_backup[x_backup_lower %in% microorganisms.oldDT$fullname_lower] - x_new[x_backup_lower %in% microorganisms.oldDT$fullname_lower] <- microorganismsDT[microorganisms.oldDT[data.table(fullname_lower = x_backup_lower[x_backup_lower %in% microorganisms.oldDT$fullname_lower]), on = "fullname_lower", "col_id_new"], on = c("col_id" = "col_id_new"), "mo"][[1]] - - if (nothing_more_to_do()) { - if (property != "mo") { - return(microorganismsDT[data.table(mo = x_new), on = "mo", ..property][[1]]) - } else { - return(to_class_mo(x_new)) - } - } - - # codes from the microorganisms.codes data set - x_new[x_backup_upper %in% microorganisms.codes$code] <- as.data.table(microorganisms.codes)[data.table(code = x_backup_upper[x_backup_upper %in% microorganisms.codes$code]), on = "code", "mo"][[1]] - if (!is.null(reference_df)) { - colnames(reference_df)[1] <- "code" - x_new[x_backup_upper %in% reference_df$code] <- as.data.table(reference_df)[data.table(code = x_backup_upper[x_backup_upper %in% reference_df$code]), on = "code", mo][[1]] - if (!all(x_new %in% microorganisms$mo, na.rm = TRUE)) { - warning("Values ", paste(x_new[!x_new %in% c(NA, microorganisms$mo)], collapse = ", "), " found in reference_df, but these are not valid MO codes.", call. = FALSE) - x_new[!x_new %in% c(NA, microorganisms$mo)] <- "UNKNOWN" - } - } - - x_new[x_backup_upper %in% c("MRSA", "MSSA", "VISA", "VRSA")] <- "B_STPHY_AUR" - x_new[x_backup_upper %in% c("MRSE", "MSSE")] <- "B_STPHY_EPI" - x_new[x_backup_upper %in% c("AIEC", "ATEC", "DAEC", "EAEC", "EHEC", "EIEC", "EPEC", "ETEC", "NMEC", "STEC", "UPEC") - | x_backup_upper %like% "O?(26|103|104|111|121|145|157)"] <- "B_ESCHR_COL" - x_new[x_backup_upper %in% c("MRPA")] <- "B_PSDMN_AER" - x_new[x_backup_upper %in% c("CRSM")] <- "B_STNTR_MAL" - x_new[x_backup_upper %in% c("PISP", "PRSP", "VISP", "VRSP")] <- "B_STRPT_PNE" - x_new[x_backup_upper %in% c("VRE") - | x_backup %like% "(enterococci|enterokok|enterococo)[a-z]*?$"] <- "B_ENTRC" - - # start showing progress bar here - progress <- progress_estimated(n = 3, min_time = 0) - # most prevalent (1) - x_new[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6] <- microorganismsDT[prevalence == 1] %>% lookup_regexes("fullname", x_withspaces_start_end[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6]) - x_new[!is.na(x_backup) & is.na(x_new)] <- microorganismsDT[prevalence == 1] %>% lookup_regexes("fullname", paste0(trimws(x_withspaces_start_only), " ")[!is.na(x_backup) & is.na(x_new)]) - x_new[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6] <- microorganismsDT[prevalence == 1] %>% lookup_regexes("fullname", x_withspaces_start_only[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6]) - x_new[!is.na(x_backup) & is.na(x_new)] <- microorganismsDT[prevalence == 1] %>% lookup_regexes("fullname", paste0(" ", trimws(x_withspaces_end_only))[!is.na(x_backup) & is.na(x_new)]) - x_new[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6] <- microorganismsDT[prevalence == 1] %>% lookup_regexes("fullname", x_trimmed[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6]) - x_new[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6] <- microorganismsDT[prevalence == 1] %>% lookup_regexes("fullname", x_trimmed_without_group[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6]) - x_new[!is.na(x_backup) & is.na(x_new)] <- microorganismsDT[prevalence == 1] %>% lookup_regexes("fullname", x_withspaces_start_only[!is.na(x_backup) & is.na(x_new)]) - if (nothing_more_to_do()) { - if (property != "mo") { - return(microorganismsDT[data.table(mo = x_new), on = "mo", ..property][[1]]) - } else { - return(to_class_mo(x_new)) - } - } - progress$tick()$print() - # less prevalent (2) - x_new[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6] <- microorganismsDT[prevalence == 2] %>% lookup_regexes("fullname", x_withspaces_start_end[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6]) - x_new[!is.na(x_backup) & is.na(x_new)] <- microorganismsDT[prevalence == 2] %>% lookup_regexes("fullname", paste0(trimws(x_withspaces_start_only), " ")[!is.na(x_backup) & is.na(x_new)]) - x_new[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6] <- microorganismsDT[prevalence == 2] %>% lookup_regexes("fullname", x_withspaces_start_only[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6]) - x_new[!is.na(x_backup) & is.na(x_new)] <- microorganismsDT[prevalence == 2] %>% lookup_regexes("fullname", paste0(" ", trimws(x_withspaces_end_only))[!is.na(x_backup) & is.na(x_new)]) - x_new[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6] <- microorganismsDT[prevalence == 2] %>% lookup_regexes("fullname", x_trimmed[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6]) - x_new[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6] <- microorganismsDT[prevalence == 2] %>% lookup_regexes("fullname", x_trimmed_without_group[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6]) - x_new[!is.na(x_backup) & is.na(x_new)] <- microorganismsDT[prevalence == 2] %>% lookup_regexes("fullname", x_withspaces_start_only[!is.na(x_backup) & is.na(x_new)]) - if (nothing_more_to_do()) { - if (property != "mo") { - return(microorganismsDT[data.table(mo = x_new), on = "mo", ..property][[1]]) - } else { - return(to_class_mo(x_new)) - } - } - progress$tick()$print() - # least prevalent (3) - x_new[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6] <- microorganismsDT[prevalence == 3] %>% lookup_regexes("fullname", x_withspaces_start_end[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6]) - x_new[!is.na(x_backup) & is.na(x_new)] <- microorganismsDT[prevalence == 3] %>% lookup_regexes("fullname", paste0(trimws(x_withspaces_start_only), " ")[!is.na(x_backup) & is.na(x_new)]) - x_new[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6] <- microorganismsDT[prevalence == 3] %>% lookup_regexes("fullname", x_withspaces_start_only[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6]) - x_new[!is.na(x_backup) & is.na(x_new)] <- microorganismsDT[prevalence == 3] %>% lookup_regexes("fullname", paste0(" ", trimws(x_withspaces_end_only))[!is.na(x_backup) & is.na(x_new)]) - x_new[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6] <- microorganismsDT[prevalence == 3] %>% lookup_regexes("fullname", x_trimmed[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6]) - x_new[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6] <- microorganismsDT[prevalence == 3] %>% lookup_regexes("fullname", x_trimmed_without_group[!is.na(x_backup) & is.na(x_new) & nchar(x_backup) >= 6]) - x_new[!is.na(x_backup) & is.na(x_new)] <- microorganismsDT[prevalence == 3] %>% lookup_regexes("fullname", x_withspaces_start_only[!is.na(x_backup) & is.na(x_new)]) - - # all others are UNKNOWN - x_new[!is.na(x_backup) & is.na(x_new)] <- "UNKNOWN" - progress$tick()$print() - - return(to_class_mo(x_new)) - #for (i in 1:length(x)) { - for (i in character(0)) { - - x[i] <- "UNKNOWN" - next - - # progress$tick()$print() - - # if (initial_search == TRUE) { - # found <- microorganismsDT[mo == get_mo_history(x_backup[i], - # uncertainty_level, - # force = force_mo_history), - # ..property][[1]] - # # previously found result - # if (length(found) > 0) { - # x[i] <- found[1L] - # next - # } - # } - - found <- microorganismsDT[mo == toupper(x_backup[i]), ..property][[1]] - # is a valid MO code - if (length(found) > 0) { - x[i] <- found[1L] - next - } - - found <- microorganismsDT[fullname_lower %in% tolower(c(x_backup[i], x_backup_without_spp[i])), ..property][[1]] - # most probable: is exact match in fullname - if (length(found) > 0) { - x[i] <- found[1L] - if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - - found <- microorganismsDT[col_id == x_backup[i], ..property][[1]] - # is a valid Catalogue of Life ID - if (NROW(found) > 0) { - x[i] <- found[1L] - if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - - - # WHONET: xxx = no growth - if (tolower(as.character(paste0(x_backup_without_spp[i], ""))) %in% c("", "xxx", "na", "nan")) { - x[i] <- NA_character_ - next - } - - if (tolower(x_backup_without_spp[i]) %in% c("other", "none", "unknown")) { - # empty and nonsense values, ignore without warning - x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]] - if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - - # check for very small input, but ignore the O antigens of E. coli - if (nchar(gsub("[^a-zA-Z]", "", x_trimmed[i])) < 3 - & !x_backup_without_spp[i] %like% "O?(26|103|104|104|111|121|145|157)") { - # check if search term was like "A. species", then return first genus found with ^A - # if (x_backup[i] %like% "[a-z]+ species" | x_backup[i] %like% "[a-z] spp[.]?") { - # # get mo code of first hit - # found <- microorganismsDT[fullname %like% x_withspaces_start_only[i], mo] - # if (length(found) > 0) { - # mo_code <- found[1L] %>% strsplit("_") %>% unlist() %>% .[1:2] %>% paste(collapse = "_") - # found <- microorganismsDT[mo == mo_code, ..property][[1]] - # # return first genus that begins with x_trimmed, e.g. when "E. spp." - # if (length(found) > 0) { - # x[i] <- found[1L] - # if (initial_search == TRUE) { - # set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - # } - # next - # } - # } - # } - # fewer than 3 chars and not looked for species, add as failure - x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]] - if (initial_search == TRUE) { - failures <- c(failures, x_backup[i]) - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - - if (x_backup_without_spp[i] %like% "virus") { - # there is no fullname like virus, so don't try to coerce it - x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]] - if (initial_search == TRUE) { - failures <- c(failures, x_backup[i]) - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - - # translate known trivial abbreviations to genus + species ---- - if (!is.na(x_trimmed[i])) { - if (toupper(x_backup_without_spp[i]) %in% c('MRSA', 'MSSA', 'VISA', 'VRSA')) { - x[i] <- microorganismsDT[mo == 'B_STPHY_AUR', ..property][[1]][1L] - if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - if (toupper(x_backup_without_spp[i]) %in% c('MRSE', 'MSSE')) { - x[i] <- microorganismsDT[mo == 'B_STPHY_EPI', ..property][[1]][1L] - if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - if (toupper(x_backup_without_spp[i]) == "VRE" - | x_backup_without_spp[i] %like% '(enterococci|enterokok|enterococo)[a-z]*?$') { - x[i] <- microorganismsDT[mo == 'B_ENTRC', ..property][[1]][1L] - if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - # support for: - # - AIEC (Adherent-Invasive E. coli) - # - ATEC (Atypical Entero-pathogenic E. coli) - # - DAEC (Diffusely Adhering E. coli) - # - EAEC (Entero-Aggresive E. coli) - # - EHEC (Entero-Haemorrhagic E. coli) - # - EIEC (Entero-Invasive E. coli) - # - EPEC (Entero-Pathogenic E. coli) - # - ETEC (Entero-Toxigenic E. coli) - # - NMEC (Neonatal Meningitis‐causing E. coli) - # - STEC (Shiga-toxin producing E. coli) - # - UPEC (Uropathogenic E. coli) - if (toupper(x_backup_without_spp[i]) %in% c("AIEC", "ATEC", "DAEC", "EAEC", "EHEC", "EIEC", "EPEC", "ETEC", "NMEC", "STEC", "UPEC") - # also support O-antigens of E. coli: O26, O103, O104, O111, O121, O145, O157 - | x_backup_without_spp[i] %like% "O?(26|103|104|111|121|145|157)") { - x[i] <- microorganismsDT[mo == 'B_ESCHR_COL', ..property][[1]][1L] - if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - if (toupper(x_backup_without_spp[i]) == 'MRPA') { - # multi resistant P. aeruginosa - x[i] <- microorganismsDT[mo == 'B_PSDMN_AER', ..property][[1]][1L] - if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - if (toupper(x_backup_without_spp[i]) == 'CRS' - | toupper(x_backup_without_spp[i]) == 'CRSM') { - # co-trim resistant S. maltophilia - x[i] <- microorganismsDT[mo == 'B_STNTR_MAL', ..property][[1]][1L] - if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - if (toupper(x_backup_without_spp[i]) %in% c('PISP', 'PRSP', 'VISP', 'VRSP')) { - # peni I, peni R, vanco I, vanco R: S. pneumoniae - x[i] <- microorganismsDT[mo == 'B_STRPT_PNE', ..property][[1]][1L] - if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - if (x_backup_without_spp[i] %like% '^G[ABCDFGHK]S$') { - # Streptococci, like GBS = Group B Streptococci (B_STRPT_GRB) - x[i] <- microorganismsDT[mo == gsub("G([ABCDFGHK])S", "B_STRPT_GR\\1", x_backup_without_spp[i], ignore.case = TRUE), ..property][[1]][1L] - if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - if (x_backup_without_spp[i] %like% '(streptococ|streptokok).* [ABCDFGHK]$') { - # Streptococci in different languages, like "estreptococos grupo B" - x[i] <- microorganismsDT[mo == gsub(".*(streptococ|streptokok|estreptococ).* ([ABCDFGHK])$", "B_STRPT_GR\\2", x_backup_without_spp[i], ignore.case = TRUE), ..property][[1]][1L] - if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - if (x_backup_without_spp[i] %like% 'group [ABCDFGHK] (streptococ|streptokok|estreptococ)') { - # Streptococci in different languages, like "Group A Streptococci" - x[i] <- microorganismsDT[mo == gsub(".*group ([ABCDFGHK]) (streptococ|streptokok|estreptococ).*", "B_STRPT_GR\\1", x_backup_without_spp[i], ignore.case = TRUE), ..property][[1]][1L] - if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - if (x_backup_without_spp[i] %like% 'haemoly.*strept') { - # Haemolytic streptococci in different languages - x[i] <- microorganismsDT[mo == 'B_STRPT_HAE', ..property][[1]][1L] - if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - # CoNS/CoPS in different languages (support for German, Dutch, Spanish, Portuguese) ---- - if (x_backup_without_spp[i] %like% '[ck]oagulas[ea] negatie?[vf]' - | x_trimmed[i] %like% '[ck]oagulas[ea] negatie?[vf]' - | x_backup_without_spp[i] %like% '[ck]o?ns[^a-z]?$') { - # coerce S. coagulase negative - x[i] <- microorganismsDT[mo == 'B_STPHY_CNS', ..property][[1]][1L] - if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - if (x_backup_without_spp[i] %like% '[ck]oagulas[ea] positie?[vf]' - | x_trimmed[i] %like% '[ck]oagulas[ea] positie?[vf]' - | x_backup_without_spp[i] %like% '[ck]o?ps[^a-z]?$') { - # coerce S. coagulase positive - x[i] <- microorganismsDT[mo == 'B_STPHY_CPS', ..property][[1]][1L] - if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - # streptococcal groups: milleri and viridans - if (x_trimmed[i] %like% 'strepto.* milleri' - | x_backup_without_spp[i] %like% 'strepto.* milleri' - | x_backup_without_spp[i] %like% 'mgs[^a-z]?$') { - # Milleri Group Streptococcus (MGS) - x[i] <- microorganismsDT[mo == 'B_STRPT_MIL', ..property][[1]][1L] - if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - if (x_trimmed[i] %like% 'strepto.* viridans' - | x_backup_without_spp[i] %like% 'strepto.* viridans' - | x_backup_without_spp[i] %like% 'vgs[^a-z]?$') { - # Viridans Group Streptococcus (VGS) - x[i] <- microorganismsDT[mo == 'B_STRPT_VIR', ..property][[1]][1L] - if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - if (x_backup_without_spp[i] %like% 'gram[ -]?neg.*' - | x_backup_without_spp[i] %like% 'negatie?[vf]' - | x_trimmed[i] %like% 'gram[ -]?neg.*') { - # coerce Gram negatives - x[i] <- microorganismsDT[mo == 'B_GRAMN', ..property][[1]][1L] - if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - if (x_backup_without_spp[i] %like% 'gram[ -]?pos.*' - | x_backup_without_spp[i] %like% 'positie?[vf]' - | x_trimmed[i] %like% 'gram[ -]?pos.*') { - # coerce Gram positives - x[i] <- microorganismsDT[mo == 'B_GRAMP', ..property][[1]][1L] - if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - if (x_backup_without_spp[i] %like% "salmonella [a-z]+ ?.*") { - if (x_backup_without_spp[i] %like% "Salmonella group") { - # Salmonella Group A to Z, just return S. species for now - x[i] <- microorganismsDT[mo == 'B_SLMNL', ..property][[1]][1L] - if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - } else if (grepl("[sS]almonella [A-Z][a-z]+ ?.*", x_backup_without_spp[i], ignore.case = FALSE)) { - # Salmonella with capital letter species like "Salmonella Goettingen" - they're all S. enterica - x[i] <- microorganismsDT[mo == 'B_SLMNL_ENT', ..property][[1]][1L] - if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - uncertainties <- rbind(uncertainties, - format_uncertainty_as_df(uncertainty_level = 1, - input = x_backup_without_spp[i], - result_mo = "B_SLMNL_ENT")) - } - next - } - - # trivial names known to the field: - if ("meningococcus" %like% x_trimmed[i]) { - # coerce S. coagulase positive - x[i] <- microorganismsDT[mo == 'B_NESSR_MEN', ..property][[1]][1L] - if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - if ("gonococcus" %like% x_trimmed[i]) { - # coerce S. coagulase positive - x[i] <- microorganismsDT[mo == 'B_NESSR_GON', ..property][[1]][1L] - if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - if ("pneumococcus" %like% x_trimmed[i]) { - # coerce S. coagulase positive - x[i] <- microorganismsDT[mo == 'B_STRPT_PNE', ..property][[1]][1L] - if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - } - - # FIRST TRY FULLNAMES AND CODES ---- - # if only genus is available, return only genus - if (all(!c(x[i], x_trimmed[i]) %like% " ")) { - found <- microorganismsDT[fullname_lower %in% tolower(c(x_species[i], x_trimmed_species[i])), ..property][[1]] - if (length(found) > 0) { - x[i] <- found[1L] - if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - if (nchar(x_backup_without_spp[i]) >= 6) { - found <- microorganismsDT[fullname_lower %like% paste0("^", unregex(x_backup_without_spp[i]), "[a-z]+"), ..property][[1]] - if (length(found) > 0) { - x[i] <- found[1L] - if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - } - # rest of genus only is in allow_uncertain part. - } - - # TRY OTHER SOURCES ---- - # WHONET and other common LIS codes - if (toupper(x_backup[i]) %in% AMR::microorganisms.codes[, 1]) { - mo_found <- AMR::microorganisms.codes[toupper(x_backup[i]) == AMR::microorganisms.codes[, 1], "mo"][1L] - if (length(mo_found) > 0) { - x[i] <- microorganismsDT[mo == mo_found, ..property][[1]][1L] - if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - } - if (!is.null(reference_df)) { - # self-defined reference - if (x_backup[i] %in% reference_df[, 1]) { - ref_mo <- reference_df[reference_df[, 1] == x_backup[i], "mo"] - if (ref_mo %in% microorganismsDT[, mo]) { - x[i] <- microorganismsDT[mo == ref_mo, ..property][[1]][1L] - next - } else { - warning("Value '", x_backup[i], "' was found in reference_df, but '", ref_mo, "' is not a valid MO code.", call. = FALSE) - } - } - } - - # allow no codes less than 4 characters long, was already checked for WHONET above - if (nchar(x_backup_without_spp[i]) < 4) { - x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]] - if (initial_search == TRUE) { - failures <- c(failures, x_backup[i]) - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - - check_per_prevalence <- function(data_to_check, - a.x_backup, - b.x_trimmed, - c.x_trimmed_without_group, - d.x_withspaces_start_end, - e.x_withspaces_start_only, - f.x_withspaces_end_only, - g.x_backup_without_spp) { - - # try probable: trimmed version of fullname ---- - found <- data_to_check[fullname_lower %in% tolower(g.x_backup_without_spp), ..property][[1]] - if (length(found) > 0) { - return(found[1L]) - } - - # try any match keeping spaces ---- - found <- data_to_check[fullname %like% d.x_withspaces_start_end, ..property][[1]] - if (length(found) > 0 & nchar(g.x_backup_without_spp) >= 6) { - return(found[1L]) - } - - # try any match keeping spaces, not ending with $ ---- - found <- data_to_check[fullname %like% paste0(trimws(e.x_withspaces_start_only), " "), ..property][[1]] - if (length(found) > 0) { - return(found[1L]) - } - found <- data_to_check[fullname %like% e.x_withspaces_start_only, ..property][[1]] - if (length(found) > 0 & nchar(g.x_backup_without_spp) >= 6) { - return(found[1L]) - } - - # try any match keeping spaces, not start with ^ ---- - found <- data_to_check[fullname %like% paste0(" ", trimws(f.x_withspaces_end_only)), ..property][[1]] - if (length(found) > 0) { - return(found[1L]) - } - - # try a trimmed version - found <- data_to_check[fullname_lower %like% b.x_trimmed - | fullname_lower %like% c.x_trimmed_without_group, ..property][[1]] - if (length(found) > 0 & nchar(g.x_backup_without_spp) >= 6) { - return(found[1L]) - } - - - # try splitting of characters in the middle and then find ID ---- - # only when text length is 6 or lower - # like esco = E. coli, klpn = K. pneumoniae, stau = S. aureus, staaur = S. aureus - if (nchar(g.x_backup_without_spp) <= 6) { - x_length <- nchar(g.x_backup_without_spp) - x_split <- paste0("^", - g.x_backup_without_spp %>% substr(1, x_length / 2), - '.* ', - g.x_backup_without_spp %>% substr((x_length / 2) + 1, x_length)) - found <- data_to_check[fullname %like% x_split, ..property][[1]] - if (length(found) > 0) { - return(found[1L]) - } - } - - # try fullname without start and without nchar limit of >= 6 ---- - # like "K. pneu rhino" >> "Klebsiella pneumoniae (rhinoscleromatis)" = KLEPNERH - found <- data_to_check[fullname %like% e.x_withspaces_start_only, ..property][[1]] - if (length(found) > 0) { - return(found[1L]) - } - - # didn't found any - return(NA_character_) - } - - # FIRST TRY VERY PREVALENT IN HUMAN INFECTIONS ---- - x[i] <- check_per_prevalence(data_to_check = microorganismsDT[prevalence == 1], - a.x_backup = x_backup[i], - b.x_trimmed = x_trimmed[i], - c.x_trimmed_without_group = x_trimmed_without_group[i], - d.x_withspaces_start_end = x_withspaces_start_end[i], - e.x_withspaces_start_only = x_withspaces_start_only[i], - f.x_withspaces_end_only = x_withspaces_end_only[i], - g.x_backup_without_spp = x_backup_without_spp[i]) - if (!empty_result(x[i])) { - if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - # THEN TRY PREVALENT IN HUMAN INFECTIONS ---- - x[i] <- check_per_prevalence(data_to_check = microorganismsDT[prevalence == 2], - a.x_backup = x_backup[i], - b.x_trimmed = x_trimmed[i], - c.x_trimmed_without_group = x_trimmed_without_group[i], - d.x_withspaces_start_end = x_withspaces_start_end[i], - e.x_withspaces_start_only = x_withspaces_start_only[i], - f.x_withspaces_end_only = x_withspaces_end_only[i], - g.x_backup_without_spp = x_backup_without_spp[i]) - if (!empty_result(x[i])) { - if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - # THEN UNPREVALENT IN HUMAN INFECTIONS ---- - x[i] <- check_per_prevalence(data_to_check = microorganismsDT[prevalence == 3], - a.x_backup = x_backup[i], - b.x_trimmed = x_trimmed[i], - c.x_trimmed_without_group = x_trimmed_without_group[i], - d.x_withspaces_start_end = x_withspaces_start_end[i], - e.x_withspaces_start_only = x_withspaces_start_only[i], - f.x_withspaces_end_only = x_withspaces_end_only[i], - g.x_backup_without_spp = x_backup_without_spp[i]) - if (!empty_result(x[i])) { - if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - - # MISCELLANEOUS ---- - - # look for old taxonomic names ---- - found <- microorganisms.oldDT[fullname_lower == tolower(x_backup[i]) - | fullname %like% x_withspaces_start_end[i],] - if (NROW(found) > 0) { - col_id_new <- found[1, col_id_new] - # when property is "ref" (which is the case in mo_ref, mo_authors and mo_year), return the old value, so: - # mo_ref("Chlamydia psittaci") = "Page, 1968" (with warning) - # mo_ref("Chlamydophila psittaci") = "Everett et al., 1999" - if (property == "ref") { - x[i] <- found[1, ref] - } else { - x[i] <- microorganismsDT[col_id == found[1, col_id_new], ..property][[1]] - } - options(mo_renamed_last_run = found[1, fullname]) - was_renamed(name_old = found[1, fullname], - name_new = microorganismsDT[col_id == found[1, col_id_new], fullname], - ref_old = found[1, ref], - ref_new = microorganismsDT[col_id == found[1, col_id_new], ref], - mo = microorganismsDT[col_id == found[1, col_id_new], mo]) - if (initial_search == TRUE) { - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - next - } - - # check for uncertain results ---- - uncertain_fn <- function(a.x_backup, - b.x_trimmed, - c.x_withspaces_start_end, - d.x_withspaces_start_only, - f.x_withspaces_end_only, - g.x_backup_without_spp) { - - if (uncertainty_level == 0) { - # do not allow uncertainties - return(NA_character_) - } - - # UNCERTAINTY LEVEL 1 ---- - if (uncertainty_level >= 1) { - now_checks_for_uncertainty_level <- 1 - - # (1) look again for old taxonomic names, now for G. species ---- - if (isTRUE(debug)) { - cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (1) look again for old taxonomic names, now for G. species\n") - } - if (isTRUE(debug)) { - message("Running '", c.x_withspaces_start_end, "' and '", d.x_withspaces_start_only, "'") - } - found <- microorganisms.oldDT[fullname %like% c.x_withspaces_start_end - | fullname %like% d.x_withspaces_start_only] - if (NROW(found) > 0 & nchar(g.x_backup_without_spp) >= 6) { - if (property == "ref") { - # when property is "ref" (which is the case in mo_ref, mo_authors and mo_year), return the old value, so: - # mo_ref("Chlamydia psittaci) = "Page, 1968" (with warning) - # mo_ref("Chlamydophila psittaci) = "Everett et al., 1999" - x <- found[1, ref] - } else { - x <- microorganismsDT[col_id == found[1, col_id_new], ..property][[1]] - } - was_renamed(name_old = found[1, fullname], - name_new = microorganismsDT[col_id == found[1, col_id_new], fullname], - ref_old = found[1, ref], - ref_new = microorganismsDT[col_id == found[1, col_id_new], ref], - mo = microorganismsDT[col_id == found[1, col_id_new], mo]) - options(mo_renamed_last_run = found[1, fullname]) - uncertainties <<- rbind(uncertainties, - format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, - input = a.x_backup, - result_mo = microorganismsDT[col_id == found[1, col_id_new], mo])) - if (initial_search == TRUE) { - set_mo_history(a.x_backup, get_mo_code(x, property), 1, force = force_mo_history) - } - return(x) - } - - # (2) Try with misspelled input ---- - # just rerun with dyslexia_mode = TRUE will used the extensive regex part above - if (isTRUE(debug)) { - cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (2) Try with misspelled input\n") - } - if (isTRUE(debug)) { - message("Running '", a.x_backup, "'") - } - # first try without dyslexia mode - found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) - if (empty_result(found)) { - # then with dyslexia mode - found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) - } - if (!empty_result(found)) { - found_result <- found - found <- microorganismsDT[mo == found, ..property][[1]] - uncertainties <<- rbind(uncertainties, - format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, - input = a.x_backup, - result_mo = found_result[1L])) - if (initial_search == TRUE) { - set_mo_history(a.x_backup, get_mo_code(found[1L], property), 1, force = force_mo_history) - } - return(found[1L]) - } - } - - # UNCERTAINTY LEVEL 2 ---- - if (uncertainty_level >= 2) { - now_checks_for_uncertainty_level <- 2 - - # (3) look for genus only, part of name ---- - if (isTRUE(debug)) { - cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (3) look for genus only, part of name\n") - } - if (nchar(g.x_backup_without_spp) > 4 & !b.x_trimmed %like% " ") { - if (!grepl("^[A-Z][a-z]+", b.x_trimmed, ignore.case = FALSE)) { - if (isTRUE(debug)) { - message("Running '", paste(b.x_trimmed, "species"), "'") - } - # not when input is like Genustext, because then Neospora would lead to Actinokineospora - found <- microorganismsDT[fullname_lower %like% paste(b.x_trimmed, "species"), ..property][[1]] - if (length(found) > 0) { - x[i] <- found[1L] - uncertainties <<- rbind(uncertainties, - format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, - input = a.x_backup, - result_mo = found_result[1L])) - if (initial_search == TRUE) { - set_mo_history(a.x_backup, get_mo_code(x, property), 2, force = force_mo_history) - } - return(x) - } - } - } - - # (4) strip values between brackets ---- - if (isTRUE(debug)) { - cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (4) strip values between brackets\n") - } - a.x_backup_stripped <- gsub("( *[(].*[)] *)", " ", a.x_backup) - a.x_backup_stripped <- trimws(gsub(" +", " ", a.x_backup_stripped)) - if (isTRUE(debug)) { - message("Running '", a.x_backup_stripped, "'") - } - # first try without dyslexia mode - found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup_stripped, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) - if (empty_result(found)) { - # then with dyslexia mode - found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup_stripped, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) - } - if (!empty_result(found) & nchar(g.x_backup_without_spp) >= 6) { - found_result <- found - found <- microorganismsDT[mo == found, ..property][[1]] - uncertainties <<- rbind(uncertainties, - format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, - input = a.x_backup, - result_mo = found_result[1L])) - if (initial_search == TRUE) { - set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history) - } - return(found[1L]) - } - - # (5) inverse input ---- - if (isTRUE(debug)) { - cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (5) inverse input\n") - } - a.x_backup_inversed <- paste(rev(unlist(strsplit(a.x_backup, split = " "))), collapse = " ") - if (isTRUE(debug)) { - message("Running '", a.x_backup_inversed, "'") - } - # first try without dyslexia mode - found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup_inversed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) - if (empty_result(found)) { - # then with dyslexia mode - found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup_inversed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) - } - if (!empty_result(found) & nchar(g.x_backup_without_spp) >= 6) { - found_result <- found - found <- microorganismsDT[mo == found, ..property][[1]] - uncertainties <<- rbind(uncertainties, - format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, - input = a.x_backup, - result_mo = found_result[1L])) - if (initial_search == TRUE) { - set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history) - } - return(found[1L]) - } - - # (6) try to strip off half an element from end and check the remains ---- - if (isTRUE(debug)) { - cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (6) try to strip off half an element from end and check the remains\n") - } - x_strip <- a.x_backup %>% strsplit(" ") %>% unlist() - if (length(x_strip) > 1) { - for (i in 1:(length(x_strip) - 1)) { - lastword <- x_strip[length(x_strip) - i + 1] - lastword_half <- substr(lastword, 1, as.integer(nchar(lastword) / 2)) - # remove last half of the second term - x_strip_collapsed <- paste(c(x_strip[1:(length(x_strip) - i)], lastword_half), collapse = " ") - if (nchar(x_strip_collapsed) >= 4 & nchar(lastword_half) > 2) { - if (isTRUE(debug)) { - message("Running '", x_strip_collapsed, "'") - } - # first try without dyslexia mode - found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) - if (empty_result(found)) { - # then with dyslexia mode - found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) - } - if (!empty_result(found)) { - found_result <- found - found <- microorganismsDT[mo == found, ..property][[1]] - uncertainties <<- rbind(uncertainties, - format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, - input = a.x_backup, - result_mo = found_result[1L])) - if (initial_search == TRUE) { - set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history) - } - return(found[1L]) - } - } - } - } - # (7) try to strip off one element from end and check the remains ---- - if (isTRUE(debug)) { - cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (7) try to strip off one element from end and check the remains\n") - } - if (length(x_strip) > 1) { - for (i in 1:(length(x_strip) - 1)) { - x_strip_collapsed <- paste(x_strip[1:(length(x_strip) - i)], collapse = " ") - if (nchar(x_strip_collapsed) >= 6) { - if (isTRUE(debug)) { - message("Running '", x_strip_collapsed, "'") - } - # first try without dyslexia mode - found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) - if (empty_result(found)) { - # then with dyslexia mode - found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) - } - if (!empty_result(found)) { - found_result <- found - found <- microorganismsDT[mo == found, ..property][[1]] - uncertainties <<- rbind(uncertainties, - format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, - input = a.x_backup, - result_mo = found_result[1L])) - if (initial_search == TRUE) { - set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history) - } - return(found[1L]) - } - } - } - } - # (8) check for unknown yeasts/fungi ---- - if (isTRUE(debug)) { - cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (8) check for unknown yeasts/fungi\n") - } - if (b.x_trimmed %like% "yeast") { - found <- "F_YEAST" - found_result <- found - found <- microorganismsDT[mo == found, ..property][[1]] - uncertainties <<- rbind(uncertainties, - format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, - input = a.x_backup, - result_mo = found_result[1L])) - if (initial_search == TRUE) { - set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history) - } - return(found[1L]) - } - if (b.x_trimmed %like% "(fungus|fungi)" & !b.x_trimmed %like% "Fungiphrya") { - found <- "F_FUNGUS" - found_result <- found - found <- microorganismsDT[mo == found, ..property][[1]] - uncertainties <<- rbind(uncertainties, - format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, - input = a.x_backup, - result_mo = found_result[1L])) - if (initial_search == TRUE) { - set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history) - } - return(found[1L]) - } - # (9) try to strip off one element from start and check the remains (only allow >= 2-part name outcome) ---- - if (isTRUE(debug)) { - cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (9) try to strip off one element from start and check the remains (only allow >= 2-part name outcome)\n") - } - x_strip <- a.x_backup %>% strsplit(" ") %>% unlist() - if (length(x_strip) > 1 & nchar(g.x_backup_without_spp) >= 6) { - for (i in 2:(length(x_strip))) { - x_strip_collapsed <- paste(x_strip[i:length(x_strip)], collapse = " ") - if (isTRUE(debug)) { - message("Running '", x_strip_collapsed, "'") - } - # first try without dyslexia mode - found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) - if (empty_result(found)) { - # then with dyslexia mode - found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) - } - if (!empty_result(found)) { - found_result <- found - found <- microorganismsDT[mo == found_result[1L], ..property][[1]] - # uncertainty level 2 only if searched part contains a space (otherwise it will be found with lvl 3) - if (x_strip_collapsed %like% " ") { - uncertainties <<- rbind(uncertainties, - format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, - input = a.x_backup, - result_mo = found_result[1L])) - if (initial_search == TRUE) { - set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history) - } - return(found[1L]) - } - } - } - } - } - - # UNCERTAINTY LEVEL 3 ---- - if (uncertainty_level >= 3) { - now_checks_for_uncertainty_level <- 3 - - # (10) try to strip off one element from start and check the remains (any text size) ---- - if (isTRUE(debug)) { - cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (10) try to strip off one element from start and check the remains (any text size)\n") - } - x_strip <- a.x_backup %>% strsplit(" ") %>% unlist() - if (length(x_strip) > 1 & nchar(g.x_backup_without_spp) >= 6) { - for (i in 2:(length(x_strip))) { - x_strip_collapsed <- paste(x_strip[i:length(x_strip)], collapse = " ") - if (isTRUE(debug)) { - message("Running '", x_strip_collapsed, "'") - } - # first try without dyslexia mode - found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) - if (empty_result(found)) { - # then with dyslexia mode - found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) - } - if (!empty_result(found)) { - found_result <- found - found <- microorganismsDT[mo == found, ..property][[1]] - uncertainties <<- rbind(uncertainties, - format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, - input = a.x_backup, - result_mo = found_result[1L])) - if (initial_search == TRUE) { - set_mo_history(a.x_backup, get_mo_code(found[1L], property), 3, force = force_mo_history) - } - return(found[1L]) - } - } - } - # (11) try to strip off one element from end and check the remains (any text size) ---- - # (this is in fact 7 but without nchar limit of >=6) - if (isTRUE(debug)) { - cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (11) try to strip off one element from end and check the remains (any text size)\n") - } - if (length(x_strip) > 1) { - for (i in 1:(length(x_strip) - 1)) { - x_strip_collapsed <- paste(x_strip[1:(length(x_strip) - i)], collapse = " ") - if (isTRUE(debug)) { - message("Running '", x_strip_collapsed, "'") - } - # first try without dyslexia mode - found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) - if (empty_result(found)) { - # then with dyslexia mode - found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) - } - if (!empty_result(found)) { - found_result <- found - found <- microorganismsDT[mo == found, ..property][[1]] - uncertainties <<- rbind(uncertainties, - format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, - input = a.x_backup, - result_mo = found_result[1L])) - if (initial_search == TRUE) { - set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history) - } - return(found[1L]) - } - } - } - - # (12) part of a name (very unlikely match) ---- - if (isTRUE(debug)) { - cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (12) part of a name (very unlikely match)\n") - } - if (isTRUE(debug)) { - message("Running '", f.x_withspaces_end_only, "'") - } - found <- microorganismsDT[fullname %like% f.x_withspaces_end_only] - if (nrow(found) > 0) { - found_result <- found[["mo"]] - if (!empty_result(found_result) & nchar(g.x_backup_without_spp) >= 6) { - found <- microorganismsDT[mo == found_result[1L], ..property][[1]] - uncertainties <<- rbind(uncertainties, - format_uncertainty_as_df(uncertainty_level = now_checks_for_uncertainty_level, - input = a.x_backup, - result_mo = found_result[1L])) - if (initial_search == TRUE) { - set_mo_history(a.x_backup, get_mo_code(found[1L], property), 3, force = force_mo_history) - } - return(found[1L]) - } - } - } - - # didn't found in uncertain results too - return(NA_character_) - } - x[i] <- uncertain_fn(x_backup[i], - x_trimmed[i], - x_withspaces_start_end[i], - x_withspaces_start_only[i], - x_withspaces_end_only[i], - x_backup_without_spp[i]) - if (!empty_result(x[i])) { - # no set_mo_history here - it is already set in uncertain_fn() - next - } - - # no results found: make them UNKNOWN ---- - x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]] - if (initial_search == TRUE) { - failures <- c(failures, x_backup[i]) - set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) - } - } - } - - # handling failures ---- - failures <- failures[!failures %in% c(NA, NULL, NaN)] - if (length(failures) > 0 & initial_search == TRUE) { - options(mo_failures = sort(unique(failures))) - plural <- c("value", "it", "was") - if (n_distinct(failures) > 1) { - plural <- c("values", "them", "were") - } - total_failures <- length(x_input[as.character(x_input) %in% as.character(failures) & !x_input %in% c(NA, NULL, NaN)]) - total_n <- length(x_input[!x_input %in% c(NA, NULL, NaN)]) - msg <- paste0(nr2char(n_distinct(failures)), " unique ", plural[1], - " (covering ", percent(total_failures / total_n, round = 1, force_zero = TRUE), - ") could not be coerced and ", plural[3], " considered 'unknown'") - if (n_distinct(failures) <= 10) { - msg <- paste0(msg, ": ", paste('"', unique(failures), '"', sep = "", collapse = ', ')) - } - msg <- paste0(msg, ". Use mo_failures() to review ", plural[2], ". Edit the `allow_uncertain` parameter if needed (see ?as.mo).") - warning(red(msg), - call. = FALSE, - immediate. = TRUE) # thus will always be shown, even if >= warnings - } - # handling uncertainties ---- - if (NROW(uncertainties) > 0 & initial_search == TRUE) { - options(mo_uncertainties = as.list(distinct(uncertainties, input, .keep_all = TRUE))) - - plural <- c("", "it") - if (NROW(uncertainties) > 1) { - plural <- c("s", "them") - } - msg <- paste0("\nResult", plural[1], " of ", nr2char(NROW(uncertainties)), " value", plural[1], - " was guessed with uncertainty. Use mo_uncertainties() to review ", plural[2], ".") - warning(red(msg), - call. = FALSE, - immediate. = TRUE) # thus will always be shown, even if >= warnings - } - - # Becker ---- - if (Becker == TRUE | Becker == "all") { - # See Source. It's this figure: - # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4187637/figure/F3/ - MOs_staph <- microorganismsDT[genus == "Staphylococcus"] - setkey(MOs_staph, species) - CoNS <- MOs_staph[species %in% c("arlettae", "auricularis", "capitis", - "caprae", "carnosus", "chromogenes", "cohnii", "condimenti", - "devriesei", "epidermidis", "equorum", "felis", - "fleurettii", "gallinarum", "haemolyticus", - "hominis", "jettensis", "kloosii", "lentus", - "lugdunensis", "massiliensis", "microti", - "muscae", "nepalensis", "pasteuri", "petrasii", - "pettenkoferi", "piscifermentans", "rostri", - "saccharolyticus", "saprophyticus", "sciuri", - "stepanovicii", "simulans", "succinus", - "vitulinus", "warneri", "xylosus") - | (species == "schleiferi" & subspecies %in% c("schleiferi", "")), ..property][[1]] - CoPS <- MOs_staph[species %in% c("simiae", "agnetis", - "delphini", "lutrae", - "hyicus", "intermedius", - "pseudintermedius", "pseudointermedius", - "schweitzeri", "argenteus") - | (species == "schleiferi" & subspecies == "coagulans"), ..property][[1]] - - # warn when species found that are not in Becker (2014, PMID 25278577) and Becker (2019, PMID 30872103) - post_Becker <- c("argensis", "caeli", "cornubiensis", "edaphicus") - if (any(x %in% MOs_staph[species %in% post_Becker, ..property][[1]])) { - - warning("Becker ", italic("et al."), " (2014, 2019) does not contain these species named after their publication: ", - italic(paste("S.", - sort(mo_species(unique(x[x %in% MOs_staph[species %in% post_Becker, ..property][[1]]]))), - collapse = ", ")), - ".", - call. = FALSE, - immediate. = TRUE) - } - - x[x %in% CoNS] <- microorganismsDT[mo == 'B_STPHY_CNS', ..property][[1]][1L] - x[x %in% CoPS] <- microorganismsDT[mo == 'B_STPHY_CPS', ..property][[1]][1L] - if (Becker == "all") { - x[x %in% microorganismsDT[mo %like% '^B_STPHY_AUR', ..property][[1]]] <- microorganismsDT[mo == 'B_STPHY_CPS', ..property][[1]][1L] - } - } - - # Lancefield ---- - if (Lancefield == TRUE | Lancefield == "all") { - # group A - S. pyogenes - x[x == microorganismsDT[mo == 'B_STRPT_PYO', ..property][[1]][1L]] <- microorganismsDT[mo == 'B_STRPT_GRA', ..property][[1]][1L] - # group B - S. agalactiae - x[x == microorganismsDT[mo == 'B_STRPT_AGA', ..property][[1]][1L]] <- microorganismsDT[mo == 'B_STRPT_GRB', ..property][[1]][1L] - # group C - S_groupC <- microorganismsDT %>% filter(genus == "Streptococcus", - species %in% c("equisimilis", "equi", - "zooepidemicus", "dysgalactiae")) %>% - pull(property) - x[x %in% S_groupC] <- microorganismsDT[mo == 'B_STRPT_GRC', ..property][[1]][1L] - if (Lancefield == "all") { - # all Enterococci - x[x %like% "^(Enterococcus|B_ENTRC)"] <- microorganismsDT[mo == 'B_STRPT_GRD', ..property][[1]][1L] - } - # group F - S. anginosus - x[x == microorganismsDT[mo == 'B_STRPT_ANG', ..property][[1]][1L]] <- microorganismsDT[mo == 'B_STRPT_GRF', ..property][[1]][1L] - # group H - S. sanguinis - x[x == microorganismsDT[mo == 'B_STRPT_SAN', ..property][[1]][1L]] <- microorganismsDT[mo == 'B_STRPT_GRH', ..property][[1]][1L] - # group K - S. salivarius - x[x == microorganismsDT[mo == 'B_STRPT_SAL', ..property][[1]][1L]] <- microorganismsDT[mo == 'B_STRPT_GRK', ..property][[1]][1L] - } - - # Wrap up ---------------------------------------------------------------- - - # comply to x, which is also unique and without empty values - x_input_unique_nonempty <- unique(x_input[!is.na(x_input) - & !is.null(x_input) - & !identical(x_input, "") - & !identical(x_input, "xxx")]) - - # left join the found results to the original input values (x_input) - df_found <- data.frame(input = as.character(x_input_unique_nonempty), - found = as.character(x), - stringsAsFactors = FALSE) - df_input <- data.frame(input = as.character(x_input), - stringsAsFactors = FALSE) - - suppressWarnings( - x <- df_input %>% - left_join(df_found, - by = "input") %>% - pull(found) - ) - - if (property == "mo") { - x <- to_class_mo(x) - } - - if (length(mo_renamed()) > 0) { - print(mo_renamed()) - } - - x -} diff --git a/R/mo_history.R b/R/mo_history.R index 18ac330e..6cd3c48a 100644 --- a/R/mo_history.R +++ b/R/mo_history.R @@ -19,116 +19,152 @@ # Visit our website for more info: https://msberends.gitlab.io/AMR. # # ==================================================================== # -# print successful as.mo coercions to AMR environment -#' @importFrom dplyr %>% distinct filter -set_mo_history <- function(x, mo, uncertainty_level, force = FALSE) { - # disable function for now - return(base::invisible()) +mo_history_file <- file.path(file.path(system.file(package = "AMR"), "mo_history"), "mo_history.csv") - # if (base::interactive() | force == TRUE) { - # mo_hist <- read_mo_history(uncertainty_level = uncertainty_level, force = force) - # df <- data.frame(x, mo, stringsAsFactors = FALSE) %>% - # distinct(x, .keep_all = TRUE) %>% - # filter(!is.na(x) & !is.na(mo)) - # if (nrow(df) == 0) { - # return(base::invisible()) - # } - # x <- toupper(df$x) - # mo <- df$mo - # for (i in 1:length(x)) { - # # save package version too, as both the as.mo() algorithm and the reference data set may change - # if (NROW(mo_hist[base::which(mo_hist$x == x[i] & - # mo_hist$uncertainty_level >= uncertainty_level & - # mo_hist$package_v == utils::packageVersion("AMR")),]) == 0) { - # tryCatch( - # assign(x = "mo_history", - # value = rbind(mo_hist, - # data.frame( - # x = x[i], - # mo = mo[i], - # uncertainty_level = uncertainty_level, - # package_v = base::as.character(utils::packageVersion("AMR")), - # stringsAsFactors = FALSE)), - # envir = asNamespace("AMR")), - # error = function(e) invisible()) - # } - # } - # } - # return(base::invisible()) +# print successful as.mo coercions to a options entry +#' @importFrom dplyr %>% distinct filter +set_mo_history <- function(x, mo, uncertainty_level, force = FALSE, disable = FALSE) { + if (isTRUE(disable)) { + return(base::invisible()) + } + + if (base::interactive() | force == TRUE) { + mo_hist <- read_mo_history(uncertainty_level = uncertainty_level, force = force) + df <- data.frame(x, mo, stringsAsFactors = FALSE) %>% + distinct(x, .keep_all = TRUE) %>% + filter(!is.na(x) & !is.na(mo)) + if (nrow(df) == 0) { + return(base::invisible()) + } + x <- toupper(df$x) + mo <- df$mo + for (i in 1:length(x)) { + # save package version too, as both the as.mo() algorithm and the reference data set may change + if (NROW(mo_hist[base::which(mo_hist$x == x[i] & + mo_hist$uncertainty_level >= uncertainty_level & + mo_hist$package_v == utils::packageVersion("AMR")),]) == 0) { + # # Not using the file system: + # tryCatch(options(mo_remembered_results = rbind(mo_hist, + # data.frame( + # x = x[i], + # mo = mo[i], + # uncertainty_level = uncertainty_level, + # package_v = base::as.character(utils::packageVersion("AMR")), + # stringsAsFactors = FALSE))), + # error = function(e) base::invisible()) + # # don't remember more than 1,000 different input values + # if (tryCatch(nrow(getOption("mo_remembered_results")), error = function(e) 1001) > 1000) { + # return(base::invisible()) + # } + if (is.null(mo_hist)) { + message(blue(paste0("NOTE: results are saved to ", mo_history_file, "."))) + } + tryCatch(write.csv(rbind(mo_hist, + data.frame( + x = x[i], + mo = mo[i], + uncertainty_level = uncertainty_level, + package_v = base::as.character(utils::packageVersion("AMR")), + stringsAsFactors = FALSE)), + file = mo_history_file, row.names = FALSE), + error = function(e) base::invisible()) + } + } + } + return(base::invisible()) } -get_mo_history <- function(x, uncertainty_level, force = FALSE) { - # disable function for now - return(NA) +get_mo_history <- function(x, uncertainty_level, force = FALSE, disable = FALSE) { + if (isTRUE(disable)) { + return(to_class_mo(NA)) + } - # history <- read_mo_history(uncertainty_level = uncertainty_level, force = force) - # if (base::is.null(history)) { - # NA - # } else { - # data.frame(x = toupper(x), stringsAsFactors = FALSE) %>% - # left_join(history, by = "x") %>% - # pull(mo) - # } + history <- read_mo_history(uncertainty_level = uncertainty_level, force = force) + if (base::is.null(history)) { + result <- NA + } else { + result <- data.frame(x = toupper(x), stringsAsFactors = FALSE) %>% + left_join(history, by = "x") %>% + pull(mo) + } + to_class_mo(result) } #' @importFrom dplyr %>% filter distinct -read_mo_history <- function(uncertainty_level = 2, force = FALSE, unfiltered = FALSE) { - # disable function for now - return(NULL) +read_mo_history <- function(uncertainty_level = 2, force = FALSE, unfiltered = FALSE, disable = FALSE) { + if (isTRUE(disable)) { + return(NULL) + } - # if ((!base::interactive() & force == FALSE)) { - # return(NULL) - # } - # uncertainty_level_param <- uncertainty_level - # - # history <- tryCatch(get("mo_history", envir = asNamespace("AMR")), - # error = function(e) NULL) - # if (is.null(history)) { - # return(NULL) - # } - # # Below: filter on current package version. - # # Even current fullnames may be replaced by new taxonomic names, so new versions of - # # the Catalogue of Life must not lead to data corruption. - # - # if (unfiltered == FALSE) { - # history <- history %>% - # filter(package_v == as.character(utils::packageVersion("AMR")), - # # only take unknowns if uncertainty_level_param is higher - # ((mo == "UNKNOWN" & uncertainty_level_param == uncertainty_level) | - # (mo != "UNKNOWN" & uncertainty_level_param >= uncertainty_level))) %>% - # arrange(desc(uncertainty_level)) %>% - # distinct(x, mo, .keep_all = TRUE) - # } - # - # if (nrow(history) == 0) { - # NULL - # } else { - # history - # } + if ((!base::interactive() & force == FALSE)) { + return(NULL) + } + uncertainty_level_param <- uncertainty_level + + # # Not using the file system: + # history <- tryCatch(getOption("mo_remembered_results"), + # error = function(e) NULL) + history <- tryCatch(read.csv(mo_history_file, stringsAsFactors = FALSE), + warning = function(w) invisible(), + error = function(e) NULL) + if (is.null(history)) { + return(NULL) + } + # Below: filter on current package version. + # Even current fullnames may be replaced by new taxonomic names, so new versions of + # the Catalogue of Life must not lead to data corruption. + + if (unfiltered == FALSE) { + history <- history %>% + filter(package_v == as.character(utils::packageVersion("AMR")), + # only take unknowns if uncertainty_level_param is higher + ((mo == "UNKNOWN" & uncertainty_level_param == uncertainty_level) | + (mo != "UNKNOWN" & uncertainty_level_param >= uncertainty_level))) %>% + arrange(desc(uncertainty_level)) %>% + distinct(x, mo, .keep_all = TRUE) + } + + if (nrow(history) == 0) { + NULL + } else { + history + } } -# @rdname as.mo -# @importFrom crayon red -# @importFrom utils menu -# @export -clean_mo_history <- function(...) { - # if (!is.null(read_mo_history())) { - # if (interactive() & !isTRUE(list(...)$force)) { - # q <- menu(title = paste("This will remove all", - # format(nrow(read_mo_history(999, unfiltered = TRUE)), big.mark = ","), - # "microbial IDs determined previously in this session. Are you sure?"), - # choices = c("Yes", "No"), - # graphics = FALSE) - # if (q != 1) { - # return(invisible()) - # } - # } - # tryCatch( - # assign(x = "mo_history", - # value = NULL, - # envir = asNamespace("AMR")), - # error = function(e) invisible()) - # cat(red("History removed.")) - # } +#' @rdname as.mo +#' @importFrom crayon red +#' @importFrom utils menu +#' @export +clear_mo_history <- function(...) { + if (!is.null(read_mo_history())) { + if (interactive() & !isTRUE(list(...)$force)) { + q <- menu(title = paste("This will clear all", + format(nrow(read_mo_history(999, unfiltered = TRUE)), big.mark = ","), + "previously determined microbial IDs. Are you sure?"), + choices = c("Yes", "No"), + graphics = FALSE) + if (q != 1) { + return(invisible()) + } + } + # # Not using the file system: + # success <- tryCatch(options(mo_remembered_results = NULL), + # error = function(e) FALSE) + success <- create_blank_mo_history() + if (!isFALSE(success)) { + cat(red(paste("File", mo_history_file, "cleared."))) + } + } } +create_blank_mo_history <- function() { + tryCatch( + write.csv(x = data.frame(x = character(0), + mo = character(0), + uncertainty_level = integer(0), + package_v = character(0), + stringsAsFactors = FALSE), + file = mo_history_file), + warning = function(w) invisible(), + error = function(e) TRUE) +} diff --git a/R/mo_property.R b/R/mo_property.R index ed116d9b..d8650eaa 100755 --- a/R/mo_property.R +++ b/R/mo_property.R @@ -408,7 +408,7 @@ mo_validate <- function(x, property, ...) { if (!"AMR" %in% base::.packages()) { require("AMR") - # check onLoad() in R/zzz.R: data tables are created there. + # check .onLoad() in R/zzz.R: data tables are created there. } # try to catch an error when inputting an invalid parameter diff --git a/R/zzz.R b/R/zzz.R index a6b39452..0599f71d 100755 --- a/R/zzz.R +++ b/R/zzz.R @@ -26,7 +26,8 @@ # register data microorganisms.oldDT <- as.data.table(AMR::microorganisms.old) - microorganisms.oldDT$fullname_lower <- tolower(microorganisms.oldDT$fullname) + # for fullname_lower: keep only dots, letters, numbers, slashes, spaces and dashes + microorganisms.oldDT$fullname_lower <- gsub("[^.a-z0-9/ \\-]+", "", tolower(microorganisms.oldDT$fullname)) setkey(microorganisms.oldDT, col_id, fullname) assign(x = "microorganismsDT", @@ -81,7 +82,8 @@ #' @importFrom data.table as.data.table setkey make_DT <- function() { microorganismsDT <- as.data.table(AMR::microorganisms) - microorganismsDT$fullname_lower <- tolower(microorganismsDT$fullname) + # for fullname_lower: keep only dots, letters, numbers, slashes, spaces and dashes + microorganismsDT$fullname_lower <- gsub("[^.a-z0-9/ \\-]+", "", tolower(microorganismsDT$fullname)) setkey(microorganismsDT, prevalence, kingdom, diff --git a/_pkgdown.yml b/_pkgdown.yml index a4d8ac2f..2e0b6ed2 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -118,7 +118,7 @@ reference: - "`ab_property`" - "`age`" - "`age_groups`" - - "`p.symbol`" + - "`p_symbol`" - "`join`" - "`atc_online_property`" - title: "Analysing your data" diff --git a/codecov.yml b/codecov.yml index 13bb0025..fea6aac8 100644 --- a/codecov.yml +++ b/codecov.yml @@ -19,3 +19,4 @@ ignore: - "R/atc_online.R" - "R/mo_source.R" - "R/resistance_predict.R" + - "R/aa_test.R" diff --git a/data-raw/internals.R b/data-raw/internals.R index 1be61db0..f5fc6e72 100644 --- a/data-raw/internals.R +++ b/data-raw/internals.R @@ -19,7 +19,7 @@ eucast_rules_file <- dplyr::arrange( reference.rule_group, reference.rule) -# Translations ----- +# Translations ---- translations_file <- utils::read.delim(file = "data-raw/translations.tsv", sep = "\t", stringsAsFactors = FALSE, @@ -42,3 +42,16 @@ usethis::use_data(eucast_rules_file, translations_file, # Remove from global environment ---- rm(eucast_rules_file) rm(translations_file) + +# Clean mo history ---- +mo_history_file <- file.path(file.path(system.file(package = "AMR"), "mo_history"), "mo_history.csv") +usethis::ui_done(paste0("Resetting {usethis::ui_value('", mo_history_file, "')}")) +tryCatch( + write.csv(x = data.frame(x = character(0), + mo = character(0), + uncertainty_level = integer(0), + package_v = character(0), + stringsAsFactors = FALSE), + file = mo_history_file), + warning = function(w) invisible(), + error = function(e) TRUE) diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index 64d7693f..615dfbb7 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -78,7 +78,7 @@ AMR (for R) - 0.7.1.9072 + 0.7.1.9073 diff --git a/docs/articles/benchmarks_files/figure-html/unnamed-chunk-4-1.png b/docs/articles/benchmarks_files/figure-html/unnamed-chunk-4-1.png index 8c266414..a79adfcb 100644 Binary files a/docs/articles/benchmarks_files/figure-html/unnamed-chunk-4-1.png and b/docs/articles/benchmarks_files/figure-html/unnamed-chunk-4-1.png differ diff --git a/docs/articles/benchmarks_files/figure-html/unnamed-chunk-6-1.png b/docs/articles/benchmarks_files/figure-html/unnamed-chunk-6-1.png index b30d5299..aa4c77e8 100644 Binary files a/docs/articles/benchmarks_files/figure-html/unnamed-chunk-6-1.png and b/docs/articles/benchmarks_files/figure-html/unnamed-chunk-6-1.png differ diff --git a/docs/articles/index.html b/docs/articles/index.html index 0b297047..2c086086 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -78,7 +78,7 @@ AMR (for R) - 0.7.1.9072 + 0.7.1.9073 diff --git a/docs/authors.html b/docs/authors.html index 6bddc998..6b47c3f8 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -78,7 +78,7 @@ AMR (for R) - 0.7.1.9072 + 0.7.1.9073 @@ -253,6 +253,10 @@

Erwin E. A. Hassing. Contributor.

+
  • +

    Eric H. L. C. M. Hazenberg. Contributor. +

    +
  • Annick Lenglet. Contributor.

    diff --git a/docs/index.html b/docs/index.html index 98dc1422..46363878 100644 --- a/docs/index.html +++ b/docs/index.html @@ -42,7 +42,7 @@ AMR (for R) - 0.7.1.9072 + 0.7.1.9073 diff --git a/docs/reference/as.mo.html b/docs/reference/as.mo.html index e11b9f21..8cbbb4a4 100644 --- a/docs/reference/as.mo.html +++ b/docs/reference/as.mo.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9055 + 0.7.1.9073 @@ -243,7 +243,9 @@ mo_uncertainties() -mo_renamed() +mo_renamed() + +clear_mo_history(...)

    Arguments

    @@ -283,7 +285,7 @@

    Details

    General info
    -A microbial ID from this package (class: mo) typically looks like these examples:

    +A microorganism ID from this package (class: mo) typically looks like these examples:

       Code              Full name
       ---------------   --------------------------------------
       B_KLBSL           Klebsiella
    @@ -299,8 +301,11 @@ A microbial ID from this package (class: mo) typically looks like t
     

    Values that cannot be coered will be considered 'unknown' and will get the MO code UNKNOWN.

    Use the mo_property_* functions to get properties based on the returned code, see Examples.

    -

    The algorithm uses data from the Catalogue of Life (see below) and from one other source (see ?microorganisms).

    -

    Intelligent rules
    +

    The algorithm uses data from the Catalogue of Life (see below) and from one other source (see microorganisms).

    +

    Self-learning algoritm
    +The as.mo() function gains experience from previously determined microorganism IDs and learns from it. This drastically improves both speed and reliability. Use clear_mo_history() to reset the algorithms. Only experience from your current AMR package version is used. This is done because in the future the taxonomic tree (which is included in this package) may change for any organism and it consequently has to rebuild its knowledge.

    +

    Usually, any guess after the first try runs 80-95% faster than the first try.

    +

    Intelligent rules
    This function uses intelligent rules to help getting fast and logical results. It tries to find matches in this order:

    Use mo_failures() to get a vector with all values that could not be coerced to a valid value.

    Use mo_uncertainties() to get a data.frame with all values that were coerced to a valid value, but with uncertainty.

    -

    Use mo_renamed() to get a vector with all values that could be coerced based on an old, previously accepted taxonomic name.

    +

    Use mo_renamed() to get a data.frame with all values that could be coerced based on an old, previously accepted taxonomic name.

    Microbial prevalence of pathogens in humans
    The intelligent rules take into account microbial prevalence of pathogens in humans. It uses three groups and all (sub)species are in only one group. These groups are:

    Group 1 contains all common Gram positives and Gram negatives, like all Enterobacteriaceae and e.g. Pseudomonas and Legionella.

    -

    Group 2 probably contains less microbial pathogens; all other members of phyla that were found in humans in the Northern Netherlands between 2001 and 2018.

    +

    Group 2 contains probably less pathogenic microorganisms; all other members of phyla that were found in humans in the Northern Netherlands between 2001 and 2018.

    Source

    diff --git a/docs/reference/index.html b/docs/reference/index.html index e85c7ddc..3fa080be 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -78,7 +78,7 @@ AMR (for R) - 0.7.1.9072 + 0.7.1.9073 @@ -292,7 +292,7 @@
    @@ -384,9 +384,9 @@ - + diff --git a/docs/reference/like.html b/docs/reference/like.html index d313e9d8..743e75de 100644 --- a/docs/reference/like.html +++ b/docs/reference/like.html @@ -47,7 +47,7 @@ - + @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9067 + 0.7.1.9073 @@ -230,13 +230,15 @@
    -

    Convenient wrapper around grep to match a pattern: a %like% b. It always returns a logical vector and is always case-insensitive. Also, pattern (b) can be as long as x (a) to compare items of each index in both vectors.

    +

    Convenient wrapper around grep to match a pattern: a %like% b. It always returns a logical vector and is always case-insensitive (use a %like_case% b for case-sensitive matching). Also, pattern (b) can be as long as x (a) to compare items of each index in both vectors, or can both have the same length to iterate over all cases.

    -
    like(x, pattern)
    +    
    like(x, pattern, ignore.case = TRUE)
     
    -x %like% pattern
    +x %like% pattern + +x %like_case% pattern

    Arguments

    -

    as.mo() is.mo() mo_failures() mo_uncertainties() mo_renamed()

    +

    as.mo() is.mo() mo_failures() mo_uncertainties() mo_renamed() clear_mo_history()

    Transform to microorganism ID

    -

    as.atc() p.symbol()

    +

    p_symbol()

    Deprecated functions

    Symbol of a p value

    @@ -528,7 +528,7 @@
    -

    like() `%like%`

    +

    like() `%like%` `%like_case%`

    Pattern Matching

    @@ -257,6 +259,11 @@ is used with a warning. Missing values are allowed except for regexpr and gregexpr.

    + + + +
    ignore.case

    if FALSE, the pattern matching is case + sensitive and if TRUE, case is ignored during matching.

    Source

    diff --git a/docs/reference/translate.html b/docs/reference/translate.html index 14176405..252a1822 100644 --- a/docs/reference/translate.html +++ b/docs/reference/translate.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9072 + 0.7.1.9073 diff --git a/inst/mo_history/mo_history.csv b/inst/mo_history/mo_history.csv new file mode 100644 index 00000000..db967b2c --- /dev/null +++ b/inst/mo_history/mo_history.csv @@ -0,0 +1 @@ +"","x","mo","uncertainty_level","package_v" diff --git a/man/as.mo.Rd b/man/as.mo.Rd index f28bf0ae..62f40090 100644 --- a/man/as.mo.Rd +++ b/man/as.mo.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/mo.R +% Please edit documentation in R/mo.R, R/mo_history.R \name{as.mo} \alias{as.mo} \alias{mo} @@ -7,6 +7,7 @@ \alias{mo_failures} \alias{mo_uncertainties} \alias{mo_renamed} +\alias{clear_mo_history} \title{Transform to microorganism ID} \usage{ as.mo(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = TRUE, @@ -19,6 +20,8 @@ mo_failures() mo_uncertainties() mo_renamed() + +clear_mo_history(...) } \arguments{ \item{x}{a character vector or a \code{data.frame} with one or two columns} @@ -45,7 +48,7 @@ Use this function to determine a valid microorganism ID (\code{mo}). Determinati } \details{ \strong{General info} \cr -A microbial ID from this package (class: \code{mo}) typically looks like these examples:\cr +A microorganism ID from this package (class: \code{mo}) typically looks like these examples:\cr \preformatted{ Code Full name --------------- -------------------------------------- @@ -65,7 +68,13 @@ Values that cannot be coered will be considered 'unknown' and will get the MO co Use the \code{\link{mo_property}_*} functions to get properties based on the returned code, see Examples. -The algorithm uses data from the Catalogue of Life (see below) and from one other source (see \code{?microorganisms}). +The algorithm uses data from the Catalogue of Life (see below) and from one other source (see \code{\link{microorganisms}}). + +\strong{Self-learning algoritm} \cr +The \code{as.mo()} function gains experience from previously determined microorganism IDs and learns from it. This drastically improves both speed and reliability. Use \code{clear_mo_history()} to reset the algorithms. Only experience from your current \code{AMR} package version is used. This is done because in the future the taxonomic tree (which is included in this package) may change for any organism and it consequently has to rebuild its knowledge. + +Usually, any guess after the first try runs 80-95\% faster than the first try. + \strong{Intelligent rules} \cr This function uses intelligent rules to help getting fast and logical results. It tries to find matches in this order: @@ -105,7 +114,7 @@ Use \code{mo_failures()} to get a vector with all values that could not be coerc Use \code{mo_uncertainties()} to get a data.frame with all values that were coerced to a valid value, but with uncertainty. -Use \code{mo_renamed()} to get a vector with all values that could be coerced based on an old, previously accepted taxonomic name. +Use \code{mo_renamed()} to get a data.frame with all values that could be coerced based on an old, previously accepted taxonomic name. \strong{Microbial prevalence of pathogens in humans} \cr The intelligent rules take into account microbial prevalence of pathogens in humans. It uses three groups and all (sub)species are in only one group. These groups are: @@ -117,7 +126,7 @@ The intelligent rules take into account microbial prevalence of pathogens in hum Group 1 contains all common Gram positives and Gram negatives, like all Enterobacteriaceae and e.g. \emph{Pseudomonas} and \emph{Legionella}. -Group 2 probably contains less microbial pathogens; all other members of phyla that were found in humans in the Northern Netherlands between 2001 and 2018. +Group 2 contains probably less pathogenic microorganisms; all other members of phyla that were found in humans in the Northern Netherlands between 2001 and 2018. } \section{Source}{ diff --git a/man/like.Rd b/man/like.Rd index 88ec5ae4..4d4169ff 100755 --- a/man/like.Rd +++ b/man/like.Rd @@ -3,14 +3,17 @@ \name{like} \alias{like} \alias{\%like\%} +\alias{\%like_case\%} \title{Pattern Matching} \source{ Idea from the \href{https://github.com/Rdatatable/data.table/blob/master/R/like.R}{\code{like} function from the \code{data.table} package}, but made it case insensitive at default and let it support multiple patterns. Also, if the regex fails the first time, it tries again with \code{perl = TRUE}. } \usage{ -like(x, pattern) +like(x, pattern, ignore.case = TRUE) x \%like\% pattern + +x \%like_case\% pattern } \arguments{ \item{x}{a character vector where matches are sought, or an @@ -24,12 +27,15 @@ x \%like\% pattern character vector of length 2 or more is supplied, the first element is used with a warning. Missing values are allowed except for \code{regexpr} and \code{gregexpr}.} + +\item{ignore.case}{if \code{FALSE}, the pattern matching is \emph{case + sensitive} and if \code{TRUE}, case is ignored during matching.} } \value{ A \code{logical} vector } \description{ -Convenient wrapper around \code{\link[base]{grep}} to match a pattern: \code{a \%like\% b}. It always returns a \code{logical} vector and is always case-insensitive. Also, \code{pattern} (\code{b}) can be as long as \code{x} (\code{a}) to compare items of each index in both vectors. +Convenient wrapper around \code{\link[base]{grep}} to match a pattern: \code{a \%like\% b}. It always returns a \code{logical} vector and is always case-insensitive (use \code{a \%like_case\% b} for case-sensitive matching). Also, \code{pattern} (\code{b}) can be as long as \code{x} (\code{a}) to compare items of each index in both vectors, or can both have the same length to iterate over all cases. } \details{ Using RStudio? This function can also be inserted from the Addins menu and can have its own Keyboard Shortcut like Ctrl+Shift+L or Cmd+Shift+L (see Tools > Modify Keyboard Shortcuts...). diff --git a/tests/testthat/test-mo.R b/tests/testthat/test-mo.R index c3646ea1..9fbaa3a6 100644 --- a/tests/testthat/test-mo.R +++ b/tests/testthat/test-mo.R @@ -22,6 +22,8 @@ context("mo.R") test_that("as.mo works", { + + clear_mo_history(force = TRUE) library(dplyr) MOs <- AMR::microorganisms %>% filter(!is.na(mo), nchar(mo) > 3) diff --git a/vignettes/benchmarks.Rmd b/vignettes/benchmarks.Rmd index 8aaf8d10..a5928899 100755 --- a/vignettes/benchmarks.Rmd +++ b/vignettes/benchmarks.Rmd @@ -20,7 +20,7 @@ knitr::opts_chunk$set( comment = "#", fig.width = 7.5, fig.height = 4.5, - dpi = 150 + dpi = 75 ) ``` @@ -110,26 +110,40 @@ That takes `r round(mean(T.islandicus$time, na.rm = TRUE) / mean(S.aureus$time, In the figure below, we compare *Escherichia coli* (which is very common) with *Prevotella brevis* (which is moderately common) and with *Thermus islandicus* (which is uncommon): ```{r, echo = FALSE} -ggplot.bm( - microbenchmark(as.mo("Escherichia coli"), - as.mo("E. coli"), - times = 10), title = "Very common") +# ggplot.bm( +# microbenchmark(as.mo("Escherichia coli"), +# as.mo("E. coli"), +# times = 10), title = "Very common") +# +# ggplot.bm( +# microbenchmark(as.mo("Prevotella brevis"), +# as.mo("P. brevis"), +# times = 10), title = "Moderately common") +# +# ggplot.bm( +# microbenchmark(as.mo("Thermus islandicus"), +# as.mo("T. islandicus"), +# times = 10), title = "Uncommon") -ggplot.bm( - microbenchmark(as.mo("Prevotella brevis"), - as.mo("P. brevis"), - times = 10), title = "Moderately common") - -ggplot.bm( - microbenchmark(as.mo("Thermus islandicus"), - as.mo("T. islandicus"), - times = 10), title = "Uncommon") +par(mar = c(5, 16, 4, 2)) +boxplot(microbenchmark( + 'as.mo("Thermus islandicus")' = as.mo("Thermus islandicus"), + 'as.mo("Prevotella brevis")' = as.mo("Prevotella brevis"), + 'as.mo("Escherichia coli")' = as.mo("Escherichia coli"), + 'as.mo("T. islandicus")' = as.mo("T. islandicus"), + 'as.mo("P. brevis")' = as.mo("P. brevis"), + 'as.mo("E. coli")' = as.mo("E. coli"), + times = 10), + horizontal = TRUE, las = 1, unit = "s", log = FALSE, + xlab = "", ylab = "Time in seconds", ylim = c(0, 0.5), + main = "Benchmarks per prevalence") ``` -```{r, echo = FALSE, eval = FALSE} -# In reality, the `as.mo()` functions **learns from its own output to speed up determinations for next times**. In above figure, this effect was disabled to show the difference with the boxplot below - when you would use `as.mo()` yourself: +In reality, the `as.mo()` functions **learns from its own output to speed up determinations for next times**. In above figure, this effect was disabled to show the difference with the boxplot below - when you would use `as.mo()` yourself: -clean_mo_history() +```{r, echo = FALSE} + +clear_mo_history() par(mar = c(5, 16, 4, 2)) boxplot(microbenchmark( 'as.mo("Thermus islandicus")' = as.mo("Thermus islandicus", force_mo_history = TRUE), @@ -142,10 +156,10 @@ boxplot(microbenchmark( horizontal = TRUE, las = 1, unit = "s", log = FALSE, xlab = "", ylab = "Time in seconds", ylim = c(0, 0.5), main = "Benchmarks per prevalence") - -# The highest outliers are the first times. All next determinations were done in only thousands of seconds. For now, learning only works per session. If R is closed or terminated, the algorithms reset. This will probably be resolved in a next version. ``` +The highest outliers are the first times. All next determinations were done in only thousands of seconds. + Uncommon microorganisms take a lot more time than common microorganisms. To relieve this pitfall and further improve performance, two important calculations take almost no time at all: **repetitive results** and **already precalculated results**. ### Repetitive results