as.mo improvement

This commit is contained in:
dr. M.S. (Matthijs) Berends 2019-03-15 17:36:42 +01:00
parent 4091f62828
commit e18a1f1f17
5 changed files with 91 additions and 58 deletions

View File

@ -86,7 +86,10 @@ coverage:
# codecov token is set in https://gitlab.com/msberends/AMR/settings/ci_cd # codecov token is set in https://gitlab.com/msberends/AMR/settings/ci_cd
- Rscript -e 'print("${codecov_token}")' - Rscript -e 'print("${codecov_token}")'
- Rscript -e 'print("$codecov_token")' - Rscript -e 'print("$codecov_token")'
- Rscript -e 'cc <- covr::package_coverage(); covr::codecov(coverage = cc, token = "$codecov_token"); cat("Code coverage:", covr::percent_coverage(cc))' - Rscript -e 'print("$${codecov_token}")'
- Rscript -e 'print("$$codecov_token")'
- echo $codecov_token
- Rscript -e 'cc <- covr::package_coverage(); covr::codecov(coverage = cc, token = "$$codecov_token"); cat("Code coverage:", covr::percent_coverage(cc))'
coverage: '/Code coverage: \d+\.\d+/' coverage: '/Code coverage: \d+\.\d+/'
pages: pages:

111
R/mo.R
View File

@ -57,7 +57,8 @@
#' #'
#' Use the \code{\link{mo_property}_*} functions to get properties based on the returned code, see Examples. #' Use the \code{\link{mo_property}_*} functions to get properties based on the returned code, see Examples.
#' #'
#' All IDs that are found with zero uncertainty are saved to a local file (\code{"~/.Rhistory_mo"}) to improve speed for every next time. Use \code{clean_mo_history()} to delete this file, which resets the algorithms. Only previous results will be used from this version of the \code{AMR} package, since the taxonomic tree may change in the future for any organism. #' \strong{Self-learning algoritm} \cr
#' The \code{as.mo()} function gains experience from previously determined microbial IDs and learns from it. This drastically improves both speed and reliability. Use \code{clean_mo_history()} to reset the algorithms. Only experience from your current \code{AMR} package version is used. This is done because in the future the taxonomic tree (which is included in this package) may change for any organism and it consequently has to rebuild its knowledge. Usually, any guess after the first try runs 90-95\% faster than the first try. The algorithm saves its previous findings to \code{~/.Rhistory_mo}.
#' #'
#' \strong{Intelligent rules} \cr #' \strong{Intelligent rules} \cr
#' This function uses intelligent rules to help getting fast and logical results. It tries to find matches in this order: #' This function uses intelligent rules to help getting fast and logical results. It tries to find matches in this order:
@ -218,7 +219,7 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = TRUE,
y <- x y <- x
} else if (sum(is.na(mo_hist)) == 0 } else if (!any(is.na(mo_hist))
& isFALSE(Becker) & isFALSE(Becker)
& isFALSE(Lancefield)) { & isFALSE(Lancefield)) {
# check previously found results # check previously found results
@ -242,13 +243,8 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = TRUE,
on = "fullname_lower", on = "fullname_lower",
"mo"][[1]] "mo"][[1]]
} }
# save them too # save them to history
mo_hist <- read_mo_history(force = isTRUE(list(...)$force_mo_history)) set_mo_history(x, y, force = isTRUE(list(...)$force_mo_history))
if (any(!x %in% mo_hist$x)) {
for (i in 1:length(y)) {
set_mo_history(x[i], y[i], force = isTRUE(list(...)$force_mo_history))
}
}
} else { } else {
# will be checked for mo class in validation and uses exec_as.mo internally if necessary # will be checked for mo class in validation and uses exec_as.mo internally if necessary
@ -271,13 +267,16 @@ is.mo <- function(x) {
#' @importFrom dplyr %>% pull left_join n_distinct progress_estimated filter distinct #' @importFrom dplyr %>% pull left_join n_distinct progress_estimated filter distinct
#' @importFrom data.table data.table as.data.table setkey #' @importFrom data.table data.table as.data.table setkey
#' @importFrom crayon magenta red blue silver italic has_color #' @importFrom crayon magenta red blue silver italic has_color
# param property a column name of AMR::microorganisms
# param initial_search logical - is FALSE when coming from uncertain tries, which uses exec_as.mo internally too
# param force_mo_history logical - whether found result must be saved with set_mo_history (default FALSE on non-interactive sessions)
exec_as.mo <- function(x, exec_as.mo <- function(x,
Becker = FALSE, Becker = FALSE,
Lancefield = FALSE, Lancefield = FALSE,
allow_uncertain = TRUE, allow_uncertain = TRUE,
reference_df = get_mo_source(), reference_df = get_mo_source(),
property = "mo", property = "mo",
clear_options = TRUE, initial_search = TRUE,
force_mo_history = FALSE) { force_mo_history = FALSE) {
if (!"AMR" %in% base::.packages()) { if (!"AMR" %in% base::.packages()) {
@ -285,7 +284,7 @@ exec_as.mo <- function(x,
# check onLoad() in R/zzz.R: data tables are created there. # check onLoad() in R/zzz.R: data tables are created there.
} }
if (clear_options == TRUE) { if (initial_search == TRUE) {
options(mo_failures = NULL) options(mo_failures = NULL)
options(mo_uncertainties = NULL) options(mo_uncertainties = NULL)
options(mo_renamed = NULL) options(mo_renamed = NULL)
@ -401,6 +400,10 @@ exec_as.mo <- function(x,
} }
x <- y x <- y
} else if (all(x %in% read_mo_history(force = force_mo_history)$x)) {
# previously found code
x <- microorganismsDT[data.table(mo = get_mo_history(x, force = force_mo_history)), on = "mo", ..property][[1]]
} else if (all(tolower(x) %in% microorganismsDT$fullname_lower)) { } else if (all(tolower(x) %in% microorganismsDT$fullname_lower)) {
# we need special treatment for very prevalent full names, they are likely! # we need special treatment for very prevalent full names, they are likely!
# e.g. as.mo("Staphylococcus aureus") # e.g. as.mo("Staphylococcus aureus")
@ -420,6 +423,9 @@ exec_as.mo <- function(x,
} else if (all(toupper(x) %in% AMR::microorganisms.codes$code)) { } else if (all(toupper(x) %in% AMR::microorganisms.codes$code)) {
# commonly used MO codes # commonly used MO codes
y <- as.data.table(AMR::microorganisms.codes)[data.table(code = toupper(x)), on = "code", ] y <- as.data.table(AMR::microorganisms.codes)[data.table(code = toupper(x)), on = "code", ]
# save them to history
set_mo_history(x, y$mo, force = force_mo_history)
x <- microorganismsDT[data.table(mo = y[["mo"]]), on = "mo", ..property][[1]] x <- microorganismsDT[data.table(mo = y[["mo"]]), on = "mo", ..property][[1]]
} else if (!all(x %in% AMR::microorganisms[, property])) { } else if (!all(x %in% AMR::microorganisms[, property])) {
@ -503,7 +509,7 @@ exec_as.mo <- function(x,
# most probable: is exact match in fullname # most probable: is exact match in fullname
if (length(found) > 0) { if (length(found) > 0) {
x[i] <- found[1L] x[i] <- found[1L]
if (property == "mo") { if (property == "mo" & initial_search == TRUE) {
set_mo_history(x_backup[i], x[i], force = force_mo_history) set_mo_history(x_backup[i], x[i], force = force_mo_history)
} }
next next
@ -517,6 +523,9 @@ exec_as.mo <- function(x,
if (tolower(x_backup_without_spp[i]) %in% c("other", "none", "unknown")) { if (tolower(x_backup_without_spp[i]) %in% c("other", "none", "unknown")) {
# empty and nonsense values, ignore without warning # empty and nonsense values, ignore without warning
x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]] x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]]
if (property == "mo" & initial_search == TRUE) {
set_mo_history(x_backup[i], x[i], force = force_mo_history)
}
next next
} }
@ -531,7 +540,7 @@ exec_as.mo <- function(x,
# return first genus that begins with x_trimmed, e.g. when "E. spp." # return first genus that begins with x_trimmed, e.g. when "E. spp."
if (length(found) > 0) { if (length(found) > 0) {
x[i] <- found[1L] x[i] <- found[1L]
if (property == "mo") { if (property == "mo" & initial_search == TRUE) {
set_mo_history(x_backup[i], x[i], force = force_mo_history) set_mo_history(x_backup[i], x[i], force = force_mo_history)
} }
next next
@ -541,6 +550,9 @@ exec_as.mo <- function(x,
# fewer than 3 chars and not looked for species, add as failure # fewer than 3 chars and not looked for species, add as failure
x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]] x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]]
failures <- c(failures, x_backup[i]) failures <- c(failures, x_backup[i])
if (property == "mo" & initial_search == TRUE) {
set_mo_history(x_backup[i], x[i], force = force_mo_history)
}
next next
} }
@ -548,6 +560,9 @@ exec_as.mo <- function(x,
# there is no fullname like virus, so don't try to coerce it # there is no fullname like virus, so don't try to coerce it
x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]] x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]]
failures <- c(failures, x_backup[i]) failures <- c(failures, x_backup[i])
if (property == "mo" & initial_search == TRUE) {
set_mo_history(x_backup[i], x[i], force = force_mo_history)
}
next next
} }
@ -555,14 +570,14 @@ exec_as.mo <- function(x,
if (!is.na(x_trimmed[i])) { if (!is.na(x_trimmed[i])) {
if (toupper(x_backup_without_spp[i]) %in% c('MRSA', 'MSSA', 'VISA', 'VRSA')) { if (toupper(x_backup_without_spp[i]) %in% c('MRSA', 'MSSA', 'VISA', 'VRSA')) {
x[i] <- microorganismsDT[mo == 'B_STPHY_AUR', ..property][[1]][1L] x[i] <- microorganismsDT[mo == 'B_STPHY_AUR', ..property][[1]][1L]
if (property == "mo") { if (property == "mo" & initial_search == TRUE) {
set_mo_history(x_backup[i], x[i], force = force_mo_history) set_mo_history(x_backup[i], x[i], force = force_mo_history)
} }
next next
} }
if (toupper(x_backup_without_spp[i]) %in% c('MRSE', 'MSSE')) { if (toupper(x_backup_without_spp[i]) %in% c('MRSE', 'MSSE')) {
x[i] <- microorganismsDT[mo == 'B_STPHY_EPI', ..property][[1]][1L] x[i] <- microorganismsDT[mo == 'B_STPHY_EPI', ..property][[1]][1L]
if (property == "mo") { if (property == "mo" & initial_search == TRUE) {
set_mo_history(x_backup[i], x[i], force = force_mo_history) set_mo_history(x_backup[i], x[i], force = force_mo_history)
} }
next next
@ -570,14 +585,14 @@ exec_as.mo <- function(x,
if (toupper(x_backup_without_spp[i]) == "VRE" if (toupper(x_backup_without_spp[i]) == "VRE"
| x_backup_without_spp[i] %like% '(enterococci|enterokok|enterococo)[a-z]*?$') { | x_backup_without_spp[i] %like% '(enterococci|enterokok|enterococo)[a-z]*?$') {
x[i] <- microorganismsDT[mo == 'B_ENTRC', ..property][[1]][1L] x[i] <- microorganismsDT[mo == 'B_ENTRC', ..property][[1]][1L]
if (property == "mo") { if (property == "mo" & initial_search == TRUE) {
set_mo_history(x_backup[i], x[i], force = force_mo_history) set_mo_history(x_backup[i], x[i], force = force_mo_history)
} }
next next
} }
if (toupper(x_backup_without_spp[i]) %in% c("EHEC", "EPEC", "EIEC", "STEC", "ATEC")) { if (toupper(x_backup_without_spp[i]) %in% c("EHEC", "EPEC", "EIEC", "STEC", "ATEC")) {
x[i] <- microorganismsDT[mo == 'B_ESCHR_COL', ..property][[1]][1L] x[i] <- microorganismsDT[mo == 'B_ESCHR_COL', ..property][[1]][1L]
if (property == "mo") { if (property == "mo" & initial_search == TRUE) {
set_mo_history(x_backup[i], x[i], force = force_mo_history) set_mo_history(x_backup[i], x[i], force = force_mo_history)
} }
next next
@ -585,7 +600,7 @@ exec_as.mo <- function(x,
if (toupper(x_backup_without_spp[i]) == 'MRPA') { if (toupper(x_backup_without_spp[i]) == 'MRPA') {
# multi resistant P. aeruginosa # multi resistant P. aeruginosa
x[i] <- microorganismsDT[mo == 'B_PSDMN_AER', ..property][[1]][1L] x[i] <- microorganismsDT[mo == 'B_PSDMN_AER', ..property][[1]][1L]
if (property == "mo") { if (property == "mo" & initial_search == TRUE) {
set_mo_history(x_backup[i], x[i], force = force_mo_history) set_mo_history(x_backup[i], x[i], force = force_mo_history)
} }
next next
@ -594,7 +609,7 @@ exec_as.mo <- function(x,
| toupper(x_backup_without_spp[i]) == 'CRSM') { | toupper(x_backup_without_spp[i]) == 'CRSM') {
# co-trim resistant S. maltophilia # co-trim resistant S. maltophilia
x[i] <- microorganismsDT[mo == 'B_STNTR_MAL', ..property][[1]][1L] x[i] <- microorganismsDT[mo == 'B_STNTR_MAL', ..property][[1]][1L]
if (property == "mo") { if (property == "mo" & initial_search == TRUE) {
set_mo_history(x_backup[i], x[i], force = force_mo_history) set_mo_history(x_backup[i], x[i], force = force_mo_history)
} }
next next
@ -602,7 +617,7 @@ exec_as.mo <- function(x,
if (toupper(x_backup_without_spp[i]) %in% c('PISP', 'PRSP', 'VISP', 'VRSP')) { if (toupper(x_backup_without_spp[i]) %in% c('PISP', 'PRSP', 'VISP', 'VRSP')) {
# peni I, peni R, vanco I, vanco R: S. pneumoniae # peni I, peni R, vanco I, vanco R: S. pneumoniae
x[i] <- microorganismsDT[mo == 'B_STRPT_PNE', ..property][[1]][1L] x[i] <- microorganismsDT[mo == 'B_STRPT_PNE', ..property][[1]][1L]
if (property == "mo") { if (property == "mo" & initial_search == TRUE) {
set_mo_history(x_backup[i], x[i], force = force_mo_history) set_mo_history(x_backup[i], x[i], force = force_mo_history)
} }
next next
@ -610,7 +625,7 @@ exec_as.mo <- function(x,
if (x_backup_without_spp[i] %like% '^G[ABCDFGHK]S$') { if (x_backup_without_spp[i] %like% '^G[ABCDFGHK]S$') {
# Streptococci, like GBS = Group B Streptococci (B_STRPT_GRB) # Streptococci, like GBS = Group B Streptococci (B_STRPT_GRB)
x[i] <- microorganismsDT[mo == gsub("G([ABCDFGHK])S", "B_STRPT_GR\\1", x_backup_without_spp[i], ignore.case = TRUE), ..property][[1]][1L] x[i] <- microorganismsDT[mo == gsub("G([ABCDFGHK])S", "B_STRPT_GR\\1", x_backup_without_spp[i], ignore.case = TRUE), ..property][[1]][1L]
if (property == "mo") { if (property == "mo" & initial_search == TRUE) {
set_mo_history(x_backup[i], x[i], force = force_mo_history) set_mo_history(x_backup[i], x[i], force = force_mo_history)
} }
next next
@ -618,7 +633,7 @@ exec_as.mo <- function(x,
if (x_backup_without_spp[i] %like% '(streptococ|streptokok).* [ABCDFGHK]$') { if (x_backup_without_spp[i] %like% '(streptococ|streptokok).* [ABCDFGHK]$') {
# Streptococci in different languages, like "estreptococos grupo B" # Streptococci in different languages, like "estreptococos grupo B"
x[i] <- microorganismsDT[mo == gsub(".*(streptococ|streptokok|estreptococ).* ([ABCDFGHK])$", "B_STRPT_GR\\2", x_backup_without_spp[i], ignore.case = TRUE), ..property][[1]][1L] x[i] <- microorganismsDT[mo == gsub(".*(streptococ|streptokok|estreptococ).* ([ABCDFGHK])$", "B_STRPT_GR\\2", x_backup_without_spp[i], ignore.case = TRUE), ..property][[1]][1L]
if (property == "mo") { if (property == "mo" & initial_search == TRUE) {
set_mo_history(x_backup[i], x[i], force = force_mo_history) set_mo_history(x_backup[i], x[i], force = force_mo_history)
} }
next next
@ -626,7 +641,7 @@ exec_as.mo <- function(x,
if (x_backup_without_spp[i] %like% 'group [ABCDFGHK] (streptococ|streptokok|estreptococ)') { if (x_backup_without_spp[i] %like% 'group [ABCDFGHK] (streptococ|streptokok|estreptococ)') {
# Streptococci in different languages, like "Group A Streptococci" # Streptococci in different languages, like "Group A Streptococci"
x[i] <- microorganismsDT[mo == gsub(".*group ([ABCDFGHK]) (streptococ|streptokok|estreptococ).*", "B_STRPT_GR\\1", x_backup_without_spp[i], ignore.case = TRUE), ..property][[1]][1L] x[i] <- microorganismsDT[mo == gsub(".*group ([ABCDFGHK]) (streptococ|streptokok|estreptococ).*", "B_STRPT_GR\\1", x_backup_without_spp[i], ignore.case = TRUE), ..property][[1]][1L]
if (property == "mo") { if (property == "mo" & initial_search == TRUE) {
set_mo_history(x_backup[i], x[i], force = force_mo_history) set_mo_history(x_backup[i], x[i], force = force_mo_history)
} }
next next
@ -637,7 +652,7 @@ exec_as.mo <- function(x,
| x_backup_without_spp[i] %like% '[ck]o?ns[^a-z]?$') { | x_backup_without_spp[i] %like% '[ck]o?ns[^a-z]?$') {
# coerce S. coagulase negative # coerce S. coagulase negative
x[i] <- microorganismsDT[mo == 'B_STPHY_CNS', ..property][[1]][1L] x[i] <- microorganismsDT[mo == 'B_STPHY_CNS', ..property][[1]][1L]
if (property == "mo") { if (property == "mo" & initial_search == TRUE) {
set_mo_history(x_backup[i], x[i], force = force_mo_history) set_mo_history(x_backup[i], x[i], force = force_mo_history)
} }
next next
@ -647,7 +662,7 @@ exec_as.mo <- function(x,
| x_backup_without_spp[i] %like% '[ck]o?ps[^a-z]?$') { | x_backup_without_spp[i] %like% '[ck]o?ps[^a-z]?$') {
# coerce S. coagulase positive # coerce S. coagulase positive
x[i] <- microorganismsDT[mo == 'B_STPHY_CPS', ..property][[1]][1L] x[i] <- microorganismsDT[mo == 'B_STPHY_CPS', ..property][[1]][1L]
if (property == "mo") { if (property == "mo" & initial_search == TRUE) {
set_mo_history(x_backup[i], x[i], force = force_mo_history) set_mo_history(x_backup[i], x[i], force = force_mo_history)
} }
next next
@ -657,7 +672,7 @@ exec_as.mo <- function(x,
| x_trimmed[i] %like% 'gram[ -]?neg.*') { | x_trimmed[i] %like% 'gram[ -]?neg.*') {
# coerce Gram negatives # coerce Gram negatives
x[i] <- microorganismsDT[mo == 'B_GRAMN', ..property][[1]][1L] x[i] <- microorganismsDT[mo == 'B_GRAMN', ..property][[1]][1L]
if (property == "mo") { if (property == "mo" & initial_search == TRUE) {
set_mo_history(x_backup[i], x[i], force = force_mo_history) set_mo_history(x_backup[i], x[i], force = force_mo_history)
} }
next next
@ -667,7 +682,7 @@ exec_as.mo <- function(x,
| x_trimmed[i] %like% 'gram[ -]?pos.*') { | x_trimmed[i] %like% 'gram[ -]?pos.*') {
# coerce Gram positives # coerce Gram positives
x[i] <- microorganismsDT[mo == 'B_GRAMP', ..property][[1]][1L] x[i] <- microorganismsDT[mo == 'B_GRAMP', ..property][[1]][1L]
if (property == "mo") { if (property == "mo" & initial_search == TRUE) {
set_mo_history(x_backup[i], x[i], force = force_mo_history) set_mo_history(x_backup[i], x[i], force = force_mo_history)
} }
next next
@ -676,7 +691,7 @@ exec_as.mo <- function(x,
if (x_backup_without_spp[i] %like% "Salmonella group") { if (x_backup_without_spp[i] %like% "Salmonella group") {
# Salmonella Group A to Z, just return S. species for now # Salmonella Group A to Z, just return S. species for now
x[i] <- microorganismsDT[mo == 'B_SLMNL', ..property][[1]][1L] x[i] <- microorganismsDT[mo == 'B_SLMNL', ..property][[1]][1L]
if (property == "mo") { if (property == "mo" & initial_search == TRUE) {
set_mo_history(x_backup[i], x[i], force = force_mo_history) set_mo_history(x_backup[i], x[i], force = force_mo_history)
} }
options(mo_renamed = c(getOption("mo_renamed"), options(mo_renamed = c(getOption("mo_renamed"),
@ -688,7 +703,7 @@ exec_as.mo <- function(x,
} else { } else {
# Salmonella with capital letter species like "Salmonella Goettingen" - they're all S. enterica # Salmonella with capital letter species like "Salmonella Goettingen" - they're all S. enterica
x[i] <- microorganismsDT[mo == 'B_SLMNL_ENT', ..property][[1]][1L] x[i] <- microorganismsDT[mo == 'B_SLMNL_ENT', ..property][[1]][1L]
if (property == "mo") { if (property == "mo" & initial_search == TRUE) {
set_mo_history(x_backup[i], x[i], force = force_mo_history) set_mo_history(x_backup[i], x[i], force = force_mo_history)
} }
options(mo_renamed = c(getOption("mo_renamed"), options(mo_renamed = c(getOption("mo_renamed"),
@ -708,7 +723,7 @@ exec_as.mo <- function(x,
found <- microorganismsDT[fullname_lower %in% tolower(c(x_species[i], x_trimmed_species[i])), ..property][[1]] found <- microorganismsDT[fullname_lower %in% tolower(c(x_species[i], x_trimmed_species[i])), ..property][[1]]
if (length(found) > 0) { if (length(found) > 0) {
x[i] <- found[1L] x[i] <- found[1L]
if (property == "mo") { if (property == "mo" & initial_search == TRUE) {
set_mo_history(x_backup[i], x[i], force = force_mo_history) set_mo_history(x_backup[i], x[i], force = force_mo_history)
} }
next next
@ -717,7 +732,7 @@ exec_as.mo <- function(x,
found <- microorganismsDT[fullname_lower %like% paste0("^", unregex(x_backup_without_spp[i]), "[a-z]+"), ..property][[1]] found <- microorganismsDT[fullname_lower %like% paste0("^", unregex(x_backup_without_spp[i]), "[a-z]+"), ..property][[1]]
if (length(found) > 0) { if (length(found) > 0) {
x[i] <- found[1L] x[i] <- found[1L]
if (property == "mo") { if (property == "mo" & initial_search == TRUE) {
set_mo_history(x_backup[i], x[i], force = force_mo_history) set_mo_history(x_backup[i], x[i], force = force_mo_history)
} }
next next
@ -732,7 +747,7 @@ exec_as.mo <- function(x,
mo_found <- AMR::microorganisms.codes[toupper(x_backup[i]) == AMR::microorganisms.codes[, 1], "mo"][1L] mo_found <- AMR::microorganisms.codes[toupper(x_backup[i]) == AMR::microorganisms.codes[, 1], "mo"][1L]
if (length(mo_found) > 0) { if (length(mo_found) > 0) {
x[i] <- microorganismsDT[mo == mo_found, ..property][[1]][1L] x[i] <- microorganismsDT[mo == mo_found, ..property][[1]][1L]
if (property == "mo") { if (property == "mo" & initial_search == TRUE) {
set_mo_history(x_backup[i], x[i], force = force_mo_history) set_mo_history(x_backup[i], x[i], force = force_mo_history)
} }
next next
@ -755,6 +770,9 @@ exec_as.mo <- function(x,
if (nchar(x_backup_without_spp[i]) < 4) { if (nchar(x_backup_without_spp[i]) < 4) {
x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]] x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]]
failures <- c(failures, x_backup[i]) failures <- c(failures, x_backup[i])
if (property == "mo" & initial_search == TRUE) {
set_mo_history(x_backup[i], x[i], force = force_mo_history)
}
next next
} }
@ -836,7 +854,7 @@ exec_as.mo <- function(x,
f.x_withspaces_end_only = x_withspaces_end_only[i], f.x_withspaces_end_only = x_withspaces_end_only[i],
g.x_backup_without_spp = x_backup_without_spp[i]) g.x_backup_without_spp = x_backup_without_spp[i])
if (!empty_result(x[i])) { if (!empty_result(x[i])) {
if (property == "mo") { if (property == "mo" & initial_search == TRUE) {
set_mo_history(x_backup[i], x[i], force = force_mo_history) set_mo_history(x_backup[i], x[i], force = force_mo_history)
} }
next next
@ -851,7 +869,7 @@ exec_as.mo <- function(x,
f.x_withspaces_end_only = x_withspaces_end_only[i], f.x_withspaces_end_only = x_withspaces_end_only[i],
g.x_backup_without_spp = x_backup_without_spp[i]) g.x_backup_without_spp = x_backup_without_spp[i])
if (!empty_result(x[i])) { if (!empty_result(x[i])) {
if (property == "mo") { if (property == "mo" & initial_search == TRUE) {
set_mo_history(x_backup[i], x[i], force = force_mo_history) set_mo_history(x_backup[i], x[i], force = force_mo_history)
} }
next next
@ -866,7 +884,7 @@ exec_as.mo <- function(x,
f.x_withspaces_end_only = x_withspaces_end_only[i], f.x_withspaces_end_only = x_withspaces_end_only[i],
g.x_backup_without_spp = x_backup_without_spp[i]) g.x_backup_without_spp = x_backup_without_spp[i])
if (!empty_result(x[i])) { if (!empty_result(x[i])) {
if (property == "mo") { if (property == "mo" & initial_search == TRUE) {
set_mo_history(x_backup[i], x[i], force = force_mo_history) set_mo_history(x_backup[i], x[i], force = force_mo_history)
} }
next next
@ -892,7 +910,7 @@ exec_as.mo <- function(x,
ref_old = found[1, ref], ref_old = found[1, ref],
ref_new = microorganismsDT[col_id == found[1, col_id_new], ref], ref_new = microorganismsDT[col_id == found[1, col_id_new], ref],
mo = microorganismsDT[col_id == found[1, col_id_new], mo]) mo = microorganismsDT[col_id == found[1, col_id_new], mo])
if (property == "mo") { if (property == "mo" & initial_search == TRUE) {
set_mo_history(x_backup[i], x[i], force = force_mo_history) set_mo_history(x_backup[i], x[i], force = force_mo_history)
} }
next next
@ -938,7 +956,7 @@ exec_as.mo <- function(x,
} }
# (2) not yet implemented taxonomic changes in Catalogue of Life ---- # (2) not yet implemented taxonomic changes in Catalogue of Life ----
found <- suppressMessages(suppressWarnings(exec_as.mo(TEMPORARY_TAXONOMY(b.x_trimmed), clear_options = FALSE, allow_uncertain = FALSE))) found <- suppressMessages(suppressWarnings(exec_as.mo(TEMPORARY_TAXONOMY(b.x_trimmed), initial_search = FALSE, allow_uncertain = FALSE)))
if (!empty_result(found)) { if (!empty_result(found)) {
found_result <- found found_result <- found
found <- microorganismsDT[mo == found, ..property][[1]] found <- microorganismsDT[mo == found, ..property][[1]]
@ -972,7 +990,7 @@ exec_as.mo <- function(x,
# (4) strip values between brackets ---- # (4) strip values between brackets ----
a.x_backup_stripped <- gsub("( *[(].*[)] *)", " ", a.x_backup) a.x_backup_stripped <- gsub("( *[(].*[)] *)", " ", a.x_backup)
a.x_backup_stripped <- trimws(gsub(" +", " ", a.x_backup_stripped)) a.x_backup_stripped <- trimws(gsub(" +", " ", a.x_backup_stripped))
found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup_stripped, clear_options = FALSE, allow_uncertain = FALSE))) found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup_stripped, initial_search = FALSE, allow_uncertain = FALSE)))
if (!empty_result(found) & nchar(g.x_backup_without_spp) >= 6) { if (!empty_result(found) & nchar(g.x_backup_without_spp) >= 6) {
found_result <- found found_result <- found
found <- microorganismsDT[mo == found, ..property][[1]] found <- microorganismsDT[mo == found, ..property][[1]]
@ -990,7 +1008,7 @@ exec_as.mo <- function(x,
for (i in 1:(length(x_strip) - 1)) { for (i in 1:(length(x_strip) - 1)) {
x_strip_collapsed <- paste(x_strip[1:(length(x_strip) - i)], collapse = " ") x_strip_collapsed <- paste(x_strip[1:(length(x_strip) - i)], collapse = " ")
if (nchar(x_strip_collapsed) >= 4) { if (nchar(x_strip_collapsed) >= 4) {
found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, clear_options = FALSE, allow_uncertain = FALSE))) found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, allow_uncertain = FALSE)))
if (!empty_result(found)) { if (!empty_result(found)) {
found_result <- found found_result <- found
found <- microorganismsDT[mo == found, ..property][[1]] found <- microorganismsDT[mo == found, ..property][[1]]
@ -1012,7 +1030,7 @@ exec_as.mo <- function(x,
if (length(x_strip) > 1 & nchar(g.x_backup_without_spp) >= 6) { if (length(x_strip) > 1 & nchar(g.x_backup_without_spp) >= 6) {
for (i in 2:(length(x_strip))) { for (i in 2:(length(x_strip))) {
x_strip_collapsed <- paste(x_strip[i:length(x_strip)], collapse = " ") x_strip_collapsed <- paste(x_strip[i:length(x_strip)], collapse = " ")
found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, clear_options = FALSE, allow_uncertain = FALSE))) found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, allow_uncertain = FALSE)))
if (!empty_result(found)) { if (!empty_result(found)) {
found_result <- found found_result <- found
found <- microorganismsDT[mo == found, ..property][[1]] found <- microorganismsDT[mo == found, ..property][[1]]
@ -1059,12 +1077,15 @@ exec_as.mo <- function(x,
# not found ---- # not found ----
x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]] x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]]
failures <- c(failures, x_backup[i]) failures <- c(failures, x_backup[i])
if (property == "mo" & initial_search == TRUE) {
set_mo_history(x_backup[i], x[i], force = force_mo_history)
}
} }
} }
# failures # handling failures ----
failures <- failures[!failures %in% c(NA, NULL, NaN)] failures <- x_input[x == "UNKNOWN"] # failures[!failures %in% c(NA, NULL, NaN)]
if (length(failures) > 0 & clear_options == TRUE) { if (length(failures) > 0 & initial_search == TRUE) {
options(mo_failures = sort(unique(failures))) options(mo_failures = sort(unique(failures)))
plural <- c("value", "it", "was") plural <- c("value", "it", "was")
if (n_distinct(failures) > 1) { if (n_distinct(failures) > 1) {
@ -1083,8 +1104,8 @@ exec_as.mo <- function(x,
call. = FALSE, call. = FALSE,
immediate. = TRUE) # thus will always be shown, even if >= warnings immediate. = TRUE) # thus will always be shown, even if >= warnings
} }
# uncertainties # handling uncertainties ----
if (NROW(uncertainties) > 0 & clear_options == TRUE) { if (NROW(uncertainties) > 0 & initial_search == TRUE) {
options(mo_uncertainties = as.list(distinct(uncertainties, input, .keep_all = TRUE))) options(mo_uncertainties = as.list(distinct(uncertainties, input, .keep_all = TRUE)))
plural <- c("value", "it") plural <- c("value", "it")

View File

@ -20,17 +20,22 @@
# ==================================================================== # # ==================================================================== #
# print successful as.mo coercions to file, not uncertain ones # print successful as.mo coercions to file, not uncertain ones
#' @importFrom dplyr %>% filter #' @importFrom dplyr distinct
set_mo_history <- function(x, mo, force = FALSE) { set_mo_history <- function(x, mo, force = FALSE) {
file_location <- base::path.expand('~/.Rhistory_mo') file_location <- base::path.expand('~/.Rhistory_mo')
if ((base::interactive() & mo != "UNKNOWN") | force == TRUE) { if (base::interactive() | force == TRUE) {
mo_hist <- read_mo_history(force = force) mo_hist <- read_mo_history(force = force)
if (NROW(mo_hist[base::which(mo_hist$x == x & mo_hist$package_version == utils::packageVersion("AMR")),]) == 0) { df <- distinct(data.frame(x, mo, stringsAsFactors = FALSE), x, .keep_all = TRUE)
base::write(x = c(x, mo, base::as.character(utils::packageVersion("AMR"))), x <- df$x
file = file_location, mo <- df$mo
ncolumns = 3, for (i in 1:length(x)) {
append = TRUE, if (NROW(mo_hist[base::which(mo_hist$x == x[i] & mo_hist$package_version == utils::packageVersion("AMR")),]) == 0) {
sep = "\t") base::write(x = c(x[i], mo[i], base::as.character(utils::packageVersion("AMR"))),
file = file_location,
ncolumns = 3,
append = TRUE,
sep = "\t")
}
} }
} }
return(base::invisible()) return(base::invisible())
@ -47,6 +52,7 @@ get_mo_history <- function(x, force = FALSE) {
} }
} }
#' @importFrom dplyr %>% filter distinct
read_mo_history <- function(force = FALSE) { read_mo_history <- function(force = FALSE) {
file_location <- base::path.expand('~/.Rhistory_mo') file_location <- base::path.expand('~/.Rhistory_mo')
if (!base::file.exists(file_location) | (!base::interactive() & force == FALSE)) { if (!base::file.exists(file_location) | (!base::interactive() & force == FALSE)) {
@ -58,9 +64,11 @@ read_mo_history <- function(force = FALSE) {
col.names = c("x", "mo", "package_version"), col.names = c("x", "mo", "package_version"),
stringsAsFactors = FALSE) stringsAsFactors = FALSE)
# Below: filter on current package version. # Below: filter on current package version.
# Future fullnames may even be replaced by new taxonomic names, so new versions of # Even current fullnames may be replaced by new taxonomic names, so new versions of
# the Catalogue of Life must not lead to data corruption. # the Catalogue of Life must not lead to data corruption.
file_read[base::which(file_read$package_version == utils::packageVersion("AMR")), c("x", "mo")] file_read %>%
filter(package_version == utils::packageVersion("AMR")) %>%
distinct(x, mo, .keep_all = TRUE)
} }
#' @rdname as.mo #' @rdname as.mo

View File

@ -68,7 +68,8 @@ Values that cannot be coered will be considered 'unknown' and have an MO code \c
Use the \code{\link{mo_property}_*} functions to get properties based on the returned code, see Examples. Use the \code{\link{mo_property}_*} functions to get properties based on the returned code, see Examples.
All IDs that are found with zero uncertainty are saved to a local file (\code{"~/.Rhistory_mo"}) to improve speed for every next time. Use \code{clean_mo_history()} to delete this file, which resets the algorithms. Only previous results will be used from this version of the \code{AMR} package, since the taxonomic tree may change in the future for any organism. \strong{Self-learning algoritm} \cr
The \code{as.mo()} function gains experience from previously determined microbial IDs and learns from it. This drastically improves both speed and reliability. Use \code{clean_mo_history()} to reset the algorithms. Only experience from your current \code{AMR} package version is used. This is done because in the future the taxonomic tree (which is included in this package) may change for any organism and it consequently has to rebuild its knowledge. Usually, any guess after the first try runs 90-95\% faster than the first try. The algorithm saves its previous findings to \code{~/.Rhistory_mo}.
\strong{Intelligent rules} \cr \strong{Intelligent rules} \cr
This function uses intelligent rules to help getting fast and logical results. It tries to find matches in this order: This function uses intelligent rules to help getting fast and logical results. It tries to find matches in this order:

View File

@ -30,5 +30,5 @@ test_that("mo_history works", {
expect_equal(get_mo_history("testsubject", force = TRUE), expect_equal(get_mo_history("testsubject", force = TRUE),
"B_ESCHR_COL") "B_ESCHR_COL")
expect_equal(colnames(read_mo_history(force = TRUE)), expect_equal(colnames(read_mo_history(force = TRUE)),
c("x", "mo")) c("x", "mo", "package_version"))
}) })