diff --git a/DESCRIPTION b/DESCRIPTION index 19cae831..8c93fd6d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: AMR -Version: 0.7.1.9037 +Version: 0.7.1.9038 Date: 2019-08-12 Title: Antimicrobial Resistance Analysis Authors@R: c( diff --git a/NEWS.md b/NEWS.md index d50f23da..0eaf4c39 100755 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -# AMR 0.7.1.9037 +# AMR 0.7.1.9038 ### Breaking * Function `freq()` has moved to a new package, [`clean`](https://github.com/msberends/clean) ([CRAN link](https://cran.r-project.org/package=clean)). Creating frequency tables is actually not the scope of this package (never was) and this function has matured a lot over the last two years. Therefore, a new package was created for data cleaning and checking and it perfectly fits the `freq()` function. The [`clean`](https://github.com/msberends/clean) package is available on CRAN and will be installed automatically when updating the `AMR` package, that now imports it. In a later stage, the `skewness()` and `kurtosis()` functions will be moved to the `clean` package too. diff --git a/R/mo.R b/R/mo.R index 657de9cc..db4445f9 100755 --- a/R/mo.R +++ b/R/mo.R @@ -193,19 +193,19 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = TRUE, require("AMR") # check onLoad() in R/zzz.R: data tables are created there. } - + # WHONET: xxx = no growth x[tolower(as.character(paste0(x, ""))) %in% c("", "xxx", "na", "nan")] <- NA_character_ - + uncertainty_level <- translate_allow_uncertain(allow_uncertain) # mo_hist <- get_mo_history(x, uncertainty_level, force = isTRUE(list(...)$force_mo_history)) - + if (mo_source_isvalid(reference_df) & isFALSE(Becker) & isFALSE(Lancefield) & !is.null(reference_df) & all(x %in% reference_df[,1][[1]])) { - + # has valid own reference_df # (data.table not faster here) reference_df <- reference_df %>% filter(!is.na(mo)) @@ -225,18 +225,18 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = TRUE, left_join(reference_df, by = "x") %>% pull("mo") ) - + } else if (all(x %in% AMR::microorganisms$mo) & isFALSE(Becker) & isFALSE(Lancefield)) { y <- x - + # } else if (!any(is.na(mo_hist)) # & isFALSE(Becker) # & isFALSE(Lancefield)) { # # check previously found results # y <- mo_hist - + } else if (all(tolower(x) %in% microorganismsDT$fullname_lower) & isFALSE(Becker) & isFALSE(Lancefield)) { @@ -257,7 +257,7 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = TRUE, } # save them to history set_mo_history(x, y, 0, force = isTRUE(list(...)$force_mo_history)) - + } else { # will be checked for mo class in validation and uses exec_as.mo internally if necessary y <- mo_validate(x = x, property = "mo", @@ -266,8 +266,8 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = TRUE, force_mo_history = isTRUE(list(...)$force_mo_history), ...) } - - + + to_class_mo(y) } @@ -286,6 +286,7 @@ is.mo <- function(x) { #' @importFrom crayon magenta red blue silver italic # param property a column name of AMR::microorganisms # param initial_search logical - is FALSE when coming from uncertain tries, which uses exec_as.mo internally too +# param dyslexia_mode logical - also check for characters that resemble others # param force_mo_history logical - whether found result must be saved with set_mo_history (default FALSE on non-interactive sessions) # param debug logical - show different lookup texts while searching exec_as.mo <- function(x, @@ -295,23 +296,24 @@ exec_as.mo <- function(x, reference_df = get_mo_source(), property = "mo", initial_search = TRUE, + dyslexia_mode = FALSE, force_mo_history = FALSE, debug = FALSE) { - + if (!"AMR" %in% base::.packages()) { require("AMR") # check onLoad() in R/zzz.R: data tables are created there. } - + # WHONET: xxx = no growth x[tolower(as.character(paste0(x, ""))) %in% c("", "xxx", "na", "nan")] <- NA_character_ - + if (initial_search == TRUE) { options(mo_failures = NULL) options(mo_uncertainties = NULL) options(mo_renamed = NULL) } - + if (NCOL(x) == 2) { # support tidyverse selection like: df %>% select(colA, colB) # paste these columns together @@ -325,20 +327,20 @@ exec_as.mo <- function(x, stop('`x` can be 2 columns at most', call. = FALSE) } x[is.null(x)] <- NA - + # support tidyverse selection like: df %>% select(colA) if (!is.vector(x) & !is.null(dim(x))) { x <- pull(x, 1) } } - + notes <- character(0) uncertainties <- data.frame(input = character(0), fullname = character(0), mo = character(0)) failures <- character(0) uncertainty_level <- translate_allow_uncertain(allow_uncertain) - + x_input <- x # already strip leading and trailing spaces x <- trimws(x, which = "both") @@ -350,7 +352,7 @@ exec_as.mo <- function(x, & !is.null(x) & !identical(x, "") & !identical(x, "xxx")] - + # conversion of old MO codes from v0.5.0 (ITIS) to later versions (Catalogue of Life) if (any(x %like% "^[BFP]_[A-Z]{3,7}") & !all(x %in% microorganisms$mo)) { leftpart <- gsub("^([BFP]_[A-Z]{3,7}).*", "\\1", x) @@ -372,7 +374,7 @@ exec_as.mo <- function(x, pull(new) } } - + # defined df to check for if (!is.null(reference_df)) { if (!mo_source_isvalid(reference_df)) { @@ -391,7 +393,7 @@ exec_as.mo <- function(x, reference_df[] <- lapply(reference_df, as.character) ) } - + # all empty if (all(identical(trimws(x_input), "") | is.na(x_input) | length(x) == 0)) { if (property == "mo") { @@ -399,7 +401,7 @@ exec_as.mo <- function(x, } else { return(rep(NA_character_, length(x_input))) } - + } else if (all(x %in% reference_df[, 1][[1]])) { # all in reference df colnames(reference_df)[1] <- "x" @@ -409,7 +411,7 @@ exec_as.mo <- function(x, left_join(AMR::microorganisms, by = "mo") %>% pull(property) ) - + } else if (all(x %in% AMR::microorganisms$mo)) { # existing mo codes when not looking for property "mo", like mo_genus("B_ESCHR_COL") y <- microorganismsDT[prevalence == 1][data.table(mo = x), on = "mo", ..property][[1]] @@ -424,7 +426,7 @@ exec_as.mo <- function(x, ..property][[1]] } x <- y - + } else if (all(x %in% read_mo_history(uncertainty_level, force = force_mo_history)$x)) { # previously found code @@ -432,7 +434,7 @@ exec_as.mo <- function(x, uncertainty_level, force = force_mo_history)), on = "mo", ..property][[1]] - + } else if (all(tolower(x) %in% microorganismsDT$fullname_lower)) { # we need special treatment for very prevalent full names, they are likely! # e.g. as.mo("Staphylococcus aureus") @@ -448,30 +450,30 @@ exec_as.mo <- function(x, ..property][[1]] } x <- y - + } else if (all(toupper(x) %in% AMR::microorganisms.codes$code)) { # commonly used MO codes y <- as.data.table(AMR::microorganisms.codes)[data.table(code = toupper(x)), on = "code", ] # save them to history set_mo_history(x, y$mo, 0, force = force_mo_history) - + x <- microorganismsDT[data.table(mo = y[["mo"]]), on = "mo", ..property][[1]] - + } else if (!all(x %in% AMR::microorganisms[, property])) { - + strip_whitespace <- function(x) { # all whitespaces (tab, new lines, etc.) should be one space # and spaces before and after should be omitted trimws(gsub("[\\s]+", " ", x, perl = TRUE), which = "both") } - + x <- strip_whitespace(x) x_backup <- x - + # remove spp and species x <- gsub(" +(spp.?|ssp.?|sp.? |ss ?.?|subsp.?|subspecies|biovar |serovar |species)", " ", x_backup, ignore.case = TRUE) x <- strip_whitespace(x) - + x_backup_without_spp <- x x_species <- paste(x, "species") # translate to English for supported languages of mo_property @@ -490,7 +492,7 @@ exec_as.mo <- function(x, # remove genus as first word x <- gsub("^Genus ", "", x) # allow characters that resemble others ---- - if (initial_search == FALSE) { + if (dyslexia_mode == TRUE) { x <- tolower(x) x <- gsub("[iy]+", "[iy]+", x) x <- gsub("(c|k|q|qu|s|z|x|ks)+", "(c|k|q|qu|s|z|x|ks)+", x) @@ -512,7 +514,7 @@ exec_as.mo <- function(x, x <- gsub("e\\+n(?![a-z[])", "(e+n|u+(c|k|q|qu|s|z|x|ks)+)", x, ignore.case = TRUE, perl = TRUE) } x <- strip_whitespace(x) - + x_trimmed <- x x_trimmed_species <- paste(x_trimmed, "species") x_trimmed_without_group <- gsub(" gro.u.p$", "", x_trimmed, ignore.case = TRUE) @@ -526,7 +528,7 @@ exec_as.mo <- function(x, x_withspaces_start_only <- paste0('^', x_withspaces) x_withspaces_end_only <- paste0(x_withspaces, '$') x_withspaces_start_end <- paste0('^', x_withspaces, '$') - + if (isTRUE(debug)) { cat(paste0('x "', x, '"\n')) cat(paste0('x_species "', x_species, '"\n')) @@ -539,13 +541,13 @@ exec_as.mo <- function(x, cat(paste0('x_trimmed_species "', x_trimmed_species, '"\n')) cat(paste0('x_trimmed_without_group "', x_trimmed_without_group, '"\n')) } - + progress <- progress_estimated(n = length(x), min_time = 3) - + for (i in 1:length(x)) { - + progress$tick()$print() - + if (initial_search == TRUE) { found <- microorganismsDT[mo == get_mo_history(x_backup[i], uncertainty_level, @@ -557,14 +559,14 @@ exec_as.mo <- function(x, next } } - + found <- microorganismsDT[mo == toupper(x_backup[i]), ..property][[1]] # is a valid MO code if (length(found) > 0) { x[i] <- found[1L] next } - + found <- microorganismsDT[fullname_lower %in% tolower(c(x_backup[i], x_backup_without_spp[i])), ..property][[1]] # most probable: is exact match in fullname if (length(found) > 0) { @@ -574,7 +576,7 @@ exec_as.mo <- function(x, } next } - + found <- microorganismsDT[col_id == x_backup[i], ..property][[1]] # is a valid Catalogue of Life ID if (NROW(found) > 0) { @@ -584,14 +586,14 @@ exec_as.mo <- function(x, } next } - - + + # WHONET: xxx = no growth if (tolower(as.character(paste0(x_backup_without_spp[i], ""))) %in% c("", "xxx", "na", "nan")) { x[i] <- NA_character_ next } - + if (tolower(x_backup_without_spp[i]) %in% c("other", "none", "unknown")) { # empty and nonsense values, ignore without warning x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]] @@ -600,7 +602,7 @@ exec_as.mo <- function(x, } next } - + # check for very small input, but ignore the O antigens of E. coli if (nchar(gsub("[^a-zA-Z]", "", x_trimmed[i])) < 3 & !x_backup_without_spp[i] %like% "O?(26|103|104|104|111|121|145|157)") { @@ -629,7 +631,7 @@ exec_as.mo <- function(x, } next } - + if (x_backup_without_spp[i] %like% "virus") { # there is no fullname like virus, so don't try to coerce it x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]] @@ -639,7 +641,7 @@ exec_as.mo <- function(x, } next } - + # translate known trivial abbreviations to genus + species ---- if (!is.na(x_trimmed[i])) { if (toupper(x_backup_without_spp[i]) %in% c('MRSA', 'MSSA', 'VISA', 'VRSA')) { @@ -830,7 +832,7 @@ exec_as.mo <- function(x, next } } - + # FIRST TRY FULLNAMES AND CODES ---- # if only genus is available, return only genus if (all(!c(x[i], x_trimmed[i]) %like% " ")) { @@ -854,7 +856,7 @@ exec_as.mo <- function(x, } # rest of genus only is in allow_uncertain part. } - + # TRY OTHER SOURCES ---- # WHONET and other common LIS codes if (toupper(x_backup[i]) %in% AMR::microorganisms.codes[, 1]) { @@ -879,7 +881,7 @@ exec_as.mo <- function(x, } } } - + # allow no codes less than 4 characters long, was already checked for WHONET above if (nchar(x_backup_without_spp[i]) < 4) { x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]] @@ -889,7 +891,7 @@ exec_as.mo <- function(x, } next } - + check_per_prevalence <- function(data_to_check, a.x_backup, b.x_trimmed, @@ -898,19 +900,19 @@ exec_as.mo <- function(x, e.x_withspaces_start_only, f.x_withspaces_end_only, g.x_backup_without_spp) { - + # try probable: trimmed version of fullname ---- found <- data_to_check[fullname_lower %in% tolower(g.x_backup_without_spp), ..property][[1]] if (length(found) > 0) { return(found[1L]) } - + # try any match keeping spaces ---- found <- data_to_check[fullname %like% d.x_withspaces_start_end, ..property][[1]] if (length(found) > 0 & nchar(g.x_backup_without_spp) >= 6) { return(found[1L]) } - + # try any match keeping spaces, not ending with $ ---- found <- data_to_check[fullname %like% paste0(trimws(e.x_withspaces_start_only), " "), ..property][[1]] if (length(found) > 0) { @@ -920,21 +922,21 @@ exec_as.mo <- function(x, if (length(found) > 0 & nchar(g.x_backup_without_spp) >= 6) { return(found[1L]) } - + # try any match keeping spaces, not start with ^ ---- found <- data_to_check[fullname %like% paste0(" ", trimws(f.x_withspaces_end_only)), ..property][[1]] if (length(found) > 0) { return(found[1L]) } - + # try a trimmed version found <- data_to_check[fullname_lower %like% b.x_trimmed | fullname_lower %like% c.x_trimmed_without_group, ..property][[1]] if (length(found) > 0 & nchar(g.x_backup_without_spp) >= 6) { return(found[1L]) } - - + + # try splitting of characters in the middle and then find ID ---- # only when text length is 6 or lower # like esco = E. coli, klpn = K. pneumoniae, stau = S. aureus, staaur = S. aureus @@ -949,18 +951,18 @@ exec_as.mo <- function(x, return(found[1L]) } } - + # try fullname without start and without nchar limit of >= 6 ---- # like "K. pneu rhino" >> "Klebsiella pneumoniae (rhinoscleromatis)" = KLEPNERH found <- data_to_check[fullname %like% e.x_withspaces_start_only, ..property][[1]] if (length(found) > 0) { return(found[1L]) } - + # didn't found any return(NA_character_) } - + # FIRST TRY VERY PREVALENT IN HUMAN INFECTIONS ---- x[i] <- check_per_prevalence(data_to_check = microorganismsDT[prevalence == 1], a.x_backup = x_backup[i], @@ -1006,9 +1008,9 @@ exec_as.mo <- function(x, } next } - + # MISCELLANEOUS ---- - + # look for old taxonomic names ---- found <- microorganisms.oldDT[fullname_lower == tolower(x_backup[i]) | fullname %like% x_withspaces_start_end[i],] @@ -1032,7 +1034,7 @@ exec_as.mo <- function(x, } next } - + # check for uncertain results ---- uncertain_fn <- function(a.x_backup, b.x_trimmed, @@ -1040,17 +1042,22 @@ exec_as.mo <- function(x, d.x_withspaces_start_only, f.x_withspaces_end_only, g.x_backup_without_spp) { - + if (uncertainty_level == 0) { # do not allow uncertainties return(NA_character_) } - + if (uncertainty_level >= 1) { + now_checks_for_uncertainty_level <- 1 + # (1) look again for old taxonomic names, now for G. species ---- if (isTRUE(debug)) { cat("\n[UNCERTAINLY LEVEL 1] (1) look again for old taxonomic names, now for G. species\n") } + if (isTRUE(debug)) { + message("Running '", c.x_withspaces_start_end, "' and '", d.x_withspaces_start_only, "'") + } found <- microorganisms.oldDT[fullname %like% c.x_withspaces_start_end | fullname %like% d.x_withspaces_start_only] if (NROW(found) > 0 & nchar(g.x_backup_without_spp) >= 6) { @@ -1068,7 +1075,7 @@ exec_as.mo <- function(x, ref_new = microorganismsDT[col_id == found[1, col_id_new], ref], mo = microorganismsDT[col_id == found[1, col_id_new], mo]) uncertainties <<- rbind(uncertainties, - data.frame(uncertainty = 1, + data.frame(uncertainty = now_checks_for_uncertainty_level, input = a.x_backup, fullname = found[1, fullname], mo = paste("CoL", found[1, col_id]))) @@ -1077,18 +1084,26 @@ exec_as.mo <- function(x, } return(x) } - + # (2) Try with misspelled input ---- - # just rerun with initial_search = FALSE will used the extensive regex part above + # just rerun with dyslexia_mode = TRUE will used the extensive regex part above if (isTRUE(debug)) { cat("\n[UNCERTAINLY LEVEL 1] (2) Try with misspelled input\n") } - found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup, initial_search = FALSE, allow_uncertain = FALSE, debug = debug))) + if (isTRUE(debug)) { + message("Running '", a.x_backup, "'") + } + # first try without dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) + if (empty_result(found)) { + # then with dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) + } if (!empty_result(found)) { found_result <- found found <- microorganismsDT[mo == found, ..property][[1]] uncertainties <<- rbind(uncertainties, - data.frame(uncertainty = 1, + data.frame(uncertainty = now_checks_for_uncertainty_level, input = a.x_backup, fullname = microorganismsDT[mo == found_result[1L], fullname][[1]], mo = found_result[1L])) @@ -1098,21 +1113,25 @@ exec_as.mo <- function(x, return(found[1L]) } } - + if (uncertainty_level >= 2) { - + now_checks_for_uncertainty_level <- 2 + # (3) look for genus only, part of name ---- if (isTRUE(debug)) { cat("\n[UNCERTAINLY LEVEL 2] (3) look for genus only, part of name\n") } if (nchar(g.x_backup_without_spp) > 4 & !b.x_trimmed %like% " ") { if (!grepl("^[A-Z][a-z]+", b.x_trimmed, ignore.case = FALSE)) { + if (isTRUE(debug)) { + message("Running '", paste(b.x_trimmed, "species"), "'") + } # not when input is like Genustext, because then Neospora would lead to Actinokineospora found <- microorganismsDT[fullname_lower %like% paste(b.x_trimmed, "species"), ..property][[1]] if (length(found) > 0) { x[i] <- found[1L] uncertainties <<- rbind(uncertainties, - data.frame(uncertainty = 2, + data.frame(uncertainty = now_checks_for_uncertainty_level, input = a.x_backup, fullname = microorganismsDT[mo == found[1L], fullname][[1]], mo = found[1L])) @@ -1123,19 +1142,27 @@ exec_as.mo <- function(x, } } } - + # (4) strip values between brackets ---- if (isTRUE(debug)) { cat("\n[UNCERTAINLY LEVEL 2] (4) strip values between brackets\n") } a.x_backup_stripped <- gsub("( *[(].*[)] *)", " ", a.x_backup) a.x_backup_stripped <- trimws(gsub(" +", " ", a.x_backup_stripped)) - found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup_stripped, initial_search = FALSE, allow_uncertain = FALSE, debug = debug))) + if (isTRUE(debug)) { + message("Running '", a.x_backup_stripped, "'") + } + # first try without dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup_stripped, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) + if (empty_result(found)) { + # then with dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup_stripped, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) + } if (!empty_result(found) & nchar(g.x_backup_without_spp) >= 6) { found_result <- found found <- microorganismsDT[mo == found, ..property][[1]] uncertainties <<- rbind(uncertainties, - data.frame(uncertainty = 2, + data.frame(uncertainty = now_checks_for_uncertainty_level, input = a.x_backup, fullname = microorganismsDT[mo == found_result[1L], fullname][[1]], mo = found_result[1L])) @@ -1144,7 +1171,7 @@ exec_as.mo <- function(x, } return(found[1L]) } - + # (5a) try to strip off half an element from end and check the remains ---- if (isTRUE(debug)) { cat("\n[UNCERTAINLY LEVEL 2] (5a) try to strip off half an element from end and check the remains\n") @@ -1156,13 +1183,21 @@ exec_as.mo <- function(x, lastword_half <- substr(lastword, 1, as.integer(nchar(lastword) / 2)) # remove last half of the second term x_strip_collapsed <- paste(c(x_strip[1:(length(x_strip) - i)], lastword_half), collapse = " ") - if (nchar(x_strip_collapsed) >= 4) { - found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, allow_uncertain = FALSE, debug = debug))) + if (nchar(x_strip_collapsed) >= 4 & nchar(lastword_half) > 2) { + if (isTRUE(debug)) { + message("Running '", x_strip_collapsed, "'") + } + # first try without dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) + if (empty_result(found)) { + # then with dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) + } if (!empty_result(found)) { found_result <- found found <- microorganismsDT[mo == found, ..property][[1]] uncertainties <<- rbind(uncertainties, - data.frame(uncertainty = 2, + data.frame(uncertainty = now_checks_for_uncertainty_level, input = a.x_backup, fullname = microorganismsDT[mo == found_result[1L], fullname][[1]], mo = found_result[1L])) @@ -1181,13 +1216,21 @@ exec_as.mo <- function(x, if (length(x_strip) > 1) { for (i in 1:(length(x_strip) - 1)) { x_strip_collapsed <- paste(x_strip[1:(length(x_strip) - i)], collapse = " ") - if (nchar(x_strip_collapsed) >= 4) { - found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, allow_uncertain = FALSE, debug = debug))) + if (nchar(x_strip_collapsed) >= 6) { + if (isTRUE(debug)) { + message("Running '", x_strip_collapsed, "'") + } + # first try without dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) + if (empty_result(found)) { + # then with dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) + } if (!empty_result(found)) { found_result <- found found <- microorganismsDT[mo == found, ..property][[1]] uncertainties <<- rbind(uncertainties, - data.frame(uncertainty = 2, + data.frame(uncertainty = now_checks_for_uncertainty_level, input = a.x_backup, fullname = microorganismsDT[mo == found_result[1L], fullname][[1]], mo = found_result[1L])) @@ -1208,7 +1251,7 @@ exec_as.mo <- function(x, found_result <- found found <- microorganismsDT[mo == found, ..property][[1]] uncertainties <<- rbind(uncertainties, - data.frame(uncertainty = 2, + data.frame(uncertainty = now_checks_for_uncertainty_level, input = a.x_backup, fullname = microorganismsDT[mo == found_result[1L], fullname][[1]], mo = found_result[1L])) @@ -1222,7 +1265,7 @@ exec_as.mo <- function(x, found_result <- found found <- microorganismsDT[mo == found, ..property][[1]] uncertainties <<- rbind(uncertainties, - data.frame(uncertainty = 2, + data.frame(uncertainty = now_checks_for_uncertainty_level, input = a.x_backup, fullname = microorganismsDT[mo == found_result[1L], fullname][[1]], mo = found_result[1L])) @@ -1239,14 +1282,22 @@ exec_as.mo <- function(x, if (length(x_strip) > 1 & nchar(g.x_backup_without_spp) >= 6) { for (i in 2:(length(x_strip))) { x_strip_collapsed <- paste(x_strip[i:length(x_strip)], collapse = " ") - found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, allow_uncertain = FALSE, debug = debug))) + if (isTRUE(debug)) { + message("Running '", x_strip_collapsed, "'") + } + # first try without dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) + if (empty_result(found)) { + # then with dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) + } if (!empty_result(found)) { found_result <- found found <- microorganismsDT[mo == found_result[1L], ..property][[1]] # uncertainty level 2 only if searched part contains a space (otherwise it will be found with lvl 3) if (x_strip_collapsed %like% " ") { uncertainties <<- rbind(uncertainties, - data.frame(uncertainty = 2, + data.frame(uncertainty = now_checks_for_uncertainty_level, input = a.x_backup, fullname = microorganismsDT[mo == found_result[1L], fullname][[1]], mo = found_result[1L])) @@ -1259,22 +1310,32 @@ exec_as.mo <- function(x, } } } - + if (uncertainty_level >= 3) { - # (7) try to strip off one element from start and check the remains ---- + now_checks_for_uncertainty_level <- 3 + + # (7a) try to strip off one element from start and check the remains (any text size) ---- if (isTRUE(debug)) { - cat("\n[UNCERTAINLY LEVEL 3] (7) try to strip off one element from start and check the remains\n") + cat("\n[UNCERTAINLY LEVEL 3] (7a) try to strip off one element from start and check the remains (any text size)\n") } x_strip <- a.x_backup %>% strsplit(" ") %>% unlist() if (length(x_strip) > 1 & nchar(g.x_backup_without_spp) >= 6) { for (i in 2:(length(x_strip))) { x_strip_collapsed <- paste(x_strip[i:length(x_strip)], collapse = " ") - found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, allow_uncertain = FALSE, debug = debug))) + if (isTRUE(debug)) { + message("Running '", x_strip_collapsed, "'") + } + # first try without dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) + if (empty_result(found)) { + # then with dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) + } if (!empty_result(found)) { found_result <- found found <- microorganismsDT[mo == found, ..property][[1]] uncertainties <<- rbind(uncertainties, - data.frame(uncertainty = 3, + data.frame(uncertainty = now_checks_for_uncertainty_level, input = a.x_backup, fullname = microorganismsDT[mo == found_result[1L], fullname][[1]], mo = found_result[1L])) @@ -1285,18 +1346,53 @@ exec_as.mo <- function(x, } } } - + # (7b) try to strip off one element from end and check the remains (any text size) ---- + # (this is in fact 5b but without nchar limit of >=6) + if (isTRUE(debug)) { + cat("\n[UNCERTAINLY LEVEL 3] (7b) try to strip off one element from end and check the remains (any text size)\n") + } + if (length(x_strip) > 1) { + for (i in 1:(length(x_strip) - 1)) { + x_strip_collapsed <- paste(x_strip[1:(length(x_strip) - i)], collapse = " ") + if (isTRUE(debug)) { + message("Running '", x_strip_collapsed, "'") + } + # first try without dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug))) + if (empty_result(found)) { + # then with dyslexia mode + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug))) + } + if (!empty_result(found)) { + found_result <- found + found <- microorganismsDT[mo == found, ..property][[1]] + uncertainties <<- rbind(uncertainties, + data.frame(uncertainty = now_checks_for_uncertainty_level, + input = a.x_backup, + fullname = microorganismsDT[mo == found_result[1L], fullname][[1]], + mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history) + } + return(found[1L]) + } + } + } + # (8) part of a name (very unlikely match) ---- if (isTRUE(debug)) { cat("\n[UNCERTAINLY LEVEL 3] (8) part of a name (very unlikely match)\n") } + if (isTRUE(debug)) { + message("Running '", f.x_withspaces_end_only, "'") + } found <- microorganismsDT[fullname %like% f.x_withspaces_end_only] if (nrow(found) > 0) { found_result <- found[["mo"]] if (!empty_result(found_result) & nchar(g.x_backup_without_spp) >= 6) { found <- microorganismsDT[mo == found_result[1L], ..property][[1]] uncertainties <<- rbind(uncertainties, - data.frame(uncertainty = 3, + data.frame(uncertainty = now_checks_for_uncertainty_level, input = a.x_backup, fullname = microorganismsDT[mo == found_result[1L], fullname][[1]], mo = found_result[1L])) @@ -1307,7 +1403,7 @@ exec_as.mo <- function(x, } } } - + # didn't found in uncertain results too return(NA_character_) } @@ -1321,7 +1417,7 @@ exec_as.mo <- function(x, # no set_mo_history here - it is already set in uncertain_fn() next } - + # no results found: make them UNKNOWN ---- x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]] if (initial_search == TRUE) { @@ -1330,7 +1426,7 @@ exec_as.mo <- function(x, } } } - + # handling failures ---- failures <- failures[!failures %in% c(NA, NULL, NaN)] if (length(failures) > 0 & initial_search == TRUE) { @@ -1355,7 +1451,7 @@ exec_as.mo <- function(x, # handling uncertainties ---- if (NROW(uncertainties) > 0 & initial_search == TRUE) { options(mo_uncertainties = as.list(distinct(uncertainties, input, .keep_all = TRUE))) - + plural <- c("", "it") if (NROW(uncertainties) > 1) { plural <- c("s", "them") @@ -1366,7 +1462,7 @@ exec_as.mo <- function(x, call. = FALSE, immediate. = TRUE) # thus will always be shown, even if >= warnings } - + # Becker ---- if (Becker == TRUE | Becker == "all") { # See Source. It's this figure: @@ -1391,11 +1487,11 @@ exec_as.mo <- function(x, "pseudintermedius", "pseudointermedius", "schweitzeri", "argenteus") | (species == "schleiferi" & subspecies == "coagulans"), ..property][[1]] - + # warn when species found that are not in Becker (2014, PMID 25278577) and Becker (2019, PMID 30872103) post_Becker <- c("argensis", "caeli", "cornubiensis", "edaphicus") if (any(x %in% MOs_staph[species %in% post_Becker, ..property][[1]])) { - + warning("Becker ", italic("et al."), " (2014, 2019) does not contain these species named after their publication: ", italic(paste("S.", sort(mo_species(unique(x[x %in% MOs_staph[species %in% post_Becker, ..property][[1]]]))), @@ -1404,14 +1500,14 @@ exec_as.mo <- function(x, call. = FALSE, immediate. = TRUE) } - + x[x %in% CoNS] <- microorganismsDT[mo == 'B_STPHY_CNS', ..property][[1]][1L] x[x %in% CoPS] <- microorganismsDT[mo == 'B_STPHY_CPS', ..property][[1]][1L] if (Becker == "all") { x[x %in% microorganismsDT[mo %like% '^B_STPHY_AUR', ..property][[1]]] <- microorganismsDT[mo == 'B_STPHY_CPS', ..property][[1]][1L] } } - + # Lancefield ---- if (Lancefield == TRUE | Lancefield == "all") { # group A - S. pyogenes @@ -1435,37 +1531,37 @@ exec_as.mo <- function(x, # group K - S. salivarius x[x == microorganismsDT[mo == 'B_STRPT_SAL', ..property][[1]][1L]] <- microorganismsDT[mo == 'B_STRPT_GRK', ..property][[1]][1L] } - + # Wrap up ---------------------------------------------------------------- - + # comply to x, which is also unique and without empty values x_input_unique_nonempty <- unique(x_input[!is.na(x_input) & !is.null(x_input) & !identical(x_input, "") & !identical(x_input, "xxx")]) - + # left join the found results to the original input values (x_input) df_found <- data.frame(input = as.character(x_input_unique_nonempty), found = as.character(x), stringsAsFactors = FALSE) df_input <- data.frame(input = as.character(x_input), stringsAsFactors = FALSE) - + suppressWarnings( x <- df_input %>% left_join(df_found, by = "input") %>% pull(found) ) - + if (property == "mo") { x <- to_class_mo(x) } - + if (length(mo_renamed()) > 0) { print(mo_renamed()) } - + x } @@ -1494,7 +1590,7 @@ was_renamed <- function(name_old, name_new, ref_old = "", ref_new = "", mo = "") old_values <- gsub("et al.", italic("et al."), old_values) new_values <- paste0(italic(name_new), ref_new, mo) new_values <- gsub("et al.", italic("et al."), new_values) - + names(new_values) <- old_values total <- c(getOption("mo_renamed"), new_values) options(mo_renamed = total[order(names(total))]) @@ -1604,7 +1700,7 @@ print.mo_uncertainties <- function(x, ...) { "\n(1 = ", green("renamed/misspelled"), ", 2 = ", yellow("uncertain"), ", 3 = ", red("very uncertain"), ")\n")) - + msg <- "" for (i in 1:nrow(x)) { if (x[i, "uncertainty"] == 1) { @@ -1633,11 +1729,11 @@ mo_renamed <- function() { if (is.null(items)) { return(NULL) } - + items <- strip_style(items) names(items) <- strip_style(names(items)) structure(.Data = items, - class = c("mo_renamed", "character")) + class = c("mo_renamed", "character")) } #' @exportMethod print.mo_renamed @@ -1666,7 +1762,7 @@ unregex <- function(x) { get_mo_code <- function(x, property) { # don't use right now return(NULL) - + if (property == "mo") { unique(x) } else { diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index fa7afb69..a73981e8 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -78,7 +78,7 @@
diff --git a/docs/articles/index.html b/docs/articles/index.html index 823a7563..1263320c 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -78,7 +78,7 @@ diff --git a/docs/authors.html b/docs/authors.html index 1c7d8097..61214a41 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -78,7 +78,7 @@ diff --git a/docs/index.html b/docs/index.html index 5b68dca2..16701f2d 100644 --- a/docs/index.html +++ b/docs/index.html @@ -42,7 +42,7 @@ diff --git a/docs/news/index.html b/docs/news/index.html index 8f2d95eb..6ccffcb3 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -78,7 +78,7 @@ @@ -225,9 +225,9 @@ -as.mo(..., allow_uncertain = 3)
Contents