diff --git a/DESCRIPTION b/DESCRIPTION index f25023b4..dc3b576f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: AMR -Version: 1.8.1.9067 -Date: 2022-09-23 +Version: 1.8.1.9069 +Date: 2022-09-27 Title: Antimicrobial Resistance Data Analysis Description: Functions to simplify and standardise antimicrobial resistance (AMR) data analysis and to work with microbial and antimicrobial properties by diff --git a/NEWS.md b/NEWS.md index 878c2b02..f080b4c4 100755 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -# AMR 1.8.1.9067 +# AMR 1.8.1.9069 This version will eventually become v2.0! We're happy to reach a new major milestone soon! diff --git a/R/mo.R b/R/mo.R index b6bec184..320b1901 100755 --- a/R/mo.R +++ b/R/mo.R @@ -232,6 +232,7 @@ as.mo <- function(x, if (any(is.na(out) & !is.na(x))) { # reset uncertainties pkg_env$mo_uncertainties <- pkg_env$mo_uncertainties[0, ] + pkg_env$mo_failures <- NULL # Laboratory systems: remove (translated) entries like "no growth", "not E. coli", etc. x[trimws2(x) %like% translate_into_language("no .*growth", language = language)] <- NA_character_ @@ -245,16 +246,25 @@ as.mo <- function(x, on.exit(close(progress)) # run it - x_coerced <- lapply(x_unique, function(x_search) { + x_coerced <- vapply(FUN.VALUE = character(1), x_unique, function(x_search) { progress$tick() - x_out <- trimws(tolower(x_search)) + # some required cleaning steps + x_out <- trimws(x_search) + x_out <- gsub("[^A-Za-z-]+", " ", x_out, perl = TRUE) + x_out <- gsub(" +", " ", x_out, perl = TRUE) + x_out <- gsub("(^| )(e?spp|e?ssp|e?ss|e?sp|e?subsp|sube?species|biovar|biotype|serovar|e?species)( |$)", "", x_out, ignore.case = TRUE, perl = TRUE) + x_search_cleaned <- x_out + x_out <- tolower(x_out) + + # take out the parts, split by space x_parts <- strsplit(gsub("-", " ", x_out, fixed = TRUE), " ", fixed = TRUE)[[1]] - + # do a pre-match on first character (and if it contains a space, first chars of first two terms) - if (length(x_parts) == 2) { + if (length(x_parts) %in% c(2, 3)) { + # for genus + species + subspecies filtr <- which(MO_lookup$full_first == substr(x_parts[1], 1, 1) & MO_lookup$species_first == substr(x_parts[2], 1, 1)) - } else if (length(x_parts) > 2) { + } else if (length(x_parts) > 3) { first_chars <- paste0("(^| )", "[", paste(substr(x_parts, 1, 1), collapse = ""), "]") filtr <- which(MO_lookup$full_first %like_case% first_chars) } else if (nchar(x_out) == 4) { @@ -281,9 +291,9 @@ as.mo <- function(x, } pkg_env$mo_to_search <- mo_to_search # determine the matching score on the original search value - m <- mo_matching_score(x = x_search, n = mo_to_search) + m <- mo_matching_score(x = x_search_cleaned, n = mo_to_search) if (is.null(minimum_matching_score)) { - minimum_matching_score_current <- min(0.7, min(10, nchar(x_search)) * 0.08) + minimum_matching_score_current <- min(0.7, min(10, nchar(x_search_cleaned)) * 0.08) # correct back for prevalence minimum_matching_score_current <- minimum_matching_score_current / MO_lookup$prevalence[match(mo_to_search, MO_lookup$fullname)] # correct back for kingdom @@ -291,17 +301,20 @@ as.mo <- function(x, } else { minimum_matching_score_current <- minimum_matching_score } + m[m < minimum_matching_score_current] <- NA_real_ + top_hits <- mo_to_search[order(m, decreasing = TRUE, na.last = NA)] # na.last = NA will remove the NAs if (length(top_hits) == 0) { - warning_("No hits found for \"", x_search, "\" with minimum_matching_score = ", ifelse(is.null(minimum_matching_score), "NULL", minimum_matching_score), ". Try setting this value lower or even to 0.") + warning_("No hits found for \"", x_search, "\" with minimum_matching_score = ", ifelse(is.null(minimum_matching_score), paste0("NULL (=", round(min(minimum_matching_score_current, na.rm = TRUE), 3), ")"), minimum_matching_score), ". Try setting this value lower or even to 0.") result_mo <- NA_character_ } else { result_mo <- MO_lookup$mo[match(top_hits[1], MO_lookup$fullname)] pkg_env$mo_uncertainties <- rbind(pkg_env$mo_uncertainties, data.frame( minimum_matching_score = ifelse(is.null(minimum_matching_score), "NULL", minimum_matching_score), - input = x_search, + original_input = x_search, + input = x_search_cleaned, fullname = top_hits[1], mo = result_mo, candidates = ifelse(length(top_hits) > 1, paste(top_hits[2:min(26, length(top_hits))], collapse = ", "), ""), @@ -312,7 +325,7 @@ as.mo <- function(x, # save to package env to save time for next time pkg_env$mo_previously_coerced <- unique(rbind(pkg_env$mo_previously_coerced, data.frame( - x = x_search, + x = paste(x_search, min(minimum_matching_score_current, na.rm = TRUE)), mo = result_mo, stringsAsFactors = FALSE ), @@ -320,30 +333,30 @@ as.mo <- function(x, )) } # the actual result: - result_mo + as.character(result_mo) }) # remove progress bar from console close(progress) # expand from unique again - out[is.na(out)] <- unlist(x_coerced)[match(x[is.na(out)], x_unique)] + out[is.na(out)] <- x_coerced[match(x[is.na(out)], x_unique)] # Throw note about uncertainties ---- if (isTRUE(info) && NROW(pkg_env$mo_uncertainties) > 0) { - if (message_not_thrown_before("as.mo", "uncertainties", pkg_env$mo_uncertainties$input)) { + if (message_not_thrown_before("as.mo", "uncertainties", pkg_env$mo_uncertainties$original_input)) { plural <- c("", "this") - if (length(pkg_env$mo_uncertainties$input) > 1) { + if (length(pkg_env$mo_uncertainties$original_input) > 1) { plural <- c("s", "these uncertainties") } - if (length(pkg_env$mo_uncertainties$input) <= 3) { + if (length(pkg_env$mo_uncertainties$original_input) <= 3) { examples <- vector_and(paste0( - '"', pkg_env$mo_uncertainties$input, + '"', pkg_env$mo_uncertainties$original_input, '" (assumed ', font_italic(pkg_env$mo_uncertainties$fullname, collapse = NULL), ")" ), quotes = FALSE ) } else { - examples <- paste0(nr2char(length(pkg_env$mo_uncertainties$input)), " microorganism", plural[1]) + examples <- paste0(nr2char(length(pkg_env$mo_uncertainties$original_input)), " microorganism", plural[1]) } msg <- paste0( "Microorganism translation was uncertain for ", examples, @@ -449,6 +462,10 @@ as.mo <- function(x, out[out %like_case% "^B_STRPT_SLVR(_|$)"] <- "B_STRPT_GRPK" # group L - only S. dysgalactiae which is also group C, so ignore it here } + + # All unknowns ---- + out[is.na(out) & !is.na(x)] <- "UNKNOWN" + pkg_env$mo_failures <- unique(x[out == "UNKNOWN" & x != "UNKNOWN" & !is.na(x)]) # Return class ---- set_clean_class(out, @@ -771,26 +788,21 @@ print.mo_uncertainties <- function(x, ...) { ) score_formatted <- trimws(formatC(round(score, 3), format = "f", digits = 3)) txt <- paste(txt, - paste0( - strwrap( - paste0( - '"', x[i, ]$input, '"', - " -> ", - paste0( - font_bold(font_italic(x[i, ]$fullname)), - ifelse(!is.na(x[i, ]$renamed_to), paste(", renamed to", font_italic(x[i, ]$renamed_to)), ""), - " (", x[i, ]$mo, - ", ", score_set_colour(score_formatted, score), - ") " - ) - ), - width = 0.98 * getOption("width"), - exdent = nchar(x[i, ]$input) + 6 - ), - collapse = "\n" - ), - candidates, - sep = "\n" + paste0( + paste0( + '"', x[i, ]$original_input, '"', + " -> ", + paste0( + font_bold(font_italic(x[i, ]$fullname)), + ifelse(!is.na(x[i, ]$renamed_to), paste(", renamed to", font_italic(x[i, ]$renamed_to)), ""), + " (", x[i, ]$mo, ", ", score_set_colour(score_formatted, score), ")" + ) + ), + collapse = "\n" + ), + ifelse(x[i, ]$original_input != x[i, ]$input, paste0(strrep(" ", nchar(x[i, ]$original_input) + 6), "Based on input \"", x[i, ]$input, "\""), ""), + candidates, + sep = "\n" ) txt <- paste0(gsub("\n\n", "\n", txt), "\n\n") } @@ -803,6 +815,7 @@ mo_reset_session <- function() { if (NROW(pkg_env$mo_previously_coerced) > 0) { message_("Reset ", NROW(pkg_env$mo_previously_coerced), " previously matched input values.") pkg_env$mo_previously_coerced <- pkg_env$mo_previously_coerced[0, , drop = FALSE] + pkg_env$mo_uncertainties <- pkg_env$mo_uncertainties[0, , drop = FALSE] } else { message_("No previously matched input values to reset.") } @@ -847,13 +860,14 @@ load_mo_uncertainties <- function(metadata) { pkg_env$mo_uncertainties <- metadata$uncertainties } -trimws2 <- function(x) { - trimws(gsub("[\\s]+", " ", x, perl = TRUE)) +trimws2 <- function(..., whitespace = "[\u0009\u000A\u000B\u000C\u000D\u0020\u0085\u00A0\u1680\u180E\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u200B\u200C\u200D\u2028\u2029\u202F\u205F\u2060\u3000\uFEFF]") { + # this is even faster than trimws() itself which sets " \t\n\r". + trimws(..., whitespace = whitespace) } parse_and_convert <- function(x) { if (tryCatch(is.character(x) && all(Encoding(x) == "unknown", na.rm = TRUE), error = function(e) FALSE)) { - return(x) + return(trimws2(x)) } tryCatch( { @@ -880,7 +894,7 @@ parse_and_convert <- function(x) { }, error = function(e) stop(e$message, call. = FALSE) ) # this will also be thrown when running `as.mo(no_existing_object)` - parsed + trimws2(parsed) } replace_old_mo_codes <- function(x, property) { @@ -1032,7 +1046,7 @@ convert_colloquial_input <- function(x) { out[x %like_case% "pneumo[ck]o[ck]"] <- "B_STRPT_PNMN" # unexisting names (xxx and con are WHONET codes) - out[x %in% c("xxx", "con", "other", "none", "unknown") | x %like_case% "virus"] <- "UNKNOWN" + out[x %in% c("con", "other", "none", "unknown") | x %like_case% "virus"] <- "UNKNOWN" out } diff --git a/R/mo_property.R b/R/mo_property.R index 0ea6bb28..4bc0e7bd 100755 --- a/R/mo_property.R +++ b/R/mo_property.R @@ -362,6 +362,20 @@ mo_type <- function(x, language = get_AMR_locale(), keep_synonyms = getOption("A translate_into_language(out, language = language, only_unknown = FALSE) } +#' @rdname mo_property +#' @export +mo_status <- function(x, language = get_AMR_locale(), keep_synonyms = getOption("AMR_keep_synonyms", TRUE), ...) { + if (missing(x)) { + # this tries to find the data and an column + x <- find_mo_col(fn = "mo_status") + } + meet_criteria(x, allow_NA = TRUE) + language <- validate_language(language) + meet_criteria(keep_synonyms, allow_class = "logical", has_length = 1) + + translate_into_language(mo_validate(x = x, property = "status", language = language, keep_synonyms = keep_synonyms, ...), language = language, only_unknown = TRUE) +} + #' @rdname mo_property #' @export mo_gramstain <- function(x, language = get_AMR_locale(), keep_synonyms = getOption("AMR_keep_synonyms", TRUE), ...) { @@ -649,7 +663,7 @@ mo_synonyms <- function(x, language = get_AMR_locale(), keep_synonyms = getOptio syns <- lapply(x.mo, function(y) { gbif <- AMR::microorganisms$gbif[match(y, AMR::microorganisms$mo)] lpsn <- AMR::microorganisms$lpsn[match(y, AMR::microorganisms$mo)] - out <- AMR::microorganisms[which(AMR::microorganisms$lpsn_renamed_to %in% lpsn | AMR::microorganisms$gbif_renamed_to %in% gbif), "fullname", drop = TRUE] + out <- AMR::microorganisms[which(AMR::microorganisms$lpsn_renamed_to == lpsn | AMR::microorganisms$gbif_renamed_to == gbif), "fullname", drop = TRUE] if (length(out) == 0) { NULL } else { @@ -686,6 +700,7 @@ mo_info <- function(x, language = get_AMR_locale(), keep_synonyms = getOption("A c( mo_taxonomy(y, language = language, keep_synonyms = keep_synonyms), list( + status = mo_status(y, language = language, keep_synonyms = keep_synonyms), synonyms = mo_synonyms(y, keep_synonyms = keep_synonyms), gramstain = mo_gramstain(y, language = language, keep_synonyms = keep_synonyms), url = unname(mo_url(y, open = FALSE, keep_synonyms = keep_synonyms)), diff --git a/R/zzz.R b/R/zzz.R index 1e03f862..ef4fd1d3 100755 --- a/R/zzz.R +++ b/R/zzz.R @@ -27,6 +27,7 @@ pkg_env <- new.env(hash = FALSE) pkg_env$mo_uncertainties <- data.frame( uncertainty = integer(0), + original_input = character(0), input = character(0), fullname = character(0), mo = character(0), diff --git a/inst/tinytest/test-mo.R b/inst/tinytest/test-mo.R index bf155894..f336e293 100644 --- a/inst/tinytest/test-mo.R +++ b/inst/tinytest/test-mo.R @@ -27,7 +27,7 @@ MOs <- subset(microorganisms, !is.na(mo) & nchar(mo) > 3) expect_identical(as.character(MOs$mo), as.character(as.mo(MOs$mo))) expect_identical( - as.character(as.mo(c("E. coli", "H. influenzae"))), + as.character(as.mo(c("E. coli", "H. influenzae"), keep_synonyms = FALSE)), c("B_ESCHR_COLI", "B_HMPHL_INFL") ) @@ -41,7 +41,7 @@ expect_equal(as.character(as.mo(" B_ESCHR_COLI ")), "B_ESCHR_COLI") expect_equal(as.character(as.mo("e coli")), "B_ESCHR_COLI") # not Campylobacter expect_equal(as.character(as.mo("klpn")), "B_KLBSL_PNMN") expect_equal(as.character(as.mo("Klebsiella")), "B_KLBSL") -expect_equal(as.character(as.mo("K. pneu rhino")), "B_KLBSL_PNMN_RHNS") # K. pneumoniae subspp. rhinoscleromatis +expect_equal(as.character(as.mo("K. pneumo rhino")), "B_KLBSL_PNMN_RHNS") # K. pneumoniae subspp. rhinoscleromatis expect_equal(as.character(as.mo("Bartonella")), "B_BRTNL") expect_equal(as.character(as.mo("C. difficile")), "B_CRDDS_DFFC") expect_equal(as.character(as.mo("L. pneumophila")), "B_LGNLL_PNMP") @@ -284,12 +284,12 @@ expect_stdout(print(mo_uncertainties())) # Salmonella (City) are all actually Salmonella enterica spp (City) expect_equal( - suppressMessages(as.mo(c("Salmonella Goettingen", "Salmonella Typhimurium", "Salmonella Group A"))), + suppressMessages(as.mo(c("Salmonella Goettingen", "Salmonella Typhimurium", "Salmonella Group A"), keep_synonyms = FALSE)), as.mo(c("Salmonella enterica", "Salmonella enterica", "Salmonella")) ) # no viruses -expect_equal(as.character(as.mo("Virus")), NA_character_) +expect_equal(as.mo("Virus"), as.mo("UNKNOWN")) # summary expect_equal(length(summary(example_isolates$mo)), 6) diff --git a/inst/tinytest/test-mo_property.R b/inst/tinytest/test-mo_property.R index 438bea66..62b5a507 100644 --- a/inst/tinytest/test-mo_property.R +++ b/inst/tinytest/test-mo_property.R @@ -75,13 +75,10 @@ expect_identical(MOs$fullname, mo_fullname(MOs$fullname, language = "en")) expect_equal(mo_type("Escherichia coli", language = "de"), "Bakterien") expect_equal(mo_gramstain("Escherichia coli", language = "nl"), "Gram-negatief") -expect_stdout(print(mo_gramstain("Escherichia coli", language = "en"))) -expect_stdout(print(mo_gramstain("Escherichia coli", language = "de"))) -expect_stdout(print(mo_gramstain("Escherichia coli", language = "nl"))) -expect_stdout(print(mo_gramstain("Escherichia coli", language = "es"))) -expect_stdout(print(mo_gramstain("Escherichia coli", language = "pt"))) -expect_stdout(print(mo_gramstain("Escherichia coli", language = "it"))) -expect_stdout(print(mo_gramstain("Escherichia coli", language = "fr"))) +for (l in LANGUAGES_SUPPORTED) { + expect_stdout(print(mo_gramstain("Escherichia coli", language = l))) + print(mo_gramstain("Escherichia coli", language = l)) +} expect_error(mo_gramstain("Escherichia coli", language = "UNKNOWN")) dutch <- mo_name(microorganisms$fullname[which(microorganisms$fullname %unlike% "unknown|coagulase")], language = "nl") # should be transformable to English again diff --git a/man/mo_property.Rd b/man/mo_property.Rd index 9cd09bcd..8ca6d7c8 100644 --- a/man/mo_property.Rd +++ b/man/mo_property.Rd @@ -15,6 +15,7 @@ \alias{mo_kingdom} \alias{mo_domain} \alias{mo_type} +\alias{mo_status} \alias{mo_gramstain} \alias{mo_is_gram_negative} \alias{mo_is_gram_positive} @@ -124,6 +125,13 @@ mo_type( ... ) +mo_status( + x, + language = get_AMR_locale(), + keep_synonyms = getOption("AMR_keep_synonyms", TRUE), + ... +) + mo_gramstain( x, language = get_AMR_locale(),