This commit is contained in:
dr. M.S. (Matthijs) Berends 2022-09-27 12:16:39 +02:00
parent aa66632934
commit 626757646b
8 changed files with 92 additions and 57 deletions

View File

@ -1,6 +1,6 @@
Package: AMR Package: AMR
Version: 1.8.1.9067 Version: 1.8.1.9069
Date: 2022-09-23 Date: 2022-09-27
Title: Antimicrobial Resistance Data Analysis Title: Antimicrobial Resistance Data Analysis
Description: Functions to simplify and standardise antimicrobial resistance (AMR) Description: Functions to simplify and standardise antimicrobial resistance (AMR)
data analysis and to work with microbial and antimicrobial properties by data analysis and to work with microbial and antimicrobial properties by

View File

@ -1,4 +1,4 @@
# AMR 1.8.1.9067 # AMR 1.8.1.9069
This version will eventually become v2.0! We're happy to reach a new major milestone soon! This version will eventually become v2.0! We're happy to reach a new major milestone soon!

98
R/mo.R
View File

@ -232,6 +232,7 @@ as.mo <- function(x,
if (any(is.na(out) & !is.na(x))) { if (any(is.na(out) & !is.na(x))) {
# reset uncertainties # reset uncertainties
pkg_env$mo_uncertainties <- pkg_env$mo_uncertainties[0, ] pkg_env$mo_uncertainties <- pkg_env$mo_uncertainties[0, ]
pkg_env$mo_failures <- NULL
# Laboratory systems: remove (translated) entries like "no growth", "not E. coli", etc. # Laboratory systems: remove (translated) entries like "no growth", "not E. coli", etc.
x[trimws2(x) %like% translate_into_language("no .*growth", language = language)] <- NA_character_ x[trimws2(x) %like% translate_into_language("no .*growth", language = language)] <- NA_character_
@ -245,16 +246,25 @@ as.mo <- function(x,
on.exit(close(progress)) on.exit(close(progress))
# run it # run it
x_coerced <- lapply(x_unique, function(x_search) { x_coerced <- vapply(FUN.VALUE = character(1), x_unique, function(x_search) {
progress$tick() progress$tick()
x_out <- trimws(tolower(x_search)) # some required cleaning steps
x_out <- trimws(x_search)
x_out <- gsub("[^A-Za-z-]+", " ", x_out, perl = TRUE)
x_out <- gsub(" +", " ", x_out, perl = TRUE)
x_out <- gsub("(^| )(e?spp|e?ssp|e?ss|e?sp|e?subsp|sube?species|biovar|biotype|serovar|e?species)( |$)", "", x_out, ignore.case = TRUE, perl = TRUE)
x_search_cleaned <- x_out
x_out <- tolower(x_out)
# take out the parts, split by space
x_parts <- strsplit(gsub("-", " ", x_out, fixed = TRUE), " ", fixed = TRUE)[[1]] x_parts <- strsplit(gsub("-", " ", x_out, fixed = TRUE), " ", fixed = TRUE)[[1]]
# do a pre-match on first character (and if it contains a space, first chars of first two terms) # do a pre-match on first character (and if it contains a space, first chars of first two terms)
if (length(x_parts) == 2) { if (length(x_parts) %in% c(2, 3)) {
# for genus + species + subspecies
filtr <- which(MO_lookup$full_first == substr(x_parts[1], 1, 1) & MO_lookup$species_first == substr(x_parts[2], 1, 1)) filtr <- which(MO_lookup$full_first == substr(x_parts[1], 1, 1) & MO_lookup$species_first == substr(x_parts[2], 1, 1))
} else if (length(x_parts) > 2) { } else if (length(x_parts) > 3) {
first_chars <- paste0("(^| )", "[", paste(substr(x_parts, 1, 1), collapse = ""), "]") first_chars <- paste0("(^| )", "[", paste(substr(x_parts, 1, 1), collapse = ""), "]")
filtr <- which(MO_lookup$full_first %like_case% first_chars) filtr <- which(MO_lookup$full_first %like_case% first_chars)
} else if (nchar(x_out) == 4) { } else if (nchar(x_out) == 4) {
@ -281,9 +291,9 @@ as.mo <- function(x,
} }
pkg_env$mo_to_search <- mo_to_search pkg_env$mo_to_search <- mo_to_search
# determine the matching score on the original search value # determine the matching score on the original search value
m <- mo_matching_score(x = x_search, n = mo_to_search) m <- mo_matching_score(x = x_search_cleaned, n = mo_to_search)
if (is.null(minimum_matching_score)) { if (is.null(minimum_matching_score)) {
minimum_matching_score_current <- min(0.7, min(10, nchar(x_search)) * 0.08) minimum_matching_score_current <- min(0.7, min(10, nchar(x_search_cleaned)) * 0.08)
# correct back for prevalence # correct back for prevalence
minimum_matching_score_current <- minimum_matching_score_current / MO_lookup$prevalence[match(mo_to_search, MO_lookup$fullname)] minimum_matching_score_current <- minimum_matching_score_current / MO_lookup$prevalence[match(mo_to_search, MO_lookup$fullname)]
# correct back for kingdom # correct back for kingdom
@ -291,17 +301,20 @@ as.mo <- function(x,
} else { } else {
minimum_matching_score_current <- minimum_matching_score minimum_matching_score_current <- minimum_matching_score
} }
m[m < minimum_matching_score_current] <- NA_real_ m[m < minimum_matching_score_current] <- NA_real_
top_hits <- mo_to_search[order(m, decreasing = TRUE, na.last = NA)] # na.last = NA will remove the NAs top_hits <- mo_to_search[order(m, decreasing = TRUE, na.last = NA)] # na.last = NA will remove the NAs
if (length(top_hits) == 0) { if (length(top_hits) == 0) {
warning_("No hits found for \"", x_search, "\" with minimum_matching_score = ", ifelse(is.null(minimum_matching_score), "NULL", minimum_matching_score), ". Try setting this value lower or even to 0.") warning_("No hits found for \"", x_search, "\" with minimum_matching_score = ", ifelse(is.null(minimum_matching_score), paste0("NULL (=", round(min(minimum_matching_score_current, na.rm = TRUE), 3), ")"), minimum_matching_score), ". Try setting this value lower or even to 0.")
result_mo <- NA_character_ result_mo <- NA_character_
} else { } else {
result_mo <- MO_lookup$mo[match(top_hits[1], MO_lookup$fullname)] result_mo <- MO_lookup$mo[match(top_hits[1], MO_lookup$fullname)]
pkg_env$mo_uncertainties <- rbind(pkg_env$mo_uncertainties, pkg_env$mo_uncertainties <- rbind(pkg_env$mo_uncertainties,
data.frame( data.frame(
minimum_matching_score = ifelse(is.null(minimum_matching_score), "NULL", minimum_matching_score), minimum_matching_score = ifelse(is.null(minimum_matching_score), "NULL", minimum_matching_score),
input = x_search, original_input = x_search,
input = x_search_cleaned,
fullname = top_hits[1], fullname = top_hits[1],
mo = result_mo, mo = result_mo,
candidates = ifelse(length(top_hits) > 1, paste(top_hits[2:min(26, length(top_hits))], collapse = ", "), ""), candidates = ifelse(length(top_hits) > 1, paste(top_hits[2:min(26, length(top_hits))], collapse = ", "), ""),
@ -312,7 +325,7 @@ as.mo <- function(x,
# save to package env to save time for next time # save to package env to save time for next time
pkg_env$mo_previously_coerced <- unique(rbind(pkg_env$mo_previously_coerced, pkg_env$mo_previously_coerced <- unique(rbind(pkg_env$mo_previously_coerced,
data.frame( data.frame(
x = x_search, x = paste(x_search, min(minimum_matching_score_current, na.rm = TRUE)),
mo = result_mo, mo = result_mo,
stringsAsFactors = FALSE stringsAsFactors = FALSE
), ),
@ -320,30 +333,30 @@ as.mo <- function(x,
)) ))
} }
# the actual result: # the actual result:
result_mo as.character(result_mo)
}) })
# remove progress bar from console # remove progress bar from console
close(progress) close(progress)
# expand from unique again # expand from unique again
out[is.na(out)] <- unlist(x_coerced)[match(x[is.na(out)], x_unique)] out[is.na(out)] <- x_coerced[match(x[is.na(out)], x_unique)]
# Throw note about uncertainties ---- # Throw note about uncertainties ----
if (isTRUE(info) && NROW(pkg_env$mo_uncertainties) > 0) { if (isTRUE(info) && NROW(pkg_env$mo_uncertainties) > 0) {
if (message_not_thrown_before("as.mo", "uncertainties", pkg_env$mo_uncertainties$input)) { if (message_not_thrown_before("as.mo", "uncertainties", pkg_env$mo_uncertainties$original_input)) {
plural <- c("", "this") plural <- c("", "this")
if (length(pkg_env$mo_uncertainties$input) > 1) { if (length(pkg_env$mo_uncertainties$original_input) > 1) {
plural <- c("s", "these uncertainties") plural <- c("s", "these uncertainties")
} }
if (length(pkg_env$mo_uncertainties$input) <= 3) { if (length(pkg_env$mo_uncertainties$original_input) <= 3) {
examples <- vector_and(paste0( examples <- vector_and(paste0(
'"', pkg_env$mo_uncertainties$input, '"', pkg_env$mo_uncertainties$original_input,
'" (assumed ', font_italic(pkg_env$mo_uncertainties$fullname, collapse = NULL), ")" '" (assumed ', font_italic(pkg_env$mo_uncertainties$fullname, collapse = NULL), ")"
), ),
quotes = FALSE quotes = FALSE
) )
} else { } else {
examples <- paste0(nr2char(length(pkg_env$mo_uncertainties$input)), " microorganism", plural[1]) examples <- paste0(nr2char(length(pkg_env$mo_uncertainties$original_input)), " microorganism", plural[1])
} }
msg <- paste0( msg <- paste0(
"Microorganism translation was uncertain for ", examples, "Microorganism translation was uncertain for ", examples,
@ -449,6 +462,10 @@ as.mo <- function(x,
out[out %like_case% "^B_STRPT_SLVR(_|$)"] <- "B_STRPT_GRPK" out[out %like_case% "^B_STRPT_SLVR(_|$)"] <- "B_STRPT_GRPK"
# group L - only S. dysgalactiae which is also group C, so ignore it here # group L - only S. dysgalactiae which is also group C, so ignore it here
} }
# All unknowns ----
out[is.na(out) & !is.na(x)] <- "UNKNOWN"
pkg_env$mo_failures <- unique(x[out == "UNKNOWN" & x != "UNKNOWN" & !is.na(x)])
# Return class ---- # Return class ----
set_clean_class(out, set_clean_class(out,
@ -771,26 +788,21 @@ print.mo_uncertainties <- function(x, ...) {
) )
score_formatted <- trimws(formatC(round(score, 3), format = "f", digits = 3)) score_formatted <- trimws(formatC(round(score, 3), format = "f", digits = 3))
txt <- paste(txt, txt <- paste(txt,
paste0( paste0(
strwrap( paste0(
paste0( '"', x[i, ]$original_input, '"',
'"', x[i, ]$input, '"', " -> ",
" -> ", paste0(
paste0( font_bold(font_italic(x[i, ]$fullname)),
font_bold(font_italic(x[i, ]$fullname)), ifelse(!is.na(x[i, ]$renamed_to), paste(", renamed to", font_italic(x[i, ]$renamed_to)), ""),
ifelse(!is.na(x[i, ]$renamed_to), paste(", renamed to", font_italic(x[i, ]$renamed_to)), ""), " (", x[i, ]$mo, ", ", score_set_colour(score_formatted, score), ")"
" (", x[i, ]$mo, )
", ", score_set_colour(score_formatted, score), ),
") " collapse = "\n"
) ),
), ifelse(x[i, ]$original_input != x[i, ]$input, paste0(strrep(" ", nchar(x[i, ]$original_input) + 6), "Based on input \"", x[i, ]$input, "\""), ""),
width = 0.98 * getOption("width"), candidates,
exdent = nchar(x[i, ]$input) + 6 sep = "\n"
),
collapse = "\n"
),
candidates,
sep = "\n"
) )
txt <- paste0(gsub("\n\n", "\n", txt), "\n\n") txt <- paste0(gsub("\n\n", "\n", txt), "\n\n")
} }
@ -803,6 +815,7 @@ mo_reset_session <- function() {
if (NROW(pkg_env$mo_previously_coerced) > 0) { if (NROW(pkg_env$mo_previously_coerced) > 0) {
message_("Reset ", NROW(pkg_env$mo_previously_coerced), " previously matched input values.") message_("Reset ", NROW(pkg_env$mo_previously_coerced), " previously matched input values.")
pkg_env$mo_previously_coerced <- pkg_env$mo_previously_coerced[0, , drop = FALSE] pkg_env$mo_previously_coerced <- pkg_env$mo_previously_coerced[0, , drop = FALSE]
pkg_env$mo_uncertainties <- pkg_env$mo_uncertainties[0, , drop = FALSE]
} else { } else {
message_("No previously matched input values to reset.") message_("No previously matched input values to reset.")
} }
@ -847,13 +860,14 @@ load_mo_uncertainties <- function(metadata) {
pkg_env$mo_uncertainties <- metadata$uncertainties pkg_env$mo_uncertainties <- metadata$uncertainties
} }
trimws2 <- function(x) { trimws2 <- function(..., whitespace = "[\u0009\u000A\u000B\u000C\u000D\u0020\u0085\u00A0\u1680\u180E\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u200B\u200C\u200D\u2028\u2029\u202F\u205F\u2060\u3000\uFEFF]") {
trimws(gsub("[\\s]+", " ", x, perl = TRUE)) # this is even faster than trimws() itself which sets " \t\n\r".
trimws(..., whitespace = whitespace)
} }
parse_and_convert <- function(x) { parse_and_convert <- function(x) {
if (tryCatch(is.character(x) && all(Encoding(x) == "unknown", na.rm = TRUE), error = function(e) FALSE)) { if (tryCatch(is.character(x) && all(Encoding(x) == "unknown", na.rm = TRUE), error = function(e) FALSE)) {
return(x) return(trimws2(x))
} }
tryCatch( tryCatch(
{ {
@ -880,7 +894,7 @@ parse_and_convert <- function(x) {
}, },
error = function(e) stop(e$message, call. = FALSE) error = function(e) stop(e$message, call. = FALSE)
) # this will also be thrown when running `as.mo(no_existing_object)` ) # this will also be thrown when running `as.mo(no_existing_object)`
parsed trimws2(parsed)
} }
replace_old_mo_codes <- function(x, property) { replace_old_mo_codes <- function(x, property) {
@ -1032,7 +1046,7 @@ convert_colloquial_input <- function(x) {
out[x %like_case% "pneumo[ck]o[ck]"] <- "B_STRPT_PNMN" out[x %like_case% "pneumo[ck]o[ck]"] <- "B_STRPT_PNMN"
# unexisting names (xxx and con are WHONET codes) # unexisting names (xxx and con are WHONET codes)
out[x %in% c("xxx", "con", "other", "none", "unknown") | x %like_case% "virus"] <- "UNKNOWN" out[x %in% c("con", "other", "none", "unknown") | x %like_case% "virus"] <- "UNKNOWN"
out out
} }

View File

@ -362,6 +362,20 @@ mo_type <- function(x, language = get_AMR_locale(), keep_synonyms = getOption("A
translate_into_language(out, language = language, only_unknown = FALSE) translate_into_language(out, language = language, only_unknown = FALSE)
} }
#' @rdname mo_property
#' @export
mo_status <- function(x, language = get_AMR_locale(), keep_synonyms = getOption("AMR_keep_synonyms", TRUE), ...) {
if (missing(x)) {
# this tries to find the data and an <mo> column
x <- find_mo_col(fn = "mo_status")
}
meet_criteria(x, allow_NA = TRUE)
language <- validate_language(language)
meet_criteria(keep_synonyms, allow_class = "logical", has_length = 1)
translate_into_language(mo_validate(x = x, property = "status", language = language, keep_synonyms = keep_synonyms, ...), language = language, only_unknown = TRUE)
}
#' @rdname mo_property #' @rdname mo_property
#' @export #' @export
mo_gramstain <- function(x, language = get_AMR_locale(), keep_synonyms = getOption("AMR_keep_synonyms", TRUE), ...) { mo_gramstain <- function(x, language = get_AMR_locale(), keep_synonyms = getOption("AMR_keep_synonyms", TRUE), ...) {
@ -649,7 +663,7 @@ mo_synonyms <- function(x, language = get_AMR_locale(), keep_synonyms = getOptio
syns <- lapply(x.mo, function(y) { syns <- lapply(x.mo, function(y) {
gbif <- AMR::microorganisms$gbif[match(y, AMR::microorganisms$mo)] gbif <- AMR::microorganisms$gbif[match(y, AMR::microorganisms$mo)]
lpsn <- AMR::microorganisms$lpsn[match(y, AMR::microorganisms$mo)] lpsn <- AMR::microorganisms$lpsn[match(y, AMR::microorganisms$mo)]
out <- AMR::microorganisms[which(AMR::microorganisms$lpsn_renamed_to %in% lpsn | AMR::microorganisms$gbif_renamed_to %in% gbif), "fullname", drop = TRUE] out <- AMR::microorganisms[which(AMR::microorganisms$lpsn_renamed_to == lpsn | AMR::microorganisms$gbif_renamed_to == gbif), "fullname", drop = TRUE]
if (length(out) == 0) { if (length(out) == 0) {
NULL NULL
} else { } else {
@ -686,6 +700,7 @@ mo_info <- function(x, language = get_AMR_locale(), keep_synonyms = getOption("A
c( c(
mo_taxonomy(y, language = language, keep_synonyms = keep_synonyms), mo_taxonomy(y, language = language, keep_synonyms = keep_synonyms),
list( list(
status = mo_status(y, language = language, keep_synonyms = keep_synonyms),
synonyms = mo_synonyms(y, keep_synonyms = keep_synonyms), synonyms = mo_synonyms(y, keep_synonyms = keep_synonyms),
gramstain = mo_gramstain(y, language = language, keep_synonyms = keep_synonyms), gramstain = mo_gramstain(y, language = language, keep_synonyms = keep_synonyms),
url = unname(mo_url(y, open = FALSE, keep_synonyms = keep_synonyms)), url = unname(mo_url(y, open = FALSE, keep_synonyms = keep_synonyms)),

View File

@ -27,6 +27,7 @@
pkg_env <- new.env(hash = FALSE) pkg_env <- new.env(hash = FALSE)
pkg_env$mo_uncertainties <- data.frame( pkg_env$mo_uncertainties <- data.frame(
uncertainty = integer(0), uncertainty = integer(0),
original_input = character(0),
input = character(0), input = character(0),
fullname = character(0), fullname = character(0),
mo = character(0), mo = character(0),

View File

@ -27,7 +27,7 @@ MOs <- subset(microorganisms, !is.na(mo) & nchar(mo) > 3)
expect_identical(as.character(MOs$mo), as.character(as.mo(MOs$mo))) expect_identical(as.character(MOs$mo), as.character(as.mo(MOs$mo)))
expect_identical( expect_identical(
as.character(as.mo(c("E. coli", "H. influenzae"))), as.character(as.mo(c("E. coli", "H. influenzae"), keep_synonyms = FALSE)),
c("B_ESCHR_COLI", "B_HMPHL_INFL") c("B_ESCHR_COLI", "B_HMPHL_INFL")
) )
@ -41,7 +41,7 @@ expect_equal(as.character(as.mo(" B_ESCHR_COLI ")), "B_ESCHR_COLI")
expect_equal(as.character(as.mo("e coli")), "B_ESCHR_COLI") # not Campylobacter expect_equal(as.character(as.mo("e coli")), "B_ESCHR_COLI") # not Campylobacter
expect_equal(as.character(as.mo("klpn")), "B_KLBSL_PNMN") expect_equal(as.character(as.mo("klpn")), "B_KLBSL_PNMN")
expect_equal(as.character(as.mo("Klebsiella")), "B_KLBSL") expect_equal(as.character(as.mo("Klebsiella")), "B_KLBSL")
expect_equal(as.character(as.mo("K. pneu rhino")), "B_KLBSL_PNMN_RHNS") # K. pneumoniae subspp. rhinoscleromatis expect_equal(as.character(as.mo("K. pneumo rhino")), "B_KLBSL_PNMN_RHNS") # K. pneumoniae subspp. rhinoscleromatis
expect_equal(as.character(as.mo("Bartonella")), "B_BRTNL") expect_equal(as.character(as.mo("Bartonella")), "B_BRTNL")
expect_equal(as.character(as.mo("C. difficile")), "B_CRDDS_DFFC") expect_equal(as.character(as.mo("C. difficile")), "B_CRDDS_DFFC")
expect_equal(as.character(as.mo("L. pneumophila")), "B_LGNLL_PNMP") expect_equal(as.character(as.mo("L. pneumophila")), "B_LGNLL_PNMP")
@ -284,12 +284,12 @@ expect_stdout(print(mo_uncertainties()))
# Salmonella (City) are all actually Salmonella enterica spp (City) # Salmonella (City) are all actually Salmonella enterica spp (City)
expect_equal( expect_equal(
suppressMessages(as.mo(c("Salmonella Goettingen", "Salmonella Typhimurium", "Salmonella Group A"))), suppressMessages(as.mo(c("Salmonella Goettingen", "Salmonella Typhimurium", "Salmonella Group A"), keep_synonyms = FALSE)),
as.mo(c("Salmonella enterica", "Salmonella enterica", "Salmonella")) as.mo(c("Salmonella enterica", "Salmonella enterica", "Salmonella"))
) )
# no viruses # no viruses
expect_equal(as.character(as.mo("Virus")), NA_character_) expect_equal(as.mo("Virus"), as.mo("UNKNOWN"))
# summary # summary
expect_equal(length(summary(example_isolates$mo)), 6) expect_equal(length(summary(example_isolates$mo)), 6)

View File

@ -75,13 +75,10 @@ expect_identical(MOs$fullname, mo_fullname(MOs$fullname, language = "en"))
expect_equal(mo_type("Escherichia coli", language = "de"), "Bakterien") expect_equal(mo_type("Escherichia coli", language = "de"), "Bakterien")
expect_equal(mo_gramstain("Escherichia coli", language = "nl"), "Gram-negatief") expect_equal(mo_gramstain("Escherichia coli", language = "nl"), "Gram-negatief")
expect_stdout(print(mo_gramstain("Escherichia coli", language = "en"))) for (l in LANGUAGES_SUPPORTED) {
expect_stdout(print(mo_gramstain("Escherichia coli", language = "de"))) expect_stdout(print(mo_gramstain("Escherichia coli", language = l)))
expect_stdout(print(mo_gramstain("Escherichia coli", language = "nl"))) print(mo_gramstain("Escherichia coli", language = l))
expect_stdout(print(mo_gramstain("Escherichia coli", language = "es"))) }
expect_stdout(print(mo_gramstain("Escherichia coli", language = "pt")))
expect_stdout(print(mo_gramstain("Escherichia coli", language = "it")))
expect_stdout(print(mo_gramstain("Escherichia coli", language = "fr")))
expect_error(mo_gramstain("Escherichia coli", language = "UNKNOWN")) expect_error(mo_gramstain("Escherichia coli", language = "UNKNOWN"))
dutch <- mo_name(microorganisms$fullname[which(microorganisms$fullname %unlike% "unknown|coagulase")], language = "nl") # should be transformable to English again dutch <- mo_name(microorganisms$fullname[which(microorganisms$fullname %unlike% "unknown|coagulase")], language = "nl") # should be transformable to English again

View File

@ -15,6 +15,7 @@
\alias{mo_kingdom} \alias{mo_kingdom}
\alias{mo_domain} \alias{mo_domain}
\alias{mo_type} \alias{mo_type}
\alias{mo_status}
\alias{mo_gramstain} \alias{mo_gramstain}
\alias{mo_is_gram_negative} \alias{mo_is_gram_negative}
\alias{mo_is_gram_positive} \alias{mo_is_gram_positive}
@ -124,6 +125,13 @@ mo_type(
... ...
) )
mo_status(
x,
language = get_AMR_locale(),
keep_synonyms = getOption("AMR_keep_synonyms", TRUE),
...
)
mo_gramstain( mo_gramstain(
x, x,
language = get_AMR_locale(), language = get_AMR_locale(),