1
0
mirror of https://github.com/msberends/AMR.git synced 2025-07-08 14:01:55 +02:00

mo codes for WHONET

This commit is contained in:
2019-02-08 16:06:54 +01:00
parent 3d3366faf7
commit ed30312048
60 changed files with 1103 additions and 615 deletions

View File

@ -134,7 +134,7 @@
#'
#' A data set containing the complete microbial taxonomy of the kingdoms Bacteria, Fungi and Protozoa from ITIS. MO codes can be looked up using \code{\link{as.mo}}.
#' @inheritSection ITIS ITIS
#' @format A \code{\link{data.frame}} with 18,833 observations and 15 variables:
#' @format A \code{\link{data.frame}} with 19,456 observations and 15 variables:
#' \describe{
#' \item{\code{mo}}{ID of microorganism}
#' \item{\code{tsn}}{Taxonomic Serial Number (TSN), as defined by ITIS}
@ -153,6 +153,17 @@
#' \item{\code{ref}}{Author(s) and year of concerning publication as found in ITIS, see Source}
#' }
#' @source Integrated Taxonomic Information System (ITIS) public online database, \url{https://www.itis.gov}.
#' @details Manually added were:
#' \itemize{
#' \item{605 species of Aspergillus (as Aspergillus misses from ITIS, list from https://en.wikipedia.org/wiki/List_of_Aspergillus_species on 2019-02-05)}
#' \item{23 species of Trichophyton (as Trichophyton misses from ITIS, list from https://en.wikipedia.org/wiki/Trichophyton on 2019-02-05)}
#' \item{9 species of Streptococcus (beta haemolytic groups A, B, C, D, F, G, H, K and unspecified)}
#' \item{2 species of Straphylococcus (coagulase-negative [CoNS] and coagulase-positive [CoPS])}
#' \item{1 species of Candida (C. glabrata)}
#' \item{2 other undefined (unknown Gram negatives and unknown Gram positives)}
#' }
#'
#' These manual entries have no Taxonomic Serial Number (TSN), so can be looked up with \code{filter(microorganisms, is.na(tsn)}.
#' @inheritSection AMR Read more on our website!
#' @seealso \code{\link{as.mo}} \code{\link{mo_property}} \code{\link{microorganisms.codes}}
"microorganisms"
@ -175,12 +186,13 @@
#' Translation table for microorganism codes
#'
#' A data set containing commonly used codes for microorganisms. Define your own with \code{\link{set_mo_source}}.
#' @format A \code{\link{data.frame}} with 3,303 observations and 2 variables:
#' A data set containing commonly used codes for microorganisms, from laboratory systems and WHONET. Define your own with \code{\link{set_mo_source}}.
#' @format A \code{\link{data.frame}} with 4,731 observations and 2 variables:
#' \describe{
#' \item{\code{certe}}{Commonly used code of a microorganism}
#' \item{\code{mo}}{Code of microorganism in \code{\link{microorganisms}}}
#' \item{\code{mo}}{ID of the microorganism in the \code{\link{microorganisms}} data set}
#' }
#' @inheritSection ITIS ITIS
#' @inheritSection AMR Read more on our website!
#' @seealso \code{\link{as.mo}} \code{\link{microorganisms}}
"microorganisms.codes"
@ -246,17 +258,21 @@
#' @name supplementary_data
#' @inheritSection AMR Read more on our website!
# # Renew data:
# # sorted on (1) bacteria, (2) fungi, (3) protozoa and then human pathogenic prevalence and then TSN:
# microorganismsDT <- data.table::as.data.table(AMR::microorganisms)
# # sort on (1) bacteria, (2) fungi, (3) protozoa and then human pathogenic prevalence and then TSN:
# data.table::setkey(microorganismsDT, kingdom, prevalence, fullname)
# microorganisms.prevDT <- microorganismsDT[prevalence == 9999,]
# microorganisms.unprevDT <- microorganismsDT[prevalence != 9999,]
# microorganisms.prevDT <- microorganismsDT[prevalence != 9999,]
# microorganisms.unprevDT <- microorganismsDT[prevalence == 9999,]
# microorganisms.oldDT <- data.table::as.data.table(AMR::microorganisms.old)
# data.table::setkey(microorganisms.oldDT, tsn, name)
# devtools::use_data(microorganismsDT, overwrite = TRUE)
# devtools::use_data(microorganisms.prevDT, overwrite = TRUE)
# devtools::use_data(microorganisms.unprevDT, overwrite = TRUE)
# devtools::use_data(microorganisms.oldDT, overwrite = TRUE)
# usethis::use_data(microorganismsDT, overwrite = TRUE)
# usethis::use_data(microorganisms.prevDT, overwrite = TRUE)
# usethis::use_data(microorganisms.unprevDT, overwrite = TRUE)
# usethis::use_data(microorganisms.oldDT, overwrite = TRUE)
# rm(microorganismsDT)
# rm(microorganisms.prevDT)
# rm(microorganisms.unprevDT)
# rm(microorganisms.oldDT)
"microorganismsDT"
#' @rdname supplementary_data

View File

@ -25,7 +25,7 @@
#' @param tbl table with antibiotic columns, like e.g. \code{amox} and \code{amcl}
#' @param info print progress
#' @param rules a character vector that specifies which rules should be applied - one or more of \code{c("breakpoints", "expert", "other", "all")}
#' @param verbose a logical to indicate whether extensive info should be returned as a \code{data.frame} with info about which rows and columns are effected
#' @param verbose a logical to indicate whether extensive info should be returned as a \code{data.frame} with info about which rows and columns are effected. It runs all EUCAST rules, but will not be applied to an output - only an informative \code{data.frame} with changes will be returned as output.
#' @param amcl,amik,amox,ampi,azit,azlo,aztr,cefa,cfep,cfot,cfox,cfra,cfta,cftr,cfur,chlo,cipr,clar,clin,clox,coli,czol,dapt,doxy,erta,eryt,fosf,fusi,gent,imip,kana,levo,linc,line,mero,mezl,mino,moxi,nali,neom,neti,nitr,norf,novo,oflo,oxac,peni,pipe,pita,poly,pris,qida,rifa,roxi,siso,teic,tetr,tica,tige,tobr,trim,trsu,vanc column name of an antibiotic, see Antibiotics
#' @param ... parameters that are passed on to \code{eucast_rules}
#' @inheritParams first_isolate
@ -101,7 +101,7 @@
#' @export
#' @importFrom dplyr %>% select pull mutate_at vars
#' @importFrom crayon bold bgGreen bgYellow bgRed black green blue italic strip_style
#' @return The input of \code{tbl}, possibly with edited values of antibiotics. Or, if \code{verbose = TRUE}, a \code{data.frame} with verbose info.
#' @return The input of \code{tbl}, possibly with edited values of antibiotics. Or, if \code{verbose = TRUE}, a \code{data.frame} with all original and new values of the affected bug-drug combinations.
#' @source
#' \itemize{
#' \item{
@ -144,7 +144,9 @@
#' # 4 Klebsiella pneumoniae - - - - - S S
#' # 5 Pseudomonas aeruginosa - - - - - S S
#'
#' b <- eucast_rules(a, "mo") # 18 results are forced as R or S
#'
#' # apply EUCAST rules: 18 results are forced as R or S
#' b <- eucast_rules(a)
#'
#' b
#' # mo vanc amox coli cfta cfur peni cfox
@ -153,6 +155,11 @@
#' # 3 Escherichia coli R - - - - R S
#' # 4 Klebsiella pneumoniae R R - - - R S
#' # 5 Pseudomonas aeruginosa R R - - R R R
#'
#'
#' # do not apply EUCAST rules, but rather get a a data.frame
#' # with 18 rows, containing all details about the transformations:
#' c <- eucast_rules(a, verbose = TRUE)
eucast_rules <- function(tbl,
col_mo = NULL,
info = TRUE,
@ -406,22 +413,31 @@ eucast_rules <- function(tbl,
trsu <- col.list[trsu]
vanc <- col.list[vanc]
number_changed <- 0
number_added_S <- 0
number_added_I <- 0
number_added_R <- 0
number_changed_to_S <- 0
number_changed_to_I <- 0
number_changed_to_R <- 0
number_affected_rows <- integer(0)
verbose_info <- data.frame(rule_type = character(0),
rule_set = character(0),
force_to = character(0),
found = integer(0),
changed = integer(0),
target_columns = integer(0),
target_rows = integer(0),
verbose_info <- data.frame(row = integer(0),
col = character(0),
mo = character(0),
mo_fullname = character(0),
old = character(0),
new = character(0),
rule_source = character(0),
rule_group = character(0),
stringsAsFactors = FALSE)
# helper function for editing the table
edit_rsi <- function(to, rule, rows, cols) {
cols <- unique(cols[!is.na(cols) & !is.null(cols)])
if (length(rows) > 0 & length(cols) > 0) {
before_df <- tbl_original
before <- as.character(unlist(as.list(tbl_original[rows, cols])))
tryCatch(
# insert into original table
tbl_original[rows, cols] <<- to,
@ -442,29 +458,81 @@ eucast_rules <- function(tbl,
suppressWarnings(
tbl[rows, cols] <<- to
))
after <- as.character(unlist(as.list(tbl_original[rows, cols])))
number_changed <<- number_changed + sum(before != after, na.rm = TRUE)
tbl[rows, cols] <<- tbl_original[rows, cols]
number_newly_added_S <- sum(!before %in% c("S", "I", "R") & after == "S", na.rm = TRUE)
number_newly_added_I <- sum(!before %in% c("S", "I", "R") & after == "I", na.rm = TRUE)
number_newly_added_R <- sum(!before %in% c("S", "I", "R") & after == "R", na.rm = TRUE)
number_newly_changed_to_S <- sum(before %in% c("I", "R") & after == "S", na.rm = TRUE)
number_newly_changed_to_I <- sum(before %in% c("S", "R") & after == "I", na.rm = TRUE)
number_newly_changed_to_R <- sum(before %in% c("S", "I") & after == "R", na.rm = TRUE)
# totals
number_added_S <<- number_added_S + number_newly_added_S
number_added_I <<- number_added_I + number_newly_added_I
number_added_R <<- number_added_R + number_newly_added_R
number_changed_to_S <<- number_changed_to_S + number_newly_changed_to_S
number_changed_to_I <<- number_changed_to_I + number_newly_changed_to_I
number_changed_to_R <<- number_changed_to_R + number_newly_changed_to_R
number_affected_rows <<- unique(c(number_affected_rows, rows))
changed_results <<- changed_results + sum(before != after, na.rm = TRUE) # will be reset at start of every rule
# will be reset at start of every rule
changed_results <<- changed_results +
number_newly_added_S +
number_newly_added_I +
number_newly_added_R +
number_newly_changed_to_S +
number_newly_changed_to_I +
number_newly_changed_to_R
if (verbose == TRUE) {
for (i in 1:length(cols)) {
# add new row for every affected column
verbose_new <- data.frame(rule_type = strip_style(rule[1]),
rule_set = strip_style(rule[2]),
force_to = to,
found = length(before),
changed = sum(before != after, na.rm = TRUE),
target_column = cols[i],
stringsAsFactors = FALSE)
verbose_new$target_rows <- list(unname(rows))
rownames(verbose_new) <- NULL
verbose_info <<- rbind(verbose_info, verbose_new)
for (r in 1:length(rows)) {
for (c in 1:length(cols)) {
old <- before_df[rows[r], cols[c]]
new <- tbl[rows[r], cols[c]]
if (!identical(old, new)) {
verbose_new <- data.frame(row = rows[r],
col = cols[c],
mo = tbl_original[rows[r], col_mo],
mo_fullname = "",
old = old,
new = new,
rule_source = strip_style(rule[1]),
rule_group = strip_style(rule[2]),
stringsAsFactors = FALSE)
verbose_info <<- rbind(verbose_info, verbose_new)
}
}
}
# verbose_new <- data.frame(row = integer(0),
# col = character(0),
# old = character(0),
# new = character(0),
# rule_source = character(0),
# rule_group = character(0),
# stringsAsFactors = FALSE)
# a <<- rule
# for (i in 1:length(cols)) {
# # add new row for every affected column
# verbose_new <- data.frame(rule_type = strip_style(rule[1]),
# rule_set = strip_style(rule[2]),
# force_to = to,
# found = length(before),
# changed = sum(before != after, na.rm = TRUE),
# target_column = cols[i],
# stringsAsFactors = FALSE)
# verbose_new$target_rows <- list(unname(rows))
# rownames(verbose_new) <- NULL
# verbose_info <<- rbind(verbose_info, verbose_new)
# }
}
}
}
na.rm <- function(col) {
if (is.null(col)) {
""
@ -489,15 +557,15 @@ eucast_rules <- function(tbl,
# since ampicillin ^= amoxicillin, get the first from the latter (not in original EUCAST table)
if (!is.null(ampi) & !is.null(amox)) {
if (verbose == TRUE) {
cat(bgGreen("\n VERBOSE: transforming",
length(which(tbl[, amox] == "S" & !tbl[, ampi] %in% c("S", "I", "R"))),
"empty ampicillin fields to 'S' based on amoxicillin. "))
cat(bgGreen("\n VERBOSE: transforming",
length(which(tbl[, amox] == "I" & !tbl[, ampi] %in% c("S", "I", "R"))),
"empty ampicillin fields to 'I' based on amoxicillin. "))
cat(bgGreen("\n VERBOSE: transforming",
length(which(tbl[, amox] == "R" & !tbl[, ampi] %in% c("S", "I", "R"))),
"empty ampicillin fields to 'R' based on amoxicillin. \n"))
cat("\n VERBOSE: transforming",
length(which(tbl[, amox] == "S" & !tbl[, ampi] %in% c("S", "I", "R"))),
"empty ampicillin fields to 'S' based on amoxicillin. ")
cat("\n VERBOSE: transforming",
length(which(tbl[, amox] == "I" & !tbl[, ampi] %in% c("S", "I", "R"))),
"empty ampicillin fields to 'I' based on amoxicillin. ")
cat("\n VERBOSE: transforming",
length(which(tbl[, amox] == "R" & !tbl[, ampi] %in% c("S", "I", "R"))),
"empty ampicillin fields to 'R' based on amoxicillin. \n")
}
tbl[which(tbl[, amox] == "S" & !tbl[, ampi] %in% c("S", "I", "R")), ampi] <- "S"
tbl[which(tbl[, amox] == "I" & !tbl[, ampi] %in% c("S", "I", "R")), ampi] <- "I"
@ -1804,22 +1872,46 @@ eucast_rules <- function(tbl,
} else {
wouldve <- ""
}
if (number_changed == 0) {
colour <- green
if (sum(number_added_S, number_added_I, number_added_R,
number_changed_to_S, number_changed_to_I, number_changed_to_R,
na.rm = TRUE) == 0) {
colour <- green # is function
} else {
colour <- blue
colour <- blue # is function
}
decimal.mark <- getOption("OutDec")
big.mark <- ifelse(decimal.mark != ",", ",", ".")
formatnr <- function(x) {
format(x, big.mark = big.mark, decimal.mark = decimal.mark)
}
cat(bold(paste('\n=> EUCAST rules', paste0(wouldve, 'affected'),
number_affected_rows %>% length() %>% format(big.mark = big.mark, decimal.mark = decimal.mark),
'out of', nrow(tbl_original) %>% format(big.mark = big.mark, decimal.mark = decimal.mark),
'rows ->',
colour(paste0(wouldve, 'changed'),
number_changed %>% format(big.mark = big.mark, decimal.mark = decimal.mark), 'test results.\n\n'))))
number_affected_rows %>% length() %>% formatnr(),
'out of', nrow(tbl_original) %>% formatnr(),
'rows\n')))
total_added <- number_added_S + number_added_I + number_added_R
total_changed <- number_changed_to_S + number_changed_to_I + number_changed_to_R
cat(colour(paste0(" -> ", wouldve, "added ",
bold(formatnr(total_added), "test results"),
if(total_added > 0)
paste0(" (", formatnr(number_added_S), " as S; ",
formatnr(number_added_I), " as I; ",
formatnr(number_added_R), " as R)"),
"\n")))
cat(colour(paste0(" -> ", wouldve, "changed ",
bold(formatnr(total_changed), "test results"),
if(total_changed > 0)
paste0(" (", formatnr(number_changed_to_S), " to S; ",
formatnr(number_changed_to_I), " to I; ",
formatnr(number_changed_to_R), " to R)"),
"\n")))
}
if (verbose == TRUE) {
suppressWarnings(
suppressMessages(
verbose_info$mo_fullname <- mo_fullname(verbose_info$mo)
)
)
return(verbose_info)
}

View File

@ -228,7 +228,7 @@ frequency_tbl <- function(x,
x.name <- x.name %>% strsplit("%>%", fixed = TRUE) %>% unlist() %>% .[1] %>% trimws()
}
if (x.name == ".") {
x.name <- "a `data.frame`"
x.name <- "a data.frame"
} else {
x.name <- paste0("`", x.name, "`")
}
@ -797,11 +797,30 @@ print.frequency_tbl <- function(x,
opt <- attr(x, "opt")
opt$header_txt <- header(x)
dots <- list(...)
if ("markdown" %in% names(dots)) {
if (dots$markdown == TRUE) {
opt$tbl_format <- "markdown"
} else {
opt$tbl_format <- "pandoc"
}
}
if (!missing(markdown)) {
if (markdown == TRUE) {
opt$tbl_format <- "markdown"
} else {
opt$tbl_format <- "pandoc"
}
}
if (length(opt$vars) == 0) {
opt$vars <- NULL
}
if (is.null(opt$title)) {
if (isTRUE(opt$data %like% "^a data.frame") & opt$tbl_format == "markdown") {
opt$data <- gsub("data.frame", "`data.frame`", opt$data, fixed = TRUE)
}
if (!is.null(opt$data) & !is.null(opt$vars)) {
title <- paste0("`", paste0(opt$vars, collapse = "` and `"), "` from ", opt$data)
} else if (!is.null(opt$data) & is.null(opt$vars)) {
@ -845,21 +864,6 @@ print.frequency_tbl <- function(x,
if (!missing(big.mark)) {
opt$big.mark <- big.mark
}
dots <- list(...)
if ("markdown" %in% names(dots)) {
if (dots$markdown == TRUE) {
opt$tbl_format <- "markdown"
} else {
opt$tbl_format <- "pandoc"
}
}
if (!missing(markdown)) {
if (markdown == TRUE) {
opt$tbl_format <- "markdown"
} else {
opt$tbl_format <- "pandoc"
}
}
if (!missing(header)) {
opt$header <- header
}

267
R/mo.R
View File

@ -54,7 +54,7 @@
#'
#' This function uses Artificial Intelligence (AI) to help getting fast and logical results. It tries to find matches in this order:
#' \itemize{
#' \item{Taxonomic kingdom: it first searches in bacteria, then fungi, then protozoa}
#' \item{Taxonomic kingdom: it first searches in Bacteria, then Fungi, then Protozoa}
#' \item{Human pathogenic prevalence: it first searches in more prevalent microorganisms, then less prevalent ones}
#' \item{Valid MO codes and full names: it first searches in already valid MO code and known genus/species combinations}
#' \item{Breakdown of input values: from here it starts to breakdown input values to find possible matches}
@ -69,13 +69,30 @@
#' }
#' This means that looking up human pathogenic microorganisms takes less time than looking up human \strong{non}-pathogenic microorganisms.
#'
#' When using \code{allow_uncertain = TRUE} (which is the default setting), it will use additional rules if all previous AI rules failed to get valid results. Examples:
#' \strong{UNCERTAIN RESULTS} \cr
#' When using \code{allow_uncertain = TRUE} (which is the default setting), it will use additional rules if all previous AI rules failed to get valid results. These are:
#' \itemize{
#' \item{It tries to look for previously accepted (but now invalid) taxonomic names}
#' \item{It strips off values between brackets and the brackets itself, and re-evaluates the input with all previous rules}
#' \item{It strips off words from the end one by one and re-evaluates the input with all previous rules}
#' \item{It strips off words from the start one by one and re-evaluates the input with all previous rules}
#' \item{It tries to look for some manual changes which are not yet published to the ITIS database (like \emph{Propionibacterium} not yet being \emph{Cutibacterium})}
#' }
#'
#' Examples:
#' \itemize{
#' \item{\code{"Streptococcus group B (known as S. agalactiae)"}. The text between brackets will be removed and a warning will be thrown that the result \emph{Streptococcus group B} (\code{B_STRPTC_GRB}) needs review.}
#' \item{\code{"S. aureus - please mind: MRSA"}. The last word will be stripped, after which the function will try to find a match. If it does not, the second last word will be stripped, etc. Again, a warning will be thrown that the result \emph{Staphylococcus aureus} (\code{B_STPHY_AUR}) needs review.}
#' \item{\code{"D. spartina"}. This is the abbreviation of an old taxonomic name: \emph{Didymosphaeria spartinae} (the last "e" was missing from the input). This fungus was renamed to \emph{Leptosphaeria obiones}, so a warning will be thrown that this result (\code{F_LPTSP_OBI}) needs review.}
#' \item{\code{"Fluoroquinolone-resistant Neisseria gonorrhoeae"}. The first word will be stripped, after which the function will try to find a match. A warning will be thrown that the result \emph{Neisseria gonorrhoeae} (\code{B_NESSR_GON}) needs review.}
#' }
#'
#' Use \code{mo_failures()} to get a vector with all values that could not be coerced to a valid value.
#'
#' Use \code{mo_uncertainties()} to get a vector with all values that were coerced to a valid value, but with uncertainty.
#'
#' Use \code{mo_renamed()} to get a vector with all values that could be coerced based on an old, previously accepted taxonomic name.
#'
#' @inheritSection ITIS ITIS
# (source as a section, so it can be inherited by other man pages)
#' @section Source:
@ -154,7 +171,7 @@ is.mo <- function(x) {
#' @importFrom dplyr %>% pull left_join n_distinct progress_estimated filter
#' @importFrom data.table data.table as.data.table setkey
#' @importFrom crayon magenta red italic
#' @importFrom crayon magenta red silver italic has_color
exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
allow_uncertain = TRUE, reference_df = get_mo_source(),
property = "mo", clear_options = TRUE) {
@ -170,6 +187,7 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
if (clear_options == TRUE) {
options(mo_failures = NULL)
options(mo_uncertainties = NULL)
options(mo_renamed = NULL)
}
@ -194,6 +212,7 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
}
notes <- character(0)
uncertainties <- character(0)
failures <- character(0)
x_input <- x
# only check the uniques, which is way faster
@ -251,7 +270,7 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
x_backup <- trimws(x, which = "both")
# remove spp and species
x <- trimws(gsub(" +(spp.?|ssp.?|subsp.?|species)", " ", x_backup, ignore.case = TRUE), which = "both")
x <- trimws(gsub(" +(spp.?|ssp.?|sp.? |ss ?.?|subsp.?|subspecies|biovar |serovar |species)", " ", x_backup, ignore.case = TRUE), which = "both")
x_species <- paste(x, "species")
# translate to English for supported languages of mo_property
x <- gsub("(Gruppe|gruppe|groep|grupo|gruppo|groupe)", "group", x, ignore.case = TRUE)
@ -259,6 +278,14 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
x <- gsub("(no MO)", "", x, fixed = TRUE)
# remove non-text in case of "E. coli" except dots and spaces
x <- gsub("[^.a-zA-Z0-9/ \\-]+", "", x)
# replace minus by a space
x <- gsub("-+", " ", x)
# replace hemolytic by haemolytic
x <- gsub("ha?emoly", "haemoly", x)
# place minus back in streptococci
x <- gsub("(alpha|beta|gamma) haemoly", "\\1-haemolytic", x)
# remove genus as first word
x <- gsub("^Genus ", "", x)
# but spaces before and after should be omitted
x <- trimws(x, which = "both")
@ -272,13 +299,13 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
x <- gsub("[ .]+", ".*", x)
# add start en stop regex
x <- paste0('^', x, '$')
x_withspaces_start <- paste0('^', x_withspaces)
x_withspaces <- paste0('^', x_withspaces, '$')
x_withspaces_start_only <- paste0('^', x_withspaces)
x_withspaces_start_end <- paste0('^', x_withspaces, '$')
# cat(paste0('x "', x, '"\n'))
# cat(paste0('x_species "', x_species, '"\n'))
# cat(paste0('x_withspaces_start "', x_withspaces_start, '"\n'))
# cat(paste0('x_withspaces "', x_withspaces, '"\n'))
# cat(paste0('x_withspaces_start_only "', x_withspaces_start_only, '"\n'))
# cat(paste0('x_withspaces_start_end "', x_withspaces_start_end, '"\n'))
# cat(paste0('x_backup "', x_backup, '"\n'))
# cat(paste0('x_trimmed "', x_trimmed, '"\n'))
# cat(paste0('x_trimmed_species "', x_trimmed_species, '"\n'))
@ -290,16 +317,17 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
progress$tick()$print()
if (identical(x_trimmed[i], "")) {
# empty values
if (tolower(x_trimmed[i]) %in% c("", "xxx", "other", "none", "unknown")) {
# empty and nonsense values, ignore without warning ("xxx" is WHONET code for 'no growth')
x[i] <- NA_character_
next
}
if (nchar(x_trimmed[i]) < 3) {
if (nchar(gsub("[^a-zA-Z]", "", x_trimmed[i])) < 3) {
# check if search term was like "A. species", then return first genus found with ^A
if (x_backup[i] %like% "species" | x_backup[i] %like% "spp[.]?") {
if (x_backup[i] %like% "[a-z]+ species" | x_backup[i] %like% "[a-z] spp[.]?") {
# get mo code of first hit
found <- microorganismsDT[fullname %like% x_withspaces_start[i], mo]
found <- microorganismsDT[fullname %like% x_withspaces_start_only[i], mo]
if (length(found) > 0) {
mo_code <- found[1L] %>% strsplit("_") %>% unlist() %>% .[1:2] %>% paste(collapse = "_")
found <- microorganismsDT[mo == mo_code, ..property][[1]]
@ -316,14 +344,13 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
next
}
# no nonsense text
if (toupper(x_trimmed[i]) %in% c('OTHER', 'NONE', 'UNKNOWN')) {
if (x_trimmed[i] %like% "virus") {
# there is no fullname like virus, so don't try to coerce it
x[i] <- NA_character_
failures <- c(failures, x_backup[i])
next
}
# translate known trivial abbreviations to genus + species ----
if (!is.na(x_trimmed[i])) {
if (toupper(x_trimmed[i]) %in% c('MRSA', 'MSSA', 'VISA', 'VRSA')) {
@ -339,6 +366,10 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
x[i] <- microorganismsDT[mo == 'B_ENTRC', ..property][[1]][1L]
next
}
if (toupper(x_trimmed[i]) %in% c('EHEC', 'EPEC', 'EIEC', 'STEC', 'ATEC')) {
x[i] <- microorganismsDT[mo == 'B_ESCHR_COL', ..property][[1]][1L]
next
}
if (toupper(x_trimmed[i]) == 'MRPA') {
# multi resistant P. aeruginosa
x[i] <- microorganismsDT[mo == 'B_PDMNS_AER', ..property][[1]][1L]
@ -398,13 +429,25 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
next
}
if (grepl("[sS]almonella [A-Z][a-z]+ ?.*", x_trimmed[i])) {
# Salmonella with capital letter species like "Salmonella Goettingen" - they're all S. enterica
x[i] <- microorganismsDT[mo == 'B_SLMNL_ENT', ..property][[1]][1L]
notes <- c(notes,
magenta(paste0("Note: ", italic(x_trimmed[i]),
" was considered (a subspecies of) ",
italic("Salmonella enterica"),
" (B_SLMNL_ENT)")))
if (x_trimmed[i] %like% "Salmonella group") {
# Salmonella Group A to Z, just return S. species for now
x[i] <- microorganismsDT[mo == 'B_SLMNL', ..property][[1]][1L]
notes <- c(notes,
magenta(paste0("Note: ",
italic("Salmonella"), " ", trimws(gsub("Salmonella", "", x_trimmed[i])),
" was considered ",
italic("Salmonella species"),
" (B_SLMNL)")))
} else {
# Salmonella with capital letter species like "Salmonella Goettingen" - they're all S. enterica
x[i] <- microorganismsDT[mo == 'B_SLMNL_ENT', ..property][[1]][1L]
notes <- c(notes,
magenta(paste0("Note: ",
italic("Salmonella"), " ", trimws(gsub("Salmonella", "", x_trimmed[i])),
" was considered a subspecies of ",
italic("Salmonella enterica"),
" (B_SLMNL_ENT)")))
}
next
}
}
@ -417,14 +460,14 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
x[i] <- found[1L]
next
}
if (nchar(x_trimmed[i]) > 4) {
# not when abbr is esco, stau, klpn, etc.
found <- microorganismsDT[tolower(fullname) %like% gsub(" ", ".*", x_trimmed_species[i], fixed = TRUE), ..property][[1]]
if (nchar(x_trimmed[i]) >= 6) {
found <- microorganismsDT[tolower(fullname) %like% paste0(x_withspaces_start_only[i], "[a-z]+ species"), ..property][[1]]
if (length(found) > 0) {
x[i] <- found[1L]
next
}
}
# rest of genus only is in allow_uncertain part.
}
# TRY OTHER SOURCES ----
@ -472,29 +515,27 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
next
}
# try any match keeping spaces ----
found <- microorganisms.prevDT[fullname %like% x_withspaces[i], ..property][[1]]
if (length(found) > 0) {
found <- microorganisms.prevDT[fullname %like% x_withspaces_start_end[i], ..property][[1]]
if (length(found) > 0 & nchar(x_trimmed[i]) >= 6) {
x[i] <- found[1L]
next
}
# try any match keeping spaces, not ending with $ ----
found <- microorganisms.prevDT[fullname %like% x_withspaces_start[i], ..property][[1]]
if (length(found) > 0) {
found <- microorganisms.prevDT[fullname %like% x_withspaces_start_only[i], ..property][[1]]
if (length(found) > 0 & nchar(x_trimmed[i]) >= 6) {
x[i] <- found[1L]
next
}
# try any match diregarding spaces ----
found <- microorganisms.prevDT[fullname %like% x[i], ..property][[1]]
if (length(found) > 0) {
if (length(found) > 0 & nchar(x_trimmed[i]) >= 6) {
x[i] <- found[1L]
next
}
# try splitting of characters in the middle and then find ID ----
# only when text length is 6 or lower
# like esco = E. coli, klpn = K. pneumoniae, stau = S. aureus, staaur = S. aureus
@ -512,7 +553,7 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
# try fullname without start and stop regex, to also find subspecies ----
# like "K. pneu rhino" >> "Klebsiella pneumoniae (rhinoscleromatis)" = KLEPNERH
found <- microorganisms.prevDT[fullname %like% x_withspaces_start[i], ..property][[1]]
found <- microorganisms.prevDT[fullname %like% x_withspaces_start_only[i], ..property][[1]]
if (length(found) > 0) {
x[i] <- found[1L]
next
@ -549,13 +590,13 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
next
}
# try any match keeping spaces ----
found <- microorganisms.unprevDT[fullname %like% x_withspaces[i], ..property][[1]]
found <- microorganisms.unprevDT[fullname %like% x_withspaces_start_end[i], ..property][[1]]
if (length(found) > 0) {
x[i] <- found[1L]
next
}
# try any match keeping spaces, not ending with $ ----
found <- microorganisms.unprevDT[fullname %like% x_withspaces_start[i], ..property][[1]]
found <- microorganisms.unprevDT[fullname %like% x_withspaces_start_only[i], ..property][[1]]
if (length(found) > 0) {
x[i] <- found[1L]
next
@ -583,7 +624,7 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
# try fullname without start and stop regex, to also find subspecies ----
# like "K. pneu rhino" >> "Klebsiella pneumoniae (rhinoscleromatis)" = KLEPNERH
found <- microorganisms.unprevDT[fullname %like% x_withspaces_start[i], ..property][[1]]
found <- microorganisms.unprevDT[fullname %like% x_withspaces_start_only[i], ..property][[1]]
if (length(found) > 0) {
x[i] <- found[1L]
next
@ -594,7 +635,7 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
# look for old taxonomic names ----
found <- microorganisms.oldDT[tolower(name) == tolower(x_backup[i])
| tsn == x_trimmed[i]
| name %like% x_withspaces[i],]
| name %like% x_withspaces_start_end[i],]
if (NROW(found) > 0) {
# when property is "ref" (which is the case in mo_ref, mo_authors and mo_year), return the old value, so:
# mo_ref("Chlamydia psittaci) = "Page, 1968" (with warning)
@ -604,22 +645,36 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
} else {
x[i] <- microorganismsDT[tsn == found[1, tsn_new], ..property][[1]]
}
notes <- c(notes,
renamed_note(name_old = found[1, name],
name_new = microorganismsDT[tsn == found[1, tsn_new], fullname],
ref_old = found[1, ref],
ref_new = microorganismsDT[tsn == found[1, tsn_new], ref],
mo = microorganismsDT[tsn == found[1, tsn_new], mo]))
was_renamed(name_old = found[1, name],
name_new = microorganismsDT[tsn == found[1, tsn_new], fullname],
ref_old = found[1, ref],
ref_new = microorganismsDT[tsn == found[1, tsn_new], ref],
mo = microorganismsDT[tsn == found[1, tsn_new], mo])
next
}
# check for uncertain results ----
if (allow_uncertain == TRUE) {
uncertain_fn <- function(a.x_backup, b.x_trimmed, c.x_withspaces, d.x_withspaces_start, e.x) {
# (1) look again for old taxonomic names, now for G. species ----
found <- microorganisms.oldDT[name %like% c.x_withspaces
| name %like% d.x_withspaces_start
uncertain_fn <- function(a.x_backup, b.x_trimmed, c.x_withspaces_start_end, d.x_withspaces_start_only, e.x) {
# (1) look for genus only, part of name ----
if (nchar(b.x_trimmed) > 4 & !b.x_trimmed %like% " ") {
if (!grepl("^[A-Z][a-z]+", b.x_trimmed, ignore.case = FALSE)) {
# not when input is like Genustext, because then Neospora would lead to Actinokineospora
found <- microorganismsDT[tolower(fullname) %like% paste(b.x_trimmed, "species"), ..property][[1]]
if (length(found) > 0) {
x[i] <- found[1L]
uncertainties <<- c(uncertainties,
paste0("'", a.x_backup, "' >> ", microorganismsDT[mo == found[1L], fullname][[1]], " (", found[1L], ")"))
return(x)
}
}
}
# (2) look again for old taxonomic names, now for G. species ----
found <- microorganisms.oldDT[name %like% c.x_withspaces_start_end
| name %like% d.x_withspaces_start_only
| name %like% e.x,]
if (NROW(found) > 0 & nchar(b.x_trimmed) >= 6) {
if (property == "ref") {
@ -630,32 +685,29 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
} else {
x <- microorganismsDT[tsn == found[1, tsn_new], ..property][[1]]
}
warning(red(paste0('(UNCERTAIN) "',
a.x_backup, '" >> ', italic(found[1, name]), " (TSN ", found[1, tsn], ")")),
call. = FALSE, immediate. = FALSE)
notes <<- c(notes,
renamed_note(name_old = found[1, name],
name_new = microorganismsDT[tsn == found[1, tsn_new], fullname],
ref_old = found[1, ref],
ref_new = microorganismsDT[tsn == found[1, tsn_new], ref],
mo = microorganismsDT[tsn == found[1, tsn_new], mo]))
was_renamed(name_old = found[1, name],
name_new = microorganismsDT[tsn == found[1, tsn_new], fullname],
ref_old = found[1, ref],
ref_new = microorganismsDT[tsn == found[1, tsn_new], ref],
mo = microorganismsDT[tsn == found[1, tsn_new], mo])
uncertainties <<- c(uncertainties,
paste0("'", a.x_backup, "' >> ", found[1, name], " (TSN ", found[1, tsn], ")"))
return(x)
}
# (2) strip values between brackets ----
# (3) strip values between brackets ----
a.x_backup_stripped <- gsub("( [(].*[)])", "", a.x_backup)
a.x_backup_stripped <- trimws(gsub(" ", " ", a.x_backup_stripped, fixed = TRUE))
found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup_stripped, clear_options = FALSE, allow_uncertain = FALSE)))
if (!is.na(found) & nchar(b.x_trimmed) >= 6) {
found_result <- found
found <- microorganismsDT[mo == found, ..property][[1]]
warning(red(paste0('(UNCERTAIN) "',
a.x_backup, '" >> ', italic(microorganismsDT[mo == found_result[1L], fullname][[1]]), " (", found_result[1L], ")")),
call. = FALSE, immediate. = FALSE)
uncertainties <<- c(uncertainties,
paste0("'", a.x_backup, "' >> ", microorganismsDT[mo == found_result[1L], fullname][[1]], " (", found_result[1L], ")"))
return(found[1L])
}
# (3) try to strip off one element and check the remains ----
# (4) try to strip off one element from end and check the remains ----
x_strip <- a.x_backup %>% strsplit(" ") %>% unlist()
if (length(x_strip) > 1 & nchar(b.x_trimmed) >= 6) {
for (i in 1:(length(x_strip) - 1)) {
@ -664,22 +716,39 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
if (!is.na(found)) {
found_result <- found
found <- microorganismsDT[mo == found, ..property][[1]]
warning(red(paste0('(UNCERTAIN) "',
a.x_backup, '" >> ', italic(microorganismsDT[mo == found_result[1L], fullname][[1]]), " (", found_result[1L], ")")),
call. = FALSE, immediate. = FALSE)
uncertainties <<- c(uncertainties,
paste0("'", a.x_backup, "' >> ", microorganismsDT[mo == found_result[1L], fullname][[1]], " (", found_result[1L], ")"))
return(found[1L])
}
}
}
# (4) not yet implemented taxonomic changes in ITIS
# (5) try to strip off one element from start and check the remains ----
x_strip <- a.x_backup %>% strsplit(" ") %>% unlist()
if (length(x_strip) > 1 & nchar(b.x_trimmed) >= 6) {
for (i in 2:(length(x_strip))) {
x_strip_collapsed <- paste(x_strip[i:length(x_strip)], collapse = " ")
found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, clear_options = FALSE, allow_uncertain = FALSE)))
if (!is.na(found)) {
found_result <- found
found <- microorganismsDT[mo == found, ..property][[1]]
uncertainties <<- c(uncertainties,
paste0("'", a.x_backup, "' >> ", microorganismsDT[mo == found_result[1L], fullname][[1]], " (", found_result[1L], ")"))
return(found[1L])
}
}
}
# (6) not yet implemented taxonomic changes in ITIS ----
found <- suppressMessages(suppressWarnings(exec_as.mo(TEMPORARY_TAXONOMY(b.x_trimmed), clear_options = FALSE, allow_uncertain = FALSE)))
if (!is.na(found)) {
found_result <- found
found <- microorganismsDT[mo == found, ..property][[1]]
warning(red(paste0('(UNCERTAIN) "',
a.x_backup, '" >> ', italic(microorganismsDT[mo == found_result[1L], fullname][[1]]), " (", found_result[1L], ")")),
warning(silver(paste0('Guessed with uncertainty: "',
a.x_backup, '" >> ', italic(microorganismsDT[mo == found_result[1L], fullname][[1]]), " (", found_result[1L], ")")),
call. = FALSE, immediate. = FALSE)
uncertainties <<- c(uncertainties,
paste0('"', a.x_backup, '" >> ', microorganismsDT[mo == found_result[1L], fullname][[1]], " (", found_result[1L], ")"))
return(found[1L])
}
@ -687,7 +756,7 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
return(NA_character_)
}
x[i] <- uncertain_fn(x_backup[i], x_trimmed[i], x_withspaces[i], x_withspaces_start[i], x[i])
x[i] <- uncertain_fn(x_backup[i], x_trimmed[i], x_withspaces_start_end[i], x_withspaces_start_only[i], x[i])
if (!is.na(x[i])) {
next
}
@ -696,26 +765,39 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
# not found ----
x[i] <- NA_character_
failures <- c(failures, x_backup[i])
}
}
# failures
failures <- failures[!failures %in% c(NA, NULL, NaN)]
if (length(failures) > 0) {
options(mo_failures = sort(unique(failures)))
plural <- ""
plural <- c("value", "it")
if (n_distinct(failures) > 1) {
plural <- "s"
plural <- c("values", "them")
}
total_failures <- length(x_input[x_input %in% failures & !x_input %in% c(NA, NULL, NaN)])
total_n <- length(x_input[!x_input %in% c(NA, NULL, NaN)])
msg <- paste0("\n", n_distinct(failures), " unique value", plural,
msg <- paste0("\n", n_distinct(failures), " unique ", plural[1],
" (^= ", percent(total_failures / total_n, round = 1, force_zero = TRUE),
") could not be coerced to a valid MO code")
if (n_distinct(failures) <= 10) {
msg <- paste0(msg, ": ", paste('"', unique(failures), '"', sep = "", collapse = ', '))
}
msg <- paste0(msg, ". Use mo_failures() to review failured input.")
msg <- paste0(msg, ". Use mo_failures() to review ", plural[2], ".")
warning(red(msg),
call. = FALSE,
immediate. = TRUE) # thus will always be shown, even if >= warnings
}
# uncertainties
if (length(uncertainties) > 0) {
options(mo_uncertainties = sort(unique(uncertainties)))
plural <- c("value", "it")
if (n_distinct(failures) > 1) {
plural <- c("values", "them")
}
msg <- paste0("\nResults of ", n_distinct(uncertainties), " input ", plural[1],
" guessed with uncertainty. Use mo_uncertainties() to review ", plural[2], ".")
warning(red(msg),
call. = FALSE,
immediate. = TRUE) # thus will always be shown, even if >= warnings
@ -774,6 +856,9 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
x[x == microorganismsDT[mo == 'B_STRPTC_SAL', ..property][[1]][1L]] <- microorganismsDT[mo == 'B_STRPTC_GRK', ..property][[1]][1L]
}
# Wrap up ----------------------------------------------------------------
# comply to x, which is also unique and without empty values
x_input_unique_nonempty <- unique(x_input[!is.na(x_input) & !is.null(x_input) & !identical(x_input, "")])
@ -794,10 +879,15 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
x <- as.integer(x)
}
if (length(notes > 0)) {
if (length(mo_renamed()) > 0) {
if (has_color()) {
notes <- getOption("mo_renamed")
} else {
notes <- mo_renamed()
}
notes <- sort(notes)
for (i in 1:length(notes)) {
base::message(notes[i])
base::message(blue(paste("Note:", notes[i])))
}
}
@ -810,7 +900,7 @@ TEMPORARY_TAXONOMY <- function(x) {
}
#' @importFrom crayon blue italic
renamed_note <- function(name_old, name_new, ref_old = "", ref_new = "", mo = "") {
was_renamed <- function(name_old, name_new, ref_old = "", ref_new = "", mo = "") {
if (!is.na(ref_old)) {
ref_old <- paste0(" (", ref_old, ")")
} else {
@ -828,10 +918,7 @@ renamed_note <- function(name_old, name_new, ref_old = "", ref_new = "", mo = ""
}
msg <- paste0(italic(name_old), ref_old, " was renamed ", italic(name_new), ref_new, mo)
msg <- gsub("et al.", italic("et al."), msg)
msg_plain <- paste0(name_old, ref_old, " >> ", name_new, ref_new)
msg_plain <- c(getOption("mo_renamed", character(0)), msg_plain)
options(mo_renamed = sort(msg_plain))
return(blue(paste("Note:", msg)))
options(mo_renamed = sort(msg))
}
#' @exportMethod print.mo
@ -882,20 +969,20 @@ pull.mo <- function(.data, ...) {
pull(as.data.frame(.data), ...)
}
#' Vector of failed coercion attempts
#'
#' Returns a vector of all failed attempts to coerce values to a valid MO code with \code{\link{as.mo}}.
#' @seealso \code{\link{as.mo}}
#' @rdname as.mo
#' @export
mo_failures <- function() {
getOption("mo_failures")
}
#' Vector of taxonomic renamed items
#'
#' Returns a vector of all renamed items of the last coercion to valid MO codes with \code{\link{as.mo}}.
#' @seealso \code{\link{as.mo}}
#' @rdname as.mo
#' @export
mo_uncertainties <- function() {
getOption("mo_uncertainties")
}
#' @rdname as.mo
#' @export
mo_renamed <- function() {
getOption("mo_renamed")
strip_style(gsub("was renamed", ">>", getOption("mo_renamed"), fixed = TRUE))
}

View File

@ -248,7 +248,11 @@ mo_gramstain <- function(x, language = get_locale(), ...) {
#' @rdname mo_property
#' @export
mo_TSN <- function(x, ...) {
mo_validate(x = x, property = "tsn", ...)
res <- mo_validate(x = x, property = "tsn", ...)
if (any(is.na(res))) {
warning("Some results do not have a TSN, because they are missing from ITIS and were added manually. See ?microorganisms.")
}
res
}
#' @rdname mo_property