mirror of
https://github.com/msberends/AMR.git
synced 2025-07-08 14:01:55 +02:00
mo codes for WHONET
This commit is contained in:
38
R/data.R
38
R/data.R
@ -134,7 +134,7 @@
|
||||
#'
|
||||
#' A data set containing the complete microbial taxonomy of the kingdoms Bacteria, Fungi and Protozoa from ITIS. MO codes can be looked up using \code{\link{as.mo}}.
|
||||
#' @inheritSection ITIS ITIS
|
||||
#' @format A \code{\link{data.frame}} with 18,833 observations and 15 variables:
|
||||
#' @format A \code{\link{data.frame}} with 19,456 observations and 15 variables:
|
||||
#' \describe{
|
||||
#' \item{\code{mo}}{ID of microorganism}
|
||||
#' \item{\code{tsn}}{Taxonomic Serial Number (TSN), as defined by ITIS}
|
||||
@ -153,6 +153,17 @@
|
||||
#' \item{\code{ref}}{Author(s) and year of concerning publication as found in ITIS, see Source}
|
||||
#' }
|
||||
#' @source Integrated Taxonomic Information System (ITIS) public online database, \url{https://www.itis.gov}.
|
||||
#' @details Manually added were:
|
||||
#' \itemize{
|
||||
#' \item{605 species of Aspergillus (as Aspergillus misses from ITIS, list from https://en.wikipedia.org/wiki/List_of_Aspergillus_species on 2019-02-05)}
|
||||
#' \item{23 species of Trichophyton (as Trichophyton misses from ITIS, list from https://en.wikipedia.org/wiki/Trichophyton on 2019-02-05)}
|
||||
#' \item{9 species of Streptococcus (beta haemolytic groups A, B, C, D, F, G, H, K and unspecified)}
|
||||
#' \item{2 species of Straphylococcus (coagulase-negative [CoNS] and coagulase-positive [CoPS])}
|
||||
#' \item{1 species of Candida (C. glabrata)}
|
||||
#' \item{2 other undefined (unknown Gram negatives and unknown Gram positives)}
|
||||
#' }
|
||||
#'
|
||||
#' These manual entries have no Taxonomic Serial Number (TSN), so can be looked up with \code{filter(microorganisms, is.na(tsn)}.
|
||||
#' @inheritSection AMR Read more on our website!
|
||||
#' @seealso \code{\link{as.mo}} \code{\link{mo_property}} \code{\link{microorganisms.codes}}
|
||||
"microorganisms"
|
||||
@ -175,12 +186,13 @@
|
||||
|
||||
#' Translation table for microorganism codes
|
||||
#'
|
||||
#' A data set containing commonly used codes for microorganisms. Define your own with \code{\link{set_mo_source}}.
|
||||
#' @format A \code{\link{data.frame}} with 3,303 observations and 2 variables:
|
||||
#' A data set containing commonly used codes for microorganisms, from laboratory systems and WHONET. Define your own with \code{\link{set_mo_source}}.
|
||||
#' @format A \code{\link{data.frame}} with 4,731 observations and 2 variables:
|
||||
#' \describe{
|
||||
#' \item{\code{certe}}{Commonly used code of a microorganism}
|
||||
#' \item{\code{mo}}{Code of microorganism in \code{\link{microorganisms}}}
|
||||
#' \item{\code{mo}}{ID of the microorganism in the \code{\link{microorganisms}} data set}
|
||||
#' }
|
||||
#' @inheritSection ITIS ITIS
|
||||
#' @inheritSection AMR Read more on our website!
|
||||
#' @seealso \code{\link{as.mo}} \code{\link{microorganisms}}
|
||||
"microorganisms.codes"
|
||||
@ -246,17 +258,21 @@
|
||||
#' @name supplementary_data
|
||||
#' @inheritSection AMR Read more on our website!
|
||||
# # Renew data:
|
||||
# # sorted on (1) bacteria, (2) fungi, (3) protozoa and then human pathogenic prevalence and then TSN:
|
||||
# microorganismsDT <- data.table::as.data.table(AMR::microorganisms)
|
||||
# # sort on (1) bacteria, (2) fungi, (3) protozoa and then human pathogenic prevalence and then TSN:
|
||||
# data.table::setkey(microorganismsDT, kingdom, prevalence, fullname)
|
||||
# microorganisms.prevDT <- microorganismsDT[prevalence == 9999,]
|
||||
# microorganisms.unprevDT <- microorganismsDT[prevalence != 9999,]
|
||||
# microorganisms.prevDT <- microorganismsDT[prevalence != 9999,]
|
||||
# microorganisms.unprevDT <- microorganismsDT[prevalence == 9999,]
|
||||
# microorganisms.oldDT <- data.table::as.data.table(AMR::microorganisms.old)
|
||||
# data.table::setkey(microorganisms.oldDT, tsn, name)
|
||||
# devtools::use_data(microorganismsDT, overwrite = TRUE)
|
||||
# devtools::use_data(microorganisms.prevDT, overwrite = TRUE)
|
||||
# devtools::use_data(microorganisms.unprevDT, overwrite = TRUE)
|
||||
# devtools::use_data(microorganisms.oldDT, overwrite = TRUE)
|
||||
# usethis::use_data(microorganismsDT, overwrite = TRUE)
|
||||
# usethis::use_data(microorganisms.prevDT, overwrite = TRUE)
|
||||
# usethis::use_data(microorganisms.unprevDT, overwrite = TRUE)
|
||||
# usethis::use_data(microorganisms.oldDT, overwrite = TRUE)
|
||||
# rm(microorganismsDT)
|
||||
# rm(microorganisms.prevDT)
|
||||
# rm(microorganisms.unprevDT)
|
||||
# rm(microorganisms.oldDT)
|
||||
"microorganismsDT"
|
||||
|
||||
#' @rdname supplementary_data
|
||||
|
176
R/eucast_rules.R
176
R/eucast_rules.R
@ -25,7 +25,7 @@
|
||||
#' @param tbl table with antibiotic columns, like e.g. \code{amox} and \code{amcl}
|
||||
#' @param info print progress
|
||||
#' @param rules a character vector that specifies which rules should be applied - one or more of \code{c("breakpoints", "expert", "other", "all")}
|
||||
#' @param verbose a logical to indicate whether extensive info should be returned as a \code{data.frame} with info about which rows and columns are effected
|
||||
#' @param verbose a logical to indicate whether extensive info should be returned as a \code{data.frame} with info about which rows and columns are effected. It runs all EUCAST rules, but will not be applied to an output - only an informative \code{data.frame} with changes will be returned as output.
|
||||
#' @param amcl,amik,amox,ampi,azit,azlo,aztr,cefa,cfep,cfot,cfox,cfra,cfta,cftr,cfur,chlo,cipr,clar,clin,clox,coli,czol,dapt,doxy,erta,eryt,fosf,fusi,gent,imip,kana,levo,linc,line,mero,mezl,mino,moxi,nali,neom,neti,nitr,norf,novo,oflo,oxac,peni,pipe,pita,poly,pris,qida,rifa,roxi,siso,teic,tetr,tica,tige,tobr,trim,trsu,vanc column name of an antibiotic, see Antibiotics
|
||||
#' @param ... parameters that are passed on to \code{eucast_rules}
|
||||
#' @inheritParams first_isolate
|
||||
@ -101,7 +101,7 @@
|
||||
#' @export
|
||||
#' @importFrom dplyr %>% select pull mutate_at vars
|
||||
#' @importFrom crayon bold bgGreen bgYellow bgRed black green blue italic strip_style
|
||||
#' @return The input of \code{tbl}, possibly with edited values of antibiotics. Or, if \code{verbose = TRUE}, a \code{data.frame} with verbose info.
|
||||
#' @return The input of \code{tbl}, possibly with edited values of antibiotics. Or, if \code{verbose = TRUE}, a \code{data.frame} with all original and new values of the affected bug-drug combinations.
|
||||
#' @source
|
||||
#' \itemize{
|
||||
#' \item{
|
||||
@ -144,7 +144,9 @@
|
||||
#' # 4 Klebsiella pneumoniae - - - - - S S
|
||||
#' # 5 Pseudomonas aeruginosa - - - - - S S
|
||||
#'
|
||||
#' b <- eucast_rules(a, "mo") # 18 results are forced as R or S
|
||||
#'
|
||||
#' # apply EUCAST rules: 18 results are forced as R or S
|
||||
#' b <- eucast_rules(a)
|
||||
#'
|
||||
#' b
|
||||
#' # mo vanc amox coli cfta cfur peni cfox
|
||||
@ -153,6 +155,11 @@
|
||||
#' # 3 Escherichia coli R - - - - R S
|
||||
#' # 4 Klebsiella pneumoniae R R - - - R S
|
||||
#' # 5 Pseudomonas aeruginosa R R - - R R R
|
||||
#'
|
||||
#'
|
||||
#' # do not apply EUCAST rules, but rather get a a data.frame
|
||||
#' # with 18 rows, containing all details about the transformations:
|
||||
#' c <- eucast_rules(a, verbose = TRUE)
|
||||
eucast_rules <- function(tbl,
|
||||
col_mo = NULL,
|
||||
info = TRUE,
|
||||
@ -406,22 +413,31 @@ eucast_rules <- function(tbl,
|
||||
trsu <- col.list[trsu]
|
||||
vanc <- col.list[vanc]
|
||||
|
||||
number_changed <- 0
|
||||
number_added_S <- 0
|
||||
number_added_I <- 0
|
||||
number_added_R <- 0
|
||||
number_changed_to_S <- 0
|
||||
number_changed_to_I <- 0
|
||||
number_changed_to_R <- 0
|
||||
|
||||
number_affected_rows <- integer(0)
|
||||
verbose_info <- data.frame(rule_type = character(0),
|
||||
rule_set = character(0),
|
||||
force_to = character(0),
|
||||
found = integer(0),
|
||||
changed = integer(0),
|
||||
target_columns = integer(0),
|
||||
target_rows = integer(0),
|
||||
verbose_info <- data.frame(row = integer(0),
|
||||
col = character(0),
|
||||
mo = character(0),
|
||||
mo_fullname = character(0),
|
||||
old = character(0),
|
||||
new = character(0),
|
||||
rule_source = character(0),
|
||||
rule_group = character(0),
|
||||
stringsAsFactors = FALSE)
|
||||
|
||||
# helper function for editing the table
|
||||
edit_rsi <- function(to, rule, rows, cols) {
|
||||
cols <- unique(cols[!is.na(cols) & !is.null(cols)])
|
||||
if (length(rows) > 0 & length(cols) > 0) {
|
||||
before_df <- tbl_original
|
||||
before <- as.character(unlist(as.list(tbl_original[rows, cols])))
|
||||
|
||||
tryCatch(
|
||||
# insert into original table
|
||||
tbl_original[rows, cols] <<- to,
|
||||
@ -442,29 +458,81 @@ eucast_rules <- function(tbl,
|
||||
suppressWarnings(
|
||||
tbl[rows, cols] <<- to
|
||||
))
|
||||
|
||||
after <- as.character(unlist(as.list(tbl_original[rows, cols])))
|
||||
number_changed <<- number_changed + sum(before != after, na.rm = TRUE)
|
||||
|
||||
tbl[rows, cols] <<- tbl_original[rows, cols]
|
||||
|
||||
number_newly_added_S <- sum(!before %in% c("S", "I", "R") & after == "S", na.rm = TRUE)
|
||||
number_newly_added_I <- sum(!before %in% c("S", "I", "R") & after == "I", na.rm = TRUE)
|
||||
number_newly_added_R <- sum(!before %in% c("S", "I", "R") & after == "R", na.rm = TRUE)
|
||||
number_newly_changed_to_S <- sum(before %in% c("I", "R") & after == "S", na.rm = TRUE)
|
||||
number_newly_changed_to_I <- sum(before %in% c("S", "R") & after == "I", na.rm = TRUE)
|
||||
number_newly_changed_to_R <- sum(before %in% c("S", "I") & after == "R", na.rm = TRUE)
|
||||
|
||||
# totals
|
||||
number_added_S <<- number_added_S + number_newly_added_S
|
||||
number_added_I <<- number_added_I + number_newly_added_I
|
||||
number_added_R <<- number_added_R + number_newly_added_R
|
||||
number_changed_to_S <<- number_changed_to_S + number_newly_changed_to_S
|
||||
number_changed_to_I <<- number_changed_to_I + number_newly_changed_to_I
|
||||
number_changed_to_R <<- number_changed_to_R + number_newly_changed_to_R
|
||||
number_affected_rows <<- unique(c(number_affected_rows, rows))
|
||||
changed_results <<- changed_results + sum(before != after, na.rm = TRUE) # will be reset at start of every rule
|
||||
|
||||
# will be reset at start of every rule
|
||||
changed_results <<- changed_results +
|
||||
number_newly_added_S +
|
||||
number_newly_added_I +
|
||||
number_newly_added_R +
|
||||
number_newly_changed_to_S +
|
||||
number_newly_changed_to_I +
|
||||
number_newly_changed_to_R
|
||||
|
||||
if (verbose == TRUE) {
|
||||
for (i in 1:length(cols)) {
|
||||
# add new row for every affected column
|
||||
verbose_new <- data.frame(rule_type = strip_style(rule[1]),
|
||||
rule_set = strip_style(rule[2]),
|
||||
force_to = to,
|
||||
found = length(before),
|
||||
changed = sum(before != after, na.rm = TRUE),
|
||||
target_column = cols[i],
|
||||
stringsAsFactors = FALSE)
|
||||
verbose_new$target_rows <- list(unname(rows))
|
||||
rownames(verbose_new) <- NULL
|
||||
verbose_info <<- rbind(verbose_info, verbose_new)
|
||||
for (r in 1:length(rows)) {
|
||||
for (c in 1:length(cols)) {
|
||||
old <- before_df[rows[r], cols[c]]
|
||||
new <- tbl[rows[r], cols[c]]
|
||||
if (!identical(old, new)) {
|
||||
verbose_new <- data.frame(row = rows[r],
|
||||
col = cols[c],
|
||||
mo = tbl_original[rows[r], col_mo],
|
||||
mo_fullname = "",
|
||||
old = old,
|
||||
new = new,
|
||||
rule_source = strip_style(rule[1]),
|
||||
rule_group = strip_style(rule[2]),
|
||||
stringsAsFactors = FALSE)
|
||||
verbose_info <<- rbind(verbose_info, verbose_new)
|
||||
}
|
||||
}
|
||||
}
|
||||
# verbose_new <- data.frame(row = integer(0),
|
||||
# col = character(0),
|
||||
# old = character(0),
|
||||
# new = character(0),
|
||||
# rule_source = character(0),
|
||||
# rule_group = character(0),
|
||||
# stringsAsFactors = FALSE)
|
||||
# a <<- rule
|
||||
# for (i in 1:length(cols)) {
|
||||
# # add new row for every affected column
|
||||
# verbose_new <- data.frame(rule_type = strip_style(rule[1]),
|
||||
# rule_set = strip_style(rule[2]),
|
||||
# force_to = to,
|
||||
# found = length(before),
|
||||
# changed = sum(before != after, na.rm = TRUE),
|
||||
# target_column = cols[i],
|
||||
# stringsAsFactors = FALSE)
|
||||
# verbose_new$target_rows <- list(unname(rows))
|
||||
# rownames(verbose_new) <- NULL
|
||||
# verbose_info <<- rbind(verbose_info, verbose_new)
|
||||
# }
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
na.rm <- function(col) {
|
||||
if (is.null(col)) {
|
||||
""
|
||||
@ -489,15 +557,15 @@ eucast_rules <- function(tbl,
|
||||
# since ampicillin ^= amoxicillin, get the first from the latter (not in original EUCAST table)
|
||||
if (!is.null(ampi) & !is.null(amox)) {
|
||||
if (verbose == TRUE) {
|
||||
cat(bgGreen("\n VERBOSE: transforming",
|
||||
length(which(tbl[, amox] == "S" & !tbl[, ampi] %in% c("S", "I", "R"))),
|
||||
"empty ampicillin fields to 'S' based on amoxicillin. "))
|
||||
cat(bgGreen("\n VERBOSE: transforming",
|
||||
length(which(tbl[, amox] == "I" & !tbl[, ampi] %in% c("S", "I", "R"))),
|
||||
"empty ampicillin fields to 'I' based on amoxicillin. "))
|
||||
cat(bgGreen("\n VERBOSE: transforming",
|
||||
length(which(tbl[, amox] == "R" & !tbl[, ampi] %in% c("S", "I", "R"))),
|
||||
"empty ampicillin fields to 'R' based on amoxicillin. \n"))
|
||||
cat("\n VERBOSE: transforming",
|
||||
length(which(tbl[, amox] == "S" & !tbl[, ampi] %in% c("S", "I", "R"))),
|
||||
"empty ampicillin fields to 'S' based on amoxicillin. ")
|
||||
cat("\n VERBOSE: transforming",
|
||||
length(which(tbl[, amox] == "I" & !tbl[, ampi] %in% c("S", "I", "R"))),
|
||||
"empty ampicillin fields to 'I' based on amoxicillin. ")
|
||||
cat("\n VERBOSE: transforming",
|
||||
length(which(tbl[, amox] == "R" & !tbl[, ampi] %in% c("S", "I", "R"))),
|
||||
"empty ampicillin fields to 'R' based on amoxicillin. \n")
|
||||
}
|
||||
tbl[which(tbl[, amox] == "S" & !tbl[, ampi] %in% c("S", "I", "R")), ampi] <- "S"
|
||||
tbl[which(tbl[, amox] == "I" & !tbl[, ampi] %in% c("S", "I", "R")), ampi] <- "I"
|
||||
@ -1804,22 +1872,46 @@ eucast_rules <- function(tbl,
|
||||
} else {
|
||||
wouldve <- ""
|
||||
}
|
||||
if (number_changed == 0) {
|
||||
colour <- green
|
||||
if (sum(number_added_S, number_added_I, number_added_R,
|
||||
number_changed_to_S, number_changed_to_I, number_changed_to_R,
|
||||
na.rm = TRUE) == 0) {
|
||||
colour <- green # is function
|
||||
} else {
|
||||
colour <- blue
|
||||
colour <- blue # is function
|
||||
}
|
||||
decimal.mark <- getOption("OutDec")
|
||||
big.mark <- ifelse(decimal.mark != ",", ",", ".")
|
||||
formatnr <- function(x) {
|
||||
format(x, big.mark = big.mark, decimal.mark = decimal.mark)
|
||||
}
|
||||
cat(bold(paste('\n=> EUCAST rules', paste0(wouldve, 'affected'),
|
||||
number_affected_rows %>% length() %>% format(big.mark = big.mark, decimal.mark = decimal.mark),
|
||||
'out of', nrow(tbl_original) %>% format(big.mark = big.mark, decimal.mark = decimal.mark),
|
||||
'rows ->',
|
||||
colour(paste0(wouldve, 'changed'),
|
||||
number_changed %>% format(big.mark = big.mark, decimal.mark = decimal.mark), 'test results.\n\n'))))
|
||||
number_affected_rows %>% length() %>% formatnr(),
|
||||
'out of', nrow(tbl_original) %>% formatnr(),
|
||||
'rows\n')))
|
||||
total_added <- number_added_S + number_added_I + number_added_R
|
||||
total_changed <- number_changed_to_S + number_changed_to_I + number_changed_to_R
|
||||
cat(colour(paste0(" -> ", wouldve, "added ",
|
||||
bold(formatnr(total_added), "test results"),
|
||||
if(total_added > 0)
|
||||
paste0(" (", formatnr(number_added_S), " as S; ",
|
||||
formatnr(number_added_I), " as I; ",
|
||||
formatnr(number_added_R), " as R)"),
|
||||
"\n")))
|
||||
cat(colour(paste0(" -> ", wouldve, "changed ",
|
||||
bold(formatnr(total_changed), "test results"),
|
||||
if(total_changed > 0)
|
||||
paste0(" (", formatnr(number_changed_to_S), " to S; ",
|
||||
formatnr(number_changed_to_I), " to I; ",
|
||||
formatnr(number_changed_to_R), " to R)"),
|
||||
"\n")))
|
||||
}
|
||||
|
||||
if (verbose == TRUE) {
|
||||
suppressWarnings(
|
||||
suppressMessages(
|
||||
verbose_info$mo_fullname <- mo_fullname(verbose_info$mo)
|
||||
)
|
||||
)
|
||||
return(verbose_info)
|
||||
}
|
||||
|
||||
|
36
R/freq.R
36
R/freq.R
@ -228,7 +228,7 @@ frequency_tbl <- function(x,
|
||||
x.name <- x.name %>% strsplit("%>%", fixed = TRUE) %>% unlist() %>% .[1] %>% trimws()
|
||||
}
|
||||
if (x.name == ".") {
|
||||
x.name <- "a `data.frame`"
|
||||
x.name <- "a data.frame"
|
||||
} else {
|
||||
x.name <- paste0("`", x.name, "`")
|
||||
}
|
||||
@ -797,11 +797,30 @@ print.frequency_tbl <- function(x,
|
||||
opt <- attr(x, "opt")
|
||||
opt$header_txt <- header(x)
|
||||
|
||||
dots <- list(...)
|
||||
if ("markdown" %in% names(dots)) {
|
||||
if (dots$markdown == TRUE) {
|
||||
opt$tbl_format <- "markdown"
|
||||
} else {
|
||||
opt$tbl_format <- "pandoc"
|
||||
}
|
||||
}
|
||||
if (!missing(markdown)) {
|
||||
if (markdown == TRUE) {
|
||||
opt$tbl_format <- "markdown"
|
||||
} else {
|
||||
opt$tbl_format <- "pandoc"
|
||||
}
|
||||
}
|
||||
|
||||
if (length(opt$vars) == 0) {
|
||||
opt$vars <- NULL
|
||||
}
|
||||
|
||||
if (is.null(opt$title)) {
|
||||
if (isTRUE(opt$data %like% "^a data.frame") & opt$tbl_format == "markdown") {
|
||||
opt$data <- gsub("data.frame", "`data.frame`", opt$data, fixed = TRUE)
|
||||
}
|
||||
if (!is.null(opt$data) & !is.null(opt$vars)) {
|
||||
title <- paste0("`", paste0(opt$vars, collapse = "` and `"), "` from ", opt$data)
|
||||
} else if (!is.null(opt$data) & is.null(opt$vars)) {
|
||||
@ -845,21 +864,6 @@ print.frequency_tbl <- function(x,
|
||||
if (!missing(big.mark)) {
|
||||
opt$big.mark <- big.mark
|
||||
}
|
||||
dots <- list(...)
|
||||
if ("markdown" %in% names(dots)) {
|
||||
if (dots$markdown == TRUE) {
|
||||
opt$tbl_format <- "markdown"
|
||||
} else {
|
||||
opt$tbl_format <- "pandoc"
|
||||
}
|
||||
}
|
||||
if (!missing(markdown)) {
|
||||
if (markdown == TRUE) {
|
||||
opt$tbl_format <- "markdown"
|
||||
} else {
|
||||
opt$tbl_format <- "pandoc"
|
||||
}
|
||||
}
|
||||
if (!missing(header)) {
|
||||
opt$header <- header
|
||||
}
|
||||
|
267
R/mo.R
267
R/mo.R
@ -54,7 +54,7 @@
|
||||
#'
|
||||
#' This function uses Artificial Intelligence (AI) to help getting fast and logical results. It tries to find matches in this order:
|
||||
#' \itemize{
|
||||
#' \item{Taxonomic kingdom: it first searches in bacteria, then fungi, then protozoa}
|
||||
#' \item{Taxonomic kingdom: it first searches in Bacteria, then Fungi, then Protozoa}
|
||||
#' \item{Human pathogenic prevalence: it first searches in more prevalent microorganisms, then less prevalent ones}
|
||||
#' \item{Valid MO codes and full names: it first searches in already valid MO code and known genus/species combinations}
|
||||
#' \item{Breakdown of input values: from here it starts to breakdown input values to find possible matches}
|
||||
@ -69,13 +69,30 @@
|
||||
#' }
|
||||
#' This means that looking up human pathogenic microorganisms takes less time than looking up human \strong{non}-pathogenic microorganisms.
|
||||
#'
|
||||
#' When using \code{allow_uncertain = TRUE} (which is the default setting), it will use additional rules if all previous AI rules failed to get valid results. Examples:
|
||||
#' \strong{UNCERTAIN RESULTS} \cr
|
||||
#' When using \code{allow_uncertain = TRUE} (which is the default setting), it will use additional rules if all previous AI rules failed to get valid results. These are:
|
||||
#' \itemize{
|
||||
#' \item{It tries to look for previously accepted (but now invalid) taxonomic names}
|
||||
#' \item{It strips off values between brackets and the brackets itself, and re-evaluates the input with all previous rules}
|
||||
#' \item{It strips off words from the end one by one and re-evaluates the input with all previous rules}
|
||||
#' \item{It strips off words from the start one by one and re-evaluates the input with all previous rules}
|
||||
#' \item{It tries to look for some manual changes which are not yet published to the ITIS database (like \emph{Propionibacterium} not yet being \emph{Cutibacterium})}
|
||||
#' }
|
||||
#'
|
||||
#' Examples:
|
||||
#' \itemize{
|
||||
#' \item{\code{"Streptococcus group B (known as S. agalactiae)"}. The text between brackets will be removed and a warning will be thrown that the result \emph{Streptococcus group B} (\code{B_STRPTC_GRB}) needs review.}
|
||||
#' \item{\code{"S. aureus - please mind: MRSA"}. The last word will be stripped, after which the function will try to find a match. If it does not, the second last word will be stripped, etc. Again, a warning will be thrown that the result \emph{Staphylococcus aureus} (\code{B_STPHY_AUR}) needs review.}
|
||||
#' \item{\code{"D. spartina"}. This is the abbreviation of an old taxonomic name: \emph{Didymosphaeria spartinae} (the last "e" was missing from the input). This fungus was renamed to \emph{Leptosphaeria obiones}, so a warning will be thrown that this result (\code{F_LPTSP_OBI}) needs review.}
|
||||
#' \item{\code{"Fluoroquinolone-resistant Neisseria gonorrhoeae"}. The first word will be stripped, after which the function will try to find a match. A warning will be thrown that the result \emph{Neisseria gonorrhoeae} (\code{B_NESSR_GON}) needs review.}
|
||||
#' }
|
||||
#'
|
||||
#' Use \code{mo_failures()} to get a vector with all values that could not be coerced to a valid value.
|
||||
#'
|
||||
#' Use \code{mo_uncertainties()} to get a vector with all values that were coerced to a valid value, but with uncertainty.
|
||||
#'
|
||||
#' Use \code{mo_renamed()} to get a vector with all values that could be coerced based on an old, previously accepted taxonomic name.
|
||||
#'
|
||||
#' @inheritSection ITIS ITIS
|
||||
# (source as a section, so it can be inherited by other man pages)
|
||||
#' @section Source:
|
||||
@ -154,7 +171,7 @@ is.mo <- function(x) {
|
||||
|
||||
#' @importFrom dplyr %>% pull left_join n_distinct progress_estimated filter
|
||||
#' @importFrom data.table data.table as.data.table setkey
|
||||
#' @importFrom crayon magenta red italic
|
||||
#' @importFrom crayon magenta red silver italic has_color
|
||||
exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
|
||||
allow_uncertain = TRUE, reference_df = get_mo_source(),
|
||||
property = "mo", clear_options = TRUE) {
|
||||
@ -170,6 +187,7 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
|
||||
|
||||
if (clear_options == TRUE) {
|
||||
options(mo_failures = NULL)
|
||||
options(mo_uncertainties = NULL)
|
||||
options(mo_renamed = NULL)
|
||||
}
|
||||
|
||||
@ -194,6 +212,7 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
|
||||
}
|
||||
|
||||
notes <- character(0)
|
||||
uncertainties <- character(0)
|
||||
failures <- character(0)
|
||||
x_input <- x
|
||||
# only check the uniques, which is way faster
|
||||
@ -251,7 +270,7 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
|
||||
x_backup <- trimws(x, which = "both")
|
||||
|
||||
# remove spp and species
|
||||
x <- trimws(gsub(" +(spp.?|ssp.?|subsp.?|species)", " ", x_backup, ignore.case = TRUE), which = "both")
|
||||
x <- trimws(gsub(" +(spp.?|ssp.?|sp.? |ss ?.?|subsp.?|subspecies|biovar |serovar |species)", " ", x_backup, ignore.case = TRUE), which = "both")
|
||||
x_species <- paste(x, "species")
|
||||
# translate to English for supported languages of mo_property
|
||||
x <- gsub("(Gruppe|gruppe|groep|grupo|gruppo|groupe)", "group", x, ignore.case = TRUE)
|
||||
@ -259,6 +278,14 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
|
||||
x <- gsub("(no MO)", "", x, fixed = TRUE)
|
||||
# remove non-text in case of "E. coli" except dots and spaces
|
||||
x <- gsub("[^.a-zA-Z0-9/ \\-]+", "", x)
|
||||
# replace minus by a space
|
||||
x <- gsub("-+", " ", x)
|
||||
# replace hemolytic by haemolytic
|
||||
x <- gsub("ha?emoly", "haemoly", x)
|
||||
# place minus back in streptococci
|
||||
x <- gsub("(alpha|beta|gamma) haemoly", "\\1-haemolytic", x)
|
||||
# remove genus as first word
|
||||
x <- gsub("^Genus ", "", x)
|
||||
|
||||
# but spaces before and after should be omitted
|
||||
x <- trimws(x, which = "both")
|
||||
@ -272,13 +299,13 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
|
||||
x <- gsub("[ .]+", ".*", x)
|
||||
# add start en stop regex
|
||||
x <- paste0('^', x, '$')
|
||||
x_withspaces_start <- paste0('^', x_withspaces)
|
||||
x_withspaces <- paste0('^', x_withspaces, '$')
|
||||
x_withspaces_start_only <- paste0('^', x_withspaces)
|
||||
x_withspaces_start_end <- paste0('^', x_withspaces, '$')
|
||||
|
||||
# cat(paste0('x "', x, '"\n'))
|
||||
# cat(paste0('x_species "', x_species, '"\n'))
|
||||
# cat(paste0('x_withspaces_start "', x_withspaces_start, '"\n'))
|
||||
# cat(paste0('x_withspaces "', x_withspaces, '"\n'))
|
||||
# cat(paste0('x_withspaces_start_only "', x_withspaces_start_only, '"\n'))
|
||||
# cat(paste0('x_withspaces_start_end "', x_withspaces_start_end, '"\n'))
|
||||
# cat(paste0('x_backup "', x_backup, '"\n'))
|
||||
# cat(paste0('x_trimmed "', x_trimmed, '"\n'))
|
||||
# cat(paste0('x_trimmed_species "', x_trimmed_species, '"\n'))
|
||||
@ -290,16 +317,17 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
|
||||
|
||||
progress$tick()$print()
|
||||
|
||||
if (identical(x_trimmed[i], "")) {
|
||||
# empty values
|
||||
if (tolower(x_trimmed[i]) %in% c("", "xxx", "other", "none", "unknown")) {
|
||||
# empty and nonsense values, ignore without warning ("xxx" is WHONET code for 'no growth')
|
||||
x[i] <- NA_character_
|
||||
next
|
||||
}
|
||||
if (nchar(x_trimmed[i]) < 3) {
|
||||
|
||||
if (nchar(gsub("[^a-zA-Z]", "", x_trimmed[i])) < 3) {
|
||||
# check if search term was like "A. species", then return first genus found with ^A
|
||||
if (x_backup[i] %like% "species" | x_backup[i] %like% "spp[.]?") {
|
||||
if (x_backup[i] %like% "[a-z]+ species" | x_backup[i] %like% "[a-z] spp[.]?") {
|
||||
# get mo code of first hit
|
||||
found <- microorganismsDT[fullname %like% x_withspaces_start[i], mo]
|
||||
found <- microorganismsDT[fullname %like% x_withspaces_start_only[i], mo]
|
||||
if (length(found) > 0) {
|
||||
mo_code <- found[1L] %>% strsplit("_") %>% unlist() %>% .[1:2] %>% paste(collapse = "_")
|
||||
found <- microorganismsDT[mo == mo_code, ..property][[1]]
|
||||
@ -316,14 +344,13 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
|
||||
next
|
||||
}
|
||||
|
||||
# no nonsense text
|
||||
if (toupper(x_trimmed[i]) %in% c('OTHER', 'NONE', 'UNKNOWN')) {
|
||||
if (x_trimmed[i] %like% "virus") {
|
||||
# there is no fullname like virus, so don't try to coerce it
|
||||
x[i] <- NA_character_
|
||||
failures <- c(failures, x_backup[i])
|
||||
next
|
||||
}
|
||||
|
||||
|
||||
# translate known trivial abbreviations to genus + species ----
|
||||
if (!is.na(x_trimmed[i])) {
|
||||
if (toupper(x_trimmed[i]) %in% c('MRSA', 'MSSA', 'VISA', 'VRSA')) {
|
||||
@ -339,6 +366,10 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
|
||||
x[i] <- microorganismsDT[mo == 'B_ENTRC', ..property][[1]][1L]
|
||||
next
|
||||
}
|
||||
if (toupper(x_trimmed[i]) %in% c('EHEC', 'EPEC', 'EIEC', 'STEC', 'ATEC')) {
|
||||
x[i] <- microorganismsDT[mo == 'B_ESCHR_COL', ..property][[1]][1L]
|
||||
next
|
||||
}
|
||||
if (toupper(x_trimmed[i]) == 'MRPA') {
|
||||
# multi resistant P. aeruginosa
|
||||
x[i] <- microorganismsDT[mo == 'B_PDMNS_AER', ..property][[1]][1L]
|
||||
@ -398,13 +429,25 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
|
||||
next
|
||||
}
|
||||
if (grepl("[sS]almonella [A-Z][a-z]+ ?.*", x_trimmed[i])) {
|
||||
# Salmonella with capital letter species like "Salmonella Goettingen" - they're all S. enterica
|
||||
x[i] <- microorganismsDT[mo == 'B_SLMNL_ENT', ..property][[1]][1L]
|
||||
notes <- c(notes,
|
||||
magenta(paste0("Note: ", italic(x_trimmed[i]),
|
||||
" was considered (a subspecies of) ",
|
||||
italic("Salmonella enterica"),
|
||||
" (B_SLMNL_ENT)")))
|
||||
if (x_trimmed[i] %like% "Salmonella group") {
|
||||
# Salmonella Group A to Z, just return S. species for now
|
||||
x[i] <- microorganismsDT[mo == 'B_SLMNL', ..property][[1]][1L]
|
||||
notes <- c(notes,
|
||||
magenta(paste0("Note: ",
|
||||
italic("Salmonella"), " ", trimws(gsub("Salmonella", "", x_trimmed[i])),
|
||||
" was considered ",
|
||||
italic("Salmonella species"),
|
||||
" (B_SLMNL)")))
|
||||
} else {
|
||||
# Salmonella with capital letter species like "Salmonella Goettingen" - they're all S. enterica
|
||||
x[i] <- microorganismsDT[mo == 'B_SLMNL_ENT', ..property][[1]][1L]
|
||||
notes <- c(notes,
|
||||
magenta(paste0("Note: ",
|
||||
italic("Salmonella"), " ", trimws(gsub("Salmonella", "", x_trimmed[i])),
|
||||
" was considered a subspecies of ",
|
||||
italic("Salmonella enterica"),
|
||||
" (B_SLMNL_ENT)")))
|
||||
}
|
||||
next
|
||||
}
|
||||
}
|
||||
@ -417,14 +460,14 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
|
||||
x[i] <- found[1L]
|
||||
next
|
||||
}
|
||||
if (nchar(x_trimmed[i]) > 4) {
|
||||
# not when abbr is esco, stau, klpn, etc.
|
||||
found <- microorganismsDT[tolower(fullname) %like% gsub(" ", ".*", x_trimmed_species[i], fixed = TRUE), ..property][[1]]
|
||||
if (nchar(x_trimmed[i]) >= 6) {
|
||||
found <- microorganismsDT[tolower(fullname) %like% paste0(x_withspaces_start_only[i], "[a-z]+ species"), ..property][[1]]
|
||||
if (length(found) > 0) {
|
||||
x[i] <- found[1L]
|
||||
next
|
||||
}
|
||||
}
|
||||
# rest of genus only is in allow_uncertain part.
|
||||
}
|
||||
|
||||
# TRY OTHER SOURCES ----
|
||||
@ -472,29 +515,27 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
|
||||
next
|
||||
}
|
||||
|
||||
|
||||
# try any match keeping spaces ----
|
||||
found <- microorganisms.prevDT[fullname %like% x_withspaces[i], ..property][[1]]
|
||||
if (length(found) > 0) {
|
||||
found <- microorganisms.prevDT[fullname %like% x_withspaces_start_end[i], ..property][[1]]
|
||||
if (length(found) > 0 & nchar(x_trimmed[i]) >= 6) {
|
||||
x[i] <- found[1L]
|
||||
next
|
||||
}
|
||||
|
||||
# try any match keeping spaces, not ending with $ ----
|
||||
found <- microorganisms.prevDT[fullname %like% x_withspaces_start[i], ..property][[1]]
|
||||
if (length(found) > 0) {
|
||||
found <- microorganisms.prevDT[fullname %like% x_withspaces_start_only[i], ..property][[1]]
|
||||
if (length(found) > 0 & nchar(x_trimmed[i]) >= 6) {
|
||||
x[i] <- found[1L]
|
||||
next
|
||||
}
|
||||
|
||||
# try any match diregarding spaces ----
|
||||
found <- microorganisms.prevDT[fullname %like% x[i], ..property][[1]]
|
||||
if (length(found) > 0) {
|
||||
if (length(found) > 0 & nchar(x_trimmed[i]) >= 6) {
|
||||
x[i] <- found[1L]
|
||||
next
|
||||
}
|
||||
|
||||
|
||||
# try splitting of characters in the middle and then find ID ----
|
||||
# only when text length is 6 or lower
|
||||
# like esco = E. coli, klpn = K. pneumoniae, stau = S. aureus, staaur = S. aureus
|
||||
@ -512,7 +553,7 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
|
||||
|
||||
# try fullname without start and stop regex, to also find subspecies ----
|
||||
# like "K. pneu rhino" >> "Klebsiella pneumoniae (rhinoscleromatis)" = KLEPNERH
|
||||
found <- microorganisms.prevDT[fullname %like% x_withspaces_start[i], ..property][[1]]
|
||||
found <- microorganisms.prevDT[fullname %like% x_withspaces_start_only[i], ..property][[1]]
|
||||
if (length(found) > 0) {
|
||||
x[i] <- found[1L]
|
||||
next
|
||||
@ -549,13 +590,13 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
|
||||
next
|
||||
}
|
||||
# try any match keeping spaces ----
|
||||
found <- microorganisms.unprevDT[fullname %like% x_withspaces[i], ..property][[1]]
|
||||
found <- microorganisms.unprevDT[fullname %like% x_withspaces_start_end[i], ..property][[1]]
|
||||
if (length(found) > 0) {
|
||||
x[i] <- found[1L]
|
||||
next
|
||||
}
|
||||
# try any match keeping spaces, not ending with $ ----
|
||||
found <- microorganisms.unprevDT[fullname %like% x_withspaces_start[i], ..property][[1]]
|
||||
found <- microorganisms.unprevDT[fullname %like% x_withspaces_start_only[i], ..property][[1]]
|
||||
if (length(found) > 0) {
|
||||
x[i] <- found[1L]
|
||||
next
|
||||
@ -583,7 +624,7 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
|
||||
|
||||
# try fullname without start and stop regex, to also find subspecies ----
|
||||
# like "K. pneu rhino" >> "Klebsiella pneumoniae (rhinoscleromatis)" = KLEPNERH
|
||||
found <- microorganisms.unprevDT[fullname %like% x_withspaces_start[i], ..property][[1]]
|
||||
found <- microorganisms.unprevDT[fullname %like% x_withspaces_start_only[i], ..property][[1]]
|
||||
if (length(found) > 0) {
|
||||
x[i] <- found[1L]
|
||||
next
|
||||
@ -594,7 +635,7 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
|
||||
# look for old taxonomic names ----
|
||||
found <- microorganisms.oldDT[tolower(name) == tolower(x_backup[i])
|
||||
| tsn == x_trimmed[i]
|
||||
| name %like% x_withspaces[i],]
|
||||
| name %like% x_withspaces_start_end[i],]
|
||||
if (NROW(found) > 0) {
|
||||
# when property is "ref" (which is the case in mo_ref, mo_authors and mo_year), return the old value, so:
|
||||
# mo_ref("Chlamydia psittaci) = "Page, 1968" (with warning)
|
||||
@ -604,22 +645,36 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
|
||||
} else {
|
||||
x[i] <- microorganismsDT[tsn == found[1, tsn_new], ..property][[1]]
|
||||
}
|
||||
notes <- c(notes,
|
||||
renamed_note(name_old = found[1, name],
|
||||
name_new = microorganismsDT[tsn == found[1, tsn_new], fullname],
|
||||
ref_old = found[1, ref],
|
||||
ref_new = microorganismsDT[tsn == found[1, tsn_new], ref],
|
||||
mo = microorganismsDT[tsn == found[1, tsn_new], mo]))
|
||||
was_renamed(name_old = found[1, name],
|
||||
name_new = microorganismsDT[tsn == found[1, tsn_new], fullname],
|
||||
ref_old = found[1, ref],
|
||||
ref_new = microorganismsDT[tsn == found[1, tsn_new], ref],
|
||||
mo = microorganismsDT[tsn == found[1, tsn_new], mo])
|
||||
next
|
||||
}
|
||||
|
||||
# check for uncertain results ----
|
||||
if (allow_uncertain == TRUE) {
|
||||
|
||||
uncertain_fn <- function(a.x_backup, b.x_trimmed, c.x_withspaces, d.x_withspaces_start, e.x) {
|
||||
# (1) look again for old taxonomic names, now for G. species ----
|
||||
found <- microorganisms.oldDT[name %like% c.x_withspaces
|
||||
| name %like% d.x_withspaces_start
|
||||
uncertain_fn <- function(a.x_backup, b.x_trimmed, c.x_withspaces_start_end, d.x_withspaces_start_only, e.x) {
|
||||
|
||||
# (1) look for genus only, part of name ----
|
||||
if (nchar(b.x_trimmed) > 4 & !b.x_trimmed %like% " ") {
|
||||
if (!grepl("^[A-Z][a-z]+", b.x_trimmed, ignore.case = FALSE)) {
|
||||
# not when input is like Genustext, because then Neospora would lead to Actinokineospora
|
||||
found <- microorganismsDT[tolower(fullname) %like% paste(b.x_trimmed, "species"), ..property][[1]]
|
||||
if (length(found) > 0) {
|
||||
x[i] <- found[1L]
|
||||
uncertainties <<- c(uncertainties,
|
||||
paste0("'", a.x_backup, "' >> ", microorganismsDT[mo == found[1L], fullname][[1]], " (", found[1L], ")"))
|
||||
return(x)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# (2) look again for old taxonomic names, now for G. species ----
|
||||
found <- microorganisms.oldDT[name %like% c.x_withspaces_start_end
|
||||
| name %like% d.x_withspaces_start_only
|
||||
| name %like% e.x,]
|
||||
if (NROW(found) > 0 & nchar(b.x_trimmed) >= 6) {
|
||||
if (property == "ref") {
|
||||
@ -630,32 +685,29 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
|
||||
} else {
|
||||
x <- microorganismsDT[tsn == found[1, tsn_new], ..property][[1]]
|
||||
}
|
||||
warning(red(paste0('(UNCERTAIN) "',
|
||||
a.x_backup, '" >> ', italic(found[1, name]), " (TSN ", found[1, tsn], ")")),
|
||||
call. = FALSE, immediate. = FALSE)
|
||||
notes <<- c(notes,
|
||||
renamed_note(name_old = found[1, name],
|
||||
name_new = microorganismsDT[tsn == found[1, tsn_new], fullname],
|
||||
ref_old = found[1, ref],
|
||||
ref_new = microorganismsDT[tsn == found[1, tsn_new], ref],
|
||||
mo = microorganismsDT[tsn == found[1, tsn_new], mo]))
|
||||
was_renamed(name_old = found[1, name],
|
||||
name_new = microorganismsDT[tsn == found[1, tsn_new], fullname],
|
||||
ref_old = found[1, ref],
|
||||
ref_new = microorganismsDT[tsn == found[1, tsn_new], ref],
|
||||
mo = microorganismsDT[tsn == found[1, tsn_new], mo])
|
||||
uncertainties <<- c(uncertainties,
|
||||
paste0("'", a.x_backup, "' >> ", found[1, name], " (TSN ", found[1, tsn], ")"))
|
||||
return(x)
|
||||
}
|
||||
|
||||
# (2) strip values between brackets ----
|
||||
# (3) strip values between brackets ----
|
||||
a.x_backup_stripped <- gsub("( [(].*[)])", "", a.x_backup)
|
||||
a.x_backup_stripped <- trimws(gsub(" ", " ", a.x_backup_stripped, fixed = TRUE))
|
||||
found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup_stripped, clear_options = FALSE, allow_uncertain = FALSE)))
|
||||
if (!is.na(found) & nchar(b.x_trimmed) >= 6) {
|
||||
found_result <- found
|
||||
found <- microorganismsDT[mo == found, ..property][[1]]
|
||||
warning(red(paste0('(UNCERTAIN) "',
|
||||
a.x_backup, '" >> ', italic(microorganismsDT[mo == found_result[1L], fullname][[1]]), " (", found_result[1L], ")")),
|
||||
call. = FALSE, immediate. = FALSE)
|
||||
uncertainties <<- c(uncertainties,
|
||||
paste0("'", a.x_backup, "' >> ", microorganismsDT[mo == found_result[1L], fullname][[1]], " (", found_result[1L], ")"))
|
||||
return(found[1L])
|
||||
}
|
||||
|
||||
# (3) try to strip off one element and check the remains ----
|
||||
# (4) try to strip off one element from end and check the remains ----
|
||||
x_strip <- a.x_backup %>% strsplit(" ") %>% unlist()
|
||||
if (length(x_strip) > 1 & nchar(b.x_trimmed) >= 6) {
|
||||
for (i in 1:(length(x_strip) - 1)) {
|
||||
@ -664,22 +716,39 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
|
||||
if (!is.na(found)) {
|
||||
found_result <- found
|
||||
found <- microorganismsDT[mo == found, ..property][[1]]
|
||||
warning(red(paste0('(UNCERTAIN) "',
|
||||
a.x_backup, '" >> ', italic(microorganismsDT[mo == found_result[1L], fullname][[1]]), " (", found_result[1L], ")")),
|
||||
call. = FALSE, immediate. = FALSE)
|
||||
uncertainties <<- c(uncertainties,
|
||||
paste0("'", a.x_backup, "' >> ", microorganismsDT[mo == found_result[1L], fullname][[1]], " (", found_result[1L], ")"))
|
||||
return(found[1L])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# (4) not yet implemented taxonomic changes in ITIS
|
||||
# (5) try to strip off one element from start and check the remains ----
|
||||
x_strip <- a.x_backup %>% strsplit(" ") %>% unlist()
|
||||
if (length(x_strip) > 1 & nchar(b.x_trimmed) >= 6) {
|
||||
for (i in 2:(length(x_strip))) {
|
||||
x_strip_collapsed <- paste(x_strip[i:length(x_strip)], collapse = " ")
|
||||
found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, clear_options = FALSE, allow_uncertain = FALSE)))
|
||||
if (!is.na(found)) {
|
||||
found_result <- found
|
||||
found <- microorganismsDT[mo == found, ..property][[1]]
|
||||
uncertainties <<- c(uncertainties,
|
||||
paste0("'", a.x_backup, "' >> ", microorganismsDT[mo == found_result[1L], fullname][[1]], " (", found_result[1L], ")"))
|
||||
return(found[1L])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# (6) not yet implemented taxonomic changes in ITIS ----
|
||||
found <- suppressMessages(suppressWarnings(exec_as.mo(TEMPORARY_TAXONOMY(b.x_trimmed), clear_options = FALSE, allow_uncertain = FALSE)))
|
||||
if (!is.na(found)) {
|
||||
found_result <- found
|
||||
found <- microorganismsDT[mo == found, ..property][[1]]
|
||||
warning(red(paste0('(UNCERTAIN) "',
|
||||
a.x_backup, '" >> ', italic(microorganismsDT[mo == found_result[1L], fullname][[1]]), " (", found_result[1L], ")")),
|
||||
warning(silver(paste0('Guessed with uncertainty: "',
|
||||
a.x_backup, '" >> ', italic(microorganismsDT[mo == found_result[1L], fullname][[1]]), " (", found_result[1L], ")")),
|
||||
call. = FALSE, immediate. = FALSE)
|
||||
uncertainties <<- c(uncertainties,
|
||||
paste0('"', a.x_backup, '" >> ', microorganismsDT[mo == found_result[1L], fullname][[1]], " (", found_result[1L], ")"))
|
||||
return(found[1L])
|
||||
}
|
||||
|
||||
@ -687,7 +756,7 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
|
||||
return(NA_character_)
|
||||
}
|
||||
|
||||
x[i] <- uncertain_fn(x_backup[i], x_trimmed[i], x_withspaces[i], x_withspaces_start[i], x[i])
|
||||
x[i] <- uncertain_fn(x_backup[i], x_trimmed[i], x_withspaces_start_end[i], x_withspaces_start_only[i], x[i])
|
||||
if (!is.na(x[i])) {
|
||||
next
|
||||
}
|
||||
@ -696,26 +765,39 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
|
||||
# not found ----
|
||||
x[i] <- NA_character_
|
||||
failures <- c(failures, x_backup[i])
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
# failures
|
||||
failures <- failures[!failures %in% c(NA, NULL, NaN)]
|
||||
if (length(failures) > 0) {
|
||||
options(mo_failures = sort(unique(failures)))
|
||||
plural <- ""
|
||||
plural <- c("value", "it")
|
||||
if (n_distinct(failures) > 1) {
|
||||
plural <- "s"
|
||||
plural <- c("values", "them")
|
||||
}
|
||||
total_failures <- length(x_input[x_input %in% failures & !x_input %in% c(NA, NULL, NaN)])
|
||||
total_n <- length(x_input[!x_input %in% c(NA, NULL, NaN)])
|
||||
msg <- paste0("\n", n_distinct(failures), " unique value", plural,
|
||||
msg <- paste0("\n", n_distinct(failures), " unique ", plural[1],
|
||||
" (^= ", percent(total_failures / total_n, round = 1, force_zero = TRUE),
|
||||
") could not be coerced to a valid MO code")
|
||||
if (n_distinct(failures) <= 10) {
|
||||
msg <- paste0(msg, ": ", paste('"', unique(failures), '"', sep = "", collapse = ', '))
|
||||
}
|
||||
msg <- paste0(msg, ". Use mo_failures() to review failured input.")
|
||||
msg <- paste0(msg, ". Use mo_failures() to review ", plural[2], ".")
|
||||
warning(red(msg),
|
||||
call. = FALSE,
|
||||
immediate. = TRUE) # thus will always be shown, even if >= warnings
|
||||
}
|
||||
# uncertainties
|
||||
if (length(uncertainties) > 0) {
|
||||
options(mo_uncertainties = sort(unique(uncertainties)))
|
||||
plural <- c("value", "it")
|
||||
if (n_distinct(failures) > 1) {
|
||||
plural <- c("values", "them")
|
||||
}
|
||||
msg <- paste0("\nResults of ", n_distinct(uncertainties), " input ", plural[1],
|
||||
" guessed with uncertainty. Use mo_uncertainties() to review ", plural[2], ".")
|
||||
warning(red(msg),
|
||||
call. = FALSE,
|
||||
immediate. = TRUE) # thus will always be shown, even if >= warnings
|
||||
@ -774,6 +856,9 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
|
||||
x[x == microorganismsDT[mo == 'B_STRPTC_SAL', ..property][[1]][1L]] <- microorganismsDT[mo == 'B_STRPTC_GRK', ..property][[1]][1L]
|
||||
}
|
||||
|
||||
|
||||
# Wrap up ----------------------------------------------------------------
|
||||
|
||||
# comply to x, which is also unique and without empty values
|
||||
x_input_unique_nonempty <- unique(x_input[!is.na(x_input) & !is.null(x_input) & !identical(x_input, "")])
|
||||
|
||||
@ -794,10 +879,15 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
|
||||
x <- as.integer(x)
|
||||
}
|
||||
|
||||
if (length(notes > 0)) {
|
||||
if (length(mo_renamed()) > 0) {
|
||||
if (has_color()) {
|
||||
notes <- getOption("mo_renamed")
|
||||
} else {
|
||||
notes <- mo_renamed()
|
||||
}
|
||||
notes <- sort(notes)
|
||||
for (i in 1:length(notes)) {
|
||||
base::message(notes[i])
|
||||
base::message(blue(paste("Note:", notes[i])))
|
||||
}
|
||||
}
|
||||
|
||||
@ -810,7 +900,7 @@ TEMPORARY_TAXONOMY <- function(x) {
|
||||
}
|
||||
|
||||
#' @importFrom crayon blue italic
|
||||
renamed_note <- function(name_old, name_new, ref_old = "", ref_new = "", mo = "") {
|
||||
was_renamed <- function(name_old, name_new, ref_old = "", ref_new = "", mo = "") {
|
||||
if (!is.na(ref_old)) {
|
||||
ref_old <- paste0(" (", ref_old, ")")
|
||||
} else {
|
||||
@ -828,10 +918,7 @@ renamed_note <- function(name_old, name_new, ref_old = "", ref_new = "", mo = ""
|
||||
}
|
||||
msg <- paste0(italic(name_old), ref_old, " was renamed ", italic(name_new), ref_new, mo)
|
||||
msg <- gsub("et al.", italic("et al."), msg)
|
||||
msg_plain <- paste0(name_old, ref_old, " >> ", name_new, ref_new)
|
||||
msg_plain <- c(getOption("mo_renamed", character(0)), msg_plain)
|
||||
options(mo_renamed = sort(msg_plain))
|
||||
return(blue(paste("Note:", msg)))
|
||||
options(mo_renamed = sort(msg))
|
||||
}
|
||||
|
||||
#' @exportMethod print.mo
|
||||
@ -882,20 +969,20 @@ pull.mo <- function(.data, ...) {
|
||||
pull(as.data.frame(.data), ...)
|
||||
}
|
||||
|
||||
#' Vector of failed coercion attempts
|
||||
#'
|
||||
#' Returns a vector of all failed attempts to coerce values to a valid MO code with \code{\link{as.mo}}.
|
||||
#' @seealso \code{\link{as.mo}}
|
||||
#' @rdname as.mo
|
||||
#' @export
|
||||
mo_failures <- function() {
|
||||
getOption("mo_failures")
|
||||
}
|
||||
|
||||
#' Vector of taxonomic renamed items
|
||||
#'
|
||||
#' Returns a vector of all renamed items of the last coercion to valid MO codes with \code{\link{as.mo}}.
|
||||
#' @seealso \code{\link{as.mo}}
|
||||
#' @rdname as.mo
|
||||
#' @export
|
||||
mo_uncertainties <- function() {
|
||||
getOption("mo_uncertainties")
|
||||
}
|
||||
|
||||
#' @rdname as.mo
|
||||
#' @export
|
||||
mo_renamed <- function() {
|
||||
getOption("mo_renamed")
|
||||
strip_style(gsub("was renamed", ">>", getOption("mo_renamed"), fixed = TRUE))
|
||||
}
|
||||
|
@ -248,7 +248,11 @@ mo_gramstain <- function(x, language = get_locale(), ...) {
|
||||
#' @rdname mo_property
|
||||
#' @export
|
||||
mo_TSN <- function(x, ...) {
|
||||
mo_validate(x = x, property = "tsn", ...)
|
||||
res <- mo_validate(x = x, property = "tsn", ...)
|
||||
if (any(is.na(res))) {
|
||||
warning("Some results do not have a TSN, because they are missing from ITIS and were added manually. See ?microorganisms.")
|
||||
}
|
||||
res
|
||||
}
|
||||
|
||||
#' @rdname mo_property
|
||||
|
Reference in New Issue
Block a user