diff --git a/DESCRIPTION b/DESCRIPTION index 9a9b9fa5..1c069aec 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: AMR -Version: 0.5.0.9023 -Date: 2019-03-15 +Version: 0.5.0.9024 +Date: 2019-03-18 Title: Antimicrobial Resistance Analysis Authors@R: c( person( diff --git a/NAMESPACE b/NAMESPACE index 5ef332cf..ed636cd8 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -220,6 +220,7 @@ importFrom(crayon,magenta) importFrom(crayon,red) importFrom(crayon,silver) importFrom(crayon,strip_style) +importFrom(crayon,underline) importFrom(crayon,yellow) importFrom(data.table,as.data.table) importFrom(data.table,data.table) diff --git a/R/catalogue_of_life.R b/R/catalogue_of_life.R index 0b9c98fd..3cd597f0 100755 --- a/R/catalogue_of_life.R +++ b/R/catalogue_of_life.R @@ -30,7 +30,7 @@ #' @section Included taxa: #' Included are: #' \itemize{ -#' \item{All ~55,000 (sub)species from the kingdoms of Archaea, Bacteria, Protozoa and Viruses} +#' \item{All ~55,000 (sub)species from the kingdoms of Archaea, Bacteria and Protozoa} #' \item{All ~3,500 (sub)species from these orders of the kingdom of Fungi: Eurotiales, Onygenales, Pneumocystales, Saccharomycetales, Schizosaccharomycetales and Tremellales. The kingdom of Fungi is a very large taxon with almost 300,000 different (sub)species, of which most are not microbial (but rather macroscopic, like mushrooms). Because of this, not all fungi fit the scope of this package and including everything would tremendously slow down our algorithms too. By only including the aforementioned taxonomic orders, the most relevant fungi are covered (like all species of \emph{Aspergillus}, \emph{Candida}, \emph{Cryptococcus}, \emph{Histplasma}, \emph{Pneumocystis}, \emph{Saccharomyces} and \emph{Trichophyton}).} #' \item{All ~2,000 (sub)species from ~100 other relevant genera, from the kingdoms of Animalia and Plantae (like \emph{Strongyloides} and \emph{Taenia})} #' \item{All ~15,000 previously accepted names of included (sub)species that have been taxonomically renamed} @@ -44,6 +44,8 @@ #' @inheritSection AMR Read more on our website! #' @name catalogue_of_life #' @rdname catalogue_of_life +#' @seealso Data set \code{\link{microorganisms}} for the actual data. \cr +#' Function \code{\link{as.mo}()} to use the data for intelligent determination of microorganisms. #' @examples #' # Get version info of included data set #' catalogue_of_life_version() @@ -77,11 +79,16 @@ NULL #' Version info of included Catalogue of Life #' -#' This function returns a list with info about the included data from the Catalogue of Life. It also shows if the included version is their latest annual release. The Catalogue of Life releases their annual release in March each year. +#' This function returns information about the included data from the Catalogue of Life. It also shows if the included version is their latest annual release. The Catalogue of Life releases their annual release in March each year. #' @seealso \code{\link{microorganisms}} #' @details The list item \code{is_latest_annual_release} is based on the system date. +#' +#' For DSMZ, see \code{?microorganisms}. +#' @return a \code{list}, invisibly #' @inheritSection catalogue_of_life Catalogue of Life #' @inheritSection AMR Read more on our website! +#' @importFrom crayon bold underline +#' @importFrom dplyr filter #' @export #' @examples #' library(dplyr) @@ -89,10 +96,34 @@ NULL #' microorganisms %>% group_by(kingdom) %>% freq(phylum, nmax = NULL) catalogue_of_life_version <- function() { # see the `catalogue_of_life` list in R/data.R - list(version = catalogue_of_life$version, - url = catalogue_of_life$url, - # annual release always somewhere in March - is_latest_annual_release = Sys.Date() < as.Date(paste0(catalogue_of_life$year + 1, "-04-01")), - n_species = nrow(AMR::microorganisms), - n_synonyms = nrow(AMR::microorganisms.old)) + lst <- list(catalogue_of_life = + list(version = gsub("{year}", catalogue_of_life$year, catalogue_of_life$version, fixed = TRUE), + url = gsub("{year}", catalogue_of_life$year, catalogue_of_life$url_CoL, fixed = TRUE), + # annual release always somewhere in March, so before April is TRUE, FALSE otherwise + is_latest_annual_release = Sys.Date() < as.Date(paste0(catalogue_of_life$year + 1, "-04-01")), + n = nrow(filter(AMR::microorganisms, source == "CoL"))), + deutsche_sammlung_von_mikroorganismen_und_zellkulturen = + list(version = "Prokaryotic Nomenclature Up-to-Date from DSMZ", + url = catalogue_of_life$url_DSMZ, + yearmonth = catalogue_of_life$yearmonth_DSMZ, + n = nrow(filter(AMR::microorganisms, source == "DSMZ"))), + total_included = + list( + n_total_species = nrow(AMR::microorganisms), + n_total_synonyms = nrow(AMR::microorganisms.old))) + + cat(paste0(bold("Included in this package are:\n\n"), + underline(lst$catalogue_of_life$version), "\n", + " Available at: ", lst$catalogue_of_life$url, "\n", + " Number of included species: ", format(lst$catalogue_of_life$n, big.mark = ","), "\n", + " (based on your system time, this is most likely ", ifelse(lst$catalogue_of_life$is_latest_annual_release, "", "not "), "the latest annual release)\n\n", + underline(paste0(lst$deutsche_sammlung_von_mikroorganismen_und_zellkulturen$version, " (", + lst$deutsche_sammlung_von_mikroorganismen_und_zellkulturen$yearmonth, ")")), "\n", + " Available at: ", lst$deutsche_sammlung_von_mikroorganismen_und_zellkulturen$url, "\n", + " Number of included species: ", format(lst$deutsche_sammlung_von_mikroorganismen_und_zellkulturen$n, big.mark = ","), "\n\n", + "Total number of species included: ", format(lst$total_included$n_total_species, big.mark = ","), "\n", + "Total number of synonyms included: ", format(lst$total_included$n_total_synonyms, big.mark = ","), "\n\n", + "See for more info ?microorganisms and ?catalogue_of_life.\n")) + + return(base::invisible(lst)) } diff --git a/R/data.R b/R/data.R index ace145ef..68bd43d8 100755 --- a/R/data.R +++ b/R/data.R @@ -130,11 +130,11 @@ # "antibiotics" -#' Data set with ~60,000 microorganisms +#' Data set with ~65,000 microorganisms #' #' A data set containing the microbial taxonomy of six kingdoms from the Catalogue of Life. MO codes can be looked up using \code{\link{as.mo}}. #' @inheritSection catalogue_of_life Catalogue of Life -#' @format A \code{\link{data.frame}} with 59,985 observations and 15 variables: +#' @format A \code{\link{data.frame}} with 65,629 observations and 16 variables: #' \describe{ #' \item{\code{mo}}{ID of microorganism as used by this package} #' \item{\code{col_id}}{Catalogue of Life ID} @@ -150,30 +150,40 @@ #' \item{\code{rank}}{Taxonomic rank of the microorganism, like \code{"species"} or \code{"genus"}} #' \item{\code{ref}}{Author(s) and year of concerning scientific publication} #' \item{\code{species_id}}{ID of the species as used by the Catalogue of Life} +#' \item{\code{source}}{Either \code{"CoL"}, \code{"DSMZ"} (see source) or "manually added"} #' \item{\code{prevalence}}{Prevalence of the microorganism, see \code{?as.mo}} #' } -#' @source Catalogue of Life: Annual Checklist (public online database), \url{www.catalogueoflife.org}. #' @details Manually added were: #' \itemize{ #' \item{9 species of \emph{Streptococcus} (beta haemolytic groups A, B, C, D, F, G, H, K and unspecified)} #' \item{2 species of \emph{Staphylococcus} (coagulase-negative [CoNS] and coagulase-positive [CoPS])} -#' \item{2 other undefined (unknown Gram negatives and unknown Gram positives)} +#' \item{3 other undefined (unknown, unknown Gram negatives and unknown Gram positives)} +#' \item{8,830 species from the DSMZ (Deutsche Sammlung von Mikroorganismen und Zellkulturen) that are not in the Catalogue of Life} #' } +#' @section About the records from DSMZ (see source): +#' Names of prokaryotes are defined as being validly published by the International Code of Nomenclature of Bacteria. Validly published are all names which are included in the Approved Lists of Bacterial Names and the names subsequently published in the International Journal of Systematic Bacteriology (IJSB) and, from January 2000, in the International Journal of Systematic and Evolutionary Microbiology (IJSEM) as original articles or in the validation lists. +#' +#' From: \url{https://www.dsmz.de/support/bacterial-nomenclature-up-to-date-downloads/readme.html} +#' @source Catalogue of Life: Annual Checklist (public online taxonomic database), \url{www.catalogueoflife.org} (check included annual version with \code{\link{catalogue_of_life_version}()}). +#' +#' Leibniz Institute DSMZ-German Collection of Microorganisms and Cell Cultures, Germany, Prokaryotic Nomenclature Up-to-Date, \url{http://www.dsmz.de/bacterial-diversity/prokaryotic-nomenclature-up-to-date} (check included version with \code{\link{catalogue_of_life_version}()}). #' @inheritSection AMR Read more on our website! #' @seealso \code{\link{as.mo}}, \code{\link{mo_property}}, \code{\link{microorganisms.codes}} "microorganisms" catalogue_of_life <- list( year = 2018, - version = "Catalogue of Life: 2018 Annual Checklist", - url = "http://www.catalogueoflife.org/annual-checklist/2018" + version = "Catalogue of Life: {year} Annual Checklist", + url_CoL = "http://www.catalogueoflife.org/annual-checklist/{year}/", + url_DSMZ = "https://www.dsmz.de/microorganisms/pnu/bacterial_nomenclature_info_mm.php", + yearmonth_DSMZ = "February 2019" ) #' Data set with previously accepted taxonomic names #' #' A data set containing old (previously valid or accepted) taxonomic names according to the Catalogue of Life. This data set is used internally by \code{\link{as.mo}}. #' @inheritSection catalogue_of_life Catalogue of Life -#' @format A \code{\link{data.frame}} with 17,069 observations and 4 variables: +#' @format A \code{\link{data.frame}} with 16,911 observations and 4 variables: #' \describe{ #' \item{\code{col_id}}{Catalogue of Life ID} #' \item{\code{tsn_new}}{New Catalogue of Life ID} diff --git a/R/globals.R b/R/globals.R index 6fbd9834..7c480b9b 100755 --- a/R/globals.R +++ b/R/globals.R @@ -80,6 +80,7 @@ globalVariables(c(".", "phylum", "prevalence", "prevalent", + "property", "psae", "R", "real_first_isolate", diff --git a/R/mdro.R b/R/mdro.R index 51a9c0f8..1602db3d 100755 --- a/R/mdro.R +++ b/R/mdro.R @@ -150,7 +150,7 @@ mdro <- function(tbl, } else if (guideline$country$code == 'nl') { guideline$country$name <- 'The Netherlands' guideline$name <- 'WIP-Richtlijn BRMO' - guideline$version <- 'Revision of December 2017' + guideline$version <- 'Revision as of December 2017' guideline$source <- 'https://www.rivm.nl/Documenten_en_publicaties/Professioneel_Praktisch/Richtlijnen/Infectieziekten/WIP_Richtlijnen/WIP_Richtlijnen/Ziekenhuizen/WIP_richtlijn_BRMO_Bijzonder_Resistente_Micro_Organismen_ZKH' # add here more countries like this: # } else if (country$code == 'xx') { diff --git a/R/mo.R b/R/mo.R index f610b9f4..fbadb762 100755 --- a/R/mo.R +++ b/R/mo.R @@ -21,9 +21,9 @@ #' Transform to microorganism ID #' -#' Use this function to determine a valid microorganism ID (\code{mo}). Determination is done using intelligent rules and the complete taxonomic kingdoms Bacteria, Chromista, Protozoa, Archaea, Viruses, and most microbial species from the kingdom Fungi (see Source). The input can be almost anything: a full name (like \code{"Staphylococcus aureus"}), an abbreviated name (like \code{"S. aureus"}), an abbreviation known in the field (like \code{"MRSA"}), or just a genus. Please see Examples. +#' Use this function to determine a valid microorganism ID (\code{mo}). Determination is done using intelligent rules and the complete taxonomic kingdoms Bacteria, Chromista, Protozoa, Archaea and most microbial species from the kingdom Fungi (see Source). The input can be almost anything: a full name (like \code{"Staphylococcus aureus"}), an abbreviated name (like \code{"S. aureus"}), an abbreviation known in the field (like \code{"MRSA"}), or just a genus. Please see Examples. #' @param x a character vector or a \code{data.frame} with one or two columns -#' @param Becker a logical to indicate whether \emph{Staphylococci} should be categorised into Coagulase Negative \emph{Staphylococci} ("CoNS") and Coagulase Positive \emph{Staphylococci} ("CoPS") instead of their own species, according to Karsten Becker \emph{et al.} [1]. +#' @param Becker a logical to indicate whether \emph{Staphylococci} should be categorised into Coagulase Negative \emph{Staphylococci} ("CoNS") and Coagulase Positive \emph{Staphylococci} ("CoPS") instead of their own species, according to Karsten Becker \emph{et al.} [1]. Note that this does not include species that were newly named after this publication. #' #' This excludes \emph{Staphylococcus aureus} at default, use \code{Becker = "all"} to also categorise \emph{S. aureus} as "CoPS". #' @param Lancefield a logical to indicate whether beta-haemolytic \emph{Streptococci} should be categorised into Lancefield groups instead of their own species, according to Rebecca C. Lancefield [2]. These \emph{Streptococci} will be categorised in their first group, e.g. \emph{Streptococcus dysgalactiae} will be group C, although officially it was also categorised into groups G and L. @@ -50,13 +50,15 @@ #' | | ----> species, a 3-4 letter acronym #' | ----> genus, a 5-7 letter acronym, mostly without vowels #' ----> taxonomic kingdom: A (Archaea), AN (Animalia), B (Bacteria), C (Chromista), -#' F (Fungi), P (Protozoa), PL (Plantae) or V (Viruses) +#' F (Fungi), P (Protozoa) or PL (Plantae) #' } #' #' Values that cannot be coered will be considered 'unknown' and have an MO code \code{UNKNOWN}. #' #' Use the \code{\link{mo_property}_*} functions to get properties based on the returned code, see Examples. #' +#' The algorithm uses data from the Catalogue of Life (see below) and from one other source (see \code{?microorganisms}). +#' #' \strong{Self-learning algoritm} \cr #' The \code{as.mo()} function gains experience from previously determined microbial IDs and learns from it. This drastically improves both speed and reliability. Use \code{clean_mo_history()} to reset the algorithms. Only experience from your current \code{AMR} package version is used. This is done because in the future the taxonomic tree (which is included in this package) may change for any organism and it consequently has to rebuild its knowledge. Usually, any guess after the first try runs 90-95\% faster than the first try. The algorithm saves its previous findings to \code{~/.Rhistory_mo}. #' @@ -65,7 +67,7 @@ #' \itemize{ #' \item{Valid MO codes and full names: it first searches in already valid MO code and known genus/species combinations} #' \item{Human pathogenic prevalence: it first searches in more prevalent microorganisms, then less prevalent ones (see \emph{Microbial prevalence of pathogens in humans} below)} -#' \item{Taxonomic kingdom: it first searches in Bacteria/Chromista, then Fungi, then Protozoa, then Viruses} +#' \item{Taxonomic kingdom: it first searches in Bacteria/Chromista, then Fungi, then Protozoa} #' \item{Breakdown of input values: from here it starts to breakdown input values to find possible matches} #' } #' @@ -82,7 +84,6 @@ #' \itemize{ #' \item{(uncertainty level 1): It tries to look for only matching genera} #' \item{(uncertainty level 1): It tries to look for previously accepted (but now invalid) taxonomic names} -#' \item{(uncertainty level 1): It tries to look for some manual changes which are not (yet) published to the Catalogue of Life (like \emph{Propionibacterium} being \emph{Cutibacterium})} #' \item{(uncertainty level 2): It strips off values between brackets and the brackets itself, and re-evaluates the input with all previous rules} #' \item{(uncertainty level 2): It strips off words from the end one by one and re-evaluates the input with all previous rules} #' \item{(uncertainty level 3): It strips off words from the start one by one and re-evaluates the input with all previous rules} @@ -144,6 +145,12 @@ #' as.mo("VISA") # Vancomycin Intermediate S. aureus #' as.mo("VRSA") # Vancomycin Resistant S. aureus #' +#' # Dyslexia is no problem - these all work: +#' as.mo("Ureaplasma urealyticum") +#' as.mo("Ureaplasma urealyticus") +#' as.mo("Ureaplasmium urealytica") +#' as.mo("Ureaplazma urealitycium") +#' #' as.mo("Streptococcus group A") #' as.mo("GAS") # Group A Streptococci #' as.mo("GBS") # Group B Streptococci @@ -154,13 +161,9 @@ #' as.mo("S. pyogenes") # will remain species: B_STRPT_PYO #' as.mo("S. pyogenes", Lancefield = TRUE) # will not remain species: B_STRPT_GRA #' -#' # Use mo_* functions to get a specific property based on `mo` -#' Ecoli <- as.mo("E. coli") # returns `B_ESCHR_COL` -#' mo_genus(Ecoli) # returns "Escherichia" -#' mo_gramstain(Ecoli) # returns "Gram negative" -#' # but it uses as.mo internally too, so you could also just use: +#' # All mo_* functions use as.mo() internally too (see ?mo_property): #' mo_genus("E. coli") # returns "Escherichia" -#' +#' mo_gramstain("E. coli") # returns "Gram negative"#' #' #' \dontrun{ #' df$mo <- as.mo(df$microorganism_name) @@ -246,13 +249,13 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = TRUE, # save them to history set_mo_history(x, y, force = isTRUE(list(...)$force_mo_history)) - } else { - # will be checked for mo class in validation and uses exec_as.mo internally if necessary - y <- mo_validate(x = x, property = "mo", - Becker = Becker, Lancefield = Lancefield, - allow_uncertain = allow_uncertain, reference_df = reference_df, - force_mo_history = isTRUE(list(...)$force_mo_history)) - } + } else { + # will be checked for mo class in validation and uses exec_as.mo internally if necessary + y <- mo_validate(x = x, property = "mo", + Becker = Becker, Lancefield = Lancefield, + allow_uncertain = allow_uncertain, reference_df = reference_df, + force_mo_history = isTRUE(list(...)$force_mo_history)) + } structure(.Data = y, class = "mo") @@ -270,6 +273,7 @@ is.mo <- function(x) { # param property a column name of AMR::microorganisms # param initial_search logical - is FALSE when coming from uncertain tries, which uses exec_as.mo internally too # param force_mo_history logical - whether found result must be saved with set_mo_history (default FALSE on non-interactive sessions) +# param debug logical - show different lookup texts while searching exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, @@ -277,7 +281,8 @@ exec_as.mo <- function(x, reference_df = get_mo_source(), property = "mo", initial_search = TRUE, - force_mo_history = FALSE) { + force_mo_history = FALSE, + debug = FALSE) { if (!"AMR" %in% base::.packages()) { library("AMR") @@ -336,6 +341,7 @@ exec_as.mo <- function(x, & !identical(x, "") & !identical(x, "xxx") & !identical(x, "con")] + x_input_backup <- x # conversion of old MO codes from v0.5.0 (ITIS) to later versions (Catalogue of Life) if (any(x %like% "^[BFP]_[A-Z]{3,7}") & !all(x %in% microorganisms$mo)) { @@ -455,6 +461,9 @@ exec_as.mo <- function(x, x <- gsub("(ph|f|v)+", "(ph|f|v)+", x, ignore.case = TRUE) x <- gsub("(th|t)+", "(th|t)+", x, ignore.case = TRUE) x <- gsub("a+", "a+", x, ignore.case = TRUE) + # allow any ending of -um, -us, -ium, -ius and -a (needs perl for the negative backward lookup): + x <- gsub("(um|u\\[sz\\]\\+|\\[iy\\]\\+um|\\[iy\\]\\+u\\[sz\\]\\+|a\\+)(?![a-z[])", + "(um|us|ium|ius|a)", x, ignore.case = TRUE, perl = TRUE) x <- gsub("e+", "e+", x, ignore.case = TRUE) x <- gsub("o+", "o+", x, ignore.case = TRUE) @@ -474,16 +483,18 @@ exec_as.mo <- function(x, x_withspaces_end_only <- paste0(x_withspaces, '$') x_withspaces_start_end <- paste0('^', x_withspaces, '$') - # cat(paste0('x "', x, '"\n')) - # cat(paste0('x_species "', x_species, '"\n')) - # cat(paste0('x_withspaces_start_only "', x_withspaces_start_only, '"\n')) - # cat(paste0('x_withspaces_end_only "', x_withspaces_end_only, '"\n')) - # cat(paste0('x_withspaces_start_end "', x_withspaces_start_end, '"\n')) - # cat(paste0('x_backup "', x_backup, '"\n')) - # cat(paste0('x_backup_without_spp "', x_backup_without_spp, '"\n')) - # cat(paste0('x_trimmed "', x_trimmed, '"\n')) - # cat(paste0('x_trimmed_species "', x_trimmed_species, '"\n')) - # cat(paste0('x_trimmed_without_group "', x_trimmed_without_group, '"\n')) + if (debug == TRUE) { + cat(paste0('x "', x, '"\n')) + cat(paste0('x_species "', x_species, '"\n')) + cat(paste0('x_withspaces_start_only "', x_withspaces_start_only, '"\n')) + cat(paste0('x_withspaces_end_only "', x_withspaces_end_only, '"\n')) + cat(paste0('x_withspaces_start_end "', x_withspaces_start_end, '"\n')) + cat(paste0('x_backup "', x_backup, '"\n')) + cat(paste0('x_backup_without_spp "', x_backup_without_spp, '"\n')) + cat(paste0('x_trimmed "', x_trimmed, '"\n')) + cat(paste0('x_trimmed_species "', x_trimmed_species, '"\n')) + cat(paste0('x_trimmed_without_group "', x_trimmed_without_group, '"\n')) + } progress <- progress_estimated(n = length(x), min_time = 3) @@ -509,13 +520,13 @@ exec_as.mo <- function(x, # most probable: is exact match in fullname if (length(found) > 0) { x[i] <- found[1L] - if (property == "mo" & initial_search == TRUE) { - set_mo_history(x_backup[i], x[i], force = force_mo_history) + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), force = force_mo_history) } next } - if (any(x_backup_without_spp[i] %in% c(NA, "", "xxx", "con"))) { + if (any(tolower(x_backup_without_spp[i]) %in% c(NA, "", "xxx", "con", "na", "nan"))) { x[i] <- NA_character_ next } @@ -523,8 +534,8 @@ exec_as.mo <- function(x, if (tolower(x_backup_without_spp[i]) %in% c("other", "none", "unknown")) { # empty and nonsense values, ignore without warning x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]] - if (property == "mo" & initial_search == TRUE) { - set_mo_history(x_backup[i], x[i], force = force_mo_history) + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), force = force_mo_history) } next } @@ -540,8 +551,8 @@ exec_as.mo <- function(x, # return first genus that begins with x_trimmed, e.g. when "E. spp." if (length(found) > 0) { x[i] <- found[1L] - if (property == "mo" & initial_search == TRUE) { - set_mo_history(x_backup[i], x[i], force = force_mo_history) + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), force = force_mo_history) } next } @@ -549,9 +560,9 @@ exec_as.mo <- function(x, } # fewer than 3 chars and not looked for species, add as failure x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]] - failures <- c(failures, x_backup[i]) - if (property == "mo" & initial_search == TRUE) { - set_mo_history(x_backup[i], x[i], force = force_mo_history) + if (initial_search == TRUE) { + failures <- c(failures, x_backup[i]) + set_mo_history(x_backup[i], get_mo_code(x[i], property), force = force_mo_history) } next } @@ -559,9 +570,9 @@ exec_as.mo <- function(x, if (x_backup_without_spp[i] %like% "virus") { # there is no fullname like virus, so don't try to coerce it x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]] - failures <- c(failures, x_backup[i]) - if (property == "mo" & initial_search == TRUE) { - set_mo_history(x_backup[i], x[i], force = force_mo_history) + if (initial_search == TRUE) { + failures <- c(failures, x_backup[i]) + set_mo_history(x_backup[i], get_mo_code(x[i], property), force = force_mo_history) } next } @@ -570,38 +581,38 @@ exec_as.mo <- function(x, if (!is.na(x_trimmed[i])) { if (toupper(x_backup_without_spp[i]) %in% c('MRSA', 'MSSA', 'VISA', 'VRSA')) { x[i] <- microorganismsDT[mo == 'B_STPHY_AUR', ..property][[1]][1L] - if (property == "mo" & initial_search == TRUE) { - set_mo_history(x_backup[i], x[i], force = force_mo_history) + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), force = force_mo_history) } next } if (toupper(x_backup_without_spp[i]) %in% c('MRSE', 'MSSE')) { x[i] <- microorganismsDT[mo == 'B_STPHY_EPI', ..property][[1]][1L] - if (property == "mo" & initial_search == TRUE) { - set_mo_history(x_backup[i], x[i], force = force_mo_history) + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), force = force_mo_history) } next } if (toupper(x_backup_without_spp[i]) == "VRE" | x_backup_without_spp[i] %like% '(enterococci|enterokok|enterococo)[a-z]*?$') { x[i] <- microorganismsDT[mo == 'B_ENTRC', ..property][[1]][1L] - if (property == "mo" & initial_search == TRUE) { - set_mo_history(x_backup[i], x[i], force = force_mo_history) + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), force = force_mo_history) } next } if (toupper(x_backup_without_spp[i]) %in% c("EHEC", "EPEC", "EIEC", "STEC", "ATEC")) { x[i] <- microorganismsDT[mo == 'B_ESCHR_COL', ..property][[1]][1L] - if (property == "mo" & initial_search == TRUE) { - set_mo_history(x_backup[i], x[i], force = force_mo_history) + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), force = force_mo_history) } next } if (toupper(x_backup_without_spp[i]) == 'MRPA') { # multi resistant P. aeruginosa x[i] <- microorganismsDT[mo == 'B_PSDMN_AER', ..property][[1]][1L] - if (property == "mo" & initial_search == TRUE) { - set_mo_history(x_backup[i], x[i], force = force_mo_history) + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), force = force_mo_history) } next } @@ -609,40 +620,40 @@ exec_as.mo <- function(x, | toupper(x_backup_without_spp[i]) == 'CRSM') { # co-trim resistant S. maltophilia x[i] <- microorganismsDT[mo == 'B_STNTR_MAL', ..property][[1]][1L] - if (property == "mo" & initial_search == TRUE) { - set_mo_history(x_backup[i], x[i], force = force_mo_history) + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), force = force_mo_history) } next } if (toupper(x_backup_without_spp[i]) %in% c('PISP', 'PRSP', 'VISP', 'VRSP')) { # peni I, peni R, vanco I, vanco R: S. pneumoniae x[i] <- microorganismsDT[mo == 'B_STRPT_PNE', ..property][[1]][1L] - if (property == "mo" & initial_search == TRUE) { - set_mo_history(x_backup[i], x[i], force = force_mo_history) + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), force = force_mo_history) } next } if (x_backup_without_spp[i] %like% '^G[ABCDFGHK]S$') { # Streptococci, like GBS = Group B Streptococci (B_STRPT_GRB) x[i] <- microorganismsDT[mo == gsub("G([ABCDFGHK])S", "B_STRPT_GR\\1", x_backup_without_spp[i], ignore.case = TRUE), ..property][[1]][1L] - if (property == "mo" & initial_search == TRUE) { - set_mo_history(x_backup[i], x[i], force = force_mo_history) + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), force = force_mo_history) } next } if (x_backup_without_spp[i] %like% '(streptococ|streptokok).* [ABCDFGHK]$') { # Streptococci in different languages, like "estreptococos grupo B" x[i] <- microorganismsDT[mo == gsub(".*(streptococ|streptokok|estreptococ).* ([ABCDFGHK])$", "B_STRPT_GR\\2", x_backup_without_spp[i], ignore.case = TRUE), ..property][[1]][1L] - if (property == "mo" & initial_search == TRUE) { - set_mo_history(x_backup[i], x[i], force = force_mo_history) + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), force = force_mo_history) } next } if (x_backup_without_spp[i] %like% 'group [ABCDFGHK] (streptococ|streptokok|estreptococ)') { # Streptococci in different languages, like "Group A Streptococci" x[i] <- microorganismsDT[mo == gsub(".*group ([ABCDFGHK]) (streptococ|streptokok|estreptococ).*", "B_STRPT_GR\\1", x_backup_without_spp[i], ignore.case = TRUE), ..property][[1]][1L] - if (property == "mo" & initial_search == TRUE) { - set_mo_history(x_backup[i], x[i], force = force_mo_history) + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), force = force_mo_history) } next } @@ -652,8 +663,8 @@ exec_as.mo <- function(x, | x_backup_without_spp[i] %like% '[ck]o?ns[^a-z]?$') { # coerce S. coagulase negative x[i] <- microorganismsDT[mo == 'B_STPHY_CNS', ..property][[1]][1L] - if (property == "mo" & initial_search == TRUE) { - set_mo_history(x_backup[i], x[i], force = force_mo_history) + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), force = force_mo_history) } next } @@ -662,8 +673,8 @@ exec_as.mo <- function(x, | x_backup_without_spp[i] %like% '[ck]o?ps[^a-z]?$') { # coerce S. coagulase positive x[i] <- microorganismsDT[mo == 'B_STPHY_CPS', ..property][[1]][1L] - if (property == "mo" & initial_search == TRUE) { - set_mo_history(x_backup[i], x[i], force = force_mo_history) + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), force = force_mo_history) } next } @@ -672,8 +683,8 @@ exec_as.mo <- function(x, | x_trimmed[i] %like% 'gram[ -]?neg.*') { # coerce Gram negatives x[i] <- microorganismsDT[mo == 'B_GRAMN', ..property][[1]][1L] - if (property == "mo" & initial_search == TRUE) { - set_mo_history(x_backup[i], x[i], force = force_mo_history) + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), force = force_mo_history) } next } @@ -682,8 +693,8 @@ exec_as.mo <- function(x, | x_trimmed[i] %like% 'gram[ -]?pos.*') { # coerce Gram positives x[i] <- microorganismsDT[mo == 'B_GRAMP', ..property][[1]][1L] - if (property == "mo" & initial_search == TRUE) { - set_mo_history(x_backup[i], x[i], force = force_mo_history) + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), force = force_mo_history) } next } @@ -691,8 +702,8 @@ exec_as.mo <- function(x, if (x_backup_without_spp[i] %like% "Salmonella group") { # Salmonella Group A to Z, just return S. species for now x[i] <- microorganismsDT[mo == 'B_SLMNL', ..property][[1]][1L] - if (property == "mo" & initial_search == TRUE) { - set_mo_history(x_backup[i], x[i], force = force_mo_history) + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), force = force_mo_history) } options(mo_renamed = c(getOption("mo_renamed"), magenta(paste0("Note: ", @@ -703,8 +714,8 @@ exec_as.mo <- function(x, } else { # Salmonella with capital letter species like "Salmonella Goettingen" - they're all S. enterica x[i] <- microorganismsDT[mo == 'B_SLMNL_ENT', ..property][[1]][1L] - if (property == "mo" & initial_search == TRUE) { - set_mo_history(x_backup[i], x[i], force = force_mo_history) + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), force = force_mo_history) } options(mo_renamed = c(getOption("mo_renamed"), magenta(paste0("Note: ", @@ -723,8 +734,8 @@ exec_as.mo <- function(x, found <- microorganismsDT[fullname_lower %in% tolower(c(x_species[i], x_trimmed_species[i])), ..property][[1]] if (length(found) > 0) { x[i] <- found[1L] - if (property == "mo" & initial_search == TRUE) { - set_mo_history(x_backup[i], x[i], force = force_mo_history) + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), force = force_mo_history) } next } @@ -732,8 +743,8 @@ exec_as.mo <- function(x, found <- microorganismsDT[fullname_lower %like% paste0("^", unregex(x_backup_without_spp[i]), "[a-z]+"), ..property][[1]] if (length(found) > 0) { x[i] <- found[1L] - if (property == "mo" & initial_search == TRUE) { - set_mo_history(x_backup[i], x[i], force = force_mo_history) + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), force = force_mo_history) } next } @@ -747,8 +758,8 @@ exec_as.mo <- function(x, mo_found <- AMR::microorganisms.codes[toupper(x_backup[i]) == AMR::microorganisms.codes[, 1], "mo"][1L] if (length(mo_found) > 0) { x[i] <- microorganismsDT[mo == mo_found, ..property][[1]][1L] - if (property == "mo" & initial_search == TRUE) { - set_mo_history(x_backup[i], x[i], force = force_mo_history) + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), force = force_mo_history) } next } @@ -769,9 +780,9 @@ exec_as.mo <- function(x, # allow no codes less than 4 characters long, was already checked for WHONET above if (nchar(x_backup_without_spp[i]) < 4) { x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]] - failures <- c(failures, x_backup[i]) - if (property == "mo" & initial_search == TRUE) { - set_mo_history(x_backup[i], x[i], force = force_mo_history) + if (initial_search == TRUE) { + failures <- c(failures, x_backup[i]) + set_mo_history(x_backup[i], get_mo_code(x[i], property), force = force_mo_history) } next } @@ -790,11 +801,6 @@ exec_as.mo <- function(x, if (length(found) > 0) { return(found[1L]) } - found <- data_to_check[fullname_lower %like% b.x_trimmed - | fullname_lower %like% c.x_trimmed_without_group, ..property][[1]] - if (length(found) > 0 & nchar(g.x_backup_without_spp) >= 6) { - return(found[1L]) - } # try any match keeping spaces ---- found <- data_to_check[fullname %like% d.x_withspaces_start_end, ..property][[1]] @@ -818,6 +824,14 @@ exec_as.mo <- function(x, return(found[1L]) } + # try a trimmed version + found <- data_to_check[fullname_lower %like% b.x_trimmed + | fullname_lower %like% c.x_trimmed_without_group, ..property][[1]] + if (length(found) > 0 & nchar(g.x_backup_without_spp) >= 6) { + return(found[1L]) + } + + # try splitting of characters in the middle and then find ID ---- # only when text length is 6 or lower # like esco = E. coli, klpn = K. pneumoniae, stau = S. aureus, staaur = S. aureus @@ -854,8 +868,8 @@ exec_as.mo <- function(x, f.x_withspaces_end_only = x_withspaces_end_only[i], g.x_backup_without_spp = x_backup_without_spp[i]) if (!empty_result(x[i])) { - if (property == "mo" & initial_search == TRUE) { - set_mo_history(x_backup[i], x[i], force = force_mo_history) + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), force = force_mo_history) } next } @@ -869,8 +883,8 @@ exec_as.mo <- function(x, f.x_withspaces_end_only = x_withspaces_end_only[i], g.x_backup_without_spp = x_backup_without_spp[i]) if (!empty_result(x[i])) { - if (property == "mo" & initial_search == TRUE) { - set_mo_history(x_backup[i], x[i], force = force_mo_history) + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), force = force_mo_history) } next } @@ -884,8 +898,8 @@ exec_as.mo <- function(x, f.x_withspaces_end_only = x_withspaces_end_only[i], g.x_backup_without_spp = x_backup_without_spp[i]) if (!empty_result(x[i])) { - if (property == "mo" & initial_search == TRUE) { - set_mo_history(x_backup[i], x[i], force = force_mo_history) + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), force = force_mo_history) } next } @@ -910,8 +924,8 @@ exec_as.mo <- function(x, ref_old = found[1, ref], ref_new = microorganismsDT[col_id == found[1, col_id_new], ref], mo = microorganismsDT[col_id == found[1, col_id_new], mo]) - if (property == "mo" & initial_search == TRUE) { - set_mo_history(x_backup[i], x[i], force = force_mo_history) + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), force = force_mo_history) } next } @@ -954,19 +968,6 @@ exec_as.mo <- function(x, mo = paste("CoL", found[1, col_id]))) return(x) } - - # (2) not yet implemented taxonomic changes in Catalogue of Life ---- - found <- suppressMessages(suppressWarnings(exec_as.mo(TEMPORARY_TAXONOMY(b.x_trimmed), initial_search = FALSE, allow_uncertain = FALSE))) - if (!empty_result(found)) { - found_result <- found - found <- microorganismsDT[mo == found, ..property][[1]] - uncertainties <<- rbind(uncertainties, - data.frame(uncertainty = 1, - input = a.x_backup, - fullname = microorganismsDT[mo == found_result[1L], fullname][[1]], - mo = found_result[1L])) - return(found[1L]) - } } if (allow_uncertain >= 2) { @@ -1074,17 +1075,17 @@ exec_as.mo <- function(x, next } - # not found ---- + # no results found: make them UNKNOWN ---- x[i] <- microorganismsDT[mo == "UNKNOWN", ..property][[1]] - failures <- c(failures, x_backup[i]) - if (property == "mo" & initial_search == TRUE) { - set_mo_history(x_backup[i], x[i], force = force_mo_history) + if (initial_search == TRUE) { + failures <- c(failures, x_backup[i]) + set_mo_history(x_backup[i], get_mo_code(x[i], property), force = force_mo_history) } } } # handling failures ---- - failures <- x_input[x == "UNKNOWN"] # failures[!failures %in% c(NA, NULL, NaN)] + failures <- failures[!failures %in% c(NA, NULL, NaN)] if (length(failures) > 0 & initial_search == TRUE) { options(mo_failures = sort(unique(failures))) plural <- c("value", "it", "was") @@ -1172,7 +1173,6 @@ exec_as.mo <- function(x, x[x == microorganismsDT[mo == 'B_STRPT_SAL', ..property][[1]][1L]] <- microorganismsDT[mo == 'B_STRPT_GRK', ..property][[1]][1L] } - # Wrap up ---------------------------------------------------------------- # comply to x, which is also unique and without empty values @@ -1189,10 +1189,12 @@ exec_as.mo <- function(x, df_input <- data.frame(input = as.character(x_input), stringsAsFactors = FALSE) - x <- df_input %>% - left_join(df_found, - by = "input") %>% - pull(found) + suppressWarnings( + x <- df_input %>% + left_join(df_found, + by = "input") %>% + pull(found) + ) if (property == "mo") { class(x) <- "mo" @@ -1217,11 +1219,6 @@ empty_result <- function(x) { all(x %in% c(NA, "UNKNOWN")) } -TEMPORARY_TAXONOMY <- function(x) { - x[x %like% 'Cutibacterium'] <- gsub('Cutibacterium', 'Propionibacterium', x[x %like% 'Cutibacterium']) - x -} - #' @importFrom crayon italic was_renamed <- function(name_old, name_new, ref_old = "", ref_new = "", mo = "") { if (!is.na(ref_old)) { @@ -1368,3 +1365,11 @@ nr2char <- function(x) { unregex <- function(x) { gsub("[^a-zA-Z0-9 -]", "", x) } + +get_mo_code <- function(x, property) { + if (property == "mo") { + unique(x) + } else { + AMR::microorganisms[base::which(AMR::microorganisms[, property] %in% x),]$mo + } +} diff --git a/R/mo_history.R b/R/mo_history.R index 43fd3e88..e9c52bf4 100644 --- a/R/mo_history.R +++ b/R/mo_history.R @@ -20,15 +20,21 @@ # ==================================================================== # # print successful as.mo coercions to file, not uncertain ones -#' @importFrom dplyr distinct +#' @importFrom dplyr %>% distinct filter set_mo_history <- function(x, mo, force = FALSE) { file_location <- base::path.expand('~/.Rhistory_mo') if (base::interactive() | force == TRUE) { mo_hist <- read_mo_history(force = force) - df <- distinct(data.frame(x, mo, stringsAsFactors = FALSE), x, .keep_all = TRUE) - x <- df$x + df <- data.frame(x, mo, stringsAsFactors = FALSE) %>% + distinct(x, .keep_all = TRUE) %>% + filter(!is.na(x) & !is.na(mo)) + if (nrow(df) == 0) { + return(base::invisible()) + } + x <- toupper(df$x) mo <- df$mo for (i in 1:length(x)) { + # save package version too, as both the as.mo() algorithm and the reference data set may change if (NROW(mo_hist[base::which(mo_hist$x == x[i] & mo_hist$package_version == utils::packageVersion("AMR")),]) == 0) { base::write(x = c(x[i], mo[i], base::as.character(utils::packageVersion("AMR"))), file = file_location, @@ -46,7 +52,7 @@ get_mo_history <- function(x, force = FALSE) { if (base::is.null(file_read)) { NA } else { - data.frame(x, stringsAsFactors = FALSE) %>% + data.frame(x = toupper(x), stringsAsFactors = FALSE) %>% left_join(file_read, by = "x") %>% pull(mo) } diff --git a/R/mo_property.R b/R/mo_property.R index d8ad30f4..e0fb4a24 100755 --- a/R/mo_property.R +++ b/R/mo_property.R @@ -26,7 +26,7 @@ #' @param property one of the column names of one of the \code{\link{microorganisms}} data set or \code{"shortname"} #' @param language language of the returned text, defaults to system language (see \code{\link{get_locale}}) and can also be set with \code{\link{getOption}("AMR_locale")}. Use \code{language = NULL} or \code{language = ""} to prevent translation. #' @param ... other parameters passed on to \code{\link{as.mo}} -#' @param open browse the URL using \code{\link[utils]{browseURL}} +#' @param open browse the URL using \code{\link[utils]{browseURL}()} #' @details All functions will return the most recently known taxonomic property according to the Catalogue of Life, except for \code{mo_ref}, \code{mo_authors} and \code{mo_year}. This leads to the following results: #' \itemize{ #' \item{\code{mo_fullname("Chlamydia psittaci")} will return \code{"Chlamydophila psittaci"} (with a warning about the renaming)} @@ -34,9 +34,9 @@ #' \item{\code{mo_ref("Chlamydophila psittaci")} will return \code{"Everett et al., 1999"} (without a warning)} #' } #' -#' The Gram stain - \code{mo_gramstain()} - will be determined on the taxonomic kingdom and phylum. According to Cavalier-Smith (2002) who defined subkingdoms Negibacteria and Posibacteria, only these phyla are Posibacteria: Actinobacteria, Chloroflexi, Firmicutes and Tenericutes (ref: \url{https://itis.gov/servlet/SingleRpt/SingleRpt?search_topic=TSN&search_value=956097}). These bacteria are considered Gram positive - all other bacteria are considered Gram negative. Species outside the kingdom of Bacteria will return a value \code{NA}. +#' The Gram stain - \code{mo_gramstain()} - will be determined on the taxonomic kingdom and phylum. According to Cavalier-Smith (2002) who defined subkingdoms Negibacteria and Posibacteria, only these phyla are Posibacteria: Actinobacteria, Chloroflexi, Firmicutes and Tenericutes. These bacteria are considered Gram positive - all other bacteria are considered Gram negative. Species outside the kingdom of Bacteria will return a value \code{NA}. #' -#' The function \code{mo_url()} will return the direct URL to the species in the Catalogue of Life. +#' The function \code{mo_url()} will return the direct URL to the online database entry, which also shows the scientific reference of the concerned species. #' @inheritSection get_locale Supported languages #' @inheritSection catalogue_of_life Catalogue of Life #' @inheritSection as.mo Source @@ -99,7 +99,7 @@ #' #' # Becker classification, see ?as.mo #' mo_fullname("S. epi") # "Staphylococcus epidermidis" -#' mo_fullname("S. epi", Becker = TRUE) # "Coagulase Negative Staphylococcus (CoNS)" +#' mo_fullname("S. epi", Becker = TRUE) # "Coagulase-negative Staphylococcus (CoNS)" #' mo_shortname("S. epi") # "S. epidermidis" #' mo_shortname("S. epi", Becker = TRUE) # "CoNS" #' @@ -320,14 +320,24 @@ mo_taxonomy <- function(x, language = get_locale(), ...) { #' @rdname mo_property #' @importFrom utils browseURL +#' @importFrom dplyr %>% left_join select mutate case_when #' @export mo_url <- function(x, open = FALSE, ...) { - u <- mo_validate(x = x, property = "species_id", ...) - u[u != ""] <- paste0(catalogue_of_life$url, "/details/species/id/", u) - names(u) <- mo_fullname(x = x, ... = ...) + mo <- AMR::as.mo(x = x, ... = ...) + df <- data.frame(mo, stringsAsFactors = FALSE) %>% + left_join(select(AMR::microorganisms, mo, source, species_id), by = "mo") %>% + mutate(url = case_when(source == "CoL" ~ + paste0(gsub("{year}", catalogue_of_life$year, catalogue_of_life$url_CoL, fixed = TRUE), "details/species/id/", species_id), + source == "DSMZ" ~ + paste0(catalogue_of_life$url_DSMZ, "?bnu_no=", species_id, "#", species_id), + TRUE ~ + NA_character_)) + + u <- df$url + names(u) <- mo_fullname(mo) if (open == TRUE) { if (length(u) > 1) { - warning("only the first URL will be opened, as `browseURL` only suports one string.") + warning("only the first URL will be opened, as `browseURL()` only suports one string.") } browseURL(u[1L]) } @@ -364,7 +374,7 @@ mo_translate <- function(x, language) { } x_tobetranslated <- grepl(x = x, - pattern = "(Coagulase Negative Staphylococcus|Coagulase Positive Staphylococcus|Beta-haemolytic Streptococcus|unknown Gram negatives|unknown Gram positives|unknown name|unknown kingdom|unknown phylum|unknown class|unknown order|unknown family|unknown genus|unknown species|unknown subspecies|unknown rank|CoNS|CoPS|Gram negative|Gram positive|Bacteria|Fungi|Protozoa|biogroup|biotype|vegetative|group|Group)") + pattern = "(Coagulase-negative Staphylococcus|Coagulase-positive Staphylococcus|Beta-haemolytic Streptococcus|unknown Gram negatives|unknown Gram positives|unknown name|unknown kingdom|unknown phylum|unknown class|unknown order|unknown family|unknown genus|unknown species|unknown subspecies|unknown rank|CoNS|CoPS|Gram negative|Gram positive|Bacteria|Fungi|Protozoa|biogroup|biotype|vegetative|group|Group)") if (sum(x_tobetranslated, na.rm = TRUE) == 0) { return(x) @@ -374,8 +384,8 @@ mo_translate <- function(x, language) { x[x_tobetranslated] <- case_when( # German language == "de" ~ x[x_tobetranslated] %>% - gsub("Coagulase Negative Staphylococcus","Koagulase-negative Staphylococcus", ., fixed = TRUE) %>% - gsub("Coagulase Positive Staphylococcus","Koagulase-positive Staphylococcus", ., fixed = TRUE) %>% + gsub("Coagulase-negative Staphylococcus","Koagulase-negative Staphylococcus", ., fixed = TRUE) %>% + gsub("Coagulase-positive Staphylococcus","Koagulase-positive Staphylococcus", ., fixed = TRUE) %>% gsub("Beta-haemolytic Streptococcus", "Beta-h\u00e4molytischer Streptococcus", ., fixed = TRUE) %>% gsub("unknown Gram negatives", "unbekannte Gramnegativen", ., fixed = TRUE) %>% gsub("unknown Gram positives", "unbekannte Grampositiven", ., fixed = TRUE) %>% @@ -405,8 +415,8 @@ mo_translate <- function(x, language) { # Dutch language == "nl" ~ x[x_tobetranslated] %>% - gsub("Coagulase Negative Staphylococcus","Coagulase-negatieve Staphylococcus", ., fixed = TRUE) %>% - gsub("Coagulase Positive Staphylococcus","Coagulase-positieve Staphylococcus", ., fixed = TRUE) %>% + gsub("Coagulase-negative Staphylococcus","Coagulase-negatieve Staphylococcus", ., fixed = TRUE) %>% + gsub("Coagulase-positive Staphylococcus","Coagulase-positieve Staphylococcus", ., fixed = TRUE) %>% gsub("Beta-haemolytic Streptococcus", "Beta-hemolytische Streptococcus", ., fixed = TRUE) %>% gsub("unknown Gram negatives", "onbekende Gram-negatieven", ., fixed = TRUE) %>% gsub("unknown Gram positives", "onbekende Gram-positieven", ., fixed = TRUE) %>% @@ -436,8 +446,8 @@ mo_translate <- function(x, language) { # Spanish language == "es" ~ x[x_tobetranslated] %>% - gsub("Coagulase Negative Staphylococcus","Staphylococcus coagulasa negativo", ., fixed = TRUE) %>% - gsub("Coagulase Positive Staphylococcus","Staphylococcus coagulasa positivo", ., fixed = TRUE) %>% + gsub("Coagulase-negative Staphylococcus","Staphylococcus coagulasa negativo", ., fixed = TRUE) %>% + gsub("Coagulase-positive Staphylococcus","Staphylococcus coagulasa positivo", ., fixed = TRUE) %>% gsub("Beta-haemolytic Streptococcus", "Streptococcus Beta-hemol\u00edtico", ., fixed = TRUE) %>% gsub("unknown Gram negatives", "Gram negativos desconocidos", ., fixed = TRUE) %>% gsub("unknown Gram positives", "Gram positivos desconocidos", ., fixed = TRUE) %>% @@ -465,8 +475,8 @@ mo_translate <- function(x, language) { # Italian language == "it" ~ x[x_tobetranslated] %>% - gsub("Coagulase Negative Staphylococcus","Staphylococcus negativo coagulasi", ., fixed = TRUE) %>% - gsub("Coagulase Positive Staphylococcus","Staphylococcus positivo coagulasi", ., fixed = TRUE) %>% + gsub("Coagulase-negative Staphylococcus","Staphylococcus negativo coagulasi", ., fixed = TRUE) %>% + gsub("Coagulase-positive Staphylococcus","Staphylococcus positivo coagulasi", ., fixed = TRUE) %>% gsub("Beta-haemolytic Streptococcus", "Streptococcus Beta-emolitico", ., fixed = TRUE) %>% gsub("unknown Gram negatives", "Gram negativi sconosciuti", ., fixed = TRUE) %>% gsub("unknown Gram positives", "Gram positivi sconosciuti", ., fixed = TRUE) %>% @@ -493,8 +503,8 @@ mo_translate <- function(x, language) { # French language == "fr" ~ x[x_tobetranslated] %>% - gsub("Coagulase Negative Staphylococcus","Staphylococcus \u00e0 coagulase n\u00e9gative", ., fixed = TRUE) %>% - gsub("Coagulase Positive Staphylococcus","Staphylococcus \u00e0 coagulase positif", ., fixed = TRUE) %>% + gsub("Coagulase-negative Staphylococcus","Staphylococcus \u00e0 coagulase n\u00e9gative", ., fixed = TRUE) %>% + gsub("Coagulase-positive Staphylococcus","Staphylococcus \u00e0 coagulase positif", ., fixed = TRUE) %>% gsub("Beta-haemolytic Streptococcus", "Streptococcus B\u00eata-h\u00e9molytique", ., fixed = TRUE) %>% gsub("unknown Gram negatives", "Gram n\u00e9gatifs inconnus", ., fixed = TRUE) %>% gsub("unknown Gram positives", "Gram positifs inconnus", ., fixed = TRUE) %>% @@ -522,8 +532,8 @@ mo_translate <- function(x, language) { # Portuguese language == "pt" ~ x[x_tobetranslated] %>% - gsub("Coagulase Negative Staphylococcus","Staphylococcus coagulase negativo", ., fixed = TRUE) %>% - gsub("Coagulase Positive Staphylococcus","Staphylococcus coagulase positivo", ., fixed = TRUE) %>% + gsub("Coagulase-negative Staphylococcus","Staphylococcus coagulase negativo", ., fixed = TRUE) %>% + gsub("Coagulase-positive Staphylococcus","Staphylococcus coagulase positivo", ., fixed = TRUE) %>% gsub("Beta-haemolytic Streptococcus", "Streptococcus Beta-hemol\u00edtico", ., fixed = TRUE) %>% gsub("unknown Gram negatives", "Gram negativos desconhecidos", ., fixed = TRUE) %>% gsub("unknown Gram positives", "Gram positivos desconhecidos", ., fixed = TRUE) %>% @@ -550,7 +560,6 @@ mo_translate <- function(x, language) { iconv(to = "UTF-8")) x - } mo_validate <- function(x, property, ...) { diff --git a/data/microorganisms.codes.rda b/data/microorganisms.codes.rda index 8b7b005e..9e30f40f 100644 Binary files a/data/microorganisms.codes.rda and b/data/microorganisms.codes.rda differ diff --git a/data/microorganisms.old.rda b/data/microorganisms.old.rda index 7ea93cfe..a6082ecb 100644 Binary files a/data/microorganisms.old.rda and b/data/microorganisms.old.rda differ diff --git a/data/microorganisms.rda b/data/microorganisms.rda index db3e616c..d2e4c1d1 100755 Binary files a/data/microorganisms.rda and b/data/microorganisms.rda differ diff --git a/data/septic_patients.rda b/data/septic_patients.rda index e189e0ae..095af455 100755 Binary files a/data/septic_patients.rda and b/data/septic_patients.rda differ diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index fc84075e..5fab3eb1 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -78,7 +78,7 @@ AMR (for R) - 0.5.0.9023 + 0.5.0.9024 diff --git a/docs/articles/benchmarks.html b/docs/articles/benchmarks.html index 578d71ef..90cf473b 100644 --- a/docs/articles/benchmarks.html +++ b/docs/articles/benchmarks.html @@ -40,7 +40,7 @@ AMR (for R) - 0.5.0.9023 + 0.5.0.9024 @@ -192,7 +192,7 @@

Benchmarks

Matthijs S. Berends

-

15 March 2019

+

18 March 2019

@@ -217,14 +217,14 @@ times = 10) print(S.aureus, unit = "ms", signif = 2) #> Unit: milliseconds -#> expr min lq mean median uq max neval -#> as.mo("sau") 17.0 17.0 22.0 17.0 19.0 59.0 10 -#> as.mo("stau") 41.0 41.0 46.0 41.0 44.0 83.0 10 -#> as.mo("staaur") 17.0 17.0 26.0 17.0 18.0 74.0 10 -#> as.mo("STAAUR") 17.0 17.0 29.0 17.0 52.0 62.0 10 -#> as.mo("S. aureus") 31.0 31.0 32.0 31.0 32.0 32.0 10 -#> as.mo("S. aureus") 31.0 31.0 48.0 32.0 73.0 110.0 10 -#> as.mo("Staphylococcus aureus") 7.4 7.4 7.7 7.4 8.2 8.6 10 +#> expr min lq mean median uq max neval +#> as.mo("sau") 18.0 18.0 22 18.0 18.0 61 10 +#> as.mo("stau") 49.0 50.0 62 50.0 50.0 130 10 +#> as.mo("staaur") 18.0 18.0 27 18.0 18.0 66 10 +#> as.mo("STAAUR") 18.0 18.0 23 18.0 19.0 66 10 +#> as.mo("S. aureus") 29.0 29.0 39 29.0 42.0 73 10 +#> as.mo("S. aureus") 29.0 29.0 38 29.0 31.0 72 10 +#> as.mo("Staphylococcus aureus") 8.3 8.3 12 8.3 8.8 44 10

In the table above, all measurements are shown in milliseconds (thousands of seconds). A value of 5 milliseconds means it can determine 200 input values per second. It case of 100 milliseconds, this is only 10 input values per second. The second input is the only one that has to be looked up thoroughly. All the others are known codes (the first one is a WHONET code) or common laboratory codes, or common full organism names like the last one. Full organism names are always preferred.

To achieve this speed, the as.mo function also takes into account the prevalence of human pathogenic microorganisms. The downside is of course that less prevalent microorganisms will be determined less fast. See this example for the ID of Thermus islandicus (B_THERMS_ISL), a bug probably never found before in humans:

T.islandicus <- microbenchmark(as.mo("theisl"),
@@ -236,12 +236,12 @@
 print(T.islandicus, unit = "ms", signif = 2)
 #> Unit: milliseconds
 #>                         expr min  lq mean median  uq max neval
-#>              as.mo("theisl") 420 430  450    470 470 470    10
-#>              as.mo("THEISL") 420 440  480    470 480 680    10
-#>       as.mo("T. islandicus") 290 290  310    300 330 350    10
-#>      as.mo("T.  islandicus") 300 300  330    330 350 350    10
-#>  as.mo("Thermus islandicus")  67  67   86     68 110 120    10
-

That takes 11 times as much time on average. A value of 100 milliseconds means it can only determine ~10 different input values per second. We can conclude that looking up arbitrary codes of less prevalent microorganisms is the worst way to go, in terms of calculation performance. Full names (like Thermus islandicus) are almost fast - these are the most probable input from most data sets.

+#> as.mo("theisl") 470 470 490 470 510 520 10 +#> as.mo("THEISL") 470 470 500 500 520 530 10 +#> as.mo("T. islandicus") 74 74 84 75 77 130 10 +#> as.mo("T. islandicus") 74 74 84 74 75 120 10 +#> as.mo("Thermus islandicus") 74 78 100 120 120 130 10 +

That takes 7.9 times as much time on average. A value of 100 milliseconds means it can only determine ~10 different input values per second. We can conclude that looking up arbitrary codes of less prevalent microorganisms is the worst way to go, in terms of calculation performance. Full names (like Thermus islandicus) are almost fast - these are the most probable input from most data sets.

In the figure below, we compare Escherichia coli (which is very common) with Prevotella brevis (which is moderately common) and with Thermus islandicus (which is very uncommon):

par(mar = c(5, 16, 4, 2)) # set more space for left margin text (16)
 
@@ -290,8 +290,8 @@
 print(run_it, unit = "ms", signif = 3)
 #> Unit: milliseconds
 #>            expr min  lq mean median  uq max neval
-#>  mo_fullname(x) 738 813  847    819 921 975    10
-

So transforming 500,000 values (!!) of 50 unique values only takes 0.82 seconds (818 ms). You only lose time on your unique input values.

+#> mo_fullname(x) 770 811 822 817 824 952 10 +

So transforming 500,000 values (!!) of 50 unique values only takes 0.82 seconds (816 ms). You only lose time on your unique input values.

@@ -304,10 +304,10 @@ print(run_it, unit = "ms", signif = 3) #> Unit: milliseconds #> expr min lq mean median uq max neval -#> A 11.000 11.100 15.700 11.300 11.400 52.900 10 -#> B 28.700 28.900 29.400 29.200 29.500 30.500 10 -#> C 0.322 0.556 0.523 0.568 0.581 0.586 10

-

So going from mo_fullname("Staphylococcus aureus") to "Staphylococcus aureus" takes 0.0006 seconds - it doesn’t even start calculating if the result would be the same as the expected resulting value. That goes for all helper functions:

+#> A 12.000 12.600 12.900 13.200 13.200 13.300 10 +#> B 26.100 26.200 27.200 26.600 28.100 30.400 10 +#> C 0.394 0.738 0.745 0.774 0.869 0.982 10 +

So going from mo_fullname("Staphylococcus aureus") to "Staphylococcus aureus" takes 0.0008 seconds - it doesn’t even start calculating if the result would be the same as the expected resulting value. That goes for all helper functions:

run_it <- microbenchmark(A = mo_species("aureus"),
                          B = mo_genus("Staphylococcus"),
                          C = mo_fullname("Staphylococcus aureus"),
@@ -320,14 +320,14 @@
 print(run_it, unit = "ms", signif = 3)
 #> Unit: milliseconds
 #>  expr   min    lq  mean median    uq   max neval
-#>     A 0.314 0.339 0.399  0.380 0.460 0.507    10
-#>     B 0.347 0.387 0.455  0.402 0.493 0.684    10
-#>     C 0.429 0.505 0.566  0.588 0.656 0.660    10
-#>     D 0.321 0.340 0.383  0.367 0.412 0.490    10
-#>     E 0.303 0.328 0.369  0.379 0.403 0.449    10
-#>     F 0.251 0.323 0.346  0.348 0.391 0.400    10
-#>     G 0.286 0.305 0.345  0.338 0.389 0.398    10
-#>     H 0.272 0.297 0.355  0.338 0.427 0.450    10
+#> A 0.316 0.382 0.407 0.430 0.434 0.457 10 +#> B 0.371 0.470 0.497 0.498 0.548 0.579 10 +#> C 0.410 0.465 0.662 0.606 0.851 0.944 10 +#> D 0.366 0.376 0.406 0.393 0.429 0.493 10 +#> E 0.301 0.318 0.374 0.350 0.426 0.476 10 +#> F 0.304 0.331 0.387 0.392 0.438 0.482 10 +#> G 0.303 0.331 0.381 0.374 0.432 0.473 10 +#> H 0.316 0.374 0.430 0.399 0.443 0.709 10

Of course, when running mo_phylum("Firmicutes") the function has zero knowledge about the actual microorganism, namely S. aureus. But since the result would be "Firmicutes" too, there is no point in calculating the result. And because this package ‘knows’ all phyla of all known bacteria (according to the Catalogue of Life), it can just return the initial value immediately.

@@ -335,7 +335,7 @@ Results in other languages

When the system language is non-English and supported by this AMR package, some functions will have a translated result. This almost does’t take extra time:

mo_fullname("CoNS", language = "en") # or just mo_fullname("CoNS") on an English system
-#> [1] "Coagulase Negative Staphylococcus (CoNS)"
+#> [1] "Coagulase-negative Staphylococcus (CoNS)"
 
 mo_fullname("CoNS", language = "es") # or just mo_fullname("CoNS") on a Spanish system
 #> [1] "Staphylococcus coagulasa negativo (CoNS)"
@@ -354,13 +354,13 @@
 print(run_it, unit = "ms", signif = 4)
 #> Unit: milliseconds
 #>  expr   min    lq  mean median    uq   max neval
-#>    en 18.05 18.11 19.33  18.25 18.65 25.12    10
-#>    de 30.15 30.84 43.57  31.08 72.47 73.96    10
-#>    nl 30.30 30.63 34.96  30.71 30.73 73.40    10
-#>    es 30.24 30.49 31.39  30.97 32.20 33.68    10
-#>    it 30.53 30.71 31.18  30.83 31.71 32.38    10
-#>    fr 29.64 30.49 35.32  30.84 32.25 73.00    10
-#>    pt 30.73 30.81 39.47  31.09 32.29 73.25    10
+#> en 19.22 19.33 20.42 19.58 19.84 28.13 10 +#> de 31.28 31.62 41.16 32.79 34.86 75.79 10 +#> nl 31.56 31.71 36.86 31.97 33.34 78.40 10 +#> es 31.32 31.94 42.76 32.98 41.72 81.33 10 +#> it 31.31 31.67 31.96 31.90 32.15 33.26 10 +#> fr 31.09 31.43 37.49 32.53 33.73 75.53 10 +#> pt 31.24 31.82 36.35 31.95 32.12 76.57 10

Currently supported are German, Dutch, Spanish, Italian, French and Portuguese.

diff --git a/docs/articles/benchmarks_files/figure-html/unnamed-chunk-5-1.png b/docs/articles/benchmarks_files/figure-html/unnamed-chunk-5-1.png index 9cd7c1ca..d79e8873 100644 Binary files a/docs/articles/benchmarks_files/figure-html/unnamed-chunk-5-1.png and b/docs/articles/benchmarks_files/figure-html/unnamed-chunk-5-1.png differ diff --git a/docs/articles/benchmarks_files/figure-html/unnamed-chunk-6-1.png b/docs/articles/benchmarks_files/figure-html/unnamed-chunk-6-1.png index 18c316e1..858c8cd7 100644 Binary files a/docs/articles/benchmarks_files/figure-html/unnamed-chunk-6-1.png and b/docs/articles/benchmarks_files/figure-html/unnamed-chunk-6-1.png differ diff --git a/docs/articles/index.html b/docs/articles/index.html index 9b46fe1f..331cf09a 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -78,7 +78,7 @@ AMR (for R) - 0.5.0.9023 + 0.5.0.9024 diff --git a/docs/authors.html b/docs/authors.html index c149c7c1..f1ee4ea0 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -78,7 +78,7 @@ AMR (for R) - 0.5.0.9023 + 0.5.0.9024 diff --git a/docs/index.html b/docs/index.html index 5bb1144c..8a6ba10e 100644 --- a/docs/index.html +++ b/docs/index.html @@ -42,7 +42,7 @@ AMR (for R) - 0.5.0.9023 + 0.5.0.9024 @@ -197,12 +197,12 @@

(TLDR - to find out how to conduct AMR analysis, please continue reading here to get started.


AMR is a free and open-source R package to simplify the analysis and prediction of Antimicrobial Resistance (AMR) and to work with microbial and antimicrobial properties by using evidence-based methods. It supports any data format, including WHONET/EARS-Net data.

-

After installing this package, R knows almost all ~60,000 microorganisms and ~500 antibiotics by name and code, and knows all about valid RSI and MIC values.

+

After installing this package, R knows ~65,000 microorganisms and ~500 antibiotics by name and code, and knows all about valid RSI and MIC values.

Used to SPSS? Read our tutorial on how to import data from SPSS, SAS or Stata and learn in which ways R outclasses any of these statistical packages.

We created this package for both academic research and routine analysis at the Faculty of Medical Sciences of the University of Groningen, the Netherlands, and the Medical Microbiology & Infection Prevention (MMBI) department of the University Medical Center Groningen (UMCG). This R package is actively maintained and is free software; you can freely use and distribute it for both personal and commercial (but not patent) purposes under the terms of the GNU General Public License version 2.0 (GPL-2), as published by the Free Software Foundation. Read the full license here.

This package can be used for:

diff --git a/docs/news/index.html b/docs/news/index.html index c6f0e65c..4c8c600a 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -78,7 +78,7 @@ AMR (for R) - 0.5.0.9023 + 0.5.0.9024 diff --git a/docs/reference/as.mo.html b/docs/reference/as.mo.html index 547eebbb..eaa6f33b 100644 --- a/docs/reference/as.mo.html +++ b/docs/reference/as.mo.html @@ -47,7 +47,7 @@ - + @@ -80,7 +80,7 @@ AMR (for R) - 0.5.0.9023 + 0.5.0.9024 @@ -237,7 +237,7 @@
-

Use this function to determine a valid microorganism ID (mo). Determination is done using intelligent rules and the complete taxonomic kingdoms Bacteria, Chromista, Protozoa, Archaea, Viruses, and most microbial species from the kingdom Fungi (see Source). The input can be almost anything: a full name (like "Staphylococcus aureus"), an abbreviated name (like "S. aureus"), an abbreviation known in the field (like "MRSA"), or just a genus. Please see Examples.

+

Use this function to determine a valid microorganism ID (mo). Determination is done using intelligent rules and the complete taxonomic kingdoms Bacteria, Chromista, Protozoa, Archaea and most microbial species from the kingdom Fungi (see Source). The input can be almost anything: a full name (like "Staphylococcus aureus"), an abbreviated name (like "S. aureus"), an abbreviation known in the field (like "MRSA"), or just a genus. Please see Examples.

@@ -263,7 +263,7 @@ Becker -

a logical to indicate whether Staphylococci should be categorised into Coagulase Negative Staphylococci ("CoNS") and Coagulase Positive Staphylococci ("CoPS") instead of their own species, according to Karsten Becker et al. [1].

+

a logical to indicate whether Staphylococci should be categorised into Coagulase Negative Staphylococci ("CoNS") and Coagulase Positive Staphylococci ("CoPS") instead of their own species, according to Karsten Becker et al. [1]. Note that this does not include species that were newly named after this publication.

This excludes Staphylococcus aureus at default, use Becker = "all" to also categorise S. aureus as "CoPS".

@@ -304,17 +304,18 @@ A microbial ID from this package (class: mo) typically looks like t | | ----> species, a 3-4 letter acronym | ----> genus, a 5-7 letter acronym, mostly without vowels ----> taxonomic kingdom: A (Archaea), AN (Animalia), B (Bacteria), C (Chromista), - F (Fungi), P (Protozoa), PL (Plantae) or V (Viruses) + F (Fungi), P (Protozoa) or PL (Plantae)

Values that cannot be coered will be considered 'unknown' and have an MO code UNKNOWN.

Use the mo_property_* functions to get properties based on the returned code, see Examples.

+

The algorithm uses data from the Catalogue of Life (see below) and from one other source (see ?microorganisms).

Self-learning algoritm
The as.mo() function gains experience from previously determined microbial IDs and learns from it. This drastically improves both speed and reliability. Use clean_mo_history() to reset the algorithms. Only experience from your current AMR package version is used. This is done because in the future the taxonomic tree (which is included in this package) may change for any organism and it consequently has to rebuild its knowledge. Usually, any guess after the first try runs 90-95% faster than the first try. The algorithm saves its previous findings to ~/.Rhistory_mo.

Intelligent rules
This function uses intelligent rules to help getting fast and logical results. It tries to find matches in this order:

A couple of effects because of these rules: