speed improvement as.mo, freq title

This commit is contained in:
dr. M.S. (Matthijs) Berends 2018-10-31 12:10:49 +01:00
parent 3d4c4c678b
commit 9cd4ab928a
27 changed files with 289 additions and 224 deletions

View File

@ -8,17 +8,20 @@ stages:
- build
- deploy
variables:
WARNINGS_ARE_ERRORS: 1
R 3:
image: rocker/r-ver:3 # rocker/r-base
stage: build
allow_failure: false
script:
- Rscript -e "options()" # to check for variables to circumvent testthat::skip_on_cran
- apt-get update
# install dependencies for package
- apt-get install --yes --no-install-recommends libxml2-dev libssl-dev libcurl4-openssl-dev zlib1g-dev
- R -e 'install.packages(c("devtools", "rlang"))'
- R -e 'devtools::install_deps(dependencies = c("Depends", "Imports", "Suggests"), repos = "https://cran.rstudio.com")'
- R -e 'print(as.data.frame(installed.packages(), row.names = FALSE)[, c("Package", "Version")])'
# remove vignettes folder and get VignetteBuilder field out of DESCRIPTION file
- rm -rf vignettes
- R -e 'd <- read.dcf("DESCRIPTION"); d[, colnames(d) == "VignetteBuilder"] <- NA; write.dcf(d, "DESCRIPTION")'
@ -27,13 +30,17 @@ R 3:
- R CMD check "${PKG_FILE_NAME}" --no-build-vignettes --no-manual --as-cran
# code coverage
- apt-get install --yes git
- R -e 'covr::codecov(token = "50ffa0aa-fee0-4f8b-a11d-8c7edc6d32ca")'
- Rscript -e 'cc <- covr::covr::package_coverage(); covr::codecov(coverage = cc, token = "50ffa0aa-fee0-4f8b-a11d-8c7edc6d32ca"); cat("Code coverage:", covr::percent_coverage(cc))'
coverage: '/Code coverage: \d+\.\d+/'
artifacts:
paths:
- '*.Rcheck/*'
name: 'Latest Rcheck log'
- '*.Rcheck\**\*.log'
- '*.Rcheck\**\*.out'
- '*.Rcheck\**\*.fail'
- '*.Rcheck\**\*.Rout'
name: 'Rcheck log'
expire_in: '1 month'
R 3.4:
image: rocker/r-ver:3.4 # rocker/r-base
stage: build
@ -44,14 +51,13 @@ R 3.4:
- apt-get install --yes --no-install-recommends libxml2-dev libssl-dev libcurl4-openssl-dev zlib1g-dev
- R -e 'install.packages("devtools")'
- R -e 'devtools::install_deps(dependencies = c("Depends", "Imports", "Suggests"))'
- R -e 'print(as.data.frame(installed.packages(), row.names = FALSE)[, c("Package", "Version")])'
# remove vignettes folder and get VignetteBuilder field out of DESCRIPTION file
- rm -rf vignettes
- R -e 'd <- read.dcf("DESCRIPTION"); d[, colnames(d) == "VignetteBuilder"] <- NA; write.dcf(d, "DESCRIPTION")'
- R CMD build . --no-build-vignettes --no-manual
- PKG_FILE_NAME=$(ls -1t *.tar.gz | head -n 1)
- R CMD check "${PKG_FILE_NAME}" --no-build-vignettes --no-manual --as-cran
R 3.5:
image: rocker/r-ver:3.5 # rocker/r-base
stage: build
@ -62,49 +68,15 @@ R 3.5:
- apt-get install --yes --no-install-recommends libxml2-dev libssl-dev libcurl4-openssl-dev zlib1g-dev
- R -e 'install.packages("devtools")'
- R -e 'devtools::install_deps(dependencies = c("Depends", "Imports", "Suggests"))'
- R -e 'print(as.data.frame(installed.packages(), row.names = FALSE)[, c("Package", "Version")])'
# remove vignettes folder and get VignetteBuilder field out of DESCRIPTION file
- rm -rf vignettes
- R -e 'd <- read.dcf("DESCRIPTION"); d[, colnames(d) == "VignetteBuilder"] <- NA; write.dcf(d, "DESCRIPTION")'
- R CMD build . --no-build-vignettes --no-manual
- PKG_FILE_NAME=$(ls -1t *.tar.gz | head -n 1)
- R CMD check "${PKG_FILE_NAME}" --no-build-vignettes --no-manual --as-cran
#R latest:
# image: rocker/r-ver:latest # rocker/r-base
# stage: build
# allow_failure: true
# script:
# - apt-get update
# install dependencies for package
# - apt-get install --yes --no-install-recommends libxml2-dev libssl-dev libcurl4-openssl-dev zlib1g-dev
# - R -e 'install.packages("devtools")'
# - R -e 'devtools::install_deps(dependencies = c("Depends", "Imports", "Suggests"))'
# - R -e 'print(as.data.frame(installed.packages(), row.names = FALSE)[, c("Package", "Version")])'
# remove vignettes folder and get VignetteBuilder field out of DESCRIPTION file
# - rm -rf vignettes
# - R -e 'd <- read.dcf("DESCRIPTION"); d[, colnames(d) == "VignetteBuilder"] <- NA; write.dcf(d, "DESCRIPTION")'
# - R CMD build . --no-build-vignettes --no-manual
# - PKG_FILE_NAME=$(ls -1t *.tar.gz | head -n 1)
# - R CMD check "${PKG_FILE_NAME}" --no-build-vignettes --no-manual --as-cran
# R devel:
# image: rocker/r-ver:devel # rocker/r-base
# stage: build
# allow_failure: true
# script:
# - apt-get update
# install dependencies for package
#- apt-get install --yes --no-install-recommends libxml2-dev libssl-dev libcurl4-openssl-dev zlib1g-dev
#- R -e 'install.packages("devtools")'
# - R -e 'devtools::install_deps(dependencies = c("Depends", "Imports", "Suggests"))'
#- R -e 'print(as.data.frame(installed.packages(), row.names = FALSE)[, c("Package", "Version")])'
# remove vignettes folder and get VignetteBuilder field out of DESCRIPTION file
#- rm -rf vignettes
# - R -e 'd <- read.dcf("DESCRIPTION"); d[, colnames(d) == "VignetteBuilder"] <- NA; write.dcf(d, "DESCRIPTION")'
coverage_job:
# image: rocker/r-ver:3
stage: deploy
script:
- echo "future packaging part"
- ls

View File

@ -1,6 +1,6 @@
Package: AMR
Version: 0.4.0.9007
Date: 2018-10-29
Version: 0.4.0.9008
Date: 2018-10-31
Title: Antimicrobial Resistance Analysis
Authors@R: c(
person(

View File

@ -3,6 +3,7 @@
#### New
* Repository moved to GitLab: https://gitlab.com/msberends/AMR
* Function `count_all` to get all available isolates (that like all `portion_*` and `count_*` functions also supports `summarise` and `group_by`), the old `n_rsi` is now an alias of `count_all`
* Data sets `microorganismsDT`, `microorganisms.prevDT`, `microorganisms.unprevDT` and `microorganisms.oldDT` to improve the speed of `as.mo`. They are for reference only, since they are primarily for internal use of `as.mo`.
#### Changed
* Big changes to the `EUCAST_rules` function:
@ -10,12 +11,12 @@
* New parameter `rules` to specify which rules should be applied (expert rules, breakpoints, others or all)
* New parameter `verbose` which can be set to `TRUE` to get very specific messages about which columns and rows were affected
* Better error handling when rules cannot be applied (i.e. new values could not be inserted)
* The amount of affected values will now only be measured once per row/column combination
* The number of affected values will now only be measured once per row/column combination
* Data set `septic_patients` now reflects these changes
* Tremendous speed improvement for `as.mo` (and consequently all `mo_*` functions), as empty values wil be ignored a priori
* Tremendous speed improvement for `as.mo` (and subsequently all `mo_*` functions), as empty values wil be ignored *a priori*
* Fewer than 3 characters as input for `as.mo` will return NA
* Added parameter `combine_IR` (TRUE/FALSE) to functions `portion_df` and `count_df`, to indicate that all values of I and R must be merged into one, so the output only consists of S vs. IR (susceptible vs. non-susceptible)
* Fix for `portion_*(..., as_percent = TRUE)` when minimal amount of isolates would not be met
* Fix for `portion_*(..., as_percent = TRUE)` when minimal number of isolates would not be met
* Added parameter `also_single_tested` for `portion_*` and `count_*` functions to also include cases where not all antibiotics were tested but at least one of the tested antibiotics includes the target antimicribial interpretation, see `?portion`
* Using `portion_*` functions now throws a warning when total available isolate is below parameter `minimum`
* Functions `as.mo`, `as.rsi`, `as.mic`, `as.atc` and `freq` will not set package name as attribute anymore
@ -26,6 +27,7 @@
* Gained `na` parameter, to choose with character to print for empty values
* Support for class `difftime`
* New parameter `header` to turn it off (default when `markdown = TRUE`)
* New parameter `title` to replace the automatically set title
* `first_isolate` now tries to find columns to use as input when parameters are left blank
* Improvement for MDRO algorithm
* Data set `septic_patients` is now a `data.frame`, not a tibble anymore

View File

@ -19,7 +19,7 @@
#' Data set with 423 antibiotics
#'
#' A data set containing all antibiotics with a J0 code and some other antimicrobial agents, with their DDDs. Except for trade names and abbreviations, all properties were downloaded from the WHO, see Source.
#' @format A \code{\link{tibble}} with 423 observations and 18 variables:
#' @format A \code{\link{data.frame}} with 423 observations and 18 variables:
#' \describe{
#' \item{\code{atc}}{ATC code, like \code{J01CR02}}
#' \item{\code{certe}}{Certe code, like \code{amcl}}
@ -139,7 +139,7 @@
#' \item{\code{subkingdom}}{Taxonomic subkingdom of the microorganism as found in ITIS, see Source}
#' \item{\code{gramstain}}{Gram of microorganism, like \code{"Gram negative"}}
#' \item{\code{type}}{Type of microorganism, like \code{"Bacteria"} and \code{"Fungi"}}
#' \item{\code{prevalence}}{A rounded integer based on prevalence of the microorganism. Used internally by \code{\link{as.mo}}, otherwise quite meaningless.}
#' \item{\code{prevalence}}{An integer based on estimated prevalence of the microorganism in humans. Used internally by \code{\link{as.mo}}, otherwise quite meaningless. It has a value of 25 for manually added items and a value of 1000 for all unprevalent microorganisms whose genus was somewhere in the top 250 (with another species).}
#' \item{\code{ref}}{Author(s) and year of concerning publication as found in ITIS, see Source}
#' }
#' @source [3] Integrated Taxonomic Information System (ITIS) on-line database, \url{https://www.itis.gov}.
@ -164,7 +164,7 @@
#' Translation table for UMCG
#'
#' A data set containing all bacteria codes of UMCG MMB. These codes can be joined to data with an ID from \code{\link{microorganisms}$mo} (using \code{\link{left_join_microorganisms}}). GLIMS codes can also be translated to valid \code{MO}s with \code{\link{guess_mo}}.
#' @format A \code{\link{tibble}} with 1,095 observations and 2 variables:
#' @format A \code{\link{data.frame}} with 1,095 observations and 2 variables:
#' \describe{
#' \item{\code{umcg}}{Code of microorganism according to UMCG MMB}
#' \item{\code{certe}}{Code of microorganism according to Certe MMB}
@ -175,7 +175,7 @@
#' Translation table for Certe
#'
#' A data set containing all bacteria codes of Certe MMB. These codes can be joined to data with an ID from \code{\link{microorganisms}$mo} (using \code{\link{left_join_microorganisms}}). GLIMS codes can also be translated to valid \code{MO}s with \code{\link{guess_mo}}.
#' @format A \code{\link{tibble}} with 2,665 observations and 2 variables:
#' @format A \code{\link{data.frame}} with 2,665 observations and 2 variables:
#' \describe{
#' \item{\code{certe}}{Code of microorganism according to Certe MMB}
#' \item{\code{mo}}{Code of microorganism in \code{\link{microorganisms}}}
@ -239,3 +239,31 @@
#' summarise(n = n_rsi(amcl),
#' p = portion_IR(amcl, minimum = 20))
"septic_patients"
#' Supplementary Data
#'
#' These \code{\link{data.table}s} are transformed from the \code{\link{microorganisms}} and \code{\link{microorganisms}} data sets to improve speed of \code{\link{as.mo}}. They are meant for internal use only, and are only mentioned here for reference.
#' @rdname supplementary_data
#' @name supplementary_data
# # Renew data:
# microorganismsDT <- data.table::as.data.table(AMR::microorganisms)
# # sort on (1) bacteria, (2) fungi, (3) protozoa and then human pathogenic prevalence and then TSN:
# data.table::setkey(microorganismsDT, type, prevalence, fullname)
# microorganisms.prevDT <- microorganismsDT[prevalence == 9999,]
# microorganisms.unprevDT <- microorganismsDT[prevalence != 9999,]
# microorganisms.oldDT <- data.table::as.data.table(AMR::microorganisms.old)
# data.table::setkey(microorganisms.oldDT, tsn, name)
# devtools::use_data(microorganismsDT, overwrite = TRUE)
# devtools::use_data(microorganisms.prevDT, overwrite = TRUE)
# devtools::use_data(microorganisms.unprevDT, overwrite = TRUE)
# devtools::use_data(microorganisms.oldDT, overwrite = TRUE)
"microorganismsDT"
#' @rdname supplementary_data
"microorganisms.prevDT"
#' @rdname supplementary_data
"microorganisms.unprevDT"
#' @rdname supplementary_data
"microorganisms.oldDT"

View File

@ -29,6 +29,7 @@
#' @param digits how many significant digits are to be used for numeric values in the header (not for the items themselves, that depends on \code{\link{getOption}("digits")})
#' @param quote a logical value indicating whether or not strings should be printed with surrounding quotes
#' @param header a logical value indicating whether an informative header should be printed
#' @param title text to show above frequency table, at default to tries to coerce from the variables passed to \code{x}
#' @param na a character string to should be used to show empty (\code{NA}) values (only useful when \code{na.rm = FALSE})
#' @param sep a character string to separate the terms when selecting multiple columns
#' @param f a frequency table
@ -151,6 +152,7 @@ frequency_tbl <- function(x,
digits = 2,
quote = FALSE,
header = !markdown,
title = NULL,
na = "<NA>",
sep = " ") {
@ -395,6 +397,11 @@ frequency_tbl <- function(x,
tbl_format <- 'pandoc'
}
if (!is.null(title)) {
x.name <- trimws(gsub("^Frequency table of", "", title[1L], ignore.case = TRUE))
cols <- NULL
}
structure(.Data = df,
class = c('frequency_tbl', class(df)),
opt = list(data = x.name,
@ -522,7 +529,7 @@ print.frequency_tbl <- function(x, nmax = getOption("max.print.freq", default =
}
}
title <- paste("Frequency table", title)
title <- paste("Frequency table", trimws(title))
# bold title
if (opt$tbl_format == "pandoc") {

View File

@ -49,6 +49,10 @@ globalVariables(c(".",
"mic",
"microorganisms",
"microorganisms.old",
"microorganismsDT",
"microorganisms.prevDT",
"microorganisms.unprevDT",
"microorganisms.oldDT",
"mo",
"mo.old",
"n",

204
R/mo.R
View File

@ -49,7 +49,15 @@
#'
#' Use the \code{\link{mo_property}} functions to get properties based on the returned code, see Examples.
#'
#' This function uses Artificial Intelligence (AI) to help getting more logical results, based on type of input and known prevalence of human pathogens. For example:
#' This function uses Artificial Intelligence (AI) to help getting fast and logical results. It tries to find matches in this order:
#' \itemize{
#' \item{Taxonomic kingdom: it first searches in bacteria, then fungi, then protozoa}
#' \item{Human pathogenic prevalence: it first searches in more prevalent microorganisms, then less prevalent ones}
#' \item{Valid MO codes and full names: it first searches in already valid MO code and genus/species combinations}
#' \item{Breakdown of input values: from here it starts to breakdown input values to find possible matches}
#' }
#'
#' A couple of effects because of these rules
#' \itemize{
#' \item{\code{"E. coli"} will return the ID of \emph{Escherichia coli} and not \emph{Entamoeba coli}, although the latter would alphabetically come first}
#' \item{\code{"H. influenzae"} will return the ID of \emph{Haemophilus influenzae} and not \emph{Haematobacter influenzae} for the same reason}
@ -63,10 +71,11 @@
#' \if{html}{\figure{itis_logo.jpg}{options: height=60px style=margin-bottom:5px} \cr}
#' This package contains the \strong{complete microbial taxonomic data} (with all seven taxonomic ranks - from subkingdom to subspecies) from the publicly available Integrated Taxonomic Information System (ITIS, \url{https://www.itis.gov}).
#'
#' All (sub)species from the taxonomic kingdoms Bacteria, Fungi and Protozoa are included in this package, as well as all previously accepted names known to ITIS. Furthermore, the responsible authors and year of publication are available. This allows users to use authoritative taxonomic information for their data analysis on any microorganism, not only human pathogens.
#' All (sub)species from the \strong{taxonomic kingdoms Bacteria, Fungi and Protozoa are included in this package}, as well as all previously accepted names known to ITIS. Furthermore, the responsible authors and year of publication are available. This \strong{allows users to use authoritative taxonomic information} for their data analysis on any microorganism, not only human pathogens. It also helps to \strong{quickly determine the Gram stain of bacteria}, since all bacteria are classified into subkingdom Negibacteria or Posibacteria.
#'
#' ITIS is a partnership of U.S., Canadian, and Mexican agencies and taxonomic specialists [3].
# (source as a section, so it can be inherited by other man pages:)
#'
# (source as a section, so it can be inherited by other man pages)
#' @section Source:
#' [1] Becker K \emph{et al.} \strong{Coagulase-Negative Staphylococci}. 2014. Clin Microbiol Rev. 27(4): 870926. \url{https://dx.doi.org/10.1128/CMR.00109-13}
#'
@ -129,9 +138,10 @@
#' mutate(mo = guess_mo(paste(genus, species)))
#' }
as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = FALSE, reference_df = NULL) {
exec_as.mo(x = x, Becker = Becker, Lancefield = Lancefield,
allow_uncertain = allow_uncertain, reference_df = reference_df,
property = "mo")
structure(mo_validate(x = x, property = "mo",
Becker = Becker, Lancefield = Lancefield,
allow_uncertain = allow_uncertain, reference_df = reference_df),
class = "mo")
}
#' @rdname as.mo
@ -147,8 +157,15 @@ is.mo <- function(x) {
guess_mo <- as.mo
#' @importFrom dplyr %>% pull left_join
#' @importFrom data.table as.data.table setkey
#' @importFrom data.table data.table as.data.table setkey
exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = FALSE, reference_df = NULL, property = "mo") {
# These data.tables are available as data sets when the AMR package is loaded:
# microorganismsDT # this one is sorted by kingdom (B<F<P), prevalence, TSN
# microorganisms.prevDT # same as microorganismsDT, but with prevalence != 9999
# microorganisms.unprevDT # same as microorganismsDT, but with prevalence == 9999
# microorganisms.oldDT # old taxonomic names, sorted by name (genus+species), TSN
if (NCOL(x) == 2) {
# support tidyverse selection like: df %>% select(colA, colB)
# paste these columns together
@ -176,12 +193,6 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain =
# remove empty values (to later fill them in again)
x <- x[!is.na(x) & !is.null(x) & !identical(x, "")]
# These data.tables are available because of .onAttach:
# MOs
# MOs_mostprevalent
# MOs_allothers
# MOs_old
# defined df to check for
if (!is.null(reference_df)) {
if (!is.data.frame(reference_df) | NCOL(reference_df) < 2) {
@ -193,18 +204,12 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain =
)
}
if (all(x %in% AMR::microorganisms[, property])) {
# already existing mo
} else if (all(x %in% AMR::microorganisms[, "mo"])) {
# existing mo codes when not looking for property "mo"
suppressWarnings(
x <- data.frame(mo = x, stringsAsFactors = FALSE) %>%
left_join(AMR::microorganisms, by = "mo") %>%
pull(property)
)
if (all(x %in% microorganismsDT[["mo"]])) {
# existing mo codes when not looking for property "mo", like mo_genus("B_ESCHR_COL")
x <- microorganismsDT[data.table(mo = x), on = "mo", ..property][[1]]
} else if (!is.null(reference_df)
& all(x %in% reference_df[, 1])
& all(reference_df[, 2] %in% AMR::microorganisms$mo)) {
& all(reference_df[, 2] %in% microorganismsDT[["mo"]])) {
# manually defined reference
colnames(reference_df)[1] <- "x"
colnames(reference_df)[2] <- "mo"
@ -214,24 +219,12 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain =
left_join(AMR::microorganisms, by = "mo") %>%
pull(property)
)
} else if (all(x %in% AMR::microorganisms.certe[, "certe"])) {
} else if (all(toupper(x) %in% AMR::microorganisms.certe[, "certe"])) {
# old Certe codes
suppressWarnings(
x <- data.frame(certe = x, stringsAsFactors = FALSE) %>%
left_join(AMR::microorganisms.certe, by = "certe") %>%
left_join(AMR::microorganisms, by = "mo") %>%
pull(property)
)
} else if (all(x %in% AMR::microorganisms.umcg[, "umcg"])) {
# old UMCG codes
suppressWarnings(
x <- data.frame(umcg = x, stringsAsFactors = FALSE) %>%
left_join(AMR::microorganisms.umcg, by = "umcg") %>%
left_join(AMR::microorganisms.certe, by = "certe") %>%
left_join(AMR::microorganisms, by = "mo") %>%
pull(property)
)
} else {
y <- as.data.table(AMR::microorganisms.certe)[data.table(certe = toupper(x)), on = "certe", ]
x <- microorganismsDT[data.table(mo = y[["mo"]]), on = "mo", ..property][[1]]
} else if (!all(x %in% microorganismsDT[[property]])) {
x_backup <- trimws(x, which = "both")
x_species <- paste(x_backup, "species")
@ -280,36 +273,36 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain =
| toupper(x_trimmed[i]) == 'MSSA'
| toupper(x_trimmed[i]) == 'VISA'
| toupper(x_trimmed[i]) == 'VRSA') {
x[i] <- MOs[mo == 'B_STPHY_AUR', ..property][[1]][1L]
x[i] <- microorganismsDT[mo == 'B_STPHY_AUR', ..property][[1]][1L]
next
}
if (toupper(x_trimmed[i]) == 'MRSE'
| toupper(x_trimmed[i]) == 'MSSE') {
x[i] <- MOs[mo == 'B_STPHY_EPI', ..property][[1]][1L]
x[i] <- microorganismsDT[mo == 'B_STPHY_EPI', ..property][[1]][1L]
next
}
if (toupper(x_trimmed[i]) == 'VRE') {
x[i] <- MOs[mo == 'B_ENTRC', ..property][[1]][1L]
x[i] <- microorganismsDT[mo == 'B_ENTRC', ..property][[1]][1L]
next
}
if (toupper(x_trimmed[i]) == 'MRPA') {
# multi resistant P. aeruginosa
x[i] <- MOs[mo == 'B_PDMNS_AER', ..property][[1]][1L]
x[i] <- microorganismsDT[mo == 'B_PDMNS_AER', ..property][[1]][1L]
next
}
if (toupper(x_trimmed[i]) == 'CRS'
| toupper(x_trimmed[i]) == 'CRSM') {
# co-trim resistant S. maltophilia
x[i] <- MOs[mo == 'B_STNTR_MAL', ..property][[1]][1L]
x[i] <- microorganismsDT[mo == 'B_STNTR_MAL', ..property][[1]][1L]
next
}
if (toupper(x_trimmed[i]) %in% c('PISP', 'PRSP', 'VISP', 'VRSP')) {
# peni I, peni R, vanco I, vanco R: S. pneumoniae
x[i] <- MOs[mo == 'B_STRPTC_PNE', ..property][[1]][1L]
x[i] <- microorganismsDT[mo == 'B_STRPTC_PNE', ..property][[1]][1L]
next
}
if (toupper(x_trimmed[i]) %like% '^G[ABCDFGHK]S$') {
x[i] <- MOs[mo == gsub("G([ABCDFGHK])S", "B_STRPTC_GR\\1", x_trimmed[i]), ..property][[1]][1L]
x[i] <- microorganismsDT[mo == gsub("G([ABCDFGHK])S", "B_STRPTC_GR\\1", x_trimmed[i]), ..property][[1]][1L]
next
}
# CoNS/CoPS in different languages (support for German, Dutch, Spanish, Portuguese) ----
@ -317,14 +310,14 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain =
| tolower(x_trimmed[i]) %like% '[ck]oagulas[ea] negatie?[vf]'
| tolower(x[i]) %like% '[ck]o?ns[^a-z]?$') {
# coerce S. coagulase negative
x[i] <- MOs[mo == 'B_STPHY_CNS', ..property][[1]][1L]
x[i] <- microorganismsDT[mo == 'B_STPHY_CNS', ..property][[1]][1L]
next
}
if (tolower(x[i]) %like% '[ck]oagulas[ea] positie?[vf]'
| tolower(x_trimmed[i]) %like% '[ck]oagulas[ea] positie?[vf]'
| tolower(x[i]) %like% '[ck]o?ps[^a-z]?$') {
# coerce S. coagulase positive
x[i] <- MOs[mo == 'B_STPHY_CPS', ..property][[1]][1L]
x[i] <- microorganismsDT[mo == 'B_STPHY_CPS', ..property][[1]][1L]
next
}
}
@ -332,14 +325,14 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain =
# FIRST TRY FULLNAMES AND CODES
# if only genus is available, return only genus
if (all(!c(x[i], x_trimmed[i]) %like% " ")) {
found <- MOs[tolower(fullname) %in% tolower(c(x_species[i], x_trimmed_species[i])), ..property][[1]]
found <- microorganismsDT[tolower(fullname) %in% tolower(c(x_species[i], x_trimmed_species[i])), ..property][[1]]
if (length(found) > 0) {
x[i] <- found[1L]
next
}
if (nchar(x_trimmed[i]) > 4) {
# not when abbr is esco, stau, klpn, etc.
found <- MOs[tolower(fullname) %like% gsub(" ", ".*", x_trimmed_species[i], fixed = TRUE), ..property][[1]]
found <- microorganismsDT[tolower(fullname) %like% gsub(" ", ".*", x_trimmed_species[i], fixed = TRUE), ..property][[1]]
if (length(found) > 0) {
x[i] <- found[1L]
next
@ -348,20 +341,22 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain =
}
# TRY OTHER SOURCES ----
if (x_backup[i] %in% AMR::microorganisms.certe[, 1]) {
x[i] <- MOs[mo == AMR::microorganisms.certe[AMR::microorganisms.certe[, 1] == x_backup[i], 2], ..property][[1]][1L]
next
if (x_backup[i] %in% AMR::microorganisms.certe$certe) {
x[i] <- microorganismsDT[mo == AMR::microorganisms.certe[AMR::microorganisms.certe[, 1] == x_backup[i], 2], ..property][[1]][1L]
# x[i] <- exec_as.mo(x = AMR::microorganisms.certe[AMR::microorganisms.certe$certe == x_backup[i], "mo"],
# property = property)
# next
}
if (x_backup[i] %in% AMR::microorganisms.umcg[, 1]) {
ref_certe <- AMR::microorganisms.umcg[AMR::microorganisms.umcg[, 1] == x_backup[i], 2]
ref_mo <- AMR::microorganisms.certe[AMR::microorganisms.certe[, 1] == ref_certe, 2]
x[i] <- MOs[mo == ref_mo, ..property][[1]][1L]
x[i] <- microorganismsDT[mo == ref_mo, ..property][[1]][1L]
next
}
if (x_backup[i] %in% reference_df[, 1]) {
ref_mo <- reference_df[reference_df[, 1] == x_backup[i], 2]
if (ref_mo %in% MOs[, mo]) {
x[i] <- MOs[mo == ref_mo, ..property][[1]][1L]
if (ref_mo %in% microorganismsDT[, mo]) {
x[i] <- microorganismsDT[mo == ref_mo, ..property][[1]][1L]
next
} else {
warning("Value '", x_backup[i], "' was found in reference_df, but '", ref_mo, "' is not a valid MO code.", call. = FALSE)
@ -369,20 +364,19 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain =
}
# TRY FIRST THOUSAND MOST PREVALENT IN HUMAN INFECTIONS ----
found <- MOs_mostprevalent[tolower(fullname) %in% tolower(c(x_backup[i], x_trimmed[i])), ..property][[1]]
found <- microorganisms.prevDT[tolower(fullname) %in% tolower(c(x_backup[i], x_trimmed[i])), ..property][[1]]
# most probable: is exact match in fullname
if (length(found) > 0) {
x[i] <- found[1L]
next
}
found <- MOs_mostprevalent[tsn == x_trimmed[i], ..property][[1]]
found <- microorganisms.prevDT[tsn == x_trimmed[i], ..property][[1]]
# is a valid TSN
if (length(found) > 0) {
x[i] <- found[1L]
next
}
found <- MOs_mostprevalent[mo == toupper(x_backup[i]), ..property][[1]]
found <- microorganisms.prevDT[mo == toupper(x_backup[i]), ..property][[1]]
# is a valid mo
if (length(found) > 0) {
x[i] <- found[1L]
@ -390,21 +384,21 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain =
}
# try any match keeping spaces ----
found <- MOs_mostprevalent[fullname %like% x_withspaces[i], ..property][[1]]
found <- microorganisms.prevDT[fullname %like% x_withspaces[i], ..property][[1]]
if (length(found) > 0) {
x[i] <- found[1L]
next
}
# try any match keeping spaces, not ending with $ ----
found <- MOs_mostprevalent[fullname %like% x_withspaces_start[i], ..property][[1]]
found <- microorganisms.prevDT[fullname %like% x_withspaces_start[i], ..property][[1]]
if (length(found) > 0) {
x[i] <- found[1L]
next
}
# try any match diregarding spaces ----
found <- MOs_mostprevalent[fullname %like% x[i], ..property][[1]]
found <- microorganisms.prevDT[fullname %like% x[i], ..property][[1]]
if (length(found) > 0) {
x[i] <- found[1L]
next
@ -412,7 +406,7 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain =
# try fullname without start and stop regex, to also find subspecies ----
# like "K. pneu rhino" -> "Klebsiella pneumoniae (rhinoscleromatis)" = KLEPNERH
found <- MOs_mostprevalent[fullname %like% x_withspaces_start[i], ..property][[1]]
found <- microorganisms.prevDT[fullname %like% x_withspaces_start[i], ..property][[1]]
if (length(found) > 0) {
x[i] <- found[1L]
next
@ -427,7 +421,7 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain =
x_split[i] <- paste0(x_trimmed[i] %>% substr(1, x_length / 2) %>% trimws(),
'.* ',
x_trimmed[i] %>% substr((x_length / 2) + 1, x_length) %>% trimws())
found <- MOs_mostprevalent[fullname %like% paste0('^', x_split[i]), ..property][[1]]
found <- microorganisms.prevDT[fullname %like% paste0('^', x_split[i]), ..property][[1]]
if (length(found) > 0) {
x[i] <- found[1L]
next
@ -442,7 +436,7 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain =
# x_trimmed[i] <- trimws(x_trimmed[i], which = "both")
# }
# if (!is.na(x_trimmed[i])) {
# found <- MOs_mostprevalent[fullname %like% x_trimmed[i], ..property][[1]]
# found <- microorganisms.prevDT[fullname %like% x_trimmed[i], ..property][[1]]
# if (length(found) > 0) {
# x[i] <- found[1L]
# next
@ -450,25 +444,25 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain =
# }
# THEN TRY ALL OTHERS ----
found <- MOs_allothers[tolower(fullname) == tolower(x_backup[i]), ..property][[1]]
found <- microorganisms.unprevDT[tolower(fullname) == tolower(x_backup[i]), ..property][[1]]
# most probable: is exact match in fullname
if (length(found) > 0) {
x[i] <- found[1L]
next
}
found <- MOs_allothers[tolower(fullname) == tolower(x_trimmed[i]), ..property][[1]]
found <- microorganisms.unprevDT[tolower(fullname) == tolower(x_trimmed[i]), ..property][[1]]
# most probable: is exact match in fullname
if (length(found) > 0) {
x[i] <- found[1L]
next
}
found <- MOs_allothers[tsn == x_trimmed[i], ..property][[1]]
found <- microorganisms.unprevDT[tsn == x_trimmed[i], ..property][[1]]
# is a valid TSN
if (length(found) > 0) {
x[i] <- found[1L]
next
}
found <- MOs_allothers[mo == toupper(x_backup[i]), ..property][[1]]
found <- microorganisms.unprevDT[mo == toupper(x_backup[i]), ..property][[1]]
# is a valid mo
if (length(found) > 0) {
x[i] <- found[1L]
@ -476,21 +470,21 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain =
}
# try any match keeping spaces ----
found <- MOs_allothers[fullname %like% x_withspaces[i], ..property][[1]]
found <- microorganisms.unprevDT[fullname %like% x_withspaces[i], ..property][[1]]
if (length(found) > 0) {
x[i] <- found[1L]
next
}
# try any match keeping spaces, not ending with $ ----
found <- MOs_allothers[fullname %like% x_withspaces_start[i], ..property][[1]]
found <- microorganisms.unprevDT[fullname %like% x_withspaces_start[i], ..property][[1]]
if (length(found) > 0) {
x[i] <- found[1L]
next
}
# try any match diregarding spaces ----
found <- MOs_allothers[fullname %like% x[i], ..property][[1]]
found <- microorganisms.unprevDT[fullname %like% x[i], ..property][[1]]
if (length(found) > 0) {
x[i] <- found[1L]
next
@ -498,7 +492,7 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain =
# try fullname without start and stop regex, to also find subspecies ----
# like "K. pneu rhino" -> "Klebsiella pneumoniae (rhinoscleromatis)" = KLEPNERH
found <- MOs_allothers[fullname %like% x_withspaces_start[i], ..property][[1]]
found <- microorganisms.unprevDT[fullname %like% x_withspaces_start[i], ..property][[1]]
if (length(found) > 0) {
x[i] <- found[1L]
next
@ -513,7 +507,7 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain =
x_split[i] <- paste0(x_trimmed[i] %>% substr(1, x_length / 2) %>% trimws(),
'.* ',
x_trimmed[i] %>% substr((x_length / 2) + 1, x_length) %>% trimws())
found <- MOs_allothers[fullname %like% paste0('^', x_split[i]), ..property][[1]]
found <- microorganisms.unprevDT[fullname %like% paste0('^', x_split[i]), ..property][[1]]
if (length(found) > 0) {
x[i] <- found[1L]
next
@ -528,7 +522,7 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain =
# x_trimmed[i] <- trimws(x_trimmed[i], which = "both")
# }
# if (!is.na(x_trimmed[i])) {
# found <- MOs_allothers[fullname %like% x_trimmed[i], ..property][[1]]
# found <- microorganisms.unprevDT[fullname %like% x_trimmed[i], ..property][[1]]
# if (length(found) > 0) {
# x[i] <- found[1L]
# next
@ -538,33 +532,33 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain =
# MISCELLANEOUS ----
# look for old taxonomic names ----
found <- MOs_old[tolower(name) == tolower(x_backup[i])
| tsn == x_trimmed[i]
| name %like% x_withspaces[i],]
found <- microorganisms.oldDT[tolower(name) == tolower(x_backup[i])
| tsn == x_trimmed[i]
| name %like% x_withspaces[i],]
if (NROW(found) > 0) {
x[i] <- MOs[tsn == found[1, tsn_new], ..property][[1]]
x[i] <- microorganismsDT[tsn == found[1, tsn_new], ..property][[1]]
renamed_note(name_old = found[1, name],
name_new = MOs[tsn == found[1, tsn_new], fullname],
name_new = microorganismsDT[tsn == found[1, tsn_new], fullname],
ref_old = found[1, ref],
ref_new = MOs[tsn == found[1, tsn_new], ref])
ref_new = microorganismsDT[tsn == found[1, tsn_new], ref])
next
}
# check for uncertain results ----
if (allow_uncertain == TRUE) {
# (1) look again for old taxonomic names, now for G. species ----
found <- MOs_old[name %like% x_withspaces[i]
| name %like% x_withspaces_start[i]
| name %like% x[i],]
found <- microorganisms.oldDT[name %like% x_withspaces[i]
| name %like% x_withspaces_start[i]
| name %like% x[i],]
if (NROW(found) > 0) {
x[i] <- MOs[tsn == found[1, tsn_new], ..property][[1]]
x[i] <- microorganismsDT[tsn == found[1, tsn_new], ..property][[1]]
warning("Uncertain interpretation: '",
x_backup[i], "' -> '", found[1, name], "'",
call. = FALSE, immediate. = TRUE)
renamed_note(name_old = found[1, name],
name_new = MOs[tsn == found[1, tsn_new], fullname],
name_new = microorganismsDT[tsn == found[1, tsn_new], fullname],
ref_old = found[1, ref],
ref_new = MOs[tsn == found[1, tsn_new], ref])
ref_new = microorganismsDT[tsn == found[1, tsn_new], ref])
next
}
@ -574,7 +568,7 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain =
x[i] <- suppressWarnings(suppressMessages(as.mo(x_strip)))
if (!is.na(x[i])) {
warning("Uncertain interpretation: '",
x_backup[i], "' -> '", MOs[mo == x[i], fullname], "' (", x[i], ")",
x_backup[i], "' -> '", microorganismsDT[mo == x[i], fullname], "' (", x[i], ")",
call. = FALSE, immediate. = TRUE)
next
}
@ -599,7 +593,7 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain =
if (Becker == TRUE | Becker == "all") {
# See Source. It's this figure:
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4187637/figure/F3/
MOs_staph <- MOs[genus == "Staphylococcus"]
MOs_staph <- microorganismsDT[genus == "Staphylococcus"]
setkey(MOs_staph, species)
CoNS <- MOs_staph[species %in% c("arlettae", "auricularis", "capitis",
"caprae", "carnosus", "cohnii", "condimenti",
@ -617,35 +611,35 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain =
"hyicus", "intermedius",
"pseudintermedius", "pseudointermedius",
"schleiferi"), ..property][[1]]
x[x %in% CoNS] <- MOs[mo == 'B_STPHY_CNS', ..property][[1]][1L]
x[x %in% CoPS] <- MOs[mo == 'B_STPHY_CPS', ..property][[1]][1L]
x[x %in% CoNS] <- microorganismsDT[mo == 'B_STPHY_CNS', ..property][[1]][1L]
x[x %in% CoPS] <- microorganismsDT[mo == 'B_STPHY_CPS', ..property][[1]][1L]
if (Becker == "all") {
x[x == MOs[mo == 'B_STPHY_AUR', ..property][[1]][1L]] <- MOs[mo == 'B_STPHY_CPS', ..property][[1]][1L]
x[x == microorganismsDT[mo == 'B_STPHY_AUR', ..property][[1]][1L]] <- microorganismsDT[mo == 'B_STPHY_CPS', ..property][[1]][1L]
}
}
# Lancefield ----
if (Lancefield == TRUE | Lancefield == "all") {
# group A - S. pyogenes
x[x == MOs[mo == 'B_STRPTC_PYO', ..property][[1]][1L]] <- MOs[mo == 'B_STRPTC_GRA', ..property][[1]][1L]
x[x == microorganismsDT[mo == 'B_STRPTC_PYO', ..property][[1]][1L]] <- microorganismsDT[mo == 'B_STRPTC_GRA', ..property][[1]][1L]
# group B - S. agalactiae
x[x == MOs[mo == 'B_STRPTC_AGA', ..property][[1]][1L]] <- MOs[mo == 'B_STRPTC_GRB', ..property][[1]][1L]
x[x == microorganismsDT[mo == 'B_STRPTC_AGA', ..property][[1]][1L]] <- microorganismsDT[mo == 'B_STRPTC_GRB', ..property][[1]][1L]
# group C
S_groupC <- MOs %>% filter(genus == "Streptococcus",
species %in% c("equisimilis", "equi",
"zooepidemicus", "dysgalactiae")) %>%
S_groupC <- microorganismsDT %>% filter(genus == "Streptococcus",
species %in% c("equisimilis", "equi",
"zooepidemicus", "dysgalactiae")) %>%
pull(property)
x[x %in% S_groupC] <- MOs[mo == 'B_STRPTC_GRC', ..property][[1]][1L]
x[x %in% S_groupC] <- microorganismsDT[mo == 'B_STRPTC_GRC', ..property][[1]][1L]
if (Lancefield == "all") {
# all Enterococci
x[x %like% "^(Enterococcus|B_ENTRC)"] <- MOs[mo == 'B_STRPTC_GRD', ..property][[1]][1L]
x[x %like% "^(Enterococcus|B_ENTRC)"] <- microorganismsDT[mo == 'B_STRPTC_GRD', ..property][[1]][1L]
}
# group F - S. anginosus
x[x == MOs[mo == 'B_STRPTC_ANG', ..property][[1]][1L]] <- MOs[mo == 'B_STRPTC_GRF', ..property][[1]][1L]
x[x == microorganismsDT[mo == 'B_STRPTC_ANG', ..property][[1]][1L]] <- microorganismsDT[mo == 'B_STRPTC_GRF', ..property][[1]][1L]
# group H - S. sanguinis
x[x == MOs[mo == 'B_STRPTC_SAN', ..property][[1]][1L]] <- MOs[mo == 'B_STRPTC_GRH', ..property][[1]][1L]
x[x == microorganismsDT[mo == 'B_STRPTC_SAN', ..property][[1]][1L]] <- microorganismsDT[mo == 'B_STRPTC_GRH', ..property][[1]][1L]
# group K - S. salivarius
x[x == MOs[mo == 'B_STRPTC_SAL', ..property][[1]][1L]] <- MOs[mo == 'B_STRPTC_GRK', ..property][[1]][1L]
x[x == microorganismsDT[mo == 'B_STRPTC_SAL', ..property][[1]][1L]] <- microorganismsDT[mo == 'B_STRPTC_GRK', ..property][[1]][1L]
}
# comply to x, which is also unique and without empty values
@ -700,7 +694,7 @@ print.mo <- function(x, ...) {
#' @export
#' @noRd
as.data.frame.mo <- function (x, ...) {
# same as as.data.frame.character but with removed stringsAsFactors
# same as as.data.frame.character but with removed stringsAsFactors, since it will be class "mo"
nm <- paste(deparse(substitute(x), width.cutoff = 500L),
collapse = " ")
if (!"nm" %in% names(list(...))) {

View File

@ -383,7 +383,9 @@ mo_validate <- function(x, property, ...) {
Lancefield <- FALSE
}
if (!all(x %in% AMR::microorganisms[, property]) | Becker %in% c(TRUE, "all") | Lancefield == TRUE) {
if (!all(x %in% microorganismsDT[[property]])
| Becker %in% c(TRUE, "all")
| Lancefield %in% c(TRUE, "all")) {
exec_as.mo(x, property = property, ...)
} else {
x

32
R/zzz.R
View File

@ -50,21 +50,25 @@ NULL
.onAttach <- function(libname, pkgname) {
# save data.tables to improve speed of as.mo:
MOs <- data.table::as.data.table(AMR::microorganisms)
data.table::setkey(MOs, prevalence, tsn)
base::assign(x = "MOs",
value = MOs,
envir = base::as.environment("package:AMR"))
base::assign(x = "MOs_mostprevalent",
value = MOs[prevalence != 9999,],
envir = base::as.environment("package:AMR"))
base::assign(x = "MOs_allothers",
value = MOs[prevalence == 9999,],
envir = base::as.environment("package:AMR"))
# microorganismsDT <- data.table::as.data.table(AMR::microorganisms)
# microorganisms.oldDT <- data.table::as.data.table(AMR::microorganisms.old)
#
# data.table::setkey(microorganismsDT, prevalence, tsn)
# data.table::setkey(microorganisms.oldDT, tsn, name)
base::assign(x = "MOs_old",
value = data.table::as.data.table(AMR::microorganisms.old),
envir = base::as.environment("package:AMR"))
base::assign(x = "microorganismsDT",
value = microorganismsDT,
envir = base::as.environment("package:AMR"))
base::assign(x = "microorganisms.prevDT",
value = microorganismsDT[prevalence != 9999,],
envir = base::as.environment("package:AMR"))
base::assign(x = "microorganisms.unprevDT",
value = microorganismsDT[prevalence == 9999,],
envir = base::as.environment("package:AMR"))
base::assign(x = "microorganisms.oldDT",
value = microorganisms.oldDT,
envir = base::as.environment("package:AMR"))
}

View File

@ -65,7 +65,7 @@ The `AMR` package basically does four important things:
3. It **analyses the data** with convenient functions that use well-known methods.
* Calculate the resistance (and even co-resistance) of microbial isolates with the `portion_R`, `portion_IR`, `portion_I`, `portion_SI` and `portion_S` functions. Similarly, the *amount* of isolates can be determined with the `count_R`, `count_IR`, `count_I`, `count_SI` and `count_S` functions. All these functions can be used [with the `dplyr` package](https://dplyr.tidyverse.org/#usage) (e.g. in conjunction with [`summarise`](https://dplyr.tidyverse.org/reference/summarise.html))
* Calculate the resistance (and even co-resistance) of microbial isolates with the `portion_R`, `portion_IR`, `portion_I`, `portion_SI` and `portion_S` functions. Similarly, the *number* of isolates can be determined with the `count_R`, `count_IR`, `count_I`, `count_SI` and `count_S` functions. All these functions can be used [with the `dplyr` package](https://dplyr.tidyverse.org/#usage) (e.g. in conjunction with [`summarise`](https://dplyr.tidyverse.org/reference/summarise.html))
* Plot AMR results with `geom_rsi`, a function made for the `ggplot2` package
* Predict antimicrobial resistance for the nextcoming years using logistic regression models with the `resistance_predict` function
* Conduct descriptive statistics to enhance base R: calculate kurtosis, skewness and create frequency tables
@ -83,7 +83,7 @@ The `AMR` package basically does four important things:
This package contains the **complete microbial taxonomic data** (with all seven taxonomic ranks - from subkingdom to subspecies) from the publicly available Integrated Taxonomic Information System (ITIS, https://www.itis.gov).
All (sub)species from the taxonomic kingdoms Bacteria, Fungi and Protozoa are included in this package, as well as all previously accepted names known to ITIS. Furthermore, the responsible authors and year of publication are available. This allows users to use authoritative taxonomic information for their data analysis on any microorganism, not only human pathogens.
All (sub)species from the **taxonomic kingdoms Bacteria, Fungi and Protozoa are included in this package**, as well as all previously accepted names known to ITIS. Furthermore, the responsible authors and year of publication are available. This **allows users to use authoritative taxonomic information** for their data analysis on any microorganism, not only human pathogens. It also helps to **quickly determine the Gram stain of bacteria**, since all bacteria are classified into subkingdom Negibacteria or Posibacteria.
ITIS is a partnership of U.S., Canadian, and Mexican agencies and taxonomic specialists.
@ -102,6 +102,12 @@ mo_class("E. coli")
mo_family("E. coli")
# [1] "Enterobacteriaceae"
mo_subkingdom("E. coli")
# [1] "Negibacteria"
mo_gramstain("E. coli") # based on subkingdom
# [1] "Gram negative"
mo_ref("E. coli")
# [1] "Castellani and Chalmers, 1919"
```
@ -453,7 +459,7 @@ Using the `microbenchmark` package, we can review the calculation performance of
library(microbenchmark)
```
In the next test, we try to 'coerce' different input values for *Staphylococcus aureus*. The actual result is the same every time: it returns its MO code `B_STAPHY_AUR` (*B* stands for *Bacteria*, the taxonomic kingdom).
In the next test, we try to 'coerce' different input values for *Staphylococcus aureus*. The actual result is the same every time: it returns its MO code `B_STPHY_AUR` (*B* stands for *Bacteria*, the taxonomic kingdom).
But the calculation time differs a lot. Here, the AI effect can be reviewed best:
@ -464,56 +470,61 @@ microbenchmark(A = as.mo("stau"),
D = as.mo("S. aureus"),
E = as.mo("STAAUR"),
F = as.mo("Staphylococcus aureus"),
G = as.mo("B_STPHY_AUR"),
times = 10,
unit = "ms")
# Unit: milliseconds
# expr min lq mean median uq max neval
# A 36.05088 36.14782 36.65635 36.24466 36.43075 39.78544 10
# B 16.43575 16.46885 16.67816 16.66053 16.84858 16.95299 10
# C 14.44150 14.52182 16.81197 14.59173 14.67854 36.75244 10
# D 14.49765 14.58153 16.71666 14.59414 14.61094 35.50731 10
# E 14.45212 14.75146 14.82033 14.85559 14.96433 15.03438 10
# F 10.69445 10.73852 10.80334 10.79596 10.86856 10.97465 10
# expr min lq mean median uq max neval
# A 38.864859 38.923316 42.5410391 39.172790 39.394955 70.512389 10
# B 13.912175 14.002899 14.1044062 14.084962 14.254467 14.281845 10
# C 11.492663 11.555520 76.6953055 11.652670 11.864149 662.026786 10
# D 11.616702 11.683261 12.1807189 11.873159 12.142327 14.761724 10
# E 13.761108 14.012048 14.1360584 14.106509 14.293229 14.547522 10
# F 6.743735 6.785151 6.8962407 6.871335 7.000961 7.158383 10
# G 0.119220 0.137030 0.1411503 0.142512 0.145061 0.176909 10
```
The more an input value resembles a full name, the faster the result will be found. In the table above, all measurements are in milliseconds, tested on a quite regular Linux server from 2007 with 2 GB RAM. A value of 10.8 milliseconds means it will roughly determine 93 different (unique) input values per second. It case of 36.2 milliseconds, this is only 28 input values per second.
In the table above, all measurements are shown in milliseconds (thousands of seconds), tested on a quite regular Linux server from 2007 (Core 2 Duo 2.7 GHz, 2 GB DDR2 RAM). A value of 6.9 milliseconds means it will roughly determine 144 different (unique) input values per second. It case of 39.2 milliseconds, this is only 26 input values per second. The more an input value resembles a full name (like C, D and F), the faster the result will be found. In case of G, the input is already a valid MO code, so it only almost takes no time at all (0.0001 seconds on our server).
To improve speed, the `as.mo` function also takes into account the prevalence of human pathogenic microorganisms. The downside is of course that less prevalent microorganisms will be determined far less faster. See this example for the ID of *Burkholderia nodosa* (`B_BRKHL_NOD`):
To achieve this speed, the `as.mo` function also takes into account the prevalence of human pathogenic microorganisms. The downside is of course that less prevalent microorganisms will be determined far less faster. See this example for the ID of *Burkholderia nodosa* (`B_BRKHL_NOD`):
```r
microbenchmark(B = as.mo("burnod"),
microbenchmark(A = as.mo("buno"),
B = as.mo("burnod"),
C = as.mo("B. nodosa"),
D = as.mo("B. nodosa"),
E = as.mo("BURNOD"),
F = as.mo("Burkholderia nodosa"),
G = as.mo("B_BRKHL_NOD"),
times = 10,
unit = "ms")
# Unit: milliseconds
# expr min lq mean median uq max neval
# B 175.9446 176.80440 179.18240 177.00131 177.62021 198.86286 10
# C 88.1902 88.57705 89.08851 88.84293 89.15498 91.76621 10
# D 110.2641 110.67497 113.66290 111.20534 111.80744 134.44699 10
# E 175.0728 177.04235 207.83542 190.38109 200.33448 388.12177 10
# F 45.0778 45.31617 52.72430 45.62962 67.85262 70.42250 10
# expr min lq mean median uq max neval
# A 124.175427 124.474837 125.8610536 125.3750560 126.160945 131.485994 10
# B 154.249713 155.364729 160.9077032 156.8738940 157.136183 197.315105 10
# C 66.066571 66.162393 66.5538611 66.4488130 66.698077 67.623404 10
# D 86.747693 86.918665 90.7831016 87.8149725 89.440982 116.767991 10
# E 154.863827 155.208563 162.6535954 158.4062465 168.593785 187.378088 10
# F 32.427028 32.638648 32.9929454 32.7860475 32.992813 34.674241 10
# G 0.213155 0.216578 0.2369226 0.2338985 0.253734 0.285581 10
```
(Note: `A` is missing here, because `as.mo("buno")` returns `F_BUELL_NOT`: the ID of the fungus *Buellia notabilis*)
That takes up to 12 times as much time! A value of 190.4 milliseconds means it can only determine ~5 different input values per second. We can conclude that looking up arbitrary codes of less prevalent microorganisms is the worst way to go, in terms of calculation performance.
That takes up to 11 times as much time! A value of 158.4 milliseconds means it can only determine ~6 different input values per second. We can conclude that looking up arbitrary codes of less prevalent microorganisms is the worst way to go, in terms of calculation performance.
To relieve this pitfall and further improve performance, two important calculations take almost no time at all: **repetive results** and **already precalculated results**.
Let's set up 25,000 entries of `"Staphylococcus aureus"` and check its speed:
```r
repetive_results <- rep("Staphylococcus aureus", 25000)
microbenchmark(A = as.mo(repetive_results),
microbenchmark(F = as.mo(repetive_results),
times = 10,
unit = "ms")
# Unit: milliseconds
# expr min lq mean median uq max neval
# A 14.61282 14.6372 14.70817 14.72597 14.76124 14.78498 10
# F 12.24381 12.34707 13.84736 12.37689 12.43266 40.36833 100
```
So transforming 25,000 times (!) `"Staphylococcus aureus"` only takes 4 ms (0.004 seconds) more than transforming it once. You only lose time on your unique input values.
So transforming 25,000 times (!) `"Staphylococcus aureus"` only takes 6 ms (0.006 seconds) more than transforming it once. You only lose time on your unique input values.
What about precalculated results? This package also contains helper functions for specific microbial properties, for example `mo_fullname`. It returns the full microbial name (genus, species and possibly subspecies) and uses `as.mo` internally. If the input is however an already precalculated result, it almost doesn't take any time at all (see 'C' below):
@ -524,13 +535,13 @@ microbenchmark(A = mo_fullname("B_STPHY_AUR"),
times = 10,
unit = "ms")
# Unit: milliseconds
# expr min lq mean median uq max neval
# A 13.548652 13.74588 13.8052969 13.813594 13.881165 14.090969 10
# B 15.079781 15.16785 15.3835842 15.374477 15.395115 16.072995 10
# C 0.171182 0.18563 0.2306307 0.203413 0.224610 0.492312 10
# expr min lq mean median uq max neval
# A 11.364086 11.460537 11.5104799 11.4795330 11.524860 11.818263 10
# B 11.976454 12.012352 12.1704592 12.0853020 12.210004 12.881737 10
# C 0.095823 0.102528 0.1167754 0.1153785 0.132629 0.140661 10
```
So going from `mo_fullname("Staphylococcus aureus")` to `"Staphylococcus aureus"` takes 0.0002 seconds - it doesn't even start calculating *if the result would be the same as the expected resulting value*. That goes for all helper functions:
So going from `mo_fullname("Staphylococcus aureus")` to `"Staphylococcus aureus"` takes 0.0001 seconds - it doesn't even start calculating *if the result would be the same as the expected resulting value*. That goes for all helper functions:
```r
microbenchmark(A = mo_species("aureus"),
@ -545,17 +556,17 @@ microbenchmark(A = mo_species("aureus"),
unit = "ms")
# Unit: milliseconds
# expr min lq mean median uq max neval
# A 0.145270 0.158750 0.1908419 0.1693655 0.218255 0.300528 10
# B 0.182985 0.184522 0.2025408 0.1970235 0.209944 0.243328 10
# C 0.176280 0.201632 0.2618147 0.2303025 0.339499 0.388249 10
# D 0.136890 0.139054 0.1552231 0.1518010 0.168738 0.193042 10
# E 0.100921 0.116496 0.1321823 0.1222930 0.129976 0.230477 10
# F 0.103017 0.110281 0.1214480 0.1199880 0.124319 0.147506 10
# G 0.099246 0.110280 0.1195553 0.1188705 0.125436 0.149741 10
# H 0.114331 0.117264 0.1249819 0.1220830 0.129557 0.143385 10
# A 0.096801 0.120966 0.1264836 0.1262045 0.135773 0.158192 10
# B 0.102807 0.123899 0.1258339 0.1286835 0.132420 0.143245 10
# C 0.122503 0.128299 0.1374623 0.1292070 0.139683 0.187315 10
# D 0.087372 0.093239 0.1053774 0.1026330 0.113633 0.128299 10
# E 0.084020 0.098617 0.1124383 0.1094420 0.113423 0.178515 10
# F 0.080667 0.085346 0.1068579 0.1128295 0.115030 0.133537 10
# G 0.087443 0.090026 0.1030171 0.0995250 0.106369 0.152325 10
# H 0.084648 0.103156 0.1058313 0.1095120 0.112864 0.117265 10
```
Of course, when running `mo_phylum("Firmicutes")` the function has zero knowledge about the actual microorganism, namely *S. aureus*. But since the result would be `"Firmicutes"` too, there is no point in calculating the result. And since this package 'knows' all phyla of all known microorganisms (according to ITIS), it can just return the initial value immediately.
Of course, when running `mo_phylum("Firmicutes")` the function has zero knowledge about the actual microorganism, namely *S. aureus*. But since the result would be `"Firmicutes"` too, there is no point in calculating the result. And because this package 'knows' all phyla of all known microorganisms (according to ITIS), it can just return the initial value immediately.
## Copyright

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
data/microorganismsDT.rda Normal file

Binary file not shown.

2
man/antibiotics.Rd Executable file → Normal file
View File

@ -4,7 +4,7 @@
\name{antibiotics}
\alias{antibiotics}
\title{Data set with 423 antibiotics}
\format{A \code{\link{tibble}} with 423 observations and 18 variables:
\format{A \code{\link{data.frame}} with 423 observations and 18 variables:
\describe{
\item{\code{atc}}{ATC code, like \code{J01CR02}}
\item{\code{certe}}{Certe code, like \code{amcl}}

View File

@ -54,7 +54,15 @@ A microbial ID from this package (class: \code{mo}) typically looks like these e
Use the \code{\link{mo_property}} functions to get properties based on the returned code, see Examples.
This function uses Artificial Intelligence (AI) to help getting more logical results, based on type of input and known prevalence of human pathogens. For example:
This function uses Artificial Intelligence (AI) to help getting fast and logical results. It tries to find matches in this order:
\itemize{
\item{Taxonomic kingdom: it first searches in bacteria, then fungi, then protozoa}
\item{Human pathogenic prevalence: it first searches in more prevalent microorganisms, then less prevalent ones}
\item{Valid MO codes and full names: it first searches in already valid MO code and genus/species combinations}
\item{Breakdown of input values: from here it starts to breakdown input values to find possible matches}
}
A couple of effects because of these rules
\itemize{
\item{\code{"E. coli"} will return the ID of \emph{Escherichia coli} and not \emph{Entamoeba coli}, although the latter would alphabetically come first}
\item{\code{"H. influenzae"} will return the ID of \emph{Haemophilus influenzae} and not \emph{Haematobacter influenzae} for the same reason}
@ -70,7 +78,7 @@ This means that looking up human pathogenic microorganisms takes less time than
\if{html}{\figure{itis_logo.jpg}{options: height=60px style=margin-bottom:5px} \cr}
This package contains the \strong{complete microbial taxonomic data} (with all seven taxonomic ranks - from subkingdom to subspecies) from the publicly available Integrated Taxonomic Information System (ITIS, \url{https://www.itis.gov}).
All (sub)species from the taxonomic kingdoms Bacteria, Fungi and Protozoa are included in this package, as well as all previously accepted names known to ITIS. Furthermore, the responsible authors and year of publication are available. This allows users to use authoritative taxonomic information for their data analysis on any microorganism, not only human pathogens.
All (sub)species from the \strong{taxonomic kingdoms Bacteria, Fungi and Protozoa are included in this package}, as well as all previously accepted names known to ITIS. Furthermore, the responsible authors and year of publication are available. This \strong{allows users to use authoritative taxonomic information} for their data analysis on any microorganism, not only human pathogens. It also helps to \strong{quickly determine the Gram stain of bacteria}, since all bacteria are classified into subkingdom Negibacteria or Posibacteria.
ITIS is a partnership of U.S., Canadian, and Mexican agencies and taxonomic specialists [3].
}

View File

@ -10,12 +10,12 @@
frequency_tbl(x, ..., sort.count = TRUE,
nmax = getOption("max.print.freq"), na.rm = TRUE, row.names = TRUE,
markdown = !interactive(), digits = 2, quote = FALSE,
header = !markdown, na = "<NA>", sep = " ")
header = !markdown, title = NULL, na = "<NA>", sep = " ")
freq(x, ..., sort.count = TRUE, nmax = getOption("max.print.freq"),
na.rm = TRUE, row.names = TRUE, markdown = !interactive(),
digits = 2, quote = FALSE, header = !markdown, na = "<NA>",
sep = " ")
digits = 2, quote = FALSE, header = !markdown, title = NULL,
na = "<NA>", sep = " ")
top_freq(f, n)
@ -43,6 +43,8 @@ top_freq(f, n)
\item{header}{a logical value indicating whether an informative header should be printed}
\item{title}{text to show above frequency table, at default to tries to coerce from the variables passed to \code{x}}
\item{na}{a character string to should be used to show empty (\code{NA}) values (only useful when \code{na.rm = FALSE})}
\item{sep}{a character string to separate the terms when selecting multiple columns}

View File

@ -19,7 +19,7 @@
\item{\code{subkingdom}}{Taxonomic subkingdom of the microorganism as found in ITIS, see Source}
\item{\code{gramstain}}{Gram of microorganism, like \code{"Gram negative"}}
\item{\code{type}}{Type of microorganism, like \code{"Bacteria"} and \code{"Fungi"}}
\item{\code{prevalence}}{A rounded integer based on prevalence of the microorganism. Used internally by \code{\link{as.mo}}, otherwise quite meaningless.}
\item{\code{prevalence}}{An integer based on estimated prevalence of the microorganism in humans. Used internally by \code{\link{as.mo}}, otherwise quite meaningless. It has a value of 25 for manually added items and a value of 1000 for all unprevalent microorganisms whose genus was somewhere in the top 250 (with another species).}
\item{\code{ref}}{Author(s) and year of concerning publication as found in ITIS, see Source}
}}
\source{
@ -36,7 +36,7 @@ A data set containing the complete microbial taxonomy of the kingdoms Bacteria,
\if{html}{\figure{itis_logo.jpg}{options: height=60px style=margin-bottom:5px} \cr}
This package contains the \strong{complete microbial taxonomic data} (with all seven taxonomic ranks - from subkingdom to subspecies) from the publicly available Integrated Taxonomic Information System (ITIS, \url{https://www.itis.gov}).
All (sub)species from the taxonomic kingdoms Bacteria, Fungi and Protozoa are included in this package, as well as all previously accepted names known to ITIS. Furthermore, the responsible authors and year of publication are available. This allows users to use authoritative taxonomic information for their data analysis on any microorganism, not only human pathogens.
All (sub)species from the \strong{taxonomic kingdoms Bacteria, Fungi and Protozoa are included in this package}, as well as all previously accepted names known to ITIS. Furthermore, the responsible authors and year of publication are available. This \strong{allows users to use authoritative taxonomic information} for their data analysis on any microorganism, not only human pathogens. It also helps to \strong{quickly determine the Gram stain of bacteria}, since all bacteria are classified into subkingdom Negibacteria or Posibacteria.
ITIS is a partnership of U.S., Canadian, and Mexican agencies and taxonomic specialists [3].
}

View File

@ -4,7 +4,7 @@
\name{microorganisms.certe}
\alias{microorganisms.certe}
\title{Translation table for Certe}
\format{A \code{\link{tibble}} with 2,665 observations and 2 variables:
\format{A \code{\link{data.frame}} with 2,665 observations and 2 variables:
\describe{
\item{\code{certe}}{Code of microorganism according to Certe MMB}
\item{\code{mo}}{Code of microorganism in \code{\link{microorganisms}}}

View File

@ -25,7 +25,7 @@ A data set containing old (previously valid or accepted) taxonomic names accordi
\if{html}{\figure{itis_logo.jpg}{options: height=60px style=margin-bottom:5px} \cr}
This package contains the \strong{complete microbial taxonomic data} (with all seven taxonomic ranks - from subkingdom to subspecies) from the publicly available Integrated Taxonomic Information System (ITIS, \url{https://www.itis.gov}).
All (sub)species from the taxonomic kingdoms Bacteria, Fungi and Protozoa are included in this package, as well as all previously accepted names known to ITIS. Furthermore, the responsible authors and year of publication are available. This allows users to use authoritative taxonomic information for their data analysis on any microorganism, not only human pathogens.
All (sub)species from the \strong{taxonomic kingdoms Bacteria, Fungi and Protozoa are included in this package}, as well as all previously accepted names known to ITIS. Furthermore, the responsible authors and year of publication are available. This \strong{allows users to use authoritative taxonomic information} for their data analysis on any microorganism, not only human pathogens. It also helps to \strong{quickly determine the Gram stain of bacteria}, since all bacteria are classified into subkingdom Negibacteria or Posibacteria.
ITIS is a partnership of U.S., Canadian, and Mexican agencies and taxonomic specialists [3].
}

View File

@ -4,7 +4,7 @@
\name{microorganisms.umcg}
\alias{microorganisms.umcg}
\title{Translation table for UMCG}
\format{A \code{\link{tibble}} with 1,095 observations and 2 variables:
\format{A \code{\link{data.frame}} with 1,095 observations and 2 variables:
\describe{
\item{\code{umcg}}{Code of microorganism according to UMCG MMB}
\item{\code{certe}}{Code of microorganism according to Certe MMB}

View File

@ -71,7 +71,7 @@ Use these functions to return a specific property of a microorganism from the \c
\if{html}{\figure{itis_logo.jpg}{options: height=60px style=margin-bottom:5px} \cr}
This package contains the \strong{complete microbial taxonomic data} (with all seven taxonomic ranks - from subkingdom to subspecies) from the publicly available Integrated Taxonomic Information System (ITIS, \url{https://www.itis.gov}).
All (sub)species from the taxonomic kingdoms Bacteria, Fungi and Protozoa are included in this package, as well as all previously accepted names known to ITIS. Furthermore, the responsible authors and year of publication are available. This allows users to use authoritative taxonomic information for their data analysis on any microorganism, not only human pathogens.
All (sub)species from the \strong{taxonomic kingdoms Bacteria, Fungi and Protozoa are included in this package}, as well as all previously accepted names known to ITIS. Furthermore, the responsible authors and year of publication are available. This \strong{allows users to use authoritative taxonomic information} for their data analysis on any microorganism, not only human pathogens. It also helps to \strong{quickly determine the Gram stain of bacteria}, since all bacteria are classified into subkingdom Negibacteria or Posibacteria.
ITIS is a partnership of U.S., Canadian, and Mexican agencies and taxonomic specialists [3].
}

24
man/supplementary_data.Rd Normal file
View File

@ -0,0 +1,24 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data.R
\docType{data}
\name{supplementary_data}
\alias{supplementary_data}
\alias{microorganismsDT}
\alias{microorganisms.prevDT}
\alias{microorganisms.unprevDT}
\alias{microorganisms.oldDT}
\title{Supplementary Data}
\format{An object of class \code{data.table} (inherits from \code{data.frame}) with 18833 rows and 15 columns.}
\usage{
microorganismsDT
microorganisms.prevDT
microorganisms.unprevDT
microorganisms.oldDT
}
\description{
These \code{\link{data.table}s} are transformed from the \code{\link{microorganisms}} and \code{\link{microorganisms}} data sets to improve speed of \code{\link{as.mo}}. They are meant for internal use only, and are only mentioned here for reference.
}
\keyword{datasets}

View File

@ -21,6 +21,7 @@ test_that("frequency table works", {
expect_output(print(freq(septic_patients$age, markdown = TRUE), markdown = TRUE))
expect_output(print(freq(septic_patients$age[0])))
expect_output(print(freq(septic_patients$age, quote = TRUE)))
expect_output(print(freq(septic_patients$age, markdown = TRUE, title = "TITLE")))
# character
expect_output(print(freq(septic_patients$mo)))

View File

@ -53,4 +53,10 @@ test_that("mo_property works", {
expect_identical(mo_property("E. coli", property = "species"),
mo_species("E. coli"))
# check vector with random values
library(dplyr)
df_sample <- AMR::microorganisms %>% sample_n(100)
expect_identical(df_sample %>% pull(mo) %>% mo_fullname(),
df_sample %>% pull(fullname))
})

View File

@ -39,7 +39,7 @@ The `AMR` package basically does four important things:
3. It **analyses the data** with convenient functions that use well-known methods.
* Calculate the resistance (and even co-resistance) of microbial isolates with the `portion_R`, `portion_IR`, `portion_I`, `portion_SI` and `portion_S` functions. Similarly, the *amount* of isolates can be determined with the `count_R`, `count_IR`, `count_I`, `count_SI` and `count_S` functions. All these functions can be used [with the `dplyr` package](https://dplyr.tidyverse.org/#usage) (e.g. in conjunction with [`summarise`](https://dplyr.tidyverse.org/reference/summarise.html))
* Calculate the resistance (and even co-resistance) of microbial isolates with the `portion_R`, `portion_IR`, `portion_I`, `portion_SI` and `portion_S` functions. Similarly, the *number* of isolates can be determined with the `count_R`, `count_IR`, `count_I`, `count_SI` and `count_S` functions. All these functions can be used [with the `dplyr` package](https://dplyr.tidyverse.org/#usage) (e.g. in conjunction with [`summarise`](https://dplyr.tidyverse.org/reference/summarise.html))
* Plot AMR results with `geom_rsi`, a function made for the `ggplot2` package
* Predict antimicrobial resistance for the nextcoming years using logistic regression models with the `resistance_predict` function
* Conduct descriptive statistics to enhance base R: calculate kurtosis, skewness and create frequency tables