diff --git a/DESCRIPTION b/DESCRIPTION index c9648b26..9a86ca29 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: AMR Version: 0.5.0.9009 -Date: 2019-01-11 +Date: 2019-01-12 Title: Antimicrobial Resistance Analysis Authors@R: c( person( diff --git a/R/ab_property.R b/R/ab_property.R old mode 100644 new mode 100755 diff --git a/R/abname.R b/R/abname.R old mode 100644 new mode 100755 diff --git a/R/age.R b/R/age.R old mode 100644 new mode 100755 index 399e7ea0..265b1c9e --- a/R/age.R +++ b/R/age.R @@ -21,11 +21,11 @@ #' Age in years of individuals #' -#' Calculates age in years based on a reference date, which is the sytem time at default. +#' Calculates age in years based on a reference date, which is the sytem date at default. #' @param x date(s), will be coerced with \code{\link{as.POSIXlt}} -#' @param reference reference date(s) (defaults to today), will be coerced with \code{\link{as.POSIXlt}} +#' @param reference reference date(s) (defaults to today), will be coerced with \code{\link{as.POSIXlt}} and cannot be lower than \code{x} #' @return Integer (no decimals) -#' @seealso \code{\link{age_groups}} to splits age into groups +#' @seealso \code{\link{age_groups}} to split age into age groups #' @importFrom dplyr if_else #' @inheritSection AMR Read more on our website! #' @export @@ -45,8 +45,8 @@ age <- function(x, reference = Sys.Date()) { years_gap <- reference$year - x$year # from https://stackoverflow.com/a/25450756/4575331 ages <- if_else(reference$mon < x$mon | (reference$mon == x$mon & reference$mday < x$mday), - as.integer(years_gap - 1), - as.integer(years_gap)) + as.integer(years_gap - 1), + as.integer(years_gap)) if (any(ages > 120)) { warning("Some ages are > 120.") } @@ -60,7 +60,7 @@ age <- function(x, reference = Sys.Date()) { #' @param split_at values to split \code{x} at, defaults to age groups 0-11, 12-24, 26-54, 55-74 and 75+. See Details. #' @details To split ages, the input can be: #' \itemize{ -#' \item{A numeric vector. A vector of \code{c(10, 20)} will split on 0-9, 10-19 and 20+. A value of only \code{50} will split on 0-49 and 50+. +#' \item{A numeric vector. A vector of e.g. \code{c(10, 20)} will split on 0-9, 10-19 and 20+. A value of only \code{50} will split on 0-49 and 50+. #' The default is to split on young children (0-11), youth (12-24), young adults (26-54), middle-aged adults (55-74) and elderly (75+).} #' \item{A character:} #' \itemize{ @@ -139,8 +139,7 @@ age_groups <- function(x, split_at = c(12, 25, 55, 75)) { for (i in 1:length(split_at)) { y[x >= split_at[i]] <- i # create labels - # when age group consists of only one age - labs[i - 1] <- paste0(unique(c(split_at[i - 1], split_at[i] - 1)), collapse = "-") + labs[i - 1] <- paste0(unique(c(split_at[i - 1], split_at[i] - 1)), collapse = "-") } # last category diff --git a/R/count.R b/R/count.R old mode 100644 new mode 100755 diff --git a/R/deprecated.R b/R/deprecated.R old mode 100644 new mode 100755 diff --git a/R/g.test.R b/R/g.test.R old mode 100644 new mode 100755 index 7a6d05c5..3d05d079 --- a/R/g.test.R +++ b/R/g.test.R @@ -44,7 +44,7 @@ #' #' It is also possible to do a \emph{G}-test of independence with more than two nominal variables. For example, Jackson et al. (2013) also had data for children under 3, so you could do an analysis of old vs. young, thigh vs. arm, and reaction vs. no reaction, all analyzed together. #' -#' Fisher's exact test (\code{\link{fisher.test}}) is more accurate than the \emph{G}-test of independence when the expected numbers are small, so it is recommend to only use the \emph{G}-test if your total sample size is greater than 1000. +#' Fisher's exact test (\code{\link{fisher.test}}) is an \strong{exact} test, where the \emph{G}-test is still only an \strong{approximation}. For any 2x2 table, Fisher's Exact test may be slower but will still run in seconds, even if the sum of your observations is multiple millions. #' #' The \emph{G}-test of independence is an alternative to the chi-square test of independence (\code{\link{chisq.test}}), and they will give approximately the same results. #' @section How the test works: @@ -155,6 +155,9 @@ g.test <- function(x, nc <- as.integer(ncol(x)) if (is.na(nr) || is.na(nc) || is.na(nr * nc)) stop("invalid nrow(x) or ncol(x)", domain = NA) + # add fisher.test suggestion + if (nr == 2 && nc == 2) + warning("`fisher.test()` is always more reliable for 2x2 tables and although must slower, often only takes seconds.") sr <- rowSums(x) sc <- colSums(x) E <- outer(sr, sc, "*")/n @@ -221,15 +224,9 @@ g.test <- function(x, } names(STATISTIC) <- "X-squared" names(PARAMETER) <- "df" - # if (any(E < 5) && is.finite(PARAMETER)) - # warning("G-statistic approximation may be incorrect") + if (any(E < 5) && is.finite(PARAMETER)) + warning("G-statistic approximation may be incorrect due to E < 5") - # suggest fisher.test when total is < 1000 (John McDonald, Handbook of Biological Statistics, 2014) - if (sum(x, na.rm = TRUE) < 1000 && is.finite(PARAMETER)) { - warning("G-statistic approximation may be incorrect, consider Fisher's Exact test") - } else if (any(E < 5) && is.finite(PARAMETER)) { - warning("G-statistic approximation may be incorrect, consider Fisher's Exact test") - } structure(list(statistic = STATISTIC, parameter = PARAMETER, p.value = PVAL, method = METHOD, data.name = DNAME, observed = x, expected = E, residuals = (x - E)/sqrt(E), diff --git a/R/get_locale.R b/R/get_locale.R old mode 100644 new mode 100755 diff --git a/R/ggplot_rsi.R b/R/ggplot_rsi.R old mode 100644 new mode 100755 diff --git a/R/guess_ab_col.R b/R/guess_ab_col.R old mode 100644 new mode 100755 diff --git a/R/itis.R b/R/itis.R old mode 100644 new mode 100755 index e4bc2477..f1e74752 --- a/R/itis.R +++ b/R/itis.R @@ -26,7 +26,7 @@ #' \if{html}{\figure{itis_logo.jpg}{options: height=60px style=margin-bottom:5px} \cr} #' This package contains the \strong{complete microbial taxonomic data} (with all nine taxonomic ranks - from kingdom to subspecies) from the publicly available Integrated Taxonomic Information System (ITIS, \url{https://www.itis.gov}). #' -#' All (sub)species from \strong{the taxonomic kingdoms Bacteria, Fungi and Protozoa are included in this package}, as well as all previously accepted names known to ITIS. Furthermore, the responsible authors and year of publication are available. This allows users to use authoritative taxonomic information for their data analysis on any microorganism, not only human pathogens. It also helps to quickly determine the Gram stain of bacteria, since all bacteria are classified into subkingdom Negibacteria or Posibacteria. +#' All ~20,000 (sub)species from \strong{the taxonomic kingdoms Bacteria, Fungi and Protozoa are included in this package}, as well as all ~2,500 previously accepted names known to ITIS. Furthermore, the responsible authors and year of publication are available. This allows users to use authoritative taxonomic information for their data analysis on any microorganism, not only human pathogens. It also helps to quickly determine the Gram stain of bacteria, since all bacteria are classified into subkingdom Negibacteria or Posibacteria. #' #' ITIS is a partnership of U.S., Canadian, and Mexican agencies and taxonomic specialists [3]. #' @inheritSection AMR Read more on our website! diff --git a/R/key_antibiotics.R b/R/key_antibiotics.R old mode 100644 new mode 100755 diff --git a/R/kurtosis.R b/R/kurtosis.R old mode 100644 new mode 100755 diff --git a/R/like.R b/R/like.R old mode 100644 new mode 100755 diff --git a/R/mo.R b/R/mo.R old mode 100644 new mode 100755 diff --git a/R/mo_property.R b/R/mo_property.R old mode 100644 new mode 100755 diff --git a/R/p.symbol.R b/R/p.symbol.R old mode 100644 new mode 100755 diff --git a/R/read.4d.R b/R/read.4d.R old mode 100644 new mode 100755 diff --git a/R/resistance_predict.R b/R/resistance_predict.R old mode 100644 new mode 100755 diff --git a/R/rsi.R b/R/rsi.R old mode 100644 new mode 100755 diff --git a/R/rsi_calc.R b/R/rsi_calc.R old mode 100644 new mode 100755 diff --git a/R/skewness.R b/R/skewness.R old mode 100644 new mode 100755 diff --git a/docs/articles/AMR.html b/docs/articles/AMR.html index d586dc73..79ab6ab5 100644 --- a/docs/articles/AMR.html +++ b/docs/articles/AMR.html @@ -178,7 +178,7 @@
AMR.Rmd
Note: values on this page will change with every website update since they are based on randomly created values and the page was written in RMarkdown. However, the methodology remains unchanged. This page was generated on 11 January 2019.
+Note: values on this page will change with every website update since they are based on randomly created values and the page was written in RMarkdown. However, the methodology remains unchanged. This page was generated on 12 January 2019.
As with many uses in R, we need some additional packages for AMR analysis. The most important one is dplyr
, which tremendously improves the way we work with data - it allows for a very natural way of writing syntaxes in R. Another important dependency is ggplot2
. This package can be used to create beautiful plots in R.
Our AMR
package depends on these packages and even extends their use and functions.
library(dplyr) # the data science package
-library(AMR) # this package, to simplify and automate AMR analysis
-library(ggplot2) # for appealing plots
library(dplyr) # the data science package
+library(AMR) # this package, to simplify and automate AMR analysis
+library(ggplot2) # for appealing plots
To start with patients, we need a unique list of patients.
- +The LETTERS
object is available in R - it’s a vector with 26 characters: A
to Z
. The patients
object we just created is now a vector of length 260, with values (patient IDs) varying from A1
to Z10
. Now we we also set the gender of our patients, by putting the ID and the gender in a table:
patients_table <- data.frame(patient_id = patients,
+ gender = c(rep("M", 135),
+ rep("F", 125)))
The first 135 patient IDs are now male, the other 125 are female.
Let’s pretend that our data consists of blood cultures isolates from 1 January 2010 until 1 January 2018.
- +This dates
object now contains all days in our date range.
For this tutorial, we will uses four different microorganisms: Escherichia coli, Staphylococcus aureus, Streptococcus pneumoniae, and Klebsiella pneumoniae:
-bacteria <- c("Escherichia coli", "Staphylococcus aureus",
- "Streptococcus pneumoniae", "Klebsiella pneumoniae")
bacteria <- c("Escherichia coli", "Staphylococcus aureus",
+ "Streptococcus pneumoniae", "Klebsiella pneumoniae")
For completeness, we can also add the hospital where the patients was admitted and we need to define valid antibmicrobial results for our randomisation:
- +Using the sample()
function, we can randomly select items from all objects we defined earlier. To let our fake data reflect reality a bit, we will also approximately define the probabilities of bacteria and the antibiotic results with the prob
parameter.
data <- data.frame(date = sample(dates, 5000, replace = TRUE),
- patient_id = sample(patients, 5000, replace = TRUE),
- hospital = sample(hospitals, 5000, replace = TRUE, prob = c(0.30, 0.35, 0.15, 0.20)),
- bacteria = sample(bacteria, 5000, replace = TRUE, prob = c(0.50, 0.25, 0.15, 0.10)),
- amox = sample(ab_interpretations, 5000, replace = TRUE, prob = c(0.60, 0.05, 0.35)),
- amcl = sample(ab_interpretations, 5000, replace = TRUE, prob = c(0.75, 0.10, 0.15)),
- cipr = sample(ab_interpretations, 5000, replace = TRUE, prob = c(0.80, 0.00, 0.20)),
- gent = sample(ab_interpretations, 5000, replace = TRUE, prob = c(0.92, 0.00, 0.08))
- )
Using the left_join()
function from the dplyr
package, we can ‘map’ the gender to the patient ID using the patients_table
object we created earlier:
Using the sample()
function, we can randomly select items from all objects we defined earlier. To let our fake data reflect reality a bit, we will also approximately define the probabilities of bacteria and the antibiotic results with the prob
parameter.
data <- data.frame(date = sample(dates, 5000, replace = TRUE),
+ patient_id = sample(patients, 5000, replace = TRUE),
+ hospital = sample(hospitals, 5000, replace = TRUE, prob = c(0.30, 0.35, 0.15, 0.20)),
+ bacteria = sample(bacteria, 5000, replace = TRUE, prob = c(0.50, 0.25, 0.15, 0.10)),
+ amox = sample(ab_interpretations, 5000, replace = TRUE, prob = c(0.60, 0.05, 0.35)),
+ amcl = sample(ab_interpretations, 5000, replace = TRUE, prob = c(0.75, 0.10, 0.15)),
+ cipr = sample(ab_interpretations, 5000, replace = TRUE, prob = c(0.80, 0.00, 0.20)),
+ gent = sample(ab_interpretations, 5000, replace = TRUE, prob = c(0.92, 0.00, 0.08))
+ )
Using the left_join()
function from the dplyr
package, we can ‘map’ the gender to the patient ID using the patients_table
object we created earlier:
data <- data %>% left_join(patients_table)
The resulting data set contains 5,000 blood culture isolates. With the head()
function we can preview the first 6 values of this data set:
head(data)
date | @@ -303,70 +303,70 @@|||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2014-02-02 | -P8 | -Hospital D | +2013-07-18 | +Z9 | +Hospital C | Escherichia coli | S | -R | +S | S | S | F | |||
2013-10-26 | -Q1 | -Hospital B | +2010-02-24 | +P1 | +Hospital D | Escherichia coli | -I | -R | R | S | +S | +S | F | ||
2017-06-12 | -E5 | -Hospital D | +2014-04-02 | +A10 | +Hospital B | Streptococcus pneumoniae | S | -R | +S | S | S | M | |||
2013-06-16 | -K7 | +2016-05-30 | +A7 | Hospital B | Escherichia coli | -S | -I | -S | +R | +R | +R | S | M | ||
2013-01-11 | -M4 | -Hospital B | -Staphylococcus aureus | +2017-11-22 | +O6 | +Hospital D | +Klebsiella pneumoniae | +R | I | S | S | -S | -M | +F | |
2016-11-18 | -W10 | -Hospital A | -Staphylococcus aureus | +2017-03-08 | +L4 | +Hospital B | +Escherichia coli | +R | +R | S | S | -S | -S | -F | +M |
Use the frequency table function freq()
to look specifically for unique values in any variable. For example, for the gender
variable:
data %>% freq(gender) # this would be the same: freq(data$gender)
# Frequency table of `gender`
# Class: factor (numeric)
# Levels: F, M
@@ -386,67 +386,67 @@
#
# Item Count Percent Cum. Count Cum. Percent
# --- ----- ------ -------- ----------- -------------
-# 1 M 2,598 52.0% 2,598 52.0%
-# 2 F 2,402 48.0% 5,000 100.0%
+# 1 M 2,632 52.6% 2,632 52.6%
+# 2 F 2,368 47.4% 5,000 100.0%
So, we can draw at least two conclusions immediately. From a data scientist perspective, the data looks clean: only values M
and F
. From a researcher perspective: there are slightly more men. Nothing we didn’t already know.
The data is already quite clean, but we still need to transform some variables. The bacteria
column now consists of text, and we want to add more variables based on microbial IDs later on. So, we will transform this column to valid IDs. The mutate()
function of the dplyr
package makes this really easy:
We also want to transform the antibiotics, because in real life data we don’t know if they are really clean. The as.rsi()
function ensures reliability and reproducibility in these kind of variables. The mutate_at()
will run the as.rsi()
function on defined variables:
The data is already quite clean, but we still need to transform some variables. The bacteria
column now consists of text, and we want to add more variables based on microbial IDs later on. So, we will transform this column to valid IDs. The mutate()
function of the dplyr
package makes this really easy:
We also want to transform the antibiotics, because in real life data we don’t know if they are really clean. The as.rsi()
function ensures reliability and reproducibility in these kind of variables. The mutate_at()
will run the as.rsi()
function on defined variables:
Finally, we will apply EUCAST rules on our antimicrobial results. In Europe, most medical microbiological laboratories already apply these rules. Our package features their latest insights on intrinsic resistance and exceptional phenotypes. Moreover, the eucast_rules()
function can also apply additional rules, like forcing
Because the amoxicillin (column amox
) and amoxicillin/clavulanic acid (column amcl
) in our data were generated randomly, some rows will undoubtedly contain amox = S and amcl = R, which is technically impossible. The eucast_rules()
fixes this:
data <- eucast_rules(data, col_mo = "bacteria")
-#
-# Rules by the European Committee on Antimicrobial Susceptibility Testing (EUCAST)
-#
-# EUCAST Clinical Breakpoints (v8.1, 2018)
-# Enterobacteriales (Order) (no changes)
-# Staphylococcus (no changes)
-# Enterococcus (no changes)
-# Streptococcus groups A, B, C, G (no changes)
-# Streptococcus pneumoniae (no changes)
-# Viridans group streptococci (no changes)
-# Haemophilus influenzae (no changes)
-# Moraxella catarrhalis (no changes)
-# Anaerobic Gram positives (no changes)
-# Anaerobic Gram negatives (no changes)
-# Pasteurella multocida (no changes)
-# Campylobacter jejuni and C. coli (no changes)
-# Aerococcus sanguinicola and A. urinae (no changes)
-# Kingella kingae (no changes)
-#
-# EUCAST Expert Rules, Intrinsic Resistance and Exceptional Phenotypes (v3.1, 2016)
-# Table 1: Intrinsic resistance in Enterobacteriaceae (333 changes)
-# Table 2: Intrinsic resistance in non-fermentative Gram-negative bacteria (no changes)
-# Table 3: Intrinsic resistance in other Gram-negative bacteria (no changes)
-# Table 4: Intrinsic resistance in Gram-positive bacteria (692 changes)
-# Table 8: Interpretive rules for B-lactam agents and Gram-positive cocci (no changes)
-# Table 9: Interpretive rules for B-lactam agents and Gram-negative rods (no changes)
-# Table 10: Interpretive rules for B-lactam agents and other Gram-negative bacteria (no changes)
-# Table 11: Interpretive rules for macrolides, lincosamides, and streptogramins (no changes)
-# Table 12: Interpretive rules for aminoglycosides (no changes)
-# Table 13: Interpretive rules for quinolones (no changes)
-#
-# Other rules
-# Non-EUCAST: ampicillin = R where amoxicillin/clav acid = R (no changes)
-# Non-EUCAST: piperacillin = R where piperacillin/tazobactam = R (no changes)
-# Non-EUCAST: trimethoprim = R where trimethoprim/sulfa = R (no changes)
-# Non-EUCAST: amoxicillin/clav acid = S where ampicillin = S (no changes)
-# Non-EUCAST: piperacillin/tazobactam = S where piperacillin = S (no changes)
-# Non-EUCAST: trimethoprim/sulfa = S where trimethoprim = S (no changes)
-#
-# => EUCAST rules affected 1,830 out of 5,000 rows -> changed 1,025 test results.
data <- eucast_rules(data, col_mo = "bacteria")
+#
+# Rules by the European Committee on Antimicrobial Susceptibility Testing (EUCAST)
+#
+# EUCAST Clinical Breakpoints (v8.1, 2018)
+# Enterobacteriales (Order) (no changes)
+# Staphylococcus (no changes)
+# Enterococcus (no changes)
+# Streptococcus groups A, B, C, G (no changes)
+# Streptococcus pneumoniae (no changes)
+# Viridans group streptococci (no changes)
+# Haemophilus influenzae (no changes)
+# Moraxella catarrhalis (no changes)
+# Anaerobic Gram positives (no changes)
+# Anaerobic Gram negatives (no changes)
+# Pasteurella multocida (no changes)
+# Campylobacter jejuni and C. coli (no changes)
+# Aerococcus sanguinicola and A. urinae (no changes)
+# Kingella kingae (no changes)
+#
+# EUCAST Expert Rules, Intrinsic Resistance and Exceptional Phenotypes (v3.1, 2016)
+# Table 1: Intrinsic resistance in Enterobacteriaceae (307 changes)
+# Table 2: Intrinsic resistance in non-fermentative Gram-negative bacteria (no changes)
+# Table 3: Intrinsic resistance in other Gram-negative bacteria (no changes)
+# Table 4: Intrinsic resistance in Gram-positive bacteria (708 changes)
+# Table 8: Interpretive rules for B-lactam agents and Gram-positive cocci (no changes)
+# Table 9: Interpretive rules for B-lactam agents and Gram-negative rods (no changes)
+# Table 10: Interpretive rules for B-lactam agents and other Gram-negative bacteria (no changes)
+# Table 11: Interpretive rules for macrolides, lincosamides, and streptogramins (no changes)
+# Table 12: Interpretive rules for aminoglycosides (no changes)
+# Table 13: Interpretive rules for quinolones (no changes)
+#
+# Other rules
+# Non-EUCAST: ampicillin = R where amoxicillin/clav acid = R (no changes)
+# Non-EUCAST: piperacillin = R where piperacillin/tazobactam = R (no changes)
+# Non-EUCAST: trimethoprim = R where trimethoprim/sulfa = R (no changes)
+# Non-EUCAST: amoxicillin/clav acid = S where ampicillin = S (no changes)
+# Non-EUCAST: piperacillin/tazobactam = S where piperacillin = S (no changes)
+# Non-EUCAST: trimethoprim/sulfa = S where trimethoprim = S (no changes)
+#
+# => EUCAST rules affected 1,871 out of 5,000 rows -> changed 1,015 test results.
Now that we have the microbial ID, we can add some taxonomic properties:
-data <- data %>%
- mutate(gramstain = mo_gramstain(bacteria),
- genus = mo_genus(bacteria),
- species = mo_species(bacteria))
data <- data %>%
+ mutate(gramstain = mo_gramstain(bacteria),
+ genus = mo_genus(bacteria),
+ species = mo_species(bacteria))
(…) When preparing a cumulative antibiogram to guide clinical decisions about empirical antimicrobial therapy of initial infections, only the first isolate of a given species per patient, per analysis period (eg, one year) should be included, irrespective of body site, antimicrobial susceptibility profile, or other phenotypical characteristics (eg, biotype). The first isolate is easily identified, and cumulative antimicrobial susceptibility test data prepared using the first isolate are generally comparable to cumulative antimicrobial susceptibility test data calculated by other methods, providing duplicate isolates are excluded.
M39-A4 Analysis and Presentation of Cumulative Antimicrobial Susceptibility Test Data, 4th Edition. CLSI, 2014. Chapter 6.4
This AMR
package includes this methodology with the first_isolate()
function. It adopts the episode of a year (can be changed by user) and it starts counting days after every selected isolate. This new variable can easily be added to our data:
data <- data %>%
- mutate(first = first_isolate(.))
-# NOTE: Using column `bacteria` as input for `col_mo`.
-# NOTE: Using column `date` as input for `col_date`.
-# NOTE: Using column `patient_id` as input for `col_patient_id`.
-# => Found 2,962 first isolates (59.2% of total)
So only 59.2% is suitable for resistance analysis! We can now filter on is with the filter()
function, also from the dplyr
package:
data <- data %>%
+ mutate(first = first_isolate(.))
+# NOTE: Using column `bacteria` as input for `col_mo`.
+# NOTE: Using column `date` as input for `col_date`.
+# NOTE: Using column `patient_id` as input for `col_patient_id`.
+# => Found 2,951 first isolates (59.0% of total)
So only 59% is suitable for resistance analysis! We can now filter on is with the filter()
function, also from the dplyr
package:
data_1st <- data %>%
+ filter(first == TRUE)
For future use, the above two syntaxes can be shortened with the filter_first_isolate()
function:
data_1st <- data %>%
+ filter_first_isolate()
Only 3 isolates are marked as ‘first’ according to CLSI guideline. But when reviewing the antibiogram, it is obvious that some isolates are absolutely different strains and show be included too. This is why we weigh isolates, based on their antibiogram. The key_antibiotics()
function adds a vector with 18 key antibiotics: 6 broad spectrum ones, 6 small spectrum for Gram negatives and 6 small spectrum for Gram positives. These can be defined by the user.
If a column exists with a name like ‘key(…)ab’ the first_isolate()
function will automatically use it and determine the first weighted isolates. Mind the NOTEs in below output:
data <- data %>%
- mutate(keyab = key_antibiotics(.)) %>%
- mutate(first_weighted = first_isolate(.))
-# NOTE: Using column `bacteria` as input for `col_mo`.
-# amox amcl cipr gent
-# "amox" "amcl" "cipr" "gent"
-# [1] "amox" "amcl" "cipr"
-# amox amcl cipr gent
-# "amox" "amcl" "cipr" "gent"
-# NOTE: Using column `bacteria` as input for `col_mo`.
-# NOTE: Using column `date` as input for `col_date`.
-# NOTE: Using column `patient_id` as input for `col_patient_id`.
-# NOTE: Using column `keyab` as input for `col_keyantibiotics`. Use col_keyantibiotics = FALSE to prevent this.
-# [Criterion] Inclusion based on key antibiotics, ignoring I.
-# => Found 4,399 first weighted isolates (88.0% of total)
data <- data %>%
+ mutate(keyab = key_antibiotics(.)) %>%
+ mutate(first_weighted = first_isolate(.))
+# NOTE: Using column `bacteria` as input for `col_mo`.
+# amox amcl cipr gent
+# "amox" "amcl" "cipr" "gent"
+# NOTE: Using column `bacteria` as input for `col_mo`.
+# NOTE: Using column `date` as input for `col_date`.
+# NOTE: Using column `patient_id` as input for `col_patient_id`.
+# NOTE: Using column `keyab` as input for `col_keyantibiotics`. Use col_keyantibiotics = FALSE to prevent this.
+# [Criterion] Inclusion based on key antibiotics, ignoring I.
+# => Found 4,431 first weighted isolates (88.6% of total)
isolate | @@ -632,118 +629,118 @@||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | -2010-05-23 | -E7 | +2010-03-01 | +Y4 | B_ESCHR_COL | -R | S | -R | +S | +S | S | TRUE | TRUE | |
2 | -2010-08-03 | -E7 | +2010-11-02 | +Y4 | B_ESCHR_COL | S | S | S | S | FALSE | -TRUE | +FALSE | ||
3 | -2011-01-20 | -E7 | +2011-06-12 | +Y4 | B_ESCHR_COL | R | +S | R | S | -S | -FALSE | +TRUE | TRUE | |
4 | -2011-02-21 | -E7 | +2011-09-03 | +Y4 | B_ESCHR_COL | S | -R | -R | +S | +S | S | FALSE | TRUE | |
5 | -2011-08-04 | -E7 | +2011-09-07 | +Y4 | B_ESCHR_COL | S | S | +R | S | -S | -TRUE | +FALSE | TRUE | |
6 | -2011-11-15 | -E7 | +2012-06-11 | +Y4 | B_ESCHR_COL | -R | S | S | S | -FALSE | +S | +TRUE | TRUE | |
7 | -2012-01-13 | -E7 | +2012-08-16 | +Y4 | B_ESCHR_COL | S | S | -R | -R | +S | +S | +FALSE | FALSE | -TRUE |
8 | -2012-03-10 | -E7 | +2012-11-24 | +Y4 | B_ESCHR_COL | S | +I | S | -S | -S | +R | FALSE | TRUE | |
9 | -2012-11-09 | -E7 | +2012-12-12 | +Y4 | B_ESCHR_COL | S | S | +S | R | -S | -TRUE | -TRUE | +FALSE | +FALSE |
10 | -2013-04-06 | -E7 | +2013-02-02 | +Y4 | B_ESCHR_COL | -S | +R | S | S | S | @@ -752,16 +749,16 @@
Instead of 3, now 10 isolates are flagged. In total, 88% of all isolates are marked ‘first weighted’ - 147.2% more than when using the CLSI guideline. In real life, this novel algorithm will yield 5-10% more isolates than the classic CLSI guideline.
+Instead of 3, now 7 isolates are flagged. In total, 88.6% of all isolates are marked ‘first weighted’ - 147.6% more than when using the CLSI guideline. In real life, this novel algorithm will yield 5-10% more isolates than the classic CLSI guideline.
As with filter_first_isolate()
, there’s a shortcut for this new algorithm too:
So we end up with 4,399 isolates for analysis.
+data_1st <- data %>%
+ filter_first_weighted_isolate()
So we end up with 4,431 isolates for analysis.
We can remove unneeded columns:
- +Now our data looks like:
- +head(data_1st)
@@ -782,12 +779,12 @@ | ||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | -2014-02-02 | -P8 | -Hospital D | +2013-07-18 | +Z9 | +Hospital C | B_ESCHR_COL | S | -R | +S | S | S | F | @@ -798,14 +795,14 @@|||||||||
2 | -2013-10-26 | -Q1 | -Hospital B | +2010-02-24 | +P1 | +Hospital D | B_ESCHR_COL | -I | -R | R | S | +S | +S | F | Gram negative | Escherichia | @@ -814,12 +811,12 @@||||||
3 | -2017-06-12 | -E5 | -Hospital D | +2014-04-02 | +A10 | +Hospital B | B_STRPTC_PNE | S | -R | +S | S | R | M | @@ -830,13 +827,13 @@|||||||||
4 | -2013-06-16 | -K7 | +2016-05-30 | +A7 | Hospital B | B_ESCHR_COL | -S | -I | -S | +R | +R | +R | S | M | Gram negative | @@ -845,30 +842,30 @@TRUE | ||||||
5 | -2013-01-11 | -M4 | +6 | +2017-03-08 | +L4 | Hospital B | -B_STPHY_AUR | -I | -S | +B_ESCHR_COL | +R | +R | S | S | M | -Gram positive | -Staphylococcus | -aureus | +Gram negative | +Escherichia | +coli | TRUE |
7 | -2012-01-05 | -S10 | -Hospital D | +2014-06-24 | +O7 | +Hospital B | B_STPHY_AUR | -R | -I | -R | +S | +S | +S | S | F | Gram positive | @@ -886,12 +883,12 @@ Analysing the data||||||
1 | Escherichia coli | -2,138 | -48.6% | -2,138 | -48.6% | +2,143 | +48.4% | +2,143 | +48.4% | |||||||||||||
2 | Staphylococcus aureus | -1,070 | -24.3% | -3,208 | -72.9% | +1,130 | +25.5% | +3,273 | +73.9% | |||||||||||||
3 | Streptococcus pneumoniae | -697 | -15.8% | -3,905 | -88.8% | +709 | +16.0% | +3,982 | +89.9% | |||||||||||||
4 | Klebsiella pneumoniae | -494 | -11.2% | -4,399 | +449 | +10.1% | +4,431 | 100.0% |
hospital | @@ -957,27 +954,27 @@ Longest: 24||
---|---|---|
Hospital A | -0.4591382 | +0.4626060 |
Hospital B | -0.5000000 | +0.4652956 |
Hospital C | -0.4682171 | +0.4336283 |
Hospital D | -0.4953380 | +0.4677778 |
Of course it would be very convenient to know the number of isolates responsible for the percentages. For that purpose the n_rsi()
can be used, which works exactly like n_distinct()
from the dplyr
package. It counts all isolates available for every group (i.e. values S, I or R):
data_1st %>%
- group_by(hospital) %>%
- summarise(amoxicillin = portion_IR(amox),
- available = n_rsi(amox))
Of course it would be very convenient to know the number of isolates responsible for the percentages. For that purpose the n_rsi()
can be used, which works exactly like n_distinct()
from the dplyr
package. It counts all isolates available for every group (i.e. values S, I or R):
data_1st %>%
+ group_by(hospital) %>%
+ summarise(amoxicillin = portion_IR(amox),
+ available = n_rsi(amox))
hospital | @@ -987,32 +984,32 @@ Longest: 24||||
---|---|---|---|---|
Hospital A | -0.4591382 | -1346 | +0.4626060 | +1297 |
Hospital B | -0.5000000 | -1550 | +0.4652956 | +1556 |
Hospital C | -0.4682171 | -645 | +0.4336283 | +678 |
Hospital D | -0.4953380 | -858 | +0.4677778 | +900 |
These functions can also be used to get the portion of multiple antibiotics, to calculate co-resistance very easily:
-data_1st %>%
- group_by(genus) %>%
- summarise(amoxicillin = portion_S(amcl),
- gentamicin = portion_S(gent),
- "amox + gent" = portion_S(amcl, gent))
data_1st %>%
+ group_by(genus) %>%
+ summarise(amoxicillin = portion_S(amcl),
+ gentamicin = portion_S(gent),
+ "amox + gent" = portion_S(amcl, gent))
genus | @@ -1023,94 +1020,94 @@ Longest: 24||||||
---|---|---|---|---|---|---|
Escherichia | -0.7193639 | -0.9111319 | -0.9742750 | +0.7312179 | +0.9094727 | +0.9752683 |
Klebsiella | -0.7226721 | -0.9028340 | -0.9777328 | +0.7750557 | +0.9064588 | +0.9866370 |
Staphylococcus | -0.7392523 | -0.9084112 | -0.9831776 | +0.7469027 | +0.9247788 | +0.9743363 |
Streptococcus | -0.7532281 | +0.7602257 | 0.0000000 | -0.7532281 | +0.7602257 |
To make a transition to the next part, let’s see how this difference could be plotted:
-data_1st %>%
- group_by(genus) %>%
- summarise("1. Amoxicillin" = portion_S(amcl),
- "2. Gentamicin" = portion_S(gent),
- "3. Amox + gent" = portion_S(amcl, gent)) %>%
- tidyr::gather("Antibiotic", "S", -genus) %>%
- ggplot(aes(x = genus,
- y = S,
- fill = Antibiotic)) +
- geom_col(position = "dodge2")
data_1st %>%
+ group_by(genus) %>%
+ summarise("1. Amoxicillin" = portion_S(amcl),
+ "2. Gentamicin" = portion_S(gent),
+ "3. Amox + gent" = portion_S(amcl, gent)) %>%
+ tidyr::gather("Antibiotic", "S", -genus) %>%
+ ggplot(aes(x = genus,
+ y = S,
+ fill = Antibiotic)) +
+ geom_col(position = "dodge2")
To show results in plots, most R users would nowadays use the ggplot2
package. This package lets you create plots in layers. You can read more about it on their website. A quick example would look like these syntaxes:
ggplot(data = a_data_set,
- mapping = aes(x = year,
- y = value)) +
- geom_col() +
- labs(title = "A title",
- subtitle = "A subtitle",
- x = "My X axis",
- y = "My Y axis")
-
-ggplot(a_data_set,
- aes(year, value) +
- geom_bar()
ggplot(data = a_data_set,
+ mapping = aes(x = year,
+ y = value)) +
+ geom_col() +
+ labs(title = "A title",
+ subtitle = "A subtitle",
+ x = "My X axis",
+ y = "My Y axis")
+
+ggplot(a_data_set,
+ aes(year, value) +
+ geom_bar()
The AMR
package contains functions to extend this ggplot2
package, for example geom_rsi()
. It automatically transforms data with count_df()
or portion_df()
and show results in stacked bars. Its simplest and shortest example:
Omit the translate_ab = FALSE
to have the antibiotic codes (amox, amcl, cipr, gent) translated to official WHO names (amoxicillin, amoxicillin and betalactamase inhibitor, ciprofloxacin, gentamicin).
If we group on e.g. the genus
column and add some additional functions from our package, we can create this:
# group the data on `genus`
-ggplot(data_1st %>% group_by(genus)) +
- # create bars with genus on x axis
- # it looks for variables with class `rsi`,
- # of which we have 4 (earlier created with `as.rsi`)
- geom_rsi(x = "genus") +
- # split plots on antibiotic
- facet_rsi(facet = "Antibiotic") +
- # make R red, I yellow and S green
- scale_rsi_colours() +
- # show percentages on y axis
- scale_y_percent(breaks = 0:4 * 25) +
- # turn 90 degrees, make it bars instead of columns
- coord_flip() +
- # add labels
- labs(title = "Resistance per genus and antibiotic",
- subtitle = "(this is fake data)") +
- # and print genus in italic to follow our convention
- # (is now y axis because we turned the plot)
- theme(axis.text.y = element_text(face = "italic"))
# group the data on `genus`
+ggplot(data_1st %>% group_by(genus)) +
+ # create bars with genus on x axis
+ # it looks for variables with class `rsi`,
+ # of which we have 4 (earlier created with `as.rsi`)
+ geom_rsi(x = "genus") +
+ # split plots on antibiotic
+ facet_rsi(facet = "Antibiotic") +
+ # make R red, I yellow and S green
+ scale_rsi_colours() +
+ # show percentages on y axis
+ scale_y_percent(breaks = 0:4 * 25) +
+ # turn 90 degrees, make it bars instead of columns
+ coord_flip() +
+ # add labels
+ labs(title = "Resistance per genus and antibiotic",
+ subtitle = "(this is fake data)") +
+ # and print genus in italic to follow our convention
+ # (is now y axis because we turned the plot)
+ theme(axis.text.y = element_text(face = "italic"))
To simplify this, we also created the ggplot_rsi()
function, which combines almost all above functions:
data_1st %>%
- group_by(genus) %>%
- ggplot_rsi(x = "genus",
- facet = "Antibiotic",
- breaks = 0:4 * 25,
- datalabels = FALSE) +
- coord_flip()
data_1st %>%
+ group_by(genus) %>%
+ ggplot_rsi(x = "genus",
+ facet = "Antibiotic",
+ breaks = 0:4 * 25,
+ datalabels = FALSE) +
+ coord_flip()
We can transform the data and apply the test in only a couple of lines:
-septic_patients %>%
- filter(hospital_id %in% c("A", "D")) %>% # filter on only hospitals A and D
- select(hospital_id, fosf) %>% # select the hospitals and fosfomycin
- group_by(hospital_id) %>% # group on the hospitals
- count_df(combine_IR = TRUE) %>% # count all isolates per group (hospital_id)
- tidyr::spread(hospital_id, Value) %>% # transform output so A and D are columns
- select(A, D) %>% # and select these only
- as.matrix() %>% # transform to good old matrix for fisher.test()
- fisher.test() # do Fisher's Exact Test
-#
-# Fisher's Exact Test for Count Data
-#
-# data: .
-# p-value = 0.03104
-# alternative hypothesis: true odds ratio is not equal to 1
-# 95 percent confidence interval:
-# 1.054283 4.735995
-# sample estimates:
-# odds ratio
-# 2.228006
septic_patients %>%
+ filter(hospital_id %in% c("A", "D")) %>% # filter on only hospitals A and D
+ select(hospital_id, fosf) %>% # select the hospitals and fosfomycin
+ group_by(hospital_id) %>% # group on the hospitals
+ count_df(combine_IR = TRUE) %>% # count all isolates per group (hospital_id)
+ tidyr::spread(hospital_id, Value) %>% # transform output so A and D are columns
+ select(A, D) %>% # and select these only
+ as.matrix() %>% # transform to good old matrix for fisher.test()
+ fisher.test() # do Fisher's Exact Test
+#
+# Fisher's Exact Test for Count Data
+#
+# data: .
+# p-value = 0.03104
+# alternative hypothesis: true odds ratio is not equal to 1
+# 95 percent confidence interval:
+# 1.054283 4.735995
+# sample estimates:
+# odds ratio
+# 2.228006
As can be seen, the p value is 0.03, which means that the fosfomycin resistances found in hospital A and D are really different.
diff --git a/docs/articles/AMR_files/figure-html/plot 1-1.png b/docs/articles/AMR_files/figure-html/plot 1-1.png index c9ff2753..b7934e12 100644 Binary files a/docs/articles/AMR_files/figure-html/plot 1-1.png and b/docs/articles/AMR_files/figure-html/plot 1-1.png differ diff --git a/docs/articles/AMR_files/figure-html/plot 3-1.png b/docs/articles/AMR_files/figure-html/plot 3-1.png index 6bc000b5..c742e8ea 100644 Binary files a/docs/articles/AMR_files/figure-html/plot 3-1.png and b/docs/articles/AMR_files/figure-html/plot 3-1.png differ diff --git a/docs/articles/AMR_files/figure-html/plot 4-1.png b/docs/articles/AMR_files/figure-html/plot 4-1.png index 53cfc234..51fcf483 100644 Binary files a/docs/articles/AMR_files/figure-html/plot 4-1.png and b/docs/articles/AMR_files/figure-html/plot 4-1.png differ diff --git a/docs/articles/AMR_files/figure-html/plot 5-1.png b/docs/articles/AMR_files/figure-html/plot 5-1.png index 5cc23525..311e8370 100644 Binary files a/docs/articles/AMR_files/figure-html/plot 5-1.png and b/docs/articles/AMR_files/figure-html/plot 5-1.png differ diff --git a/docs/articles/EUCAST.html b/docs/articles/EUCAST.html index 48d7603c..b736e229 100644 --- a/docs/articles/EUCAST.html +++ b/docs/articles/EUCAST.html @@ -178,7 +178,7 @@EUCAST.Rmd
G_test.Rmd
Predict.Rmd
benchmarks.Rmd
One of the most important features of this package is the complete microbial taxonomic database, supplied by ITIS (https://www.itis.gov). We created a function as.mo()
that transforms any user input value to a valid microbial ID by using AI (Artificial Intelligence) and based on the taxonomic tree of ITIS.
Using the microbenchmark
package, we can review the calculation performance of this function.
library(microbenchmark)
In the next test, we try to ‘coerce’ different input values for Staphylococcus aureus. The actual result is the same every time: it returns its MO code B_STPHY_AUR
(B stands for Bacteria, the taxonomic kingdom).
But the calculation time differs a lot. Here, the AI effect can be reviewed best:
-microbenchmark(A = as.mo("stau"),
- B = as.mo("staaur"),
- C = as.mo("S. aureus"),
- D = as.mo("S. aureus"),
- E = as.mo("STAAUR"),
- F = as.mo("Staphylococcus aureus"),
- G = as.mo("B_STPHY_AUR"),
- times = 10,
- unit = "ms")
-# Unit: milliseconds
-# expr min lq mean median uq max neval
-# A 34.745551 34.798630 35.2596102 34.8994810 35.258325 38.067062 10
-# B 7.095386 7.125348 7.2219948 7.1613865 7.240377 7.495857 10
-# C 11.677114 11.733826 11.8304789 11.7715050 11.843756 12.317559 10
-# D 11.694435 11.730054 11.9859313 11.8775585 12.206371 12.750016 10
-# E 7.044402 7.117387 7.2271630 7.1923610 7.246104 7.742396 10
-# F 6.642326 6.778446 6.8988042 6.8753165 6.923577 7.513945 10
-# G 0.106788 0.131023 0.1351229 0.1357725 0.144014 0.146458 10
microbenchmark(A = as.mo("stau"),
+ B = as.mo("staaur"),
+ C = as.mo("S. aureus"),
+ D = as.mo("S. aureus"),
+ E = as.mo("STAAUR"),
+ F = as.mo("Staphylococcus aureus"),
+ G = as.mo("B_STPHY_AUR"),
+ times = 10,
+ unit = "ms")
+# Unit: milliseconds
+# expr min lq mean median uq max neval
+# A 34.745551 34.798630 35.2596102 34.8994810 35.258325 38.067062 10
+# B 7.095386 7.125348 7.2219948 7.1613865 7.240377 7.495857 10
+# C 11.677114 11.733826 11.8304789 11.7715050 11.843756 12.317559 10
+# D 11.694435 11.730054 11.9859313 11.8775585 12.206371 12.750016 10
+# E 7.044402 7.117387 7.2271630 7.1923610 7.246104 7.742396 10
+# F 6.642326 6.778446 6.8988042 6.8753165 6.923577 7.513945 10
+# G 0.106788 0.131023 0.1351229 0.1357725 0.144014 0.146458 10
In the table above, all measurements are shown in milliseconds (thousands of seconds), tested on a quite regular Linux server from 2007 (Core 2 Duo 2.7 GHz, 2 GB DDR2 RAM). A value of 6.9 milliseconds means it will roughly determine 144 input values per second. It case of 39.2 milliseconds, this is only 26 input values per second. The more an input value resembles a full name (like C, D and F), the faster the result will be found. In case of G, the input is already a valid MO code, so it only almost takes no time at all (0.0001 seconds on our server).
To achieve this speed, the as.mo
function also takes into account the prevalence of human pathogenic microorganisms. The downside is of course that less prevalent microorganisms will be determined far less faster. See this example for the ID of Burkholderia nodosa (B_BRKHL_NOD
):
microbenchmark(A = as.mo("buno"),
- B = as.mo("burnod"),
- C = as.mo("B. nodosa"),
- D = as.mo("B. nodosa"),
- E = as.mo("BURNOD"),
- F = as.mo("Burkholderia nodosa"),
- G = as.mo("B_BRKHL_NOD"),
- times = 10,
- unit = "ms")
-# Unit: milliseconds
-# expr min lq mean median uq max neval
-# A 124.175427 124.474837 125.8610536 125.3750560 126.160945 131.485994 10
-# B 154.249713 155.364729 160.9077032 156.8738940 157.136183 197.315105 10
-# C 66.066571 66.162393 66.5538611 66.4488130 66.698077 67.623404 10
-# D 86.747693 86.918665 90.7831016 87.8149725 89.440982 116.767991 10
-# E 154.863827 155.208563 162.6535954 158.4062465 168.593785 187.378088 10
-# F 32.427028 32.638648 32.9929454 32.7860475 32.992813 34.674241 10
-# G 0.213155 0.216578 0.2369226 0.2338985 0.253734 0.285581 10
microbenchmark(A = as.mo("buno"),
+ B = as.mo("burnod"),
+ C = as.mo("B. nodosa"),
+ D = as.mo("B. nodosa"),
+ E = as.mo("BURNOD"),
+ F = as.mo("Burkholderia nodosa"),
+ G = as.mo("B_BRKHL_NOD"),
+ times = 10,
+ unit = "ms")
+# Unit: milliseconds
+# expr min lq mean median uq max neval
+# A 124.175427 124.474837 125.8610536 125.3750560 126.160945 131.485994 10
+# B 154.249713 155.364729 160.9077032 156.8738940 157.136183 197.315105 10
+# C 66.066571 66.162393 66.5538611 66.4488130 66.698077 67.623404 10
+# D 86.747693 86.918665 90.7831016 87.8149725 89.440982 116.767991 10
+# E 154.863827 155.208563 162.6535954 158.4062465 168.593785 187.378088 10
+# F 32.427028 32.638648 32.9929454 32.7860475 32.992813 34.674241 10
+# G 0.213155 0.216578 0.2369226 0.2338985 0.253734 0.285581 10
That takes up to 11 times as much time! A value of 158.4 milliseconds means it can only determine ~6 different input values per second. We can conclude that looking up arbitrary codes of less prevalent microorganisms is the worst way to go, in terms of calculation performance.
To relieve this pitfall and further improve performance, two important calculations take almost no time at all: repetitive results and already precalculated results.
Repetitive results mean that unique values are present more than once. Unique values will only be calculated once by as.mo()
. We will use mo_fullname()
for this test - a helper function that returns the full microbial name (genus, species and possibly subspecies) and uses as.mo()
internally.
library(dplyr)
-# take 500,000 random MO codes from the septic_patients data set
-x = septic_patients %>%
- sample_n(500000, replace = TRUE) %>%
- pull(mo)
-
-# got the right length?
-length(x)
-# [1] 500000
-
-# and how many unique values do we have?
-n_distinct(x)
-# [1] 96
-
-# only 96, but distributed in 500,000 results. now let's see:
-microbenchmark(X = mo_fullname(x),
- times = 10,
- unit = "ms")
-# Unit: milliseconds
-# expr min lq mean median uq max neval
-# X 114.9342 117.1076 129.6448 120.2047 131.5005 168.6371 10
library(dplyr)
+# take 500,000 random MO codes from the septic_patients data set
+x = septic_patients %>%
+ sample_n(500000, replace = TRUE) %>%
+ pull(mo)
+
+# got the right length?
+length(x)
+# [1] 500000
+
+# and how many unique values do we have?
+n_distinct(x)
+# [1] 96
+
+# only 96, but distributed in 500,000 results. now let's see:
+microbenchmark(X = mo_fullname(x),
+ times = 10,
+ unit = "ms")
+# Unit: milliseconds
+# expr min lq mean median uq max neval
+# X 114.9342 117.1076 129.6448 120.2047 131.5005 168.6371 10
So transforming 500,000 values (!) of 96 unique values only takes 0.12 seconds (120 ms). You only lose time on your unique input values.
Results of a tenfold - 5,000,000 values:
-# Unit: milliseconds
-# expr min lq mean median uq max neval
-# X 882.9045 901.3011 1001.677 940.3421 1168.088 1226.846 10
# Unit: milliseconds
+# expr min lq mean median uq max neval
+# X 882.9045 901.3011 1001.677 940.3421 1168.088 1226.846 10
Even the full names of 5 Million values are calculated within a second.
What about precalculated results? If the input is an already precalculated result of a helper function like mo_fullname()
, it almost doesn’t take any time at all (see ‘C’ below):
microbenchmark(A = mo_fullname("B_STPHY_AUR"),
- B = mo_fullname("S. aureus"),
- C = mo_fullname("Staphylococcus aureus"),
- times = 10,
- unit = "ms")
-# Unit: milliseconds
-# expr min lq mean median uq max neval
-# A 11.364086 11.460537 11.5104799 11.4795330 11.524860 11.818263 10
-# B 11.976454 12.012352 12.1704592 12.0853020 12.210004 12.881737 10
-# C 0.095823 0.102528 0.1167754 0.1153785 0.132629 0.140661 10
microbenchmark(A = mo_fullname("B_STPHY_AUR"),
+ B = mo_fullname("S. aureus"),
+ C = mo_fullname("Staphylococcus aureus"),
+ times = 10,
+ unit = "ms")
+# Unit: milliseconds
+# expr min lq mean median uq max neval
+# A 11.364086 11.460537 11.5104799 11.4795330 11.524860 11.818263 10
+# B 11.976454 12.012352 12.1704592 12.0853020 12.210004 12.881737 10
+# C 0.095823 0.102528 0.1167754 0.1153785 0.132629 0.140661 10
So going from mo_fullname("Staphylococcus aureus")
to "Staphylococcus aureus"
takes 0.0001 seconds - it doesn’t even start calculating if the result would be the same as the expected resulting value. That goes for all helper functions:
microbenchmark(A = mo_species("aureus"),
- B = mo_genus("Staphylococcus"),
- C = mo_fullname("Staphylococcus aureus"),
- D = mo_family("Staphylococcaceae"),
- E = mo_order("Bacillales"),
- F = mo_class("Bacilli"),
- G = mo_phylum("Firmicutes"),
- H = mo_subkingdom("Posibacteria"),
- I = mo_kingdom("Bacteria"),
- times = 10,
- unit = "ms")
-# Unit: milliseconds
-# expr min lq mean median uq max neval
-# A 0.105181 0.121314 0.1478538 0.1465265 0.166711 0.211409 10
-# B 0.132558 0.146388 0.1584278 0.1499835 0.164895 0.208477 10
-# C 0.135492 0.160355 0.2341847 0.1884665 0.348857 0.395931 10
-# D 0.109650 0.115727 0.1270481 0.1264130 0.128648 0.168317 10
-# E 0.081574 0.096940 0.0992582 0.0980915 0.101479 0.120477 10
-# F 0.081575 0.088489 0.0988463 0.0989650 0.103365 0.126482 10
-# G 0.091981 0.095333 0.1043568 0.1001530 0.111327 0.129625 10
-# H 0.092610 0.093169 0.1009135 0.0985455 0.101828 0.120406 10
-# I 0.087371 0.091213 0.1069758 0.0941815 0.109302 0.192831 10
microbenchmark(A = mo_species("aureus"),
+ B = mo_genus("Staphylococcus"),
+ C = mo_fullname("Staphylococcus aureus"),
+ D = mo_family("Staphylococcaceae"),
+ E = mo_order("Bacillales"),
+ F = mo_class("Bacilli"),
+ G = mo_phylum("Firmicutes"),
+ H = mo_subkingdom("Posibacteria"),
+ I = mo_kingdom("Bacteria"),
+ times = 10,
+ unit = "ms")
+# Unit: milliseconds
+# expr min lq mean median uq max neval
+# A 0.105181 0.121314 0.1478538 0.1465265 0.166711 0.211409 10
+# B 0.132558 0.146388 0.1584278 0.1499835 0.164895 0.208477 10
+# C 0.135492 0.160355 0.2341847 0.1884665 0.348857 0.395931 10
+# D 0.109650 0.115727 0.1270481 0.1264130 0.128648 0.168317 10
+# E 0.081574 0.096940 0.0992582 0.0980915 0.101479 0.120477 10
+# F 0.081575 0.088489 0.0988463 0.0989650 0.103365 0.126482 10
+# G 0.091981 0.095333 0.1043568 0.1001530 0.111327 0.129625 10
+# H 0.092610 0.093169 0.1009135 0.0985455 0.101828 0.120406 10
+# I 0.087371 0.091213 0.1069758 0.0941815 0.109302 0.192831 10
Of course, when running mo_phylum("Firmicutes")
the function has zero knowledge about the actual microorganism, namely S. aureus. But since the result would be "Firmicutes"
too, there is no point in calculating the result. And because this package ‘knows’ all phyla of all known microorganisms (according to ITIS), it can just return the initial value immediately.
When the system language is non-English and supported by this AMR
package, some functions take a little while longer:
mo_fullname("CoNS", language = "en") # or just mo_fullname("CoNS") on an English system
-# "Coagulase Negative Staphylococcus (CoNS)"
-
-mo_fullname("CoNS", language = "fr") # or just mo_fullname("CoNS") on a French system
-# "Staphylococcus à coagulase négative (CoNS)"
-
-microbenchmark(en = mo_fullname("CoNS", language = "en"),
- de = mo_fullname("CoNS", language = "de"),
- nl = mo_fullname("CoNS", language = "nl"),
- es = mo_fullname("CoNS", language = "es"),
- it = mo_fullname("CoNS", language = "it"),
- fr = mo_fullname("CoNS", language = "fr"),
- pt = mo_fullname("CoNS", language = "pt"),
- times = 10,
- unit = "ms")
-# Unit: milliseconds
-# expr min lq mean median uq max neval
-# en 6.093583 6.51724 6.555105 6.562986 6.630663 6.99698 100
-# de 13.934874 14.35137 16.891587 14.462210 14.764658 43.63956 100
-# nl 13.900092 14.34729 15.943268 14.424565 14.581535 43.76283 100
-# es 13.833813 14.34596 14.574783 14.439757 14.653994 17.49168 100
-# it 13.811883 14.36621 15.179060 14.453515 14.812359 43.64284 100
-# fr 13.798683 14.37019 16.344731 14.468775 14.697610 48.62923 100
-# pt 13.789674 14.36244 15.706321 14.443772 14.679905 44.76701 100
mo_fullname("CoNS", language = "en") # or just mo_fullname("CoNS") on an English system
+# "Coagulase Negative Staphylococcus (CoNS)"
+
+mo_fullname("CoNS", language = "fr") # or just mo_fullname("CoNS") on a French system
+# "Staphylococcus à coagulase négative (CoNS)"
+
+microbenchmark(en = mo_fullname("CoNS", language = "en"),
+ de = mo_fullname("CoNS", language = "de"),
+ nl = mo_fullname("CoNS", language = "nl"),
+ es = mo_fullname("CoNS", language = "es"),
+ it = mo_fullname("CoNS", language = "it"),
+ fr = mo_fullname("CoNS", language = "fr"),
+ pt = mo_fullname("CoNS", language = "pt"),
+ times = 10,
+ unit = "ms")
+# Unit: milliseconds
+# expr min lq mean median uq max neval
+# en 6.093583 6.51724 6.555105 6.562986 6.630663 6.99698 100
+# de 13.934874 14.35137 16.891587 14.462210 14.764658 43.63956 100
+# nl 13.900092 14.34729 15.943268 14.424565 14.581535 43.76283 100
+# es 13.833813 14.34596 14.574783 14.439757 14.653994 17.49168 100
+# it 13.811883 14.36621 15.179060 14.453515 14.812359 43.64284 100
+# fr 13.798683 14.37019 16.344731 14.468775 14.697610 48.62923 100
+# pt 13.789674 14.36244 15.706321 14.443772 14.679905 44.76701 100
Currently supported are German, Dutch, Spanish, Italian, French and Portuguese.
freq.Rmd
To only show and quickly review the content of one variable, you can just select this variable in various ways. Let’s say we want to get the frequencies of the gender
variable of the septic_patients
dataset:
septic_patients %>% freq(gender)
Frequency table of gender