diff --git a/DESCRIPTION b/DESCRIPTION index c9648b26..9a86ca29 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: AMR Version: 0.5.0.9009 -Date: 2019-01-11 +Date: 2019-01-12 Title: Antimicrobial Resistance Analysis Authors@R: c( person( diff --git a/R/ab_property.R b/R/ab_property.R old mode 100644 new mode 100755 diff --git a/R/abname.R b/R/abname.R old mode 100644 new mode 100755 diff --git a/R/age.R b/R/age.R old mode 100644 new mode 100755 index 399e7ea0..265b1c9e --- a/R/age.R +++ b/R/age.R @@ -21,11 +21,11 @@ #' Age in years of individuals #' -#' Calculates age in years based on a reference date, which is the sytem time at default. +#' Calculates age in years based on a reference date, which is the sytem date at default. #' @param x date(s), will be coerced with \code{\link{as.POSIXlt}} -#' @param reference reference date(s) (defaults to today), will be coerced with \code{\link{as.POSIXlt}} +#' @param reference reference date(s) (defaults to today), will be coerced with \code{\link{as.POSIXlt}} and cannot be lower than \code{x} #' @return Integer (no decimals) -#' @seealso \code{\link{age_groups}} to splits age into groups +#' @seealso \code{\link{age_groups}} to split age into age groups #' @importFrom dplyr if_else #' @inheritSection AMR Read more on our website! #' @export @@ -45,8 +45,8 @@ age <- function(x, reference = Sys.Date()) { years_gap <- reference$year - x$year # from https://stackoverflow.com/a/25450756/4575331 ages <- if_else(reference$mon < x$mon | (reference$mon == x$mon & reference$mday < x$mday), - as.integer(years_gap - 1), - as.integer(years_gap)) + as.integer(years_gap - 1), + as.integer(years_gap)) if (any(ages > 120)) { warning("Some ages are > 120.") } @@ -60,7 +60,7 @@ age <- function(x, reference = Sys.Date()) { #' @param split_at values to split \code{x} at, defaults to age groups 0-11, 12-24, 26-54, 55-74 and 75+. See Details. #' @details To split ages, the input can be: #' \itemize{ -#' \item{A numeric vector. A vector of \code{c(10, 20)} will split on 0-9, 10-19 and 20+. A value of only \code{50} will split on 0-49 and 50+. +#' \item{A numeric vector. A vector of e.g. \code{c(10, 20)} will split on 0-9, 10-19 and 20+. A value of only \code{50} will split on 0-49 and 50+. #' The default is to split on young children (0-11), youth (12-24), young adults (26-54), middle-aged adults (55-74) and elderly (75+).} #' \item{A character:} #' \itemize{ @@ -139,8 +139,7 @@ age_groups <- function(x, split_at = c(12, 25, 55, 75)) { for (i in 1:length(split_at)) { y[x >= split_at[i]] <- i # create labels - # when age group consists of only one age - labs[i - 1] <- paste0(unique(c(split_at[i - 1], split_at[i] - 1)), collapse = "-") + labs[i - 1] <- paste0(unique(c(split_at[i - 1], split_at[i] - 1)), collapse = "-") } # last category diff --git a/R/count.R b/R/count.R old mode 100644 new mode 100755 diff --git a/R/deprecated.R b/R/deprecated.R old mode 100644 new mode 100755 diff --git a/R/g.test.R b/R/g.test.R old mode 100644 new mode 100755 index 7a6d05c5..3d05d079 --- a/R/g.test.R +++ b/R/g.test.R @@ -44,7 +44,7 @@ #' #' It is also possible to do a \emph{G}-test of independence with more than two nominal variables. For example, Jackson et al. (2013) also had data for children under 3, so you could do an analysis of old vs. young, thigh vs. arm, and reaction vs. no reaction, all analyzed together. #' -#' Fisher's exact test (\code{\link{fisher.test}}) is more accurate than the \emph{G}-test of independence when the expected numbers are small, so it is recommend to only use the \emph{G}-test if your total sample size is greater than 1000. +#' Fisher's exact test (\code{\link{fisher.test}}) is an \strong{exact} test, where the \emph{G}-test is still only an \strong{approximation}. For any 2x2 table, Fisher's Exact test may be slower but will still run in seconds, even if the sum of your observations is multiple millions. #' #' The \emph{G}-test of independence is an alternative to the chi-square test of independence (\code{\link{chisq.test}}), and they will give approximately the same results. #' @section How the test works: @@ -155,6 +155,9 @@ g.test <- function(x, nc <- as.integer(ncol(x)) if (is.na(nr) || is.na(nc) || is.na(nr * nc)) stop("invalid nrow(x) or ncol(x)", domain = NA) + # add fisher.test suggestion + if (nr == 2 && nc == 2) + warning("`fisher.test()` is always more reliable for 2x2 tables and although must slower, often only takes seconds.") sr <- rowSums(x) sc <- colSums(x) E <- outer(sr, sc, "*")/n @@ -221,15 +224,9 @@ g.test <- function(x, } names(STATISTIC) <- "X-squared" names(PARAMETER) <- "df" - # if (any(E < 5) && is.finite(PARAMETER)) - # warning("G-statistic approximation may be incorrect") + if (any(E < 5) && is.finite(PARAMETER)) + warning("G-statistic approximation may be incorrect due to E < 5") - # suggest fisher.test when total is < 1000 (John McDonald, Handbook of Biological Statistics, 2014) - if (sum(x, na.rm = TRUE) < 1000 && is.finite(PARAMETER)) { - warning("G-statistic approximation may be incorrect, consider Fisher's Exact test") - } else if (any(E < 5) && is.finite(PARAMETER)) { - warning("G-statistic approximation may be incorrect, consider Fisher's Exact test") - } structure(list(statistic = STATISTIC, parameter = PARAMETER, p.value = PVAL, method = METHOD, data.name = DNAME, observed = x, expected = E, residuals = (x - E)/sqrt(E), diff --git a/R/get_locale.R b/R/get_locale.R old mode 100644 new mode 100755 diff --git a/R/ggplot_rsi.R b/R/ggplot_rsi.R old mode 100644 new mode 100755 diff --git a/R/guess_ab_col.R b/R/guess_ab_col.R old mode 100644 new mode 100755 diff --git a/R/itis.R b/R/itis.R old mode 100644 new mode 100755 index e4bc2477..f1e74752 --- a/R/itis.R +++ b/R/itis.R @@ -26,7 +26,7 @@ #' \if{html}{\figure{itis_logo.jpg}{options: height=60px style=margin-bottom:5px} \cr} #' This package contains the \strong{complete microbial taxonomic data} (with all nine taxonomic ranks - from kingdom to subspecies) from the publicly available Integrated Taxonomic Information System (ITIS, \url{https://www.itis.gov}). #' -#' All (sub)species from \strong{the taxonomic kingdoms Bacteria, Fungi and Protozoa are included in this package}, as well as all previously accepted names known to ITIS. Furthermore, the responsible authors and year of publication are available. This allows users to use authoritative taxonomic information for their data analysis on any microorganism, not only human pathogens. It also helps to quickly determine the Gram stain of bacteria, since all bacteria are classified into subkingdom Negibacteria or Posibacteria. +#' All ~20,000 (sub)species from \strong{the taxonomic kingdoms Bacteria, Fungi and Protozoa are included in this package}, as well as all ~2,500 previously accepted names known to ITIS. Furthermore, the responsible authors and year of publication are available. This allows users to use authoritative taxonomic information for their data analysis on any microorganism, not only human pathogens. It also helps to quickly determine the Gram stain of bacteria, since all bacteria are classified into subkingdom Negibacteria or Posibacteria. #' #' ITIS is a partnership of U.S., Canadian, and Mexican agencies and taxonomic specialists [3]. #' @inheritSection AMR Read more on our website! diff --git a/R/key_antibiotics.R b/R/key_antibiotics.R old mode 100644 new mode 100755 diff --git a/R/kurtosis.R b/R/kurtosis.R old mode 100644 new mode 100755 diff --git a/R/like.R b/R/like.R old mode 100644 new mode 100755 diff --git a/R/mo.R b/R/mo.R old mode 100644 new mode 100755 diff --git a/R/mo_property.R b/R/mo_property.R old mode 100644 new mode 100755 diff --git a/R/p.symbol.R b/R/p.symbol.R old mode 100644 new mode 100755 diff --git a/R/read.4d.R b/R/read.4d.R old mode 100644 new mode 100755 diff --git a/R/resistance_predict.R b/R/resistance_predict.R old mode 100644 new mode 100755 diff --git a/R/rsi.R b/R/rsi.R old mode 100644 new mode 100755 diff --git a/R/rsi_calc.R b/R/rsi_calc.R old mode 100644 new mode 100755 diff --git a/R/skewness.R b/R/skewness.R old mode 100644 new mode 100755 diff --git a/docs/articles/AMR.html b/docs/articles/AMR.html index d586dc73..79ab6ab5 100644 --- a/docs/articles/AMR.html +++ b/docs/articles/AMR.html @@ -178,7 +178,7 @@

How to conduct AMR analysis

Matthijs S. Berends

-

11 January 2019

+

12 January 2019

@@ -187,7 +187,7 @@ -

Note: values on this page will change with every website update since they are based on randomly created values and the page was written in RMarkdown. However, the methodology remains unchanged. This page was generated on 11 January 2019.

+

Note: values on this page will change with every website update since they are based on randomly created values and the page was written in RMarkdown. However, the methodology remains unchanged. This page was generated on 12 January 2019.

Introduction

@@ -203,21 +203,21 @@ -2019-01-11 +2019-01-12 abcd Escherichia coli S S -2019-01-11 +2019-01-12 abcd Escherichia coli S R -2019-01-11 +2019-01-12 efgh Escherichia coli R @@ -231,9 +231,9 @@ Needed R packages

As with many uses in R, we need some additional packages for AMR analysis. The most important one is dplyr, which tremendously improves the way we work with data - it allows for a very natural way of writing syntaxes in R. Another important dependency is ggplot2. This package can be used to create beautiful plots in R.

Our AMR package depends on these packages and even extends their use and functions.

-
library(dplyr)   # the data science package
-library(AMR)     # this package, to simplify and automate AMR analysis
-library(ggplot2) # for appealing plots
+
library(dplyr)   # the data science package
+library(AMR)     # this package, to simplify and automate AMR analysis
+library(ggplot2) # for appealing plots

@@ -244,51 +244,51 @@

Patients

To start with patients, we need a unique list of patients.

-
patients <- unlist(lapply(LETTERS, paste0, 1:10))
+
patients <- unlist(lapply(LETTERS, paste0, 1:10))

The LETTERS object is available in R - it’s a vector with 26 characters: A to Z. The patients object we just created is now a vector of length 260, with values (patient IDs) varying from A1 to Z10. Now we we also set the gender of our patients, by putting the ID and the gender in a table:

-
patients_table <- data.frame(patient_id = patients,
-                             gender = c(rep("M", 135),
-                                        rep("F", 125)))
+
patients_table <- data.frame(patient_id = patients,
+                             gender = c(rep("M", 135),
+                                        rep("F", 125)))

The first 135 patient IDs are now male, the other 125 are female.

Dates

Let’s pretend that our data consists of blood cultures isolates from 1 January 2010 until 1 January 2018.

-
dates <- seq(as.Date("2010-01-01"), as.Date("2018-01-01"), by = "day")
+
dates <- seq(as.Date("2010-01-01"), as.Date("2018-01-01"), by = "day")

This dates object now contains all days in our date range.

Microorganisms

For this tutorial, we will uses four different microorganisms: Escherichia coli, Staphylococcus aureus, Streptococcus pneumoniae, and Klebsiella pneumoniae:

-
bacteria <- c("Escherichia coli", "Staphylococcus aureus",
-              "Streptococcus pneumoniae", "Klebsiella pneumoniae")
+
bacteria <- c("Escherichia coli", "Staphylococcus aureus",
+              "Streptococcus pneumoniae", "Klebsiella pneumoniae")

Other variables

For completeness, we can also add the hospital where the patients was admitted and we need to define valid antibmicrobial results for our randomisation:

-
hospitals <- c("Hospital A", "Hospital B", "Hospital C", "Hospital D")
-ab_interpretations <- c("S", "I", "R")
+
hospitals <- c("Hospital A", "Hospital B", "Hospital C", "Hospital D")
+ab_interpretations <- c("S", "I", "R")

Put everything together

-

Using the sample() function, we can randomly select items from all objects we defined earlier. To let our fake data reflect reality a bit, we will also approximately define the probabilities of bacteria and the antibiotic results with the prob parameter.

-
data <- data.frame(date = sample(dates, 5000, replace = TRUE),
-                   patient_id = sample(patients, 5000, replace = TRUE),
-                   hospital = sample(hospitals, 5000, replace = TRUE, prob = c(0.30, 0.35, 0.15, 0.20)),
-                   bacteria = sample(bacteria, 5000, replace = TRUE, prob = c(0.50, 0.25, 0.15, 0.10)),
-                   amox = sample(ab_interpretations, 5000, replace = TRUE, prob = c(0.60, 0.05, 0.35)),
-                   amcl = sample(ab_interpretations, 5000, replace = TRUE, prob = c(0.75, 0.10, 0.15)),
-                   cipr = sample(ab_interpretations, 5000, replace = TRUE, prob = c(0.80, 0.00, 0.20)),
-                   gent = sample(ab_interpretations, 5000, replace = TRUE, prob = c(0.92, 0.00, 0.08))
-                   )
-

Using the left_join() function from the dplyr package, we can ‘map’ the gender to the patient ID using the patients_table object we created earlier:

-
data <- data %>% left_join(patients_table)
+

Using the sample() function, we can randomly select items from all objects we defined earlier. To let our fake data reflect reality a bit, we will also approximately define the probabilities of bacteria and the antibiotic results with the prob parameter.

+
data <- data.frame(date = sample(dates, 5000, replace = TRUE),
+                   patient_id = sample(patients, 5000, replace = TRUE),
+                   hospital = sample(hospitals, 5000, replace = TRUE, prob = c(0.30, 0.35, 0.15, 0.20)),
+                   bacteria = sample(bacteria, 5000, replace = TRUE, prob = c(0.50, 0.25, 0.15, 0.10)),
+                   amox = sample(ab_interpretations, 5000, replace = TRUE, prob = c(0.60, 0.05, 0.35)),
+                   amcl = sample(ab_interpretations, 5000, replace = TRUE, prob = c(0.75, 0.10, 0.15)),
+                   cipr = sample(ab_interpretations, 5000, replace = TRUE, prob = c(0.80, 0.00, 0.20)),
+                   gent = sample(ab_interpretations, 5000, replace = TRUE, prob = c(0.92, 0.00, 0.08))
+                   )
+

Using the left_join() function from the dplyr package, we can ‘map’ the gender to the patient ID using the patients_table object we created earlier:

+
data <- data %>% left_join(patients_table)

The resulting data set contains 5,000 blood culture isolates. With the head() function we can preview the first 6 values of this data set:

-
head(data)
+
head(data)
@@ -303,70 +303,70 @@ - - - + + + - + - - - + + + - - + + - - - + + + - + - - + + - - - + + + - - - - + + + + + - - + - - - - + + + + + + - - - +
date
2014-02-02P8Hospital D2013-07-18Z9Hospital C Escherichia coli SRS S S F
2013-10-26Q1Hospital B2010-02-24P1Hospital D Escherichia coliIR R SSS F
2017-06-12E5Hospital D2014-04-02A10Hospital B Streptococcus pneumoniae SRS S S M
2013-06-16K72016-05-30A7 Hospital B Escherichia coliSISRRR S M
2013-01-11M4Hospital BStaphylococcus aureus2017-11-22O6Hospital DKlebsiella pneumoniaeR I S SSMF
2016-11-18W10Hospital AStaphylococcus aureus2017-03-08L4Hospital BEscherichia coliRR S SSSFM
@@ -377,7 +377,7 @@

Cleaning the data

Use the frequency table function freq() to look specifically for unique values in any variable. For example, for the gender variable:

-
data %>% freq(gender) # this would be the same: freq(data$gender)
+
data %>% freq(gender) # this would be the same: freq(data$gender)
# Frequency table of `gender` 
 # Class:   factor (numeric)  
 # Levels:  F, M  
@@ -386,67 +386,67 @@
 # 
 #      Item    Count   Percent   Cum. Count   Cum. Percent
 # ---  -----  ------  --------  -----------  -------------
-# 1    M       2,598     52.0%        2,598          52.0%
-# 2    F       2,402     48.0%        5,000         100.0%
+# 1 M 2,632 52.6% 2,632 52.6% +# 2 F 2,368 47.4% 5,000 100.0%

So, we can draw at least two conclusions immediately. From a data scientist perspective, the data looks clean: only values M and F. From a researcher perspective: there are slightly more men. Nothing we didn’t already know.

-

The data is already quite clean, but we still need to transform some variables. The bacteria column now consists of text, and we want to add more variables based on microbial IDs later on. So, we will transform this column to valid IDs. The mutate() function of the dplyr package makes this really easy:

-
data <- data %>%
-  mutate(bacteria = as.mo(bacteria))
-

We also want to transform the antibiotics, because in real life data we don’t know if they are really clean. The as.rsi() function ensures reliability and reproducibility in these kind of variables. The mutate_at() will run the as.rsi() function on defined variables:

-
data <- data %>%
-  mutate_at(vars(amox:gent), as.rsi)
+

The data is already quite clean, but we still need to transform some variables. The bacteria column now consists of text, and we want to add more variables based on microbial IDs later on. So, we will transform this column to valid IDs. The mutate() function of the dplyr package makes this really easy:

+
data <- data %>%
+  mutate(bacteria = as.mo(bacteria))
+

We also want to transform the antibiotics, because in real life data we don’t know if they are really clean. The as.rsi() function ensures reliability and reproducibility in these kind of variables. The mutate_at() will run the as.rsi() function on defined variables:

+
data <- data %>%
+  mutate_at(vars(amox:gent), as.rsi)

Finally, we will apply EUCAST rules on our antimicrobial results. In Europe, most medical microbiological laboratories already apply these rules. Our package features their latest insights on intrinsic resistance and exceptional phenotypes. Moreover, the eucast_rules() function can also apply additional rules, like forcing ampicillin = R when amoxicillin/clavulanic acid = R.

Because the amoxicillin (column amox) and amoxicillin/clavulanic acid (column amcl) in our data were generated randomly, some rows will undoubtedly contain amox = S and amcl = R, which is technically impossible. The eucast_rules() fixes this:

-
data <- eucast_rules(data, col_mo = "bacteria")
-# 
-# Rules by the European Committee on Antimicrobial Susceptibility Testing (EUCAST)
-# 
-# EUCAST Clinical Breakpoints (v8.1, 2018)
-# Enterobacteriales (Order) (no changes)
-# Staphylococcus (no changes)
-# Enterococcus (no changes)
-# Streptococcus groups A, B, C, G (no changes)
-# Streptococcus pneumoniae (no changes)
-# Viridans group streptococci (no changes)
-# Haemophilus influenzae (no changes)
-# Moraxella catarrhalis (no changes)
-# Anaerobic Gram positives (no changes)
-# Anaerobic Gram negatives (no changes)
-# Pasteurella multocida (no changes)
-# Campylobacter jejuni and C. coli (no changes)
-# Aerococcus sanguinicola and A. urinae (no changes)
-# Kingella kingae (no changes)
-# 
-# EUCAST Expert Rules, Intrinsic Resistance and Exceptional Phenotypes (v3.1, 2016)
-# Table 1:  Intrinsic resistance in Enterobacteriaceae (333 changes)
-# Table 2:  Intrinsic resistance in non-fermentative Gram-negative bacteria (no changes)
-# Table 3:  Intrinsic resistance in other Gram-negative bacteria (no changes)
-# Table 4:  Intrinsic resistance in Gram-positive bacteria (692 changes)
-# Table 8:  Interpretive rules for B-lactam agents and Gram-positive cocci (no changes)
-# Table 9:  Interpretive rules for B-lactam agents and Gram-negative rods (no changes)
-# Table 10: Interpretive rules for B-lactam agents and other Gram-negative bacteria (no changes)
-# Table 11: Interpretive rules for macrolides, lincosamides, and streptogramins (no changes)
-# Table 12: Interpretive rules for aminoglycosides (no changes)
-# Table 13: Interpretive rules for quinolones (no changes)
-# 
-# Other rules
-# Non-EUCAST: ampicillin = R where amoxicillin/clav acid = R (no changes)
-# Non-EUCAST: piperacillin = R where piperacillin/tazobactam = R (no changes)
-# Non-EUCAST: trimethoprim = R where trimethoprim/sulfa = R (no changes)
-# Non-EUCAST: amoxicillin/clav acid = S where ampicillin = S (no changes)
-# Non-EUCAST: piperacillin/tazobactam = S where piperacillin = S (no changes)
-# Non-EUCAST: trimethoprim/sulfa = S where trimethoprim = S (no changes)
-# 
-# => EUCAST rules affected 1,830 out of 5,000 rows -> changed 1,025 test results.
+
data <- eucast_rules(data, col_mo = "bacteria")
+# 
+# Rules by the European Committee on Antimicrobial Susceptibility Testing (EUCAST)
+# 
+# EUCAST Clinical Breakpoints (v8.1, 2018)
+# Enterobacteriales (Order) (no changes)
+# Staphylococcus (no changes)
+# Enterococcus (no changes)
+# Streptococcus groups A, B, C, G (no changes)
+# Streptococcus pneumoniae (no changes)
+# Viridans group streptococci (no changes)
+# Haemophilus influenzae (no changes)
+# Moraxella catarrhalis (no changes)
+# Anaerobic Gram positives (no changes)
+# Anaerobic Gram negatives (no changes)
+# Pasteurella multocida (no changes)
+# Campylobacter jejuni and C. coli (no changes)
+# Aerococcus sanguinicola and A. urinae (no changes)
+# Kingella kingae (no changes)
+# 
+# EUCAST Expert Rules, Intrinsic Resistance and Exceptional Phenotypes (v3.1, 2016)
+# Table 1:  Intrinsic resistance in Enterobacteriaceae (307 changes)
+# Table 2:  Intrinsic resistance in non-fermentative Gram-negative bacteria (no changes)
+# Table 3:  Intrinsic resistance in other Gram-negative bacteria (no changes)
+# Table 4:  Intrinsic resistance in Gram-positive bacteria (708 changes)
+# Table 8:  Interpretive rules for B-lactam agents and Gram-positive cocci (no changes)
+# Table 9:  Interpretive rules for B-lactam agents and Gram-negative rods (no changes)
+# Table 10: Interpretive rules for B-lactam agents and other Gram-negative bacteria (no changes)
+# Table 11: Interpretive rules for macrolides, lincosamides, and streptogramins (no changes)
+# Table 12: Interpretive rules for aminoglycosides (no changes)
+# Table 13: Interpretive rules for quinolones (no changes)
+# 
+# Other rules
+# Non-EUCAST: ampicillin = R where amoxicillin/clav acid = R (no changes)
+# Non-EUCAST: piperacillin = R where piperacillin/tazobactam = R (no changes)
+# Non-EUCAST: trimethoprim = R where trimethoprim/sulfa = R (no changes)
+# Non-EUCAST: amoxicillin/clav acid = S where ampicillin = S (no changes)
+# Non-EUCAST: piperacillin/tazobactam = S where piperacillin = S (no changes)
+# Non-EUCAST: trimethoprim/sulfa = S where trimethoprim = S (no changes)
+# 
+# => EUCAST rules affected 1,871 out of 5,000 rows -> changed 1,015 test results.

Adding new variables

Now that we have the microbial ID, we can add some taxonomic properties:

-
data <- data %>% 
-  mutate(gramstain = mo_gramstain(bacteria),
-         genus = mo_genus(bacteria),
-         species = mo_species(bacteria))
+
data <- data %>% 
+  mutate(gramstain = mo_gramstain(bacteria),
+         genus = mo_genus(bacteria),
+         species = mo_species(bacteria))

First isolates

@@ -457,18 +457,18 @@

(…) When preparing a cumulative antibiogram to guide clinical decisions about empirical antimicrobial therapy of initial infections, only the first isolate of a given species per patient, per analysis period (eg, one year) should be included, irrespective of body site, antimicrobial susceptibility profile, or other phenotypical characteristics (eg, biotype). The first isolate is easily identified, and cumulative antimicrobial susceptibility test data prepared using the first isolate are generally comparable to cumulative antimicrobial susceptibility test data calculated by other methods, providing duplicate isolates are excluded.
M39-A4 Analysis and Presentation of Cumulative Antimicrobial Susceptibility Test Data, 4th Edition. CLSI, 2014. Chapter 6.4

This AMR package includes this methodology with the first_isolate() function. It adopts the episode of a year (can be changed by user) and it starts counting days after every selected isolate. This new variable can easily be added to our data:

- -

So only 59.2% is suitable for resistance analysis! We can now filter on is with the filter() function, also from the dplyr package:

- +
data <- data %>% 
+  mutate(first = first_isolate(.))
+# NOTE: Using column `bacteria` as input for `col_mo`.
+# NOTE: Using column `date` as input for `col_date`.
+# NOTE: Using column `patient_id` as input for `col_patient_id`.
+# => Found 2,951 first isolates (59.0% of total)
+

So only 59% is suitable for resistance analysis! We can now filter on is with the filter() function, also from the dplyr package:

+
data_1st <- data %>% 
+  filter(first == TRUE)

For future use, the above two syntaxes can be shortened with the filter_first_isolate() function:

- +
data_1st <- data %>% 
+  filter_first_isolate()

@@ -489,19 +489,19 @@ 1 -2010-05-23 -E7 +2010-03-01 +Y4 B_ESCHR_COL -R S -R +S +S S TRUE 2 -2010-08-03 -E7 +2010-11-02 +Y4 B_ESCHR_COL S S @@ -511,30 +511,41 @@ 3 -2011-01-20 -E7 +2011-06-12 +Y4 B_ESCHR_COL R +S R S -S -FALSE +TRUE 4 -2011-02-21 -E7 +2011-09-03 +Y4 B_ESCHR_COL S -R -R +S +S S FALSE 5 -2011-08-04 -E7 +2011-09-07 +Y4 +B_ESCHR_COL +S +S +R +S +FALSE + + +6 +2012-06-11 +Y4 B_ESCHR_COL S S @@ -542,56 +553,45 @@ S TRUE - -6 -2011-11-15 -E7 -B_ESCHR_COL -R -S -S -S -FALSE - 7 -2012-01-13 -E7 +2012-08-16 +Y4 B_ESCHR_COL S S -R -R +S +S FALSE 8 -2012-03-10 -E7 +2012-11-24 +Y4 B_ESCHR_COL S +I S -S -S +R FALSE 9 -2012-11-09 -E7 +2012-12-12 +Y4 B_ESCHR_COL S S +S R -S -TRUE +FALSE 10 -2013-04-06 -E7 +2013-02-02 +Y4 B_ESCHR_COL -S +R S S S @@ -601,21 +601,18 @@

Only 3 isolates are marked as ‘first’ according to CLSI guideline. But when reviewing the antibiogram, it is obvious that some isolates are absolutely different strains and show be included too. This is why we weigh isolates, based on their antibiogram. The key_antibiotics() function adds a vector with 18 key antibiotics: 6 broad spectrum ones, 6 small spectrum for Gram negatives and 6 small spectrum for Gram positives. These can be defined by the user.

If a column exists with a name like ‘key(…)ab’ the first_isolate() function will automatically use it and determine the first weighted isolates. Mind the NOTEs in below output:

- +
data <- data %>% 
+  mutate(keyab = key_antibiotics(.)) %>% 
+  mutate(first_weighted = first_isolate(.))
+# NOTE: Using column `bacteria` as input for `col_mo`.
+#   amox   amcl   cipr   gent 
+# "amox" "amcl" "cipr" "gent"
+# NOTE: Using column `bacteria` as input for `col_mo`.
+# NOTE: Using column `date` as input for `col_date`.
+# NOTE: Using column `patient_id` as input for `col_patient_id`.
+# NOTE: Using column `keyab` as input for `col_keyantibiotics`. Use col_keyantibiotics = FALSE to prevent this.
+# [Criterion] Inclusion based on key antibiotics, ignoring I.
+# => Found 4,431 first weighted isolates (88.6% of total)
@@ -632,118 +629,118 @@ - - + + - - + + - - + + - + - - + + + - - + - - + + - - + + - - + + + - - + - - + + - - + + - - + + - - + + + - - - + + + - - + - - + + + - - - + + - - + + - + @@ -752,16 +749,16 @@
isolate
12010-05-23E72010-03-01Y4 B_ESCHR_COLR SRSS S TRUE TRUE
22010-08-03E72010-11-02Y4 B_ESCHR_COL S S S S FALSETRUEFALSE
32011-01-20E72011-06-12Y4 B_ESCHR_COL RS R SSFALSETRUE TRUE
42011-02-21E72011-09-03Y4 B_ESCHR_COL SRRSS S FALSE TRUE
52011-08-04E72011-09-07Y4 B_ESCHR_COL S SR SSTRUEFALSE TRUE
62011-11-15E72012-06-11Y4 B_ESCHR_COLR S S SFALSESTRUE TRUE
72012-01-13E72012-08-16Y4 B_ESCHR_COL S SRRSSFALSE FALSETRUE
82012-03-10E72012-11-24Y4 B_ESCHR_COL SI SSSR FALSE TRUE
92012-11-09E72012-12-12Y4 B_ESCHR_COL S SS RSTRUETRUEFALSEFALSE
102013-04-06E72013-02-02Y4 B_ESCHR_COLSR S S S
-

Instead of 3, now 10 isolates are flagged. In total, 88% of all isolates are marked ‘first weighted’ - 147.2% more than when using the CLSI guideline. In real life, this novel algorithm will yield 5-10% more isolates than the classic CLSI guideline.

+

Instead of 3, now 7 isolates are flagged. In total, 88.6% of all isolates are marked ‘first weighted’ - 147.6% more than when using the CLSI guideline. In real life, this novel algorithm will yield 5-10% more isolates than the classic CLSI guideline.

As with filter_first_isolate(), there’s a shortcut for this new algorithm too:

- -

So we end up with 4,399 isolates for analysis.

+
data_1st <- data %>% 
+  filter_first_weighted_isolate()
+

So we end up with 4,431 isolates for analysis.

We can remove unneeded columns:

- +
data_1st <- data_1st %>% 
+  select(-c(first, keyab))

Now our data looks like:

-
head(data_1st)
+
head(data_1st)
@@ -782,12 +779,12 @@ - - - + + + - + @@ -798,14 +795,14 @@ - - - + + + - - + + @@ -814,12 +811,12 @@ - - - + + + - + @@ -830,13 +827,13 @@ - - + + - - - + + + @@ -845,30 +842,30 @@ - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + @@ -886,12 +883,12 @@ Analysing the data

You might want to start by getting an idea of how the data is distributed. It’s an important start, because it also decides how you will continue your analysis. ## Dispersion of species To just get an idea how the species are distributed, create a frequency table with our freq() function. We created the genus and species column earlier based on the microbial ID. With paste(), we can concatenate them together.

The freq() function can be used like the base R language was intended:

-
freq(paste(data_1st$genus, data_1st$species))
+
freq(paste(data_1st$genus, data_1st$species))

Or can be used like the dplyr way, which is easier readable:

-
data_1st %>% freq(genus, species)
+
data_1st %>% freq(genus, species)

Frequency table of genus and species
Columns: 2
-Length: 4,399 (of which NA: 0 = 0.00%)
+Length: 4,431 (of which NA: 0 = 0.00%)
Unique: 4

Shortest: 16
Longest: 24

@@ -908,33 +905,33 @@ Longest: 24

- - - - + + + + - - - - + + + + - - - - + + + + - - - + + + @@ -943,12 +940,12 @@ Longest: 24

Resistance percentages

The functions portion_R, portion_RI, portion_I, portion_IS and portion_S can be used to determine the portion of a specific antimicrobial outcome. They can be used on their own:

- -

Or can be used in conjuction with group_by() and summarise(), both from the dplyr package:

-
data_1st %>% 
-  group_by(hospital) %>% 
-  summarise(amoxicillin = portion_IR(amox))
+
data_1st %>% portion_IR(amox)
+# [1] 0.460167
+

Or can be used in conjuction with group_by() and summarise(), both from the dplyr package:

+
data_1st %>% 
+  group_by(hospital) %>% 
+  summarise(amoxicillin = portion_IR(amox))
12014-02-02P8Hospital D2013-07-18Z9Hospital C B_ESCHR_COL SRS S S F
22013-10-26Q1Hospital B2010-02-24P1Hospital D B_ESCHR_COLIR R SSS F Gram negative Escherichia
32017-06-12E5Hospital D2014-04-02A10Hospital B B_STRPTC_PNE SRS S R M
42013-06-16K72016-05-30A7 Hospital B B_ESCHR_COLSISRRR S M Gram negativeTRUE
52013-01-11M462017-03-08L4 Hospital BB_STPHY_AURISB_ESCHR_COLRR S S MGram positiveStaphylococcusaureusGram negativeEscherichiacoli TRUE
72012-01-05S10Hospital D2014-06-24O7Hospital B B_STPHY_AURRIRSSS S F Gram positive
1 Escherichia coli2,13848.6%2,13848.6%2,14348.4%2,14348.4%
2 Staphylococcus aureus1,07024.3%3,20872.9%1,13025.5%3,27373.9%
3 Streptococcus pneumoniae69715.8%3,90588.8%70916.0%3,98289.9%
4 Klebsiella pneumoniae49411.2%4,39944910.1%4,431 100.0%
@@ -957,27 +954,27 @@ Longest: 24

- + - + - + - +
hospital
Hospital A0.45913820.4626060
Hospital B0.50000000.4652956
Hospital C0.46821710.4336283
Hospital D0.49533800.4677778
-

Of course it would be very convenient to know the number of isolates responsible for the percentages. For that purpose the n_rsi() can be used, which works exactly like n_distinct() from the dplyr package. It counts all isolates available for every group (i.e. values S, I or R):

-
data_1st %>% 
-  group_by(hospital) %>% 
-  summarise(amoxicillin = portion_IR(amox),
-            available = n_rsi(amox))
+

Of course it would be very convenient to know the number of isolates responsible for the percentages. For that purpose the n_rsi() can be used, which works exactly like n_distinct() from the dplyr package. It counts all isolates available for every group (i.e. values S, I or R):

+
data_1st %>% 
+  group_by(hospital) %>% 
+  summarise(amoxicillin = portion_IR(amox),
+            available = n_rsi(amox))
@@ -987,32 +984,32 @@ Longest: 24

- - + + - - + + - - + + - - + +
hospital
Hospital A0.459138213460.46260601297
Hospital B0.500000015500.46529561556
Hospital C0.46821716450.4336283678
Hospital D0.49533808580.4677778900

These functions can also be used to get the portion of multiple antibiotics, to calculate co-resistance very easily:

-
data_1st %>% 
-  group_by(genus) %>% 
-  summarise(amoxicillin = portion_S(amcl),
-            gentamicin = portion_S(gent),
-            "amox + gent" = portion_S(amcl, gent))
+
data_1st %>% 
+  group_by(genus) %>% 
+  summarise(amoxicillin = portion_S(amcl),
+            gentamicin = portion_S(gent),
+            "amox + gent" = portion_S(amcl, gent))
@@ -1023,94 +1020,94 @@ Longest: 24

- - - + + + - - - + + + - - - + + + - + - +
genus
Escherichia0.71936390.91113190.97427500.73121790.90947270.9752683
Klebsiella0.72267210.90283400.97773280.77505570.90645880.9866370
Staphylococcus0.73925230.90841120.98317760.74690270.92477880.9743363
Streptococcus0.75322810.7602257 0.00000000.75322810.7602257

To make a transition to the next part, let’s see how this difference could be plotted:

-
data_1st %>% 
-  group_by(genus) %>% 
-  summarise("1. Amoxicillin" = portion_S(amcl),
-            "2. Gentamicin" = portion_S(gent),
-            "3. Amox + gent" = portion_S(amcl, gent)) %>% 
-  tidyr::gather("Antibiotic", "S", -genus) %>%
-  ggplot(aes(x = genus,
-             y = S,
-             fill = Antibiotic)) +
-  geom_col(position = "dodge2")
+
data_1st %>% 
+  group_by(genus) %>% 
+  summarise("1. Amoxicillin" = portion_S(amcl),
+            "2. Gentamicin" = portion_S(gent),
+            "3. Amox + gent" = portion_S(amcl, gent)) %>% 
+  tidyr::gather("Antibiotic", "S", -genus) %>%
+  ggplot(aes(x = genus,
+             y = S,
+             fill = Antibiotic)) +
+  geom_col(position = "dodge2")

Plots

To show results in plots, most R users would nowadays use the ggplot2 package. This package lets you create plots in layers. You can read more about it on their website. A quick example would look like these syntaxes:

-
ggplot(data = a_data_set,
-       mapping = aes(x = year,
-                     y = value)) +
-  geom_col() +
-  labs(title = "A title",
-       subtitle = "A subtitle",
-       x = "My X axis",
-       y = "My Y axis")
-
-ggplot(a_data_set,
-       aes(year, value) +
-  geom_bar()
+
ggplot(data = a_data_set,
+       mapping = aes(x = year,
+                     y = value)) +
+  geom_col() +
+  labs(title = "A title",
+       subtitle = "A subtitle",
+       x = "My X axis",
+       y = "My Y axis")
+
+ggplot(a_data_set,
+       aes(year, value) +
+  geom_bar()

The AMR package contains functions to extend this ggplot2 package, for example geom_rsi(). It automatically transforms data with count_df() or portion_df() and show results in stacked bars. Its simplest and shortest example:

-
ggplot(data_1st) +
-  geom_rsi(translate_ab = FALSE)
+
ggplot(data_1st) +
+  geom_rsi(translate_ab = FALSE)

Omit the translate_ab = FALSE to have the antibiotic codes (amox, amcl, cipr, gent) translated to official WHO names (amoxicillin, amoxicillin and betalactamase inhibitor, ciprofloxacin, gentamicin).

If we group on e.g. the genus column and add some additional functions from our package, we can create this:

- +
# group the data on `genus`
+ggplot(data_1st %>% group_by(genus)) + 
+  # create bars with genus on x axis
+  # it looks for variables with class `rsi`,
+  # of which we have 4 (earlier created with `as.rsi`)
+  geom_rsi(x = "genus") + 
+  # split plots on antibiotic
+  facet_rsi(facet = "Antibiotic") +
+  # make R red, I yellow and S green
+  scale_rsi_colours() +
+  # show percentages on y axis
+  scale_y_percent(breaks = 0:4 * 25) +
+  # turn 90 degrees, make it bars instead of columns
+  coord_flip() +
+  # add labels
+  labs(title = "Resistance per genus and antibiotic", 
+       subtitle = "(this is fake data)") +
+  # and print genus in italic to follow our convention
+  # (is now y axis because we turned the plot)
+  theme(axis.text.y = element_text(face = "italic"))

To simplify this, we also created the ggplot_rsi() function, which combines almost all above functions:

- +
data_1st %>% 
+  group_by(genus) %>%
+  ggplot_rsi(x = "genus",
+             facet = "Antibiotic",
+             breaks = 0:4 * 25,
+             datalabels = FALSE) +
+  coord_flip()

@@ -1138,26 +1135,26 @@ Longest: 24

We can transform the data and apply the test in only a couple of lines:

-
septic_patients %>%
-  filter(hospital_id %in% c("A", "D")) %>% # filter on only hospitals A and D
-  select(hospital_id, fosf) %>%            # select the hospitals and fosfomycin
-  group_by(hospital_id) %>%                # group on the hospitals
-  count_df(combine_IR = TRUE) %>%          # count all isolates per group (hospital_id)
-  tidyr::spread(hospital_id, Value) %>%    # transform output so A and D are columns
-  select(A, D) %>%                         # and select these only
-  as.matrix() %>%                          # transform to good old matrix for fisher.test()
-  fisher.test()                            # do Fisher's Exact Test
-# 
-#   Fisher's Exact Test for Count Data
-# 
-# data:  .
-# p-value = 0.03104
-# alternative hypothesis: true odds ratio is not equal to 1
-# 95 percent confidence interval:
-#  1.054283 4.735995
-# sample estimates:
-# odds ratio 
-#   2.228006
+
septic_patients %>%
+  filter(hospital_id %in% c("A", "D")) %>% # filter on only hospitals A and D
+  select(hospital_id, fosf) %>%            # select the hospitals and fosfomycin
+  group_by(hospital_id) %>%                # group on the hospitals
+  count_df(combine_IR = TRUE) %>%          # count all isolates per group (hospital_id)
+  tidyr::spread(hospital_id, Value) %>%    # transform output so A and D are columns
+  select(A, D) %>%                         # and select these only
+  as.matrix() %>%                          # transform to good old matrix for fisher.test()
+  fisher.test()                            # do Fisher's Exact Test
+# 
+#   Fisher's Exact Test for Count Data
+# 
+# data:  .
+# p-value = 0.03104
+# alternative hypothesis: true odds ratio is not equal to 1
+# 95 percent confidence interval:
+#  1.054283 4.735995
+# sample estimates:
+# odds ratio 
+#   2.228006

As can be seen, the p value is 0.03, which means that the fosfomycin resistances found in hospital A and D are really different.

diff --git a/docs/articles/AMR_files/figure-html/plot 1-1.png b/docs/articles/AMR_files/figure-html/plot 1-1.png index c9ff2753..b7934e12 100644 Binary files a/docs/articles/AMR_files/figure-html/plot 1-1.png and b/docs/articles/AMR_files/figure-html/plot 1-1.png differ diff --git a/docs/articles/AMR_files/figure-html/plot 3-1.png b/docs/articles/AMR_files/figure-html/plot 3-1.png index 6bc000b5..c742e8ea 100644 Binary files a/docs/articles/AMR_files/figure-html/plot 3-1.png and b/docs/articles/AMR_files/figure-html/plot 3-1.png differ diff --git a/docs/articles/AMR_files/figure-html/plot 4-1.png b/docs/articles/AMR_files/figure-html/plot 4-1.png index 53cfc234..51fcf483 100644 Binary files a/docs/articles/AMR_files/figure-html/plot 4-1.png and b/docs/articles/AMR_files/figure-html/plot 4-1.png differ diff --git a/docs/articles/AMR_files/figure-html/plot 5-1.png b/docs/articles/AMR_files/figure-html/plot 5-1.png index 5cc23525..311e8370 100644 Binary files a/docs/articles/AMR_files/figure-html/plot 5-1.png and b/docs/articles/AMR_files/figure-html/plot 5-1.png differ diff --git a/docs/articles/EUCAST.html b/docs/articles/EUCAST.html index 48d7603c..b736e229 100644 --- a/docs/articles/EUCAST.html +++ b/docs/articles/EUCAST.html @@ -178,7 +178,7 @@

How to apply EUCAST rules

Matthijs S. Berends

-

11 January 2019

+

12 January 2019

diff --git a/docs/articles/G_test.html b/docs/articles/G_test.html index fb6b4173..c7c88ea3 100644 --- a/docs/articles/G_test.html +++ b/docs/articles/G_test.html @@ -178,7 +178,7 @@

How to use the G-test

Matthijs S. Berends

-

11 January 2019

+

12 January 2019

diff --git a/docs/articles/Predict.html b/docs/articles/Predict.html index 7cbb6fbd..fd98e68b 100644 --- a/docs/articles/Predict.html +++ b/docs/articles/Predict.html @@ -178,7 +178,7 @@

How to predict antimicrobial resistance

Matthijs S. Berends

-

11 January 2019

+

12 January 2019

diff --git a/docs/articles/benchmarks.html b/docs/articles/benchmarks.html index a3249e69..8e87d706 100644 --- a/docs/articles/benchmarks.html +++ b/docs/articles/benchmarks.html @@ -178,7 +178,7 @@

Benchmarks

Matthijs S. Berends

-

11 January 2019

+

12 January 2019

@@ -189,148 +189,148 @@

One of the most important features of this package is the complete microbial taxonomic database, supplied by ITIS (https://www.itis.gov). We created a function as.mo() that transforms any user input value to a valid microbial ID by using AI (Artificial Intelligence) and based on the taxonomic tree of ITIS.

Using the microbenchmark package, we can review the calculation performance of this function.

-
library(microbenchmark)
+
library(microbenchmark)

In the next test, we try to ‘coerce’ different input values for Staphylococcus aureus. The actual result is the same every time: it returns its MO code B_STPHY_AUR (B stands for Bacteria, the taxonomic kingdom).

But the calculation time differs a lot. Here, the AI effect can be reviewed best:

-
microbenchmark(A = as.mo("stau"),
-               B = as.mo("staaur"),
-               C = as.mo("S. aureus"),
-               D = as.mo("S.  aureus"),
-               E = as.mo("STAAUR"),
-               F = as.mo("Staphylococcus aureus"),
-               G = as.mo("B_STPHY_AUR"),
-               times = 10,
-               unit = "ms")
-# Unit: milliseconds
-#  expr       min        lq       mean     median        uq       max neval
-#     A 34.745551 34.798630 35.2596102 34.8994810 35.258325 38.067062    10
-#     B  7.095386  7.125348  7.2219948  7.1613865  7.240377  7.495857    10
-#     C 11.677114 11.733826 11.8304789 11.7715050 11.843756 12.317559    10
-#     D 11.694435 11.730054 11.9859313 11.8775585 12.206371 12.750016    10
-#     E  7.044402  7.117387  7.2271630  7.1923610  7.246104  7.742396    10
-#     F  6.642326  6.778446  6.8988042  6.8753165  6.923577  7.513945    10
-#     G  0.106788  0.131023  0.1351229  0.1357725  0.144014  0.146458    10
+
microbenchmark(A = as.mo("stau"),
+               B = as.mo("staaur"),
+               C = as.mo("S. aureus"),
+               D = as.mo("S.  aureus"),
+               E = as.mo("STAAUR"),
+               F = as.mo("Staphylococcus aureus"),
+               G = as.mo("B_STPHY_AUR"),
+               times = 10,
+               unit = "ms")
+# Unit: milliseconds
+#  expr       min        lq       mean     median        uq       max neval
+#     A 34.745551 34.798630 35.2596102 34.8994810 35.258325 38.067062    10
+#     B  7.095386  7.125348  7.2219948  7.1613865  7.240377  7.495857    10
+#     C 11.677114 11.733826 11.8304789 11.7715050 11.843756 12.317559    10
+#     D 11.694435 11.730054 11.9859313 11.8775585 12.206371 12.750016    10
+#     E  7.044402  7.117387  7.2271630  7.1923610  7.246104  7.742396    10
+#     F  6.642326  6.778446  6.8988042  6.8753165  6.923577  7.513945    10
+#     G  0.106788  0.131023  0.1351229  0.1357725  0.144014  0.146458    10

In the table above, all measurements are shown in milliseconds (thousands of seconds), tested on a quite regular Linux server from 2007 (Core 2 Duo 2.7 GHz, 2 GB DDR2 RAM). A value of 6.9 milliseconds means it will roughly determine 144 input values per second. It case of 39.2 milliseconds, this is only 26 input values per second. The more an input value resembles a full name (like C, D and F), the faster the result will be found. In case of G, the input is already a valid MO code, so it only almost takes no time at all (0.0001 seconds on our server).

To achieve this speed, the as.mo function also takes into account the prevalence of human pathogenic microorganisms. The downside is of course that less prevalent microorganisms will be determined far less faster. See this example for the ID of Burkholderia nodosa (B_BRKHL_NOD):

-
microbenchmark(A = as.mo("buno"),
-               B = as.mo("burnod"),
-               C = as.mo("B. nodosa"),
-               D = as.mo("B.  nodosa"),
-               E = as.mo("BURNOD"),
-               F = as.mo("Burkholderia nodosa"),
-               G = as.mo("B_BRKHL_NOD"),
-               times = 10,
-               unit = "ms")
-# Unit: milliseconds
-#  expr        min         lq        mean      median         uq        max neval
-#     A 124.175427 124.474837 125.8610536 125.3750560 126.160945 131.485994    10
-#     B 154.249713 155.364729 160.9077032 156.8738940 157.136183 197.315105    10
-#     C  66.066571  66.162393  66.5538611  66.4488130  66.698077  67.623404    10
-#     D  86.747693  86.918665  90.7831016  87.8149725  89.440982 116.767991    10
-#     E 154.863827 155.208563 162.6535954 158.4062465 168.593785 187.378088    10
-#     F  32.427028  32.638648  32.9929454  32.7860475  32.992813  34.674241    10
-#     G   0.213155   0.216578   0.2369226   0.2338985   0.253734   0.285581    10
+
microbenchmark(A = as.mo("buno"),
+               B = as.mo("burnod"),
+               C = as.mo("B. nodosa"),
+               D = as.mo("B.  nodosa"),
+               E = as.mo("BURNOD"),
+               F = as.mo("Burkholderia nodosa"),
+               G = as.mo("B_BRKHL_NOD"),
+               times = 10,
+               unit = "ms")
+# Unit: milliseconds
+#  expr        min         lq        mean      median         uq        max neval
+#     A 124.175427 124.474837 125.8610536 125.3750560 126.160945 131.485994    10
+#     B 154.249713 155.364729 160.9077032 156.8738940 157.136183 197.315105    10
+#     C  66.066571  66.162393  66.5538611  66.4488130  66.698077  67.623404    10
+#     D  86.747693  86.918665  90.7831016  87.8149725  89.440982 116.767991    10
+#     E 154.863827 155.208563 162.6535954 158.4062465 168.593785 187.378088    10
+#     F  32.427028  32.638648  32.9929454  32.7860475  32.992813  34.674241    10
+#     G   0.213155   0.216578   0.2369226   0.2338985   0.253734   0.285581    10

That takes up to 11 times as much time! A value of 158.4 milliseconds means it can only determine ~6 different input values per second. We can conclude that looking up arbitrary codes of less prevalent microorganisms is the worst way to go, in terms of calculation performance.

To relieve this pitfall and further improve performance, two important calculations take almost no time at all: repetitive results and already precalculated results.

Repetitive results

Repetitive results mean that unique values are present more than once. Unique values will only be calculated once by as.mo(). We will use mo_fullname() for this test - a helper function that returns the full microbial name (genus, species and possibly subspecies) and uses as.mo() internally.

-
library(dplyr)
-# take 500,000 random MO codes from the septic_patients data set
-x = septic_patients %>%
-  sample_n(500000, replace = TRUE) %>%
-  pull(mo)
-  
-# got the right length?
-length(x)
-# [1] 500000
-
-# and how many unique values do we have?
-n_distinct(x)
-# [1] 96
-
-# only 96, but distributed in 500,000 results. now let's see:
-microbenchmark(X = mo_fullname(x),
-               times = 10,
-               unit = "ms")
-# Unit: milliseconds
-#  expr      min       lq     mean   median       uq      max neval
-#     X 114.9342 117.1076 129.6448 120.2047 131.5005 168.6371    10
+
library(dplyr)
+# take 500,000 random MO codes from the septic_patients data set
+x = septic_patients %>%
+  sample_n(500000, replace = TRUE) %>%
+  pull(mo)
+  
+# got the right length?
+length(x)
+# [1] 500000
+
+# and how many unique values do we have?
+n_distinct(x)
+# [1] 96
+
+# only 96, but distributed in 500,000 results. now let's see:
+microbenchmark(X = mo_fullname(x),
+               times = 10,
+               unit = "ms")
+# Unit: milliseconds
+#  expr      min       lq     mean   median       uq      max neval
+#     X 114.9342 117.1076 129.6448 120.2047 131.5005 168.6371    10

So transforming 500,000 values (!) of 96 unique values only takes 0.12 seconds (120 ms). You only lose time on your unique input values.

Results of a tenfold - 5,000,000 values:

-
# Unit: milliseconds
-#  expr      min       lq     mean   median       uq      max neval
-#     X 882.9045 901.3011 1001.677 940.3421 1168.088 1226.846    10
+
# Unit: milliseconds
+#  expr      min       lq     mean   median       uq      max neval
+#     X 882.9045 901.3011 1001.677 940.3421 1168.088 1226.846    10

Even the full names of 5 Million values are calculated within a second.

Precalculated results

What about precalculated results? If the input is an already precalculated result of a helper function like mo_fullname(), it almost doesn’t take any time at all (see ‘C’ below):

-
microbenchmark(A = mo_fullname("B_STPHY_AUR"),
-               B = mo_fullname("S. aureus"),
-               C = mo_fullname("Staphylococcus aureus"),
-               times = 10,
-               unit = "ms")
-# Unit: milliseconds
-#  expr       min        lq       mean     median        uq       max neval
-#     A 11.364086 11.460537 11.5104799 11.4795330 11.524860 11.818263    10
-#     B 11.976454 12.012352 12.1704592 12.0853020 12.210004 12.881737    10
-#     C  0.095823  0.102528  0.1167754  0.1153785  0.132629  0.140661    10
+
microbenchmark(A = mo_fullname("B_STPHY_AUR"),
+               B = mo_fullname("S. aureus"),
+               C = mo_fullname("Staphylococcus aureus"),
+               times = 10,
+               unit = "ms")
+# Unit: milliseconds
+#  expr       min        lq       mean     median        uq       max neval
+#     A 11.364086 11.460537 11.5104799 11.4795330 11.524860 11.818263    10
+#     B 11.976454 12.012352 12.1704592 12.0853020 12.210004 12.881737    10
+#     C  0.095823  0.102528  0.1167754  0.1153785  0.132629  0.140661    10

So going from mo_fullname("Staphylococcus aureus") to "Staphylococcus aureus" takes 0.0001 seconds - it doesn’t even start calculating if the result would be the same as the expected resulting value. That goes for all helper functions:

-
microbenchmark(A = mo_species("aureus"),
-               B = mo_genus("Staphylococcus"),
-               C = mo_fullname("Staphylococcus aureus"),
-               D = mo_family("Staphylococcaceae"),
-               E = mo_order("Bacillales"),
-               F = mo_class("Bacilli"),
-               G = mo_phylum("Firmicutes"),
-               H = mo_subkingdom("Posibacteria"),
-               I = mo_kingdom("Bacteria"),
-               times = 10,
-               unit = "ms")
-# Unit: milliseconds
-#  expr      min       lq      mean    median       uq      max neval
-#     A 0.105181 0.121314 0.1478538 0.1465265 0.166711 0.211409    10
-#     B 0.132558 0.146388 0.1584278 0.1499835 0.164895 0.208477    10
-#     C 0.135492 0.160355 0.2341847 0.1884665 0.348857 0.395931    10
-#     D 0.109650 0.115727 0.1270481 0.1264130 0.128648 0.168317    10
-#     E 0.081574 0.096940 0.0992582 0.0980915 0.101479 0.120477    10
-#     F 0.081575 0.088489 0.0988463 0.0989650 0.103365 0.126482    10
-#     G 0.091981 0.095333 0.1043568 0.1001530 0.111327 0.129625    10
-#     H 0.092610 0.093169 0.1009135 0.0985455 0.101828 0.120406    10
-#     I 0.087371 0.091213 0.1069758 0.0941815 0.109302 0.192831    10
+
microbenchmark(A = mo_species("aureus"),
+               B = mo_genus("Staphylococcus"),
+               C = mo_fullname("Staphylococcus aureus"),
+               D = mo_family("Staphylococcaceae"),
+               E = mo_order("Bacillales"),
+               F = mo_class("Bacilli"),
+               G = mo_phylum("Firmicutes"),
+               H = mo_subkingdom("Posibacteria"),
+               I = mo_kingdom("Bacteria"),
+               times = 10,
+               unit = "ms")
+# Unit: milliseconds
+#  expr      min       lq      mean    median       uq      max neval
+#     A 0.105181 0.121314 0.1478538 0.1465265 0.166711 0.211409    10
+#     B 0.132558 0.146388 0.1584278 0.1499835 0.164895 0.208477    10
+#     C 0.135492 0.160355 0.2341847 0.1884665 0.348857 0.395931    10
+#     D 0.109650 0.115727 0.1270481 0.1264130 0.128648 0.168317    10
+#     E 0.081574 0.096940 0.0992582 0.0980915 0.101479 0.120477    10
+#     F 0.081575 0.088489 0.0988463 0.0989650 0.103365 0.126482    10
+#     G 0.091981 0.095333 0.1043568 0.1001530 0.111327 0.129625    10
+#     H 0.092610 0.093169 0.1009135 0.0985455 0.101828 0.120406    10
+#     I 0.087371 0.091213 0.1069758 0.0941815 0.109302 0.192831    10

Of course, when running mo_phylum("Firmicutes") the function has zero knowledge about the actual microorganism, namely S. aureus. But since the result would be "Firmicutes" too, there is no point in calculating the result. And because this package ‘knows’ all phyla of all known microorganisms (according to ITIS), it can just return the initial value immediately.

Results in other languages

When the system language is non-English and supported by this AMR package, some functions take a little while longer:

-
mo_fullname("CoNS", language = "en") # or just mo_fullname("CoNS") on an English system
-# "Coagulase Negative Staphylococcus (CoNS)"
-
-mo_fullname("CoNS", language = "fr") # or just mo_fullname("CoNS") on a French system
-# "Staphylococcus à coagulase négative (CoNS)"
-
-microbenchmark(en = mo_fullname("CoNS", language = "en"),
-               de = mo_fullname("CoNS", language = "de"),
-               nl = mo_fullname("CoNS", language = "nl"),
-               es = mo_fullname("CoNS", language = "es"),
-               it = mo_fullname("CoNS", language = "it"),
-               fr = mo_fullname("CoNS", language = "fr"),
-               pt = mo_fullname("CoNS", language = "pt"),
-               times = 10,
-               unit = "ms")
-# Unit: milliseconds
-#  expr       min       lq      mean    median        uq      max neval
-#    en  6.093583  6.51724  6.555105  6.562986  6.630663  6.99698   100
-#    de 13.934874 14.35137 16.891587 14.462210 14.764658 43.63956   100
-#    nl 13.900092 14.34729 15.943268 14.424565 14.581535 43.76283   100
-#    es 13.833813 14.34596 14.574783 14.439757 14.653994 17.49168   100
-#    it 13.811883 14.36621 15.179060 14.453515 14.812359 43.64284   100
-#    fr 13.798683 14.37019 16.344731 14.468775 14.697610 48.62923   100
-#    pt 13.789674 14.36244 15.706321 14.443772 14.679905 44.76701   100
+
mo_fullname("CoNS", language = "en") # or just mo_fullname("CoNS") on an English system
+# "Coagulase Negative Staphylococcus (CoNS)"
+
+mo_fullname("CoNS", language = "fr") # or just mo_fullname("CoNS") on a French system
+# "Staphylococcus à coagulase négative (CoNS)"
+
+microbenchmark(en = mo_fullname("CoNS", language = "en"),
+               de = mo_fullname("CoNS", language = "de"),
+               nl = mo_fullname("CoNS", language = "nl"),
+               es = mo_fullname("CoNS", language = "es"),
+               it = mo_fullname("CoNS", language = "it"),
+               fr = mo_fullname("CoNS", language = "fr"),
+               pt = mo_fullname("CoNS", language = "pt"),
+               times = 10,
+               unit = "ms")
+# Unit: milliseconds
+#  expr       min       lq      mean    median        uq      max neval
+#    en  6.093583  6.51724  6.555105  6.562986  6.630663  6.99698   100
+#    de 13.934874 14.35137 16.891587 14.462210 14.764658 43.63956   100
+#    nl 13.900092 14.34729 15.943268 14.424565 14.581535 43.76283   100
+#    es 13.833813 14.34596 14.574783 14.439757 14.653994 17.49168   100
+#    it 13.811883 14.36621 15.179060 14.453515 14.812359 43.64284   100
+#    fr 13.798683 14.37019 16.344731 14.468775 14.697610 48.62923   100
+#    pt 13.789674 14.36244 15.706321 14.443772 14.679905 44.76701   100

Currently supported are German, Dutch, Spanish, Italian, French and Portuguese.

diff --git a/docs/articles/freq.html b/docs/articles/freq.html index 656b19d0..a294b649 100644 --- a/docs/articles/freq.html +++ b/docs/articles/freq.html @@ -178,7 +178,7 @@

How to create frequency tables

Matthijs S. Berends

-

11 January 2019

+

12 January 2019

@@ -196,7 +196,7 @@

Frequencies of one variable

To only show and quickly review the content of one variable, you can just select this variable in various ways. Let’s say we want to get the frequencies of the gender variable of the septic_patients dataset:

-
septic_patients %>% freq(gender)
+
septic_patients %>% freq(gender)

Frequency table of gender

@@ -233,21 +233,21 @@ Frequencies of more than one variable

Multiple variables will be pasted into one variable to review individual cases, keeping a univariate frequency table.

For illustration, we could add some more variables to the septic_patients dataset to learn about bacterial properties:

-
my_patients <- septic_patients %>% left_join_microorganisms()
-# Joining, by = "mo"
+
my_patients <- septic_patients %>% left_join_microorganisms()
+# Joining, by = "mo"

Now all variables of the microorganisms dataset have been joined to the septic_patients dataset. The microorganisms dataset consists of the following variables:

-
colnames(microorganisms)
-#  [1] "mo"         "tsn"        "genus"      "species"    "subspecies"
-#  [6] "fullname"   "family"     "order"      "class"      "phylum"    
-# [11] "subkingdom" "kingdom"    "gramstain"  "prevalence" "ref"
+
colnames(microorganisms)
+#  [1] "mo"         "tsn"        "genus"      "species"    "subspecies"
+#  [6] "fullname"   "family"     "order"      "class"      "phylum"    
+# [11] "subkingdom" "kingdom"    "gramstain"  "prevalence" "ref"

If we compare the dimensions between the old and new dataset, we can see that these 14 variables were added:

-
dim(septic_patients)
-# [1] 2000   49
-dim(my_patients)
-# [1] 2000   63
+
dim(septic_patients)
+# [1] 2000   49
+dim(my_patients)
+# [1] 2000   63

So now the genus and species variables are available. A frequency table of these combined variables can be created like this:

-
my_patients %>%
-  freq(genus, species, nmax = 15)
+
my_patients %>%
+  freq(genus, species, nmax = 15)

Frequency table of genus and species

@@ -388,10 +388,10 @@ Frequencies of numeric values

Frequency tables can be created of any input.

In case of numeric values (like integers, doubles, etc.) additional descriptive statistics will be calculated and shown into the header:

-
# # get age distribution of unique patients
-septic_patients %>% 
-  distinct(patient_id, .keep_all = TRUE) %>% 
-  freq(age, nmax = 5, header = TRUE)
+
# # get age distribution of unique patients
+septic_patients %>% 
+  distinct(patient_id, .keep_all = TRUE) %>% 
+  freq(age, nmax = 5, header = TRUE)

Frequency table of age
Class: numeric
Length: 981 (of which NA: 0 = 0.00%)
@@ -469,8 +469,8 @@ Outliers: 15 (unique count: 12)

Frequencies of factors

To sort frequencies of factors on factor level instead of item count, use the sort.count parameter.

sort.count is TRUE by default. Compare this default behaviour…

-
septic_patients %>%
-  freq(hospital_id)
+
septic_patients %>%
+  freq(hospital_id)

Frequency table of hospital_id

@@ -517,8 +517,8 @@ Outliers: 15 (unique count: 12)

… with this, where items are now sorted on count:

-
septic_patients %>%
-  freq(hospital_id, sort.count = FALSE)
+
septic_patients %>%
+  freq(hospital_id, sort.count = FALSE)

Frequency table of hospital_id

@@ -565,8 +565,8 @@ Outliers: 15 (unique count: 12)

All classes will be printed into the header (default is FALSE when using markdown like this document). Variables with the new rsi class of this AMR package are actually ordered factors and have three classes (look at Class in the header):

-
septic_patients %>%
-  freq(amox, header = TRUE)
+
septic_patients %>%
+  freq(amox, header = TRUE)

Frequency table of amox
Class: factor > ordered > rsi (numeric)
Levels: S < I < R
@@ -614,8 +614,8 @@ Unique: 3

Frequencies of dates

Frequencies of dates will show the oldest and newest date in the data, and the amount of days between them:

-
septic_patients %>%
-  freq(date, nmax = 5, header = TRUE)
+
septic_patients %>%
+  freq(date, nmax = 5, header = TRUE)

Frequency table of date
Class: Date (numeric)
Length: 2,000 (of which NA: 0 = 0.00%)
@@ -681,11 +681,11 @@ Median: 31 July 2009 (47.39%)

Assigning a frequency table to an object

A frequency table is actaually a regular data.frame, with the exception that it contains an additional class.

-
my_df <- septic_patients %>% freq(age)
-class(my_df)
+
my_df <- septic_patients %>% freq(age)
+class(my_df)

[1] “frequency_tbl” “data.frame”

Because of this additional class, a frequency table prints like the examples above. But the object itself contains the complete table without a row limitation:

-
dim(my_df)
+
dim(my_df)

[1] 74 5

@@ -696,8 +696,8 @@ Median: 31 July 2009 (47.39%)

Parameter na.rm

With the na.rm parameter (defaults to TRUE, but they will always be shown into the header), you can include NA values in the frequency table:

-
septic_patients %>%
-  freq(amox, na.rm = FALSE)
+
septic_patients %>%
+  freq(amox, na.rm = FALSE)

Frequency table of amox

@@ -749,8 +749,8 @@ Median: 31 July 2009 (47.39%)

Parameter row.names

The default frequency tables shows row indices. To remove them, use row.names = FALSE:

-
septic_patients %>%
-  freq(hospital_id, row.names = FALSE)
+
septic_patients %>%
+  freq(hospital_id, row.names = FALSE)

Frequency table of hospital_id

@@ -797,8 +797,8 @@ Median: 31 July 2009 (47.39%)

Parameter markdown

The markdown parameter is TRUE at default in non-interactive sessions, like in reports created with R Markdown. This will always print all rows, unless nmax is set.

-
septic_patients %>%
-  freq(hospital_id, markdown = TRUE)
+
septic_patients %>%
+  freq(hospital_id, markdown = TRUE)

Frequency table of hospital_id

diff --git a/docs/articles/mo_property.html b/docs/articles/mo_property.html index 8c22946d..97ba39c2 100644 --- a/docs/articles/mo_property.html +++ b/docs/articles/mo_property.html @@ -178,7 +178,7 @@

How to get properties of a microorganism

Matthijs S. Berends

-

11 January 2019

+

12 January 2019

diff --git a/docs/index.html b/docs/index.html index 66eca03e..347bc253 100644 --- a/docs/index.html +++ b/docs/index.html @@ -196,7 +196,6 @@

Microbial Ecology:

- -

diff --git a/docs/news/index.html b/docs/news/index.html index 49aaec0b..384d2edc 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -229,47 +229,31 @@

Changed

@@ -597,21 +494,15 @@ New

- -
  • Determining bacterial ID: -
      +
    • Determining bacterial ID:
    • New functions as.bactid and is.bactid to transform/ look up microbial ID’s.
    • The existing function guess_bactid is now an alias of as.bactid
    • New Becker classification for Staphylococcus to categorise them into Coagulase Negative Staphylococci (CoNS) and Coagulase Positve Staphylococci (CoPS)
    • New Lancefield classification for Streptococcus to categorise them into Lancefield groups
    • -
    -
  • For convience, new descriptive statistical functions kurtosis and skewness that are lacking in base R - they are generic functions and have support for vectors, data.frames and matrices
  • Function g.test to perform the Χ2 distributed G-test, which use is the same as chisq.test
  • -
  • -Function ratio to transform a vector of values to a preset ratio - -
  • Support for Addins menu in RStudio to quickly insert %in% or %like% (and give them keyboard shortcuts), or to view the datasets that come with this package
  • Function p.symbol to transform p values to their related symbols: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  • Functions clipboard_import and clipboard_export as helper functions to quickly copy and paste from/to software like Excel and SPSS. These functions use the clipr package, but are a little altered to also support headless Linux servers (so you can use it in RStudio Server)
  • -
  • New for frequency tables (function freq): -
      +
    • New for frequency tables (function freq):
    • A vignette to explain its usage
    • Support for rsi (antimicrobial resistance) to use as input
    • Support for table to use as input: freq(table(x, y)) @@ -662,8 +543,6 @@
    • Header of frequency tables now also show Mean Absolute Deviaton (MAD) and Interquartile Range (IQR)
    • Possibility to globally set the default for the amount of items to print, with options(max.print.freq = n) where n is your preset value
    -
  • -

    @@ -685,27 +564,21 @@
  • Small improvements to the microorganisms dataset (especially for Salmonella) and the column bactid now has the new class "bactid"
  • -
  • Combined MIC/RSI values will now be coerced by the rsi and mic functions: - -
  • Now possible to coerce MIC values with a space between operator and value, i.e. as.mic("<= 0.002") now works
  • Classes rsi and mic do not add the attribute package.version anymore
  • Added "groups" option for atc_property(..., property). It will return a vector of the ATC hierarchy as defined by the WHO. The new function atc_groups is a convenient wrapper around this.
  • Build-in host check for atc_property as it requires the host set by url to be responsive
  • Improved first_isolate algorithm to exclude isolates where bacteria ID or genus is unavailable
  • Fix for warning hybrid evaluation forced for row_number (924b62) from the dplyr package v0.7.5 and above
  • -
  • Support for empty values and for 1 or 2 columns as input for guess_bactid (now called as.bactid) -
      +
    • Support for empty values and for 1 or 2 columns as input for guess_bactid (now called as.bactid)
    • So yourdata %>% select(genus, species) %>% as.bactid() now also works
    • -
    -
  • Other small fixes
  • @@ -713,14 +586,11 @@

    Other

    @@ -739,13 +609,10 @@
  • Function guess_bactid to determine the ID of a microorganism based on genus/species or known abbreviations like MRSA
  • Function guess_atc to determine the ATC of an antibiotic based on name, trade name, or known abbreviations
  • Function freq to create frequency tables, with additional info in a header
  • -
  • Function MDRO to determine Multi Drug Resistant Organisms (MDRO) with support for country-specific guidelines. - -
  • New algorithm to determine weighted isolates, can now be "points" or "keyantibiotics", see ?first_isolate
  • New print format for tibbles and data.tables
  • diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml index 31227eab..3c396dbb 100644 --- a/docs/pkgdown.yml +++ b/docs/pkgdown.yml @@ -1,4 +1,4 @@ -pandoc: 2.3.1 +pandoc: 1.17.2 pkgdown: 1.3.0 pkgdown_sha: ~ articles: diff --git a/docs/reference/ITIS.html b/docs/reference/ITIS.html index 8463d5f7..43278681 100644 --- a/docs/reference/ITIS.html +++ b/docs/reference/ITIS.html @@ -233,7 +233,7 @@


    This package contains the complete microbial taxonomic data (with all nine taxonomic ranks - from kingdom to subspecies) from the publicly available Integrated Taxonomic Information System (ITIS, https://www.itis.gov).

    -

    All (sub)species from the taxonomic kingdoms Bacteria, Fungi and Protozoa are included in this package, as well as all previously accepted names known to ITIS. Furthermore, the responsible authors and year of publication are available. This allows users to use authoritative taxonomic information for their data analysis on any microorganism, not only human pathogens. It also helps to quickly determine the Gram stain of bacteria, since all bacteria are classified into subkingdom Negibacteria or Posibacteria.

    +

    All ~20,000 (sub)species from the taxonomic kingdoms Bacteria, Fungi and Protozoa are included in this package, as well as all ~2,500 previously accepted names known to ITIS. Furthermore, the responsible authors and year of publication are available. This allows users to use authoritative taxonomic information for their data analysis on any microorganism, not only human pathogens. It also helps to quickly determine the Gram stain of bacteria, since all bacteria are classified into subkingdom Negibacteria or Posibacteria.

    ITIS is a partnership of U.S., Canadian, and Mexican agencies and taxonomic specialists [3].

    Read more on our website!

    diff --git a/docs/reference/age.html b/docs/reference/age.html index f3b014a1..b7d10a4c 100644 --- a/docs/reference/age.html +++ b/docs/reference/age.html @@ -47,7 +47,7 @@ - + @@ -223,7 +223,7 @@
    -

    Calculates age in years based on a reference date, which is the sytem time at default.

    +

    Calculates age in years based on a reference date, which is the sytem date at default.

    @@ -238,7 +238,7 @@ - +
    reference

    reference date(s) (defaults to today), will be coerced with as.POSIXlt

    reference date(s) (defaults to today), will be coerced with as.POSIXlt and cannot be lower than x

    @@ -254,7 +254,7 @@ On our website https://msberends.gitla

    See also

    -

    age_groups to splits age into groups

    +

    age_groups to split age into age groups

    diff --git a/docs/reference/age_groups.html b/docs/reference/age_groups.html index 7ab69c23..82a47bf0 100644 --- a/docs/reference/age_groups.html +++ b/docs/reference/age_groups.html @@ -249,7 +249,7 @@

    Details

    To split ages, the input can be: