diff --git a/DESCRIPTION b/DESCRIPTION index d98e45ab..dcb6027d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: AMR Version: 0.5.0.9017 -Date: 2019-02-10 +Date: 2019-02-11 Title: Antimicrobial Resistance Analysis Authors@R: c( person( diff --git a/NAMESPACE b/NAMESPACE index 79cac6fa..4a7ba715 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -253,6 +253,7 @@ importFrom(graphics,axis) importFrom(graphics,barplot) importFrom(graphics,hist) importFrom(graphics,plot) +importFrom(graphics,points) importFrom(graphics,text) importFrom(hms,is.hms) importFrom(knitr,kable) diff --git a/R/resistance_predict.R b/R/resistance_predict.R index 2380b533..55e60d92 100755 --- a/R/resistance_predict.R +++ b/R/resistance_predict.R @@ -35,6 +35,7 @@ #' @param preserve_measurements a logical to indicate whether predictions of years that are actually available in the data should be overwritten by the original data. The standard errors of those years will be \code{NA}. #' @param info a logical to indicate whether textual analysis should be printed with the name and \code{\link{summary}} of the statistical model. #' @param main title of the plot +#' @param ribbon a logical to indicate whether a ribbon should be shown (default) or error bars #' @details Valid options for the statistical model are: #' \itemize{ #' \item{\code{"binomial"} or \code{"binom"} or \code{"logit"}: a generalised linear regression model with binomial distribution} @@ -51,6 +52,7 @@ #' \item{\code{observed}, the original observed resistant percentages} #' \item{\code{estimated}, the estimated resistant percentages, calculated by the model} #' } +#' Furthermore, the model itself is available as an attribute: \code{attributes(x)$model}, see Examples. #' @seealso The \code{\link{portion}} function to calculate resistance, \cr \code{\link{lm}} \code{\link{glm}} #' @rdname resistance_predict #' @export @@ -71,7 +73,12 @@ #' plot(x) #' #' -#' # create nice plots with ggplot yourself +#' # get the model from the object +#' mymodel <- attributes(x)$model +#' summary(mymodel) +#' +#' +#' # create nice plots with ggplot2 yourself #' if (!require(ggplot2)) { #' #' data <- septic_patients %>% @@ -295,8 +302,8 @@ rsi_predict <- resistance_predict #' @exportMethod plot.mic #' @export -#' @importFrom dplyr %>% group_by summarise -#' @importFrom graphics plot axis arrows +#' @importFrom dplyr filter +#' @importFrom graphics plot axis arrows points #' @rdname resistance_predict plot.resistance_predict <- function(x, main = paste("Resistance prediction of", attributes(x)$ab), ...) { if (attributes(x)$I_as_R == TRUE) { @@ -316,18 +323,30 @@ plot.resistance_predict <- function(x, main = paste("Resistance prediction of", ", model: ", attributes(x)$model_title, ")"), cex.sub = 0.75) + axis(side = 2, at = seq(0, 1, 0.1), labels = paste0(0:10 * 10, "%")) - # arrows hack: https://stackoverflow.com/a/22037078/4575331 + # hack for error bars: https://stackoverflow.com/a/22037078/4575331 arrows(x0 = x$year, y0 = x$se_min, x1 = x$year, - y1 = x$se_max, length = 0.05, angle = 90, code = 3) + y1 = x$se_max, + length = 0.05, angle = 90, code = 3, lwd = 1.5) + + # overlay grey points for prediction + points(x = filter(x, is.na(observations))$year, + y = filter(x, is.na(observations))$value, + pch = 19, + col = "grey40") } #' @rdname resistance_predict +#' @importFrom dplyr filter #' @export -ggplot_rsi_predict <- function(x, main = paste("Resistance prediction of", attributes(x)$ab), ...) { +ggplot_rsi_predict <- function(x, + main = paste("Resistance prediction of", attributes(x)$ab), + ribbon = TRUE, + ...) { if (!"resistance_predict" %in% class(x)) { stop("`x` must be a resistance prediction model created with resistance_predict().") @@ -338,15 +357,26 @@ ggplot_rsi_predict <- function(x, main = paste("Resistance prediction of", attri } else { ylab <- "%R" } - suppressWarnings( - ggplot2::ggplot(x, ggplot2::aes(x = year, y = value)) + - ggplot2::geom_point(size = 2) + - ggplot2::geom_errorbar(ggplot2::aes(ymin = se_min, ymax = se_max), na.rm = TRUE, width = 0.5) + - scale_y_percent(limits = c(0, 1)) + - ggplot2::labs(title = main, - y = paste0("Percentage (", ylab, ")"), - x = "Year", - caption = paste0("(n = ", sum(x$observations, na.rm = TRUE), - ", model: ", attributes(x)$model_title, ")")) - ) + + p <- ggplot2::ggplot(x, ggplot2::aes(x = year, y = value)) + + ggplot2::geom_point(data = filter(x, !is.na(observations)), + size = 2) + + scale_y_percent(limits = c(0, 1)) + + ggplot2::labs(title = main, + y = paste0("Percentage (", ylab, ")"), + x = "Year", + caption = paste0("(n = ", sum(x$observations, na.rm = TRUE), + ", model: ", attributes(x)$model_title, ")")) + + if (ribbon == TRUE) { + p <- p + ggplot2::geom_ribbon(ggplot2::aes(ymin = se_min, ymax = se_max), alpha = 0.25) + } else { + p <- p + ggplot2::geom_errorbar(ggplot2::aes(ymin = se_min, ymax = se_max), na.rm = TRUE, width = 0.5) + } + p <- p + + # overlay grey points for prediction + ggplot2::geom_point(data = filter(x, is.na(observations)), + size = 2, + colour = "grey40") + p } diff --git a/README.md b/README.md index c90a692b..8c69270c 100755 --- a/README.md +++ b/README.md @@ -23,11 +23,11 @@ Bhanu N.M. Sinha - - - - + + + + + ## How to get this package All stable versions of this package [are published on CRAN](https://CRAN.R-project.org/package=AMR), the official R network with a peer-reviewed submission process. diff --git a/_pkgdown.yml b/_pkgdown.yml index 01e534e4..1f73ca9e 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -37,7 +37,7 @@ navbar: href: 'articles/AMR.html' - text: 'Predict antimicrobial resistance' icon: 'fa-dice' - href: 'articles/Predict.html' + href: 'articles/resistance_predict.html' - text: 'Work with WHONET data' icon: 'fa-globe-americas' href: 'articles/WHONET.html' @@ -46,10 +46,12 @@ navbar: href: 'articles/EUCAST.html' - text: 'Get properties of a microorganism' icon: 'fa-bug' - href: 'articles/mo_property.html' + # href: 'articles/mo_property.html' + href: 'reference/mo_property.html' - text: 'Get properties of an antibiotic' icon: 'fa-capsules' - href: 'articles/ab_property.html' + # href: 'articles/atc_property.html' + href: 'reference/atc_property.html' - text: 'Create frequency tables' icon: 'fa-sort-amount-down' href: 'articles/freq.html' diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index d7f04512..b66d2788 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -108,7 +108,7 @@
AMR.Rmd
Note: values on this page will change with every website update since they are based on randomly created values and the page was written in RMarkdown. However, the methodology remains unchanged. This page was generated on 09 February 2019.
+Note: values on this page will change with every website update since they are based on randomly created values and the page was written in RMarkdown. However, the methodology remains unchanged. This page was generated on 11 February 2019.
As with many uses in R, we need some additional packages for AMR analysis. Our package works closely together with the tidyverse packages dplyr
and ggplot2
by Dr Hadley Wickham. The tidyverse tremendously improves the way we conduct data science - it allows for a very natural way of writing syntaxes and creating beautiful plots in R.
Our AMR
package depends on these packages and even extends their use and functions.
library(dplyr)
-library(ggplot2)
-library(AMR)
-
-# (if not yet installed, install with:)
-# install.packages(c("tidyverse", "AMR"))
library(dplyr)
+library(ggplot2)
+library(AMR)
+
+# (if not yet installed, install with:)
+# install.packages(c("tidyverse", "AMR"))
To start with patients, we need a unique list of patients.
- +The LETTERS
object is available in R - it’s a vector with 26 characters: A
to Z
. The patients
object we just created is now a vector of length 260, with values (patient IDs) varying from A1
to Z10
. Now we we also set the gender of our patients, by putting the ID and the gender in a table:
The first 135 patient IDs are now male, the other 125 are female.
Let’s pretend that our data consists of blood cultures isolates from 1 January 2010 until 1 January 2018.
- +This dates
object now contains all days in our date range.
For this tutorial, we will uses four different microorganisms: Escherichia coli, Staphylococcus aureus, Streptococcus pneumoniae, and Klebsiella pneumoniae:
- +For completeness, we can also add the hospital where the patients was admitted and we need to define valid antibmicrobial results for our randomisation:
- +Using the sample()
function, we can randomly select items from all objects we defined earlier. To let our fake data reflect reality a bit, we will also approximately define the probabilities of bacteria and the antibiotic results with the prob
parameter.
data <- data.frame(date = sample(dates, 5000, replace = TRUE),
- patient_id = sample(patients, 5000, replace = TRUE),
- hospital = sample(hospitals, 5000, replace = TRUE, prob = c(0.30, 0.35, 0.15, 0.20)),
- bacteria = sample(bacteria, 5000, replace = TRUE, prob = c(0.50, 0.25, 0.15, 0.10)),
- amox = sample(ab_interpretations, 5000, replace = TRUE, prob = c(0.60, 0.05, 0.35)),
- amcl = sample(ab_interpretations, 5000, replace = TRUE, prob = c(0.75, 0.10, 0.15)),
- cipr = sample(ab_interpretations, 5000, replace = TRUE, prob = c(0.80, 0.00, 0.20)),
- gent = sample(ab_interpretations, 5000, replace = TRUE, prob = c(0.92, 0.00, 0.08))
- )
Using the left_join()
function from the dplyr
package, we can ‘map’ the gender to the patient ID using the patients_table
object we created earlier:
Using the sample()
function, we can randomly select items from all objects we defined earlier. To let our fake data reflect reality a bit, we will also approximately define the probabilities of bacteria and the antibiotic results with the prob
parameter.
sample_size <- 20000
+data <- data.frame(date = sample(dates, size = sample_size, replace = TRUE),
+ patient_id = sample(patients, size = sample_size, replace = TRUE),
+ hospital = sample(hospitals, size = sample_size, replace = TRUE,
+ prob = c(0.30, 0.35, 0.15, 0.20)),
+ bacteria = sample(bacteria, size = sample_size, replace = TRUE,
+ prob = c(0.50, 0.25, 0.15, 0.10)),
+ amox = sample(ab_interpretations, size = sample_size, replace = TRUE,
+ prob = c(0.60, 0.05, 0.35)),
+ amcl = sample(ab_interpretations, size = sample_size, replace = TRUE,
+ prob = c(0.75, 0.10, 0.15)),
+ cipr = sample(ab_interpretations, size = sample_size, replace = TRUE,
+ prob = c(0.80, 0.00, 0.20)),
+ gent = sample(ab_interpretations, size = sample_size, replace = TRUE,
+ prob = c(0.92, 0.00, 0.08))
+ )
Using the left_join()
function from the dplyr
package, we can ‘map’ the gender to the patient ID using the patients_table
object we created earlier:
The resulting data set contains 5,000 blood culture isolates. With the head()
function we can preview the first 6 values of this data set:
date | @@ -313,41 +320,63 @@||||||||||
---|---|---|---|---|---|---|---|---|---|---|
2012-10-05 | -F3 | +2013-12-28 | +M2 | +Hospital B | +Streptococcus pneumoniae | +R | +S | +S | +S | +M | +
2017-03-17 | +Z5 | +Hospital B | +Escherichia coli | +R | +S | +S | +S | +F | +||
2011-05-13 | +G5 | +Hospital B | +Streptococcus pneumoniae | +S | +S | +S | +S | +M | +||
2011-03-06 | +X3 | +Hospital D | +Escherichia coli | +S | +S | +S | +S | +F | +||
2016-11-01 | +C9 | Hospital D | Escherichia coli | R | S | -S | -S | -M | -||
2012-11-18 | -X7 | -Hospital A | -Escherichia coli | -S | -S | -S | -S | -F | -||
2012-04-27 | -K2 | -Hospital B | -Escherichia coli | -R | -S | R | S | M | ||
2014-09-03 | -M6 | +2010-12-16 | +E2 | Hospital A | Streptococcus pneumoniae | S | @@ -356,28 +385,6 @@S | M | ||
2015-09-13 | -R4 | -Hospital A | -Escherichia coli | -S | -S | -S | -S | -F | -||
2011-09-19 | -D6 | -Hospital B | -Streptococcus pneumoniae | -S | -I | -S | -S | -M | -
Now, let’s start the cleaning and the analysis!
@@ -387,78 +394,78 @@Use the frequency table function freq()
to look specifically for unique values in any variable. For example, for the gender
variable:
# Frequency table of `gender` from a data.frame (5,000 x 9)
-# Class: factor (numeric)
-# Levels: F, M
-# Length: 5,000 (of which NA: 0 = 0.00%)
-# Unique: 2
-#
-# Item Count Percent Cum. Count Cum. Percent
-# --- ----- ------ -------- ----------- -------------
-# 1 M 2,565 51.3% 2,565 51.3%
-# 2 F 2,435 48.7% 5,000 100.0%
+
+#> Frequency table of `gender` from a data.frame (20,000 x 9)
+#> Class: factor (numeric)
+#> Levels: F, M
+#> Length: 20,000 (of which NA: 0 = 0.00%)
+#> Unique: 2
+#>
+#> Item Count Percent Cum. Count Cum. Percent
+#> --- ----- ------- -------- ----------- -------------
+#> 1 M 10,390 52.0% 10,390 52.0%
+#> 2 F 9,610 48.1% 20,000 100.0%
So, we can draw at least two conclusions immediately. From a data scientist perspective, the data looks clean: only values M
and F
. From a researcher perspective: there are slightly more men. Nothing we didn’t already know.
The data is already quite clean, but we still need to transform some variables. The bacteria
column now consists of text, and we want to add more variables based on microbial IDs later on. So, we will transform this column to valid IDs. The mutate()
function of the dplyr
package makes this really easy:
We also want to transform the antibiotics, because in real life data we don’t know if they are really clean. The as.rsi()
function ensures reliability and reproducibility in these kind of variables. The mutate_at()
will run the as.rsi()
function on defined variables:
The data is already quite clean, but we still need to transform some variables. The bacteria
column now consists of text, and we want to add more variables based on microbial IDs later on. So, we will transform this column to valid IDs. The mutate()
function of the dplyr
package makes this really easy:
We also want to transform the antibiotics, because in real life data we don’t know if they are really clean. The as.rsi()
function ensures reliability and reproducibility in these kind of variables. The mutate_at()
will run the as.rsi()
function on defined variables:
Finally, we will apply EUCAST rules on our antimicrobial results. In Europe, most medical microbiological laboratories already apply these rules. Our package features their latest insights on intrinsic resistance and exceptional phenotypes. Moreover, the eucast_rules()
function can also apply additional rules, like forcing
Because the amoxicillin (column amox
) and amoxicillin/clavulanic acid (column amcl
) in our data were generated randomly, some rows will undoubtedly contain amox = S and amcl = R, which is technically impossible. The eucast_rules()
fixes this:
data <- eucast_rules(data, col_mo = "bacteria")
-#
-# Rules by the European Committee on Antimicrobial Susceptibility Testing (EUCAST)
-#
-# EUCAST Clinical Breakpoints (v9.0, 2019)
-# Enterobacteriales (Order) (no changes)
-# Staphylococcus (no changes)
-# Enterococcus (no changes)
-# Streptococcus groups A, B, C, G (no changes)
-# Streptococcus pneumoniae (no changes)
-# Viridans group streptococci (no changes)
-# Haemophilus influenzae (no changes)
-# Moraxella catarrhalis (no changes)
-# Anaerobic Gram positives (no changes)
-# Anaerobic Gram negatives (no changes)
-# Pasteurella multocida (no changes)
-# Campylobacter jejuni and C. coli (no changes)
-# Aerococcus sanguinicola and A. urinae (no changes)
-# Kingella kingae (no changes)
-#
-# EUCAST Expert Rules, Intrinsic Resistance and Exceptional Phenotypes (v3.1, 2016)
-# Table 1: Intrinsic resistance in Enterobacteriaceae (306 changes)
-# Table 2: Intrinsic resistance in non-fermentative Gram-negative bacteria (no changes)
-# Table 3: Intrinsic resistance in other Gram-negative bacteria (no changes)
-# Table 4: Intrinsic resistance in Gram-positive bacteria (681 changes)
-# Table 8: Interpretive rules for B-lactam agents and Gram-positive cocci (no changes)
-# Table 9: Interpretive rules for B-lactam agents and Gram-negative rods (no changes)
-# Table 10: Interpretive rules for B-lactam agents and other Gram-negative bacteria (no changes)
-# Table 11: Interpretive rules for macrolides, lincosamides, and streptogramins (no changes)
-# Table 12: Interpretive rules for aminoglycosides (no changes)
-# Table 13: Interpretive rules for quinolones (no changes)
-#
-# Other rules
-# Non-EUCAST: ampicillin = R where amoxicillin/clav acid = R (no changes)
-# Non-EUCAST: piperacillin = R where piperacillin/tazobactam = R (no changes)
-# Non-EUCAST: trimethoprim = R where trimethoprim/sulfa = R (no changes)
-# Non-EUCAST: amoxicillin/clav acid = S where ampicillin = S (no changes)
-# Non-EUCAST: piperacillin/tazobactam = S where piperacillin = S (no changes)
-# Non-EUCAST: trimethoprim/sulfa = S where trimethoprim = S (no changes)
-#
-# => EUCAST rules affected 1,796 out of 5,000 rows
-# -> added 0 test results
-# -> changed 987 test results (0 to S; 0 to I; 987 to R)
data <- eucast_rules(data, col_mo = "bacteria")
+#>
+#> Rules by the European Committee on Antimicrobial Susceptibility Testing (EUCAST)
+#>
+#> EUCAST Clinical Breakpoints (v9.0, 2019)
+#> Enterobacteriales (Order) (no changes)
+#> Staphylococcus (no changes)
+#> Enterococcus (no changes)
+#> Streptococcus groups A, B, C, G (no changes)
+#> Streptococcus pneumoniae (no changes)
+#> Viridans group streptococci (no changes)
+#> Haemophilus influenzae (no changes)
+#> Moraxella catarrhalis (no changes)
+#> Anaerobic Gram positives (no changes)
+#> Anaerobic Gram negatives (no changes)
+#> Pasteurella multocida (no changes)
+#> Campylobacter jejuni and C. coli (no changes)
+#> Aerococcus sanguinicola and A. urinae (no changes)
+#> Kingella kingae (no changes)
+#>
+#> EUCAST Expert Rules, Intrinsic Resistance and Exceptional Phenotypes (v3.1, 2016)
+#> Table 1: Intrinsic resistance in Enterobacteriaceae (1288 changes)
+#> Table 2: Intrinsic resistance in non-fermentative Gram-negative bacteria (no changes)
+#> Table 3: Intrinsic resistance in other Gram-negative bacteria (no changes)
+#> Table 4: Intrinsic resistance in Gram-positive bacteria (2832 changes)
+#> Table 8: Interpretive rules for B-lactam agents and Gram-positive cocci (no changes)
+#> Table 9: Interpretive rules for B-lactam agents and Gram-negative rods (no changes)
+#> Table 10: Interpretive rules for B-lactam agents and other Gram-negative bacteria (no changes)
+#> Table 11: Interpretive rules for macrolides, lincosamides, and streptogramins (no changes)
+#> Table 12: Interpretive rules for aminoglycosides (no changes)
+#> Table 13: Interpretive rules for quinolones (no changes)
+#>
+#> Other rules
+#> Non-EUCAST: ampicillin = R where amoxicillin/clav acid = R (no changes)
+#> Non-EUCAST: piperacillin = R where piperacillin/tazobactam = R (no changes)
+#> Non-EUCAST: trimethoprim = R where trimethoprim/sulfa = R (no changes)
+#> Non-EUCAST: amoxicillin/clav acid = S where ampicillin = S (no changes)
+#> Non-EUCAST: piperacillin/tazobactam = S where piperacillin = S (no changes)
+#> Non-EUCAST: trimethoprim/sulfa = S where trimethoprim = S (no changes)
+#>
+#> => EUCAST rules affected 7,405 out of 20,000 rows
+#> -> added 0 test results
+#> -> changed 4,120 test results (0 to S; 0 to I; 4,120 to R)
Now that we have the microbial ID, we can add some taxonomic properties:
-data <- data %>%
- mutate(gramstain = mo_gramstain(bacteria),
- genus = mo_genus(bacteria),
- species = mo_species(bacteria))
data <- data %>%
+ mutate(gramstain = mo_gramstain(bacteria),
+ genus = mo_genus(bacteria),
+ species = mo_species(bacteria))
(…) When preparing a cumulative antibiogram to guide clinical decisions about empirical antimicrobial therapy of initial infections, only the first isolate of a given species per patient, per analysis period (eg, one year) should be included, irrespective of body site, antimicrobial susceptibility profile, or other phenotypical characteristics (eg, biotype). The first isolate is easily identified, and cumulative antimicrobial susceptibility test data prepared using the first isolate are generally comparable to cumulative antimicrobial susceptibility test data calculated by other methods, providing duplicate isolates are excluded.
M39-A4 Analysis and Presentation of Cumulative Antimicrobial Susceptibility Test Data, 4th Edition. CLSI, 2014. Chapter 6.4
This AMR
package includes this methodology with the first_isolate()
function. It adopts the episode of a year (can be changed by user) and it starts counting days after every selected isolate. This new variable can easily be added to our data:
data <- data %>%
- mutate(first = first_isolate(.))
-# NOTE: Using column `bacteria` as input for `col_mo`.
-# NOTE: Using column `date` as input for `col_date`.
-# NOTE: Using column `patient_id` as input for `col_patient_id`.
-# => Found 2,948 first isolates (59.0% of total)
So only 59% is suitable for resistance analysis! We can now filter on it with the filter()
function, also from the dplyr
package:
data <- data %>%
+ mutate(first = first_isolate(.))
+#> NOTE: Using column `bacteria` as input for `col_mo`.
+#> NOTE: Using column `date` as input for `col_date`.
+#> NOTE: Using column `patient_id` as input for `col_patient_id`.
+#> => Found 5,647 first isolates (28.2% of total)
So only 28.2% is suitable for resistance analysis! We can now filter on it with the filter()
function, also from the dplyr
package:
For future use, the above two syntaxes can be shortened with the filter_first_isolate()
function:
Only 4 isolates are marked as ‘first’ according to CLSI guideline. But when reviewing the antibiogram, it is obvious that some isolates are absolutely different strains and should be included too. This is why we weigh isolates, based on their antibiogram. The key_antibiotics()
function adds a vector with 18 key antibiotics: 6 broad spectrum ones, 6 small spectrum for Gram negatives and 6 small spectrum for Gram positives. These can be defined by the user.
Only 2 isolates are marked as ‘first’ according to CLSI guideline. But when reviewing the antibiogram, it is obvious that some isolates are absolutely different strains and should be included too. This is why we weigh isolates, based on their antibiogram. The key_antibiotics()
function adds a vector with 18 key antibiotics: 6 broad spectrum ones, 6 small spectrum for Gram negatives and 6 small spectrum for Gram positives. These can be defined by the user.
If a column exists with a name like ‘key(…)ab’ the first_isolate()
function will automatically use it and determine the first weighted isolates. Mind the NOTEs in below output:
data <- data %>%
- mutate(keyab = key_antibiotics(.)) %>%
- mutate(first_weighted = first_isolate(.))
-# NOTE: Using column `bacteria` as input for `col_mo`.
-# NOTE: Using column `bacteria` as input for `col_mo`.
-# NOTE: Using column `date` as input for `col_date`.
-# NOTE: Using column `patient_id` as input for `col_patient_id`.
-# NOTE: Using column `keyab` as input for `col_keyantibiotics`. Use col_keyantibiotics = FALSE to prevent this.
-# [Criterion] Inclusion based on key antibiotics, ignoring I.
-# => Found 4,405 first weighted isolates (88.1% of total)
data <- data %>%
+ mutate(keyab = key_antibiotics(.)) %>%
+ mutate(first_weighted = first_isolate(.))
+#> NOTE: Using column `bacteria` as input for `col_mo`.
+#> NOTE: Using column `bacteria` as input for `col_mo`.
+#> NOTE: Using column `date` as input for `col_date`.
+#> NOTE: Using column `patient_id` as input for `col_patient_id`.
+#> NOTE: Using column `keyab` as input for `col_keyantibiotics`. Use col_keyantibiotics = FALSE to prevent this.
+#> [Criterion] Inclusion based on key antibiotics, ignoring I.
+#> => Found 15,789 first weighted isolates (78.9% of total)
isolate | @@ -639,11 +646,11 @@|||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | -2010-05-12 | -B4 | +2010-01-02 | +F5 | B_ESCHR_COL | R | -I | +S | S | S | TRUE | @@ -651,34 +658,34 @@||
2 | -2010-05-12 | -B4 | +2010-04-18 | +F5 | B_ESCHR_COL | +S | +I | R | S | -R | -R | FALSE | TRUE |
3 | -2010-07-18 | -B4 | +2010-05-05 | +F5 | B_ESCHR_COL | R | S | -R | +S | S | FALSE | TRUE | |
4 | -2011-01-18 | -B4 | +2010-11-03 | +F5 | B_ESCHR_COL | -R | +S | S | S | S | @@ -687,88 +694,88 @@|||
5 | -2011-11-20 | -B4 | +2010-12-16 | +F5 | B_ESCHR_COL | S | +S | R | S | -S | -TRUE | +FALSE | TRUE |
6 | -2011-12-03 | -B4 | +2011-02-23 | +F5 | B_ESCHR_COL | S | S | S | S | -FALSE | +TRUE | TRUE | |
7 | -2012-09-09 | -B4 | +2011-04-26 | +F5 | B_ESCHR_COL | -R | +S | S | S | S | FALSE | -TRUE | +FALSE |
8 | -2013-01-24 | -B4 | +2011-08-01 | +F5 | B_ESCHR_COL | -S | +R | I | S | S | -TRUE | +FALSE | TRUE |
9 | -2013-04-25 | -B4 | +2011-09-09 | +F5 | B_ESCHR_COL | S | S | S | S | FALSE | -FALSE | +TRUE | |
10 | -2014-02-01 | -B4 | +2011-09-11 | +F5 | B_ESCHR_COL | S | -S | R | S | -TRUE | +S | +FALSE | TRUE |
Instead of 4, now 9 isolates are flagged. In total, 88.1% of all isolates are marked ‘first weighted’ - 29.1% more than when using the CLSI guideline. In real life, this novel algorithm will yield 5-10% more isolates than the classic CLSI guideline.
+Instead of 2, now 9 isolates are flagged. In total, 78.9% of all isolates are marked ‘first weighted’ - 50.7% more than when using the CLSI guideline. In real life, this novel algorithm will yield 5-10% more isolates than the classic CLSI guideline.
As with filter_first_isolate()
, there’s a shortcut for this new algorithm too:
So we end up with 4,405 isolates for analysis.
+ +So we end up with 15,789 isolates for analysis.
We can remove unneeded columns:
- +Now our data looks like:
- +@@ -788,13 +795,29 @@ | ||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | -2012-11-18 | -X7 | -Hospital A | -B_ESCHR_COL | +1 | +2013-12-28 | +M2 | +Hospital B | +B_STRPTC_PNE | +R | S | S | +R | +M | +Gram positive | +Streptococcus | +pneumoniae | +TRUE | +
2 | +2017-03-17 | +Z5 | +Hospital B | +B_ESCHR_COL | +R | +S | S | S | F | @@ -803,27 +826,11 @@coli | TRUE | |||||||
3 | -2012-04-27 | -K2 | -Hospital B | -B_ESCHR_COL | -R | -S | -R | -S | -M | -Gram negative | -Escherichia | -coli | -TRUE | -|||||
4 | -2014-09-03 | -M6 | -Hospital A | +3 | +2011-05-13 | +G5 | +Hospital B | B_STRPTC_PNE | S | S | @@ -837,15 +844,15 @@||||||||
5 | -2015-09-13 | -R4 | -Hospital A | +2016-11-01 | +C9 | +Hospital D | B_ESCHR_COL | +R | S | +R | S | -S | -S | -F | +M | Gram negative | Escherichia | coli | @@ -853,12 +860,12 @@
6 | -2011-09-19 | -D6 | -Hospital B | +2010-12-16 | +E2 | +Hospital A | B_STRPTC_PNE | S | -I | +S | S | R | M | @@ -869,15 +876,15 @@|||||
7 | -2013-01-13 | -Q3 | +2013-06-03 | +G1 | Hospital A | B_STRPTC_PNE | R | S | S | R | -F | +M | Gram positive | Streptococcus | pneumoniae | @@ -897,12 +904,12 @@ Dispersion of species|||
1 | Escherichia coli | -2,160 | -49.0% | -2,160 | -49.0% | +7,787 | +49.3% | +7,787 | +49.3% | |||||||||
2 | Staphylococcus aureus | -1,109 | -25.2% | -3,269 | +3,922 | +24.8% | +11,709 | 74.2% | ||||||||||
3 | Streptococcus pneumoniae | -690 | -15.7% | -3,959 | -89.9% | +2,567 | +16.3% | +14,276 | +90.4% | |||||||||
4 | Klebsiella pneumoniae | -446 | -10.1% | -4,405 | +1,513 | +9.6% | +15,789 | 100.0% |
hospital | @@ -969,27 +976,27 @@ Longest: 24||
---|---|---|
Hospital A | -0.4711316 | +0.4824505 |
Hospital B | -0.4910714 | +0.4689469 |
Hospital C | -0.4597701 | +0.4689858 |
Hospital D | -0.4488698 | +0.4772799 |
Of course it would be very convenient to know the number of isolates responsible for the percentages. For that purpose the n_rsi()
can be used, which works exactly like n_distinct()
from the dplyr
package. It counts all isolates available for every group (i.e. values S, I or R):
data_1st %>%
- group_by(hospital) %>%
- summarise(amoxicillin = portion_IR(amox),
- available = n_rsi(amox))
Of course it would be very convenient to know the number of isolates responsible for the percentages. For that purpose the n_rsi()
can be used, which works exactly like n_distinct()
from the dplyr
package. It counts all isolates available for every group (i.e. values S, I or R):
data_1st %>%
+ group_by(hospital) %>%
+ summarise(amoxicillin = portion_IR(amox),
+ available = n_rsi(amox))
hospital | @@ -999,32 +1006,32 @@ Longest: 24||||
---|---|---|---|---|
Hospital A | -0.4711316 | -1299 | +0.4824505 | +4701 |
Hospital B | -0.4910714 | -1568 | +0.4689469 | +5555 |
Hospital C | -0.4597701 | -609 | +0.4689858 | +2386 |
Hospital D | -0.4488698 | -929 | +0.4772799 | +3147 |
These functions can also be used to get the portion of multiple antibiotics, to calculate co-resistance very easily:
-data_1st %>%
- group_by(genus) %>%
- summarise(amoxicillin = portion_S(amcl),
- gentamicin = portion_S(gent),
- "amox + gent" = portion_S(amcl, gent))
data_1st %>%
+ group_by(genus) %>%
+ summarise(amoxicillin = portion_S(amcl),
+ gentamicin = portion_S(gent),
+ "amox + gent" = portion_S(amcl, gent))
genus | @@ -1035,94 +1042,94 @@ Longest: 24||||||
---|---|---|---|---|---|---|
Escherichia | -0.7407407 | -0.9000000 | -0.9763889 | +0.7303198 | +0.9056119 | +0.9745730 |
Klebsiella | -0.7533632 | -0.9147982 | -0.9843049 | +0.7343027 | +0.8955717 | +0.9643093 |
Staphylococcus | -0.7303877 | -0.9305681 | -0.9819657 | +0.7304946 | +0.9196838 | +0.9821520 |
Streptococcus | -0.7173913 | +0.7308142 | 0.0000000 | -0.7173913 | +0.7308142 |
To make a transition to the next part, let’s see how this difference could be plotted:
-data_1st %>%
- group_by(genus) %>%
- summarise("1. Amoxicillin" = portion_S(amcl),
- "2. Gentamicin" = portion_S(gent),
- "3. Amox + gent" = portion_S(amcl, gent)) %>%
- tidyr::gather("Antibiotic", "S", -genus) %>%
- ggplot(aes(x = genus,
- y = S,
- fill = Antibiotic)) +
- geom_col(position = "dodge2")
data_1st %>%
+ group_by(genus) %>%
+ summarise("1. Amoxicillin" = portion_S(amcl),
+ "2. Gentamicin" = portion_S(gent),
+ "3. Amox + gent" = portion_S(amcl, gent)) %>%
+ tidyr::gather("Antibiotic", "S", -genus) %>%
+ ggplot(aes(x = genus,
+ y = S,
+ fill = Antibiotic)) +
+ geom_col(position = "dodge2")
To show results in plots, most R users would nowadays use the ggplot2
package. This package lets you create plots in layers. You can read more about it on their website. A quick example would look like these syntaxes:
ggplot(data = a_data_set,
- mapping = aes(x = year,
- y = value)) +
- geom_col() +
- labs(title = "A title",
- subtitle = "A subtitle",
- x = "My X axis",
- y = "My Y axis")
-
-ggplot(a_data_set,
- aes(year, value) +
- geom_bar()
ggplot(data = a_data_set,
+ mapping = aes(x = year,
+ y = value)) +
+ geom_col() +
+ labs(title = "A title",
+ subtitle = "A subtitle",
+ x = "My X axis",
+ y = "My Y axis")
+
+ggplot(a_data_set,
+ aes(year, value) +
+ geom_bar()
The AMR
package contains functions to extend this ggplot2
package, for example geom_rsi()
. It automatically transforms data with count_df()
or portion_df()
and show results in stacked bars. Its simplest and shortest example:
Omit the translate_ab = FALSE
to have the antibiotic codes (amox, amcl, cipr, gent) translated to official WHO names (amoxicillin, amoxicillin and betalactamase inhibitor, ciprofloxacin, gentamicin).
If we group on e.g. the genus
column and add some additional functions from our package, we can create this:
# group the data on `genus`
-ggplot(data_1st %>% group_by(genus)) +
- # create bars with genus on x axis
- # it looks for variables with class `rsi`,
- # of which we have 4 (earlier created with `as.rsi`)
- geom_rsi(x = "genus") +
- # split plots on antibiotic
- facet_rsi(facet = "Antibiotic") +
- # make R red, I yellow and S green
- scale_rsi_colours() +
- # show percentages on y axis
- scale_y_percent(breaks = 0:4 * 25) +
- # turn 90 degrees, make it bars instead of columns
- coord_flip() +
- # add labels
- labs(title = "Resistance per genus and antibiotic",
- subtitle = "(this is fake data)") +
- # and print genus in italic to follow our convention
- # (is now y axis because we turned the plot)
- theme(axis.text.y = element_text(face = "italic"))
# group the data on `genus`
+ggplot(data_1st %>% group_by(genus)) +
+ # create bars with genus on x axis
+ # it looks for variables with class `rsi`,
+ # of which we have 4 (earlier created with `as.rsi`)
+ geom_rsi(x = "genus") +
+ # split plots on antibiotic
+ facet_rsi(facet = "Antibiotic") +
+ # make R red, I yellow and S green
+ scale_rsi_colours() +
+ # show percentages on y axis
+ scale_y_percent(breaks = 0:4 * 25) +
+ # turn 90 degrees, make it bars instead of columns
+ coord_flip() +
+ # add labels
+ labs(title = "Resistance per genus and antibiotic",
+ subtitle = "(this is fake data)") +
+ # and print genus in italic to follow our convention
+ # (is now y axis because we turned the plot)
+ theme(axis.text.y = element_text(face = "italic"))
To simplify this, we also created the ggplot_rsi()
function, which combines almost all above functions:
We can transform the data and apply the test in only a couple of lines:
-septic_patients %>%
- filter(hospital_id %in% c("A", "D")) %>% # filter on only hospitals A and D
- select(hospital_id, fosf) %>% # select the hospitals and fosfomycin
- group_by(hospital_id) %>% # group on the hospitals
- count_df(combine_IR = TRUE) %>% # count all isolates per group (hospital_id)
- tidyr::spread(hospital_id, Value) %>% # transform output so A and D are columns
- select(A, D) %>% # and select these only
- as.matrix() %>% # transform to good old matrix for fisher.test()
- fisher.test() # do Fisher's Exact Test
-#
-# Fisher's Exact Test for Count Data
-#
-# data: .
-# p-value = 0.03104
-# alternative hypothesis: true odds ratio is not equal to 1
-# 95 percent confidence interval:
-# 1.054283 4.735995
-# sample estimates:
-# odds ratio
-# 2.228006
septic_patients %>%
+ filter(hospital_id %in% c("A", "D")) %>% # filter on only hospitals A and D
+ select(hospital_id, fosf) %>% # select the hospitals and fosfomycin
+ group_by(hospital_id) %>% # group on the hospitals
+ count_df(combine_IR = TRUE) %>% # count all isolates per group (hospital_id)
+ tidyr::spread(hospital_id, Value) %>% # transform output so A and D are columns
+ select(A, D) %>% # and select these only
+ as.matrix() %>% # transform to good old matrix for fisher.test()
+ fisher.test() # do Fisher's Exact Test
+#>
+#> Fisher's Exact Test for Count Data
+#>
+#> data: .
+#> p-value = 0.03104
+#> alternative hypothesis: true odds ratio is not equal to 1
+#> 95 percent confidence interval:
+#> 1.054283 4.735995
+#> sample estimates:
+#> odds ratio
+#> 2.228006
As can be seen, the p value is 0.03, which means that the fosfomycin resistances found in hospital A and D are really different.
EUCAST.Rmd
atc_property.Rmd
(will be available soon)
+mo_property.Rmd
resistance_predict.Rmd
As with many uses in R, we need some additional packages for AMR analysis. Our package works closely together with the tidyverse packages dplyr
and ggplot2
by Dr Hadley Wickham. The tidyverse tremendously improves the way we conduct data science - it allows for a very natural way of writing syntaxes and creating beautiful plots in R.
Our AMR
package depends on these packages and even extends their use and functions.
Our package contains a function resistance_predict()
, which takes the same input as functions for other AMR analysis. Based on a date column, it calculates cases per year and uses a regression model to predict antimicrobial resistance.
It is basically as easy as:
+# resistance prediction of piperacillin/tazobactam (pita):
+resistance_predict(tbl = septic_patients, col_date = "date", col_ab = "pita")
+
+# or:
+septic_patients %>%
+ resistance_predict(col_ab = "pita")
+
+# to bind it to object 'predict_pita' for example:
+predict_pita <- septic_patients %>%
+ resistance_predict(col_ab = "pita")
The function will look for a date column itself if col_date
is not set.
When running any of these commands, a summary of the regression model will be printed unless using resistance_predict(..., info = FALSE)
.
#> NOTE: Using column `date` as input for `col_date`.
+#>
+#> Logistic regression model (logit) with binomial distribution
+#> ------------------------------------------------------------
+#>
+#> Call:
+#> glm(formula = df_matrix ~ year, family = binomial)
+#>
+#> Deviance Residuals:
+#> Min 1Q Median 3Q Max
+#> -2.9224 -1.3120 0.0170 0.7586 3.1932
+#>
+#> Coefficients:
+#> Estimate Std. Error z value Pr(>|z|)
+#> (Intercept) -222.92857 45.93922 -4.853 1.22e-06 ***
+#> year 0.10994 0.02284 4.814 1.48e-06 ***
+#> ---
+#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+#>
+#> (Dispersion parameter for binomial family taken to be 1)
+#>
+#> Null deviance: 59.794 on 14 degrees of freedom
+#> Residual deviance: 35.191 on 13 degrees of freedom
+#> AIC: 93.464
+#>
+#> Number of Fisher Scoring iterations: 4
+This text is only a printed summary - the actual result (output) of the function is a data.frame
containing for each year: the number of observations, the actual observed resistance, the estimated resistance and the standard error below and above the estimation:
predict_pita
+#> year value se_min se_max observations observed estimated
+#> 1 2003 0.06250000 NA NA 32 0.06250000 0.06177594
+#> 2 2004 0.08536585 NA NA 82 0.08536585 0.06846343
+#> 3 2005 0.10000000 NA NA 60 0.10000000 0.07581637
+#> 4 2006 0.05084746 NA NA 59 0.05084746 0.08388789
+#> 5 2007 0.12121212 NA NA 66 0.12121212 0.09273250
+#> 6 2008 0.04166667 NA NA 72 0.04166667 0.10240539
+#> 7 2009 0.01639344 NA NA 61 0.01639344 0.11296163
+#> 8 2010 0.09433962 NA NA 53 0.09433962 0.12445516
+#> 9 2011 0.18279570 NA NA 93 0.18279570 0.13693759
+#> 10 2012 0.30769231 NA NA 65 0.30769231 0.15045682
+#> 11 2013 0.08620690 NA NA 58 0.08620690 0.16505550
+#> 12 2014 0.15254237 NA NA 59 0.15254237 0.18076926
+#> 13 2015 0.27272727 NA NA 55 0.27272727 0.19762493
+#> 14 2016 0.25000000 NA NA 84 0.25000000 0.21563859
+#> 15 2017 0.16279070 NA NA 86 0.16279070 0.23481370
+#> 16 2018 0.25513926 0.2228376 0.2874409 NA NA 0.25513926
+#> 17 2019 0.27658825 0.2386811 0.3144954 NA NA 0.27658825
+#> 18 2020 0.29911630 0.2551715 0.3430611 NA NA 0.29911630
+#> 19 2021 0.32266085 0.2723340 0.3729877 NA NA 0.32266085
+#> 20 2022 0.34714076 0.2901847 0.4040968 NA NA 0.34714076
+#> 21 2023 0.37245666 0.3087318 0.4361815 NA NA 0.37245666
+#> 22 2024 0.39849187 0.3279750 0.4690088 NA NA 0.39849187
+#> 23 2025 0.42511415 0.3479042 0.5023241 NA NA 0.42511415
+#> 24 2026 0.45217796 0.3684992 0.5358568 NA NA 0.45217796
+#> 25 2027 0.47952757 0.3897276 0.5693275 NA NA 0.47952757
+#> 26 2028 0.50700045 0.4115444 0.6024565 NA NA 0.50700045
+#> 27 2029 0.53443111 0.4338908 0.6349714 NA NA 0.53443111
The function plot
is available in base R, and can be extended by other packages to depend the output based on the type of input. We extended its function to cope with resistance predictions:
This is the fastest way to plot the result. It automatically adds the right axes, error bars, titles, number of available observations and type of model.
+We also support the ggplot2
package with our custom function ggplot_rsi_predict()
to create more appealing plots:
Resistance is not easily predicted; if we look at vancomycin resistance in Gram positives, the spread (i.e. standard error) is enormous:
+septic_patients %>%
+ filter(mo_gramstain(mo) == "Gram positive") %>%
+ resistance_predict(col_ab = "vanc", year_min = 2010, info = FALSE) %>%
+ ggplot_rsi_predict()
+#> NOTE: Using column `date` as input for `col_date`.
Vancomycin resistance could be 100% in ten years, but might also stay around 0%.
+You can define the model with the model
parameter. The default model is a generalised linear regression model using a binomial distribution, assuming that a period of zero resistance was followed by a period of increasing resistance leading slowly to more and more resistance.
Valid values are:
+Input values | +Function used by R | +Type of model | +
---|---|---|
+"binomial" or "binom" or "logit"
+ |
+glm(..., family = binomial) |
+Generalised linear model with binomial distribution | +
+"loglin" or "poisson"
+ |
+glm(..., family = poisson) |
+Generalised linear model with poisson distribution | +
+"lin" or "linear"
+ |
+lm() |
+Linear model | +
For the vancomycin resistance in Gram positive bacteria, a linear model might be more appropriate since no (left half of a) binomial distribution is to be expected based on the observed years:
+septic_patients %>%
+ filter(mo_gramstain(mo) == "Gram positive") %>%
+ resistance_predict(col_ab = "vanc", year_min = 2010, info = FALSE, model = "linear") %>%
+ ggplot_rsi_predict()
+#> NOTE: Using column `date` as input for `col_date`.
This seems more likely, doesn’t it?
+The model itself is also available from the object, as an attribute
:
model <- attributes(predict_pita)$model
+
+summary(model)$family
+#>
+#> Family: binomial
+#> Link function: logit
+
+summary(model)$coefficients
+#> Estimate Std. Error z value Pr(>|z|)
+#> (Intercept) -222.9285736 45.93922388 -4.852685 1.218012e-06
+#> year 0.1099391 0.02283501 4.814500 1.475690e-06
Learn R reading this great book!
' + - 'Or read it free online: r4ds.co.nz.
' + - 'This package can be used for:
as.atc()
n_rsi
to count cases where antibiotic test results were available, to be used in conjunction with dplyr::summarise
, see ?rsin_rsi
to count cases where antibiotic test results were available, to be used in conjunction with dplyr::summarise
, see ?rsiguess_bactid
to determine the ID of a microorganism based on genus/species or known abbreviations like MRSAguess_atc
to determine the ATC of an antibiotic based on name, trade name, or known abbreviationsfreq
to create frequency tables, with additional info in a headermodel | -the statistical model of choice. Valid values are |
+ the statistical model of choice. Defaults to a generalised linear regression model with binomial distribution, assuming that a period of zero resistance was followed by a period of increasing resistance leading slowly to more and more resistance. See Details for valid options. |
---|---|---|
I_as_R | @@ -312,6 +312,10 @@... | parameters passed on to the |
ribbon | +a logical to indicate whether a ribbon should be shown (default) or error bars |
+
observations
, the total number of available observations in that year, i.e. S + I + R
observed
, the original observed resistant percentages
estimated
, the estimated resistant percentages, calculated by the model
Furthermore, the model itself is available as an attribute: attributes(x)$model
, see Examples.
Valid options for the statistical model are:
"binomial"
or "binom"
or "logit"
: a generalised linear regression model with binomial distribution
"loglin"
or "poisson"
: a generalised log-linear regression model with poisson distribution
"lin"
or "linear"
: a linear regression model
Learn R reading this great book!
' + - 'Or read it free online: r4ds.co.nz.
' + - '