diff --git a/404.html b/404.html index 8dc7e520..9cfc0b9f 100644 --- a/404.html +++ b/404.html @@ -36,7 +36,7 @@ AMR (for R) - 1.8.2.9146 + 1.8.2.9147 @@ -59,11 +59,6 @@ How to - - - - User- Or Team-specific Package Settings - @@ -82,7 +77,12 @@ - Data Sets for Download / Own Use + Download Data Sets for Own Use + + + + + Set User- Or Team-specific Package Settings diff --git a/LICENSE-text.html b/LICENSE-text.html index e45d9d08..bb4437a6 100644 --- a/LICENSE-text.html +++ b/LICENSE-text.html @@ -10,7 +10,7 @@ AMR (for R) - 1.8.2.9146 + 1.8.2.9147 @@ -32,11 +32,6 @@ How to - - - - User- Or Team-specific Package Settings - @@ -55,7 +50,12 @@ - Data Sets for Download / Own Use + Download Data Sets for Own Use + + + + + Set User- Or Team-specific Package Settings diff --git a/articles/AMR.html b/articles/AMR.html index 685b4cb1..9d35fa9f 100644 --- a/articles/AMR.html +++ b/articles/AMR.html @@ -38,7 +38,7 @@ AMR (for R) - 1.8.2.9146 + 1.8.2.9147 @@ -61,11 +61,6 @@ How to - - - - User- Or Team-specific Package Settings - @@ -84,7 +79,12 @@ - Data Sets for Download / Own Use + Download Data Sets for Own Use + + + + + Set User- Or Team-specific Package Settings @@ -187,7 +187,7 @@ website update since they are based on randomly created values and the page was written in R Markdown. However, the methodology remains unchanged. This page was -generated on 24 February 2023. +generated on 26 February 2023. Introduction @@ -195,7 +195,7 @@ generated on 24 February 2023. knowledge from different scientific fields, which makes it hard to do right. At least, it requires: -Good questions (always start with those!) +Good questions (always start with those!) and reliable data A thorough understanding of (clinical) epidemiology, to understand the clinical and epidemiological relevance and possible bias of results @@ -243,21 +243,21 @@ make the structure of your data generally look like this: -2023-02-24 +2023-02-26 abcd Escherichia coli S S -2023-02-24 +2023-02-26 abcd Escherichia coli S R -2023-02-24 +2023-02-26 efgh Escherichia coli R @@ -279,309 +279,185 @@ for cleaning data and creating frequency tables. library(dplyr) library(ggplot2) library(AMR) -library(cleaner) # (if not yet installed, install with:) -# install.packages(c("dplyr", "ggplot2", "AMR", "cleaner")) - - - -Creation of data - -We will create some fake example data to use for analysis. For AMR -data analysis, we need at least: a patient ID, name or code of a -microorganism, a date and antimicrobial results (an antibiogram). It -could also include a specimen type (e.g. to filter on blood or urine), -the ward type (e.g. to filter on ICUs). -With additional columns (like a hospital name, the patients gender of -even [well-defined] clinical properties) you can do a comparative -analysis, as this tutorial will demonstrate too. - -Patients - -To start with patients, we need a unique list of patients. +# install.packages(c("dplyr", "ggplot2", "AMR")) +The AMR package contains a data set +example_isolates_unclean, which might look data that users +have extracted from their laboratory systems: -patients <- unlist(lapply(LETTERS, paste0, 1:10)) -The LETTERS object is available in R - it’s a vector -with 26 characters: A to Z. The -patients object we just created is now a vector of length -260, with values (patient IDs) varying from A1 to -Z10. Now we we also set the gender of our patients, by -putting the ID and the gender in a table: - -patients_table <- data.frame( - patient_id = patients, - gender = c( - rep("M", 135), - rep("F", 125) - ) -) -The first 135 patient IDs are now male, the other 125 are female. - - -Dates - -Let’s pretend that our data consists of blood cultures isolates from -between 1 January 2010 and 1 January 2018. - -dates <- seq(as.Date("2010-01-01"), as.Date("2018-01-01"), by = "day") -This dates object now contains all days in our date -range. - -Microorganisms - -For this tutorial, we will uses four different microorganisms: -Escherichia coli, Staphylococcus aureus, -Streptococcus pneumoniae, and Klebsiella -pneumoniae: - -bacteria <- c( - "Escherichia coli", "Staphylococcus aureus", - "Streptococcus pneumoniae", "Klebsiella pneumoniae" -) - - - -Put everything together - -Using the sample() function, we can randomly select -items from all objects we defined earlier. To let our fake data reflect -reality a bit, we will also approximately define the probabilities of -bacteria and the antibiotic results, using the random_sir() -function. - -sample_size <- 20000 -data <- data.frame( - date = sample(dates, size = sample_size, replace = TRUE), - patient_id = sample(patients, size = sample_size, replace = TRUE), - hospital = sample( - c( - "Hospital A", - "Hospital B", - "Hospital C", - "Hospital D" - ), - size = sample_size, replace = TRUE, - prob = c(0.30, 0.35, 0.15, 0.20) - ), - bacteria = sample(bacteria, - size = sample_size, replace = TRUE, - prob = c(0.50, 0.25, 0.15, 0.10) - ), - AMX = random_sir(sample_size, prob_sir = c(0.35, 0.60, 0.05)), - AMC = random_sir(sample_size, prob_sir = c(0.15, 0.75, 0.10)), - CIP = random_sir(sample_size, prob_sir = c(0.20, 0.80, 0.00)), - GEN = random_sir(sample_size, prob_sir = c(0.08, 0.92, 0.00)) -) -Using the left_join() function from the -dplyr package, we can ‘map’ the gender to the patient ID -using the patients_table object we created earlier: - -data <- data %>% left_join(patients_table) -The resulting data set contains 20 000 blood culture isolates. With -the head() function we can preview the first 6 rows of this -data set: - -head(data) - - - - - - - - - - - - - -date -patient_id -hospital -bacteria -AMX -AMC -CIP -GEN -gender - - - -2015-09-16 -Z5 -Hospital A -Escherichia coli -S -S -I -R -F - - -2017-09-08 -V1 -Hospital B -Streptococcus pneumoniae -S -S -R -S -F - - -2016-08-05 -B1 -Hospital B -Streptococcus pneumoniae -I -R -R -R -M - - -2010-08-24 -E10 -Hospital C -Escherichia coli -S -S -I -S -M - - -2011-06-19 -R10 -Hospital D -Staphylococcus aureus -R -S -S -I -F - - -2013-07-14 -L4 -Hospital D -Staphylococcus aureus -I -S -I -R -M - - - -Now, let’s start the cleaning and the analysis! - - - -Cleaning the data - -We also created a package dedicated to data cleaning and checking, -called the cleaner package. It freq() function -can be used to create frequency tables. -For example, for the gender variable: - -data %>% freq(gender) -Frequency table -Class: character -Length: 20,000 -Available: 20,000 (100%, NA: 0 = 0%) -Unique: 2 -Shortest: 1 -Longest: 1 - - - -Item -Count -Percent -Cum. Count -Cum. Percent - - - -1 -M -10,355 -51.78% -10,355 -51.78% - - -2 -F -9,645 -48.23% -20,000 -100.00% - - - -So, we can draw at least two conclusions immediately. From a data -scientists perspective, the data looks clean: only values M -and F. From a researchers perspective: there are slightly -more men. Nothing we didn’t already know. -The data is already quite clean, but we still need to transform some -variables. The bacteria column now consists of text, and we -want to add more variables based on microbial IDs later on. So, we will -transform this column to valid IDs. The mutate() function -of the dplyr package makes this really easy: - -data <- data %>% - mutate(bacteria = as.mo(bacteria)) -We also want to transform the antibiotics, because in real life data -we don’t know if they are really clean. The as.sir() -function ensures reliability and reproducibility in these kind of -variables. The is_sir_eligible() can check which columns -are probably columns with SIR test results. Using mutate() -and across(), we can apply the transformation to the formal -<rsi> class: - -is_sir_eligible(data) -#> [1] FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE FALSE -colnames(data)[is_sir_eligible(data)] -#> [1] "AMX" "AMC" "CIP" "GEN" +example_isolates_unclean +#> # A tibble: 3,000 × 8 +#> patient_id hospital date bacteria AMX AMC CIP GEN +#> <chr> <chr> <date> <chr> <chr> <chr> <chr> <chr> +#> 1 J3 A 2012-11-21 E. coli R I S S +#> 2 R7 A 2018-04-03 K. pneumoniae R I S S +#> 3 P3 A 2014-09-19 E. coli R S S S +#> 4 P10 A 2015-12-10 E. coli S I S S +#> 5 B7 A 2015-03-02 E. coli S S S S +#> 6 W3 A 2018-03-31 S. aureus R S R S +#> 7 J8 A 2016-06-14 E. coli R S S S +#> 8 M3 A 2015-10-25 E. coli R S S S +#> 9 J3 A 2019-06-19 E. coli S S S S +#> 10 G6 A 2015-04-27 S. aureus S S S S +#> # … with 2,990 more rows -data <- data %>% - mutate(across(where(is_sir_eligible), as.sir)) -Finally, we will apply EUCAST -rules on our antimicrobial results. In Europe, most medical -microbiological laboratories already apply these rules. Our package -features their latest insights on intrinsic resistance and exceptional -phenotypes. Moreover, the eucast_rules() function can also -apply additional rules, like forcing -ampicillin = R when -amoxicillin/clavulanic acid = R. -Because the amoxicillin (column AMX) and -amoxicillin/clavulanic acid (column AMC) in our data were -generated randomly, some rows will undoubtedly contain AMX = S and AMC = -R, which is technically impossible. The eucast_rules() -fixes this: - -data <- eucast_rules(data, col_mo = "bacteria", rules = "all") +# we will use 'our_data' as the data set name for this tutorial +our_data <- example_isolates_unclean +For AMR data analysis, we would like the microorganism column to +contain valid, up-to-date taxonomy, and the antibiotic columns to be +cleaned as SIR values as well. + + +Taxonomy of microorganisms + +With as.mo(), users can transform arbitrary +microorganism names or codes to current taxonomy. The AMR +package contains up-to-date taxonomic data. To be specific, currently +included data were retrieved on 11 Dec 2022. +The codes of the AMR packages that come from as.mo() are +short, but still human readable. More importantly, as.mo() +supports all kinds of input: + +as.mo("Klebsiella pneumoniae") +#> Class 'mo' +#> [1] B_KLBSL_PNMN +as.mo("K. pneumoniae") +#> Class 'mo' +#> [1] B_KLBSL_PNMN +as.mo("KLEPNE") +#> Class 'mo' +#> [1] B_KLBSL_PNMN +as.mo("KLPN") +#> Class 'mo' +#> [1] B_KLBSL_PNMN +The first character in above codes denote their taxonomic kingdom, +such as Bacteria (B), Fungi (F), and Protozoa (P). +The AMR package also contain functions to directly +retrieve taxonomic properties, such as the name, genus, species, family, +order, and even Gram-stain. They all start with mo_ and +they use as.mo() internally, so that still any arbitrary +user input can be used: + +mo_family("K. pneumoniae") +#> [1] "Enterobacteriaceae" +mo_genus("K. pneumoniae") +#> [1] "Klebsiella" +mo_species("K. pneumoniae") +#> [1] "pneumoniae" + +mo_gramstain("Klebsiella pneumoniae") +#> [1] "Gram-negative" + +mo_ref("K. pneumoniae") +#> [1] "Trevisan, 1887" + +mo_snomed("K. pneumoniae") +#> [[1]] +#> [1] "1098101000112102" "446870005" "1098201000112108" "409801009" +#> [5] "56415008" "714315002" "713926009" +Now we can thus clean our data: + +our_data$bacteria <- as.mo(our_data$bacteria, info = TRUE) +#> ℹ Microorganism translation was uncertain for four microorganisms. Run +#> mo_uncertainties() to review these uncertainties, or use +#> add_custom_microorganisms() to add custom entries. +Apparently, there was some uncertainty about the translation to +taxonomic codes. Let’s check this: + +mo_uncertainties() +#> Matching scores are based on the resemblance between the input and the full +#> taxonomic name, and the pathogenicity in humans. See ?mo_matching_score. +#> +#> -------------------------------------------------------------------------------- +#> "E. coli" -> Escherichia coli (B_ESCHR_COLI, 0.688) +#> Based on input "E coli" +#> Also matched: Enterobacter cowanii (0.600), Eubacterium combesii +#> (0.600), Eggerthia catenaformis (0.591), Eubacterium callanderi +#> (0.591), Enterocloster citroniae (0.587), Eubacterium cylindroides +#> (0.583), Enterococcus casseliflavus (0.577), Enterobacter cloacae +#> cloacae (0.571), Ehrlichia canis (0.567), and Enterobacter cloacae +#> dissolvens (0.565) +#> -------------------------------------------------------------------------------- +#> "K. pneumoniae" -> Klebsiella pneumoniae (B_KLBSL_PNMN, 0.786) +#> Based on input "K pneumoniae" +#> Also matched: Klebsiella pneumoniae ozaenae (0.707), Klebsiella +#> pneumoniae pneumoniae (0.688), Klebsiella pneumoniae rhinoscleromatis +#> (0.658), Klebsiella pasteurii (0.500), Klebsiella planticola (0.500), +#> Kingella potus (0.400), Kosakonia pseudosacchari (0.361), Kaistella +#> palustris (0.333), Kocuria palustris (0.333), and Kocuria pelophila +#> (0.333) +#> -------------------------------------------------------------------------------- +#> "S. aureus" -> Staphylococcus aureus (B_STPHY_AURS, 0.690) +#> Based on input "S aureus" +#> Also matched: Staphylococcus aureus aureus (0.643), Staphylococcus +#> argenteus (0.625), Staphylococcus aureus anaerobius (0.625), +#> Streptomyces argenteolus (0.483), Streptomyces aureus (0.474), +#> Streptomyces azureus (0.467), Streptomyces aureorectus (0.444), +#> Streptomyces auratus (0.433), Streptomyces aurantiogriseus (0.429), and +#> Streptomyces aureocirculatus (0.429) +#> -------------------------------------------------------------------------------- +#> "S. pneumoniae" -> Streptococcus pneumoniae (B_STRPT_PNMN, 0.750) +#> Based on input "S pneumoniae" +#> Also matched: Streptococcus pseudopneumoniae (0.700), Serratia +#> proteamaculans quinovora (0.545), Streptococcus pseudoporcinus (0.536), +#> Staphylococcus pseudintermedius (0.532), Serratia proteamaculans +#> proteamaculans (0.526), Salmonella Portanigra (0.524), Sphingomonas +#> paucimobilis (0.520), Streptococcus pluranimalium (0.519), +#> Streptococcus constellatus pharyngis (0.514), and Salmonella Pakistan +#> (0.500) +#> +#> Only the first 10 other matches of each record are shown. Run +#> print(mo_uncertainties(), n = ...) to view more entries, or save +#> mo_uncertainties() to an object. +That’s all good. + + +Antibiotic results + +The column with antibiotic test results must also be cleaned. The +AMR package comes with three new data types to work with +such test results: mic for minimal inhibitory +concentrations (MIC), disk for disk diffusion diameters, +and sir for SIR data that have been interpreted already. +This package can also determine SIR values based on MIC or disk +diffusion values, read more about that on the as.sir() +page. +For now, we will just clean the SIR columns in our data using +dplyr: + +# method 1, be explicit about the columns: +our_data <- our_data %>% + mutate_at(vars(AMX:GEN), as.sir) + +# method 2, let the AMR package determine the eligible columns +our_data <- our_data %>% + mutate_if(is_sir_eligible, as.sir) + +# result: +our_data +#> # A tibble: 3,000 × 8 +#> patient_id hospital date bacteria AMX AMC CIP GEN +#> <chr> <chr> <date> <mo> <sir> <sir> <sir> <sir> +#> 1 J3 A 2012-11-21 B_ESCHR_COLI R I S S +#> 2 R7 A 2018-04-03 B_KLBSL_PNMN R I S S +#> 3 P3 A 2014-09-19 B_ESCHR_COLI R S S S +#> 4 P10 A 2015-12-10 B_ESCHR_COLI S I S S +#> 5 B7 A 2015-03-02 B_ESCHR_COLI S S S S +#> 6 W3 A 2018-03-31 B_STPHY_AURS R S R S +#> 7 J8 A 2016-06-14 B_ESCHR_COLI R S S S +#> 8 M3 A 2015-10-25 B_ESCHR_COLI R S S S +#> 9 J3 A 2019-06-19 B_ESCHR_COLI S S S S +#> 10 G6 A 2015-04-27 B_STPHY_AURS S S S S +#> # … with 2,990 more rows +This is basically it for the cleaning, time to start the data +inclusion. - -Adding new variables - -Now that we have the microbial ID, we can add some taxonomic -properties: - -data <- data %>% - mutate( - gramstain = mo_gramstain(bacteria), - genus = mo_genus(bacteria), - species = mo_species(bacteria) - ) First isolates -We also need to know which isolates we can actually use for -analysis. +We need to know which isolates we can actually use for +analysis without repetition bias. To conduct an analysis of antimicrobial resistance, you must only include the first isolate of every patient per episode (Hindler et al., Clin Infect Dis. 2007). If you would not do this, you could easily get an @@ -614,13 +490,11 @@ different methods as defined by first_isolate() page. +method to properly correct for most duplicate isolates. Read more about +the methods on the first_isolate() page. The outcome of the function can easily be added to our data: - -data <- data %>% + +our_data <- our_data %>% mutate(first = first_isolate(info = TRUE)) #> Determining first isolates using an episode length of 365 days #> ℹ Using column 'bacteria' as input for col_mo. @@ -629,496 +503,698 @@ takes into account the antimicrobial susceptibility test results using #> Basing inclusion on all antimicrobial results, using a points threshold of #> 2 #> Including isolates from ICU. -#> => Found 12,372 'phenotype-based' first isolates (61.9% of total where a -#> microbial ID was available) -So only 61.9% is suitable for resistance analysis! We can now filter -on it with the filter() function, also from the +#> => Found 2,626 'phenotype-based' first isolates (87.6% within scope and +#> 87.5% of total where a microbial ID was available) +So only 88% is suitable for resistance analysis! We can now filter on +it with the filter() function, also from the dplyr package: - -data_1st <- data %>% + +our_data_1st <- our_data %>% filter(first == TRUE) For future use, the above two syntaxes can be shortened: - -data_1st <- data %>% + +our_data_1st <- our_data %>% filter_first_isolate() -So we end up with 12 372 isolates for analysis. Now our data looks +So we end up with 2 626 isolates for analysis. Now our data looks like: - -head(data_1st) - - - - - - - - - - - - - - - - - - - -date -patient_id -hospital -bacteria -AMX -AMC -CIP -GEN -gender -gramstain -genus -species -first - - - -1 -2015-09-16 -Z5 -Hospital A -B_ESCHR_COLI -S -S -I -R -F -Gram-negative -Escherichia -coli -TRUE - - -2 -2017-09-08 -V1 -Hospital B -B_STRPT_PNMN -S -S -R -R -F -Gram-positive -Streptococcus -pneumoniae -TRUE - - -3 -2016-08-05 -B1 -Hospital B -B_STRPT_PNMN -R -R -R -R -M -Gram-positive -Streptococcus -pneumoniae -TRUE - - -4 -2010-08-24 -E10 -Hospital C -B_ESCHR_COLI -S -S -I -S -M -Gram-negative -Escherichia -coli -TRUE - - -7 -2011-06-03 -D1 -Hospital A -B_STRPT_PNMN -R -R -S -R -M -Gram-positive -Streptococcus -pneumoniae -TRUE - - -8 -2016-04-15 -T10 -Hospital B -B_STRPT_PNMN -R -R -R -R -F -Gram-positive -Streptococcus -pneumoniae -TRUE - - - -Time for the analysis! + +our_data_1st +#> # A tibble: 2,626 × 9 +#> patient_id hospital date bacteria AMX AMC CIP GEN first +#> <chr> <chr> <date> <mo> <sir> <sir> <sir> <sir> <lgl> +#> 1 J3 A 2012-11-21 B_ESCHR_COLI R I S S TRUE +#> 2 R7 A 2018-04-03 B_KLBSL_PNMN R I S S TRUE +#> 3 P10 A 2015-12-10 B_ESCHR_COLI S I S S TRUE +#> 4 B7 A 2015-03-02 B_ESCHR_COLI S S S S TRUE +#> 5 W3 A 2018-03-31 B_STPHY_AURS R S R S TRUE +#> 6 J8 A 2016-06-14 B_ESCHR_COLI R S S S TRUE +#> 7 M3 A 2015-10-25 B_ESCHR_COLI R S S S TRUE +#> 8 J3 A 2019-06-19 B_ESCHR_COLI S S S S TRUE +#> 9 G6 A 2015-04-27 B_STPHY_AURS S S S S TRUE +#> 10 P4 A 2011-06-21 B_ESCHR_COLI S S S S TRUE +#> # … with 2,616 more rows +Time for the analysis. Analysing the data -You might want to start by getting an idea of how the data is -distributed. It’s an important start, because it also decides how you -will continue your analysis. Although this package contains a convenient -function to make frequency tables, exploratory data analysis (EDA) is -not the primary scope of this package. Use a package like DataExplorer -for that, or read the free online book Exploratory Data Analysis -with R by Roger D. Peng. +The base R summary() function gives a good first +impression, as it comes with support for the new mo and +sir classes that we now have in our data set: + +summary(our_data_1st) +#> patient_id hospital date +#> Length:2626 Length:2626 Min. :2011-01-01 +#> Class :character Class :character 1st Qu.:2013-04-14 +#> Mode :character Mode :character Median :2015-06-05 +#> Mean :2015-06-15 +#> 3rd Qu.:2017-08-23 +#> Max. :2020-01-01 +#> bacteria AMX AMC +#> Class :mo Class:sir Class:sir +#> <NA> :0 %R :43.2% (n=1134) %R :36.1% (n=947) +#> Unique:4 %SI :56.8% (n=1492) %SI :63.9% (n=1679) +#> #1 :B_ESCHR_COLI - %S :41.1% (n=1080) - %S :52.7% (n=1383) +#> #2 :B_STPHY_AURS - %I :15.7% (n=412) - %I :11.3% (n=296) +#> #3 :B_STRPT_PNMN +#> CIP GEN first +#> Class:sir Class:sir Mode:logical +#> %R :42.0% (n=1102) %R :37.0% (n=971) TRUE:2626 +#> %SI :58.0% (n=1524) %SI :63.0% (n=1655) +#> - %S :51.9% (n=1362) - %S :59.9% (n=1574) +#> - %I : 6.2% (n=162) - %I : 3.1% (n=81) +#> + +glimpse(our_data_1st) +#> Rows: 2,626 +#> Columns: 9 +#> $ patient_id <chr> "J3", "R7", "P10", "B7", "W3", "J8", "M3", "J3", "G6", "P4"… +#> $ hospital <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",… +#> $ date <date> 2012-11-21, 2018-04-03, 2015-12-10, 2015-03-02, 2018-03-31… +#> $ bacteria <mo> "B_ESCHR_COLI", "B_KLBSL_PNMN", "B_ESCHR_COLI", "B_ESCHR_COL… +#> $ AMX <sir> R, R, S, S, R, R, R, S, S, S, S, R, S, S, R, R, R, R, I, S,… +#> $ AMC <sir> I, I, I, S, S, S, S, S, S, S, S, S, S, S, S, S, S, R, S, R,… +#> $ CIP <sir> S, S, S, S, R, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S,… +#> $ GEN <sir> S, S, S, S, S, S, S, S, S, S, S, R, S, S, S, S, S, S, S, S,… +#> $ first <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,… + +# number of unique values per column: +sapply(our_data_1st, n_distinct) +#> patient_id hospital date bacteria AMX AMC CIP +#> 260 3 1808 4 3 3 3 +#> GEN first +#> 3 1 -Dispersion of species +Availability of species To just get an idea how the species are distributed, create a -frequency table with our freq() function. We created the -genus and species column earlier based on the -microbial ID. With paste(), we can concatenate them -together. -The freq() function can be used like the base R language -was intended: - -freq(paste(data_1st$genus, data_1st$species)) -Or can be used like the dplyr way, which is easier -readable: - -data_1st %>% freq(genus, species) -Frequency table -Class: character -Length: 12,372 -Available: 12,372 (100%, NA: 0 = 0%) -Unique: 4 -Shortest: 16 -Longest: 24 - - - - - - - - - - - -Item -Count -Percent -Cum. Count -Cum. Percent - - - -1 -Escherichia coli -5,982 -48.35% -5,982 -48.35% - - -2 -Staphylococcus aureus -3,218 -26.01% -9,200 -74.36% - - -3 -Streptococcus pneumoniae -1,867 -15.09% -11,067 -89.45% - - -4 -Klebsiella pneumoniae -1,305 -10.55% -12,372 -100.00% - - - +frequency table with count() based on the name of the +microorganisms: + +our_data %>% + count(mo_name(bacteria), sort = TRUE) +#> # A tibble: 4 × 2 +#> `mo_name(bacteria)` n +#> <chr> <int> +#> 1 Escherichia coli 1518 +#> 2 Staphylococcus aureus 730 +#> 3 Streptococcus pneumoniae 426 +#> 4 Klebsiella pneumoniae 326 + +our_data_1st %>% + count(mo_name(bacteria), sort = TRUE) +#> # A tibble: 4 × 2 +#> `mo_name(bacteria)` n +#> <chr> <int> +#> 1 Escherichia coli 1250 +#> 2 Staphylococcus aureus 661 +#> 3 Streptococcus pneumoniae 399 +#> 4 Klebsiella pneumoniae 316 -Overview of different bug/drug combinations +Select and filter with antibiotic selectors -Using tidyverse -selections, you can also select or filter columns based on the -antibiotic class they are in: - -data_1st %>% - filter(any(aminoglycosides() == "R")) -#> ℹ For aminoglycosides() using column 'GEN' (gentamicin) +Using so-called antibiotic class selectors, you can select or filter +columns based on the antibiotic class that your antibiotic results are +in: + +our_data_1st %>% + select(date, aminoglycosides()) +#> ℹ For aminoglycosides() using column 'GEN' (gentamicin) +#> # A tibble: 2,626 × 2 +#> date GEN +#> <date> <sir> +#> 1 2012-11-21 S +#> 2 2018-04-03 S +#> 3 2015-12-10 S +#> 4 2015-03-02 S +#> 5 2018-03-31 S +#> 6 2016-06-14 S +#> 7 2015-10-25 S +#> 8 2019-06-19 S +#> 9 2015-04-27 S +#> 10 2011-06-21 S +#> # … with 2,616 more rows + +our_data_1st %>% + select(bacteria, betalactams()) +#> ℹ For betalactams() using columns 'AMX' (amoxicillin) and 'AMC' +#> (amoxicillin/clavulanic acid) +#> # A tibble: 2,626 × 3 +#> bacteria AMX AMC +#> <mo> <sir> <sir> +#> 1 B_ESCHR_COLI R I +#> 2 B_KLBSL_PNMN R I +#> 3 B_ESCHR_COLI S I +#> 4 B_ESCHR_COLI S S +#> 5 B_STPHY_AURS R S +#> 6 B_ESCHR_COLI R S +#> 7 B_ESCHR_COLI R S +#> 8 B_ESCHR_COLI S S +#> 9 B_STPHY_AURS S S +#> 10 B_ESCHR_COLI S S +#> # … with 2,616 more rows + +our_data_1st %>% + select(bacteria, where(is.sir)) +#> # A tibble: 2,626 × 5 +#> bacteria AMX AMC CIP GEN +#> <mo> <sir> <sir> <sir> <sir> +#> 1 B_ESCHR_COLI R I S S +#> 2 B_KLBSL_PNMN R I S S +#> 3 B_ESCHR_COLI S I S S +#> 4 B_ESCHR_COLI S S S S +#> 5 B_STPHY_AURS R S R S +#> 6 B_ESCHR_COLI R S S S +#> 7 B_ESCHR_COLI R S S S +#> 8 B_ESCHR_COLI S S S S +#> 9 B_STPHY_AURS S S S S +#> 10 B_ESCHR_COLI S S S S +#> # … with 2,616 more rows + +# filtering using AB selectors is also possible: +our_data_1st %>% + filter(any(aminoglycosides() == "R")) +#> ℹ For aminoglycosides() using column 'GEN' (gentamicin) +#> # A tibble: 971 × 9 +#> patient_id hospital date bacteria AMX AMC CIP GEN first +#> <chr> <chr> <date> <mo> <sir> <sir> <sir> <sir> <lgl> +#> 1 J5 A 2017-12-25 B_STRPT_PNMN R S S R TRUE +#> 2 X1 A 2017-07-04 B_STPHY_AURS R S S R TRUE +#> 3 B3 A 2016-07-24 B_ESCHR_COLI S S S R TRUE +#> 4 V7 A 2012-04-03 B_ESCHR_COLI S S S R TRUE +#> 5 C9 A 2017-03-23 B_ESCHR_COLI S S S R TRUE +#> 6 R1 A 2018-06-10 B_STPHY_AURS S S S R TRUE +#> 7 S2 A 2013-07-19 B_STRPT_PNMN S S S R TRUE +#> 8 P5 A 2019-03-09 B_STPHY_AURS S S S R TRUE +#> 9 Q8 A 2019-08-10 B_STPHY_AURS S S S R TRUE +#> 10 K5 A 2013-03-15 B_STRPT_PNMN S S S R TRUE +#> # … with 961 more rows + +our_data_1st %>% + filter(all(betalactams() == "R")) +#> ℹ For betalactams() using columns 'AMX' (amoxicillin) and 'AMC' +#> (amoxicillin/clavulanic acid) +#> # A tibble: 471 × 9 +#> patient_id hospital date bacteria AMX AMC CIP GEN first +#> <chr> <chr> <date> <mo> <sir> <sir> <sir> <sir> <lgl> +#> 1 M7 A 2013-07-22 B_STRPT_PNMN R R S S TRUE +#> 2 R10 A 2013-12-20 B_STPHY_AURS R R S S TRUE +#> 3 R7 A 2015-10-25 B_STPHY_AURS R R S S TRUE +#> 4 R8 A 2019-10-25 B_STPHY_AURS R R S S TRUE +#> 5 I7 A 2015-08-19 B_ESCHR_COLI R R S S TRUE +#> 6 N3 A 2014-12-29 B_STRPT_PNMN R R R S TRUE +#> 7 Q2 A 2019-09-22 B_ESCHR_COLI R R S S TRUE +#> 8 X7 A 2011-03-20 B_ESCHR_COLI R R S R TRUE +#> 9 C5 A 2015-08-30 B_KLBSL_PNMN R R S R TRUE +#> 10 W9 A 2013-10-02 B_ESCHR_COLI R R S S TRUE +#> # … with 461 more rows + +# even works in base R (since R 3.0): +our_data_1st[all(betalactams() == "R"), ] +#> ℹ For betalactams() using columns 'AMX' (amoxicillin) and 'AMC' +#> (amoxicillin/clavulanic acid) +#> # A tibble: 471 × 9 +#> patient_id hospital date bacteria AMX AMC CIP GEN first +#> <chr> <chr> <date> <mo> <sir> <sir> <sir> <sir> <lgl> +#> 1 M7 A 2013-07-22 B_STRPT_PNMN R R S S TRUE +#> 2 R10 A 2013-12-20 B_STPHY_AURS R R S S TRUE +#> 3 R7 A 2015-10-25 B_STPHY_AURS R R S S TRUE +#> 4 R8 A 2019-10-25 B_STPHY_AURS R R S S TRUE +#> 5 I7 A 2015-08-19 B_ESCHR_COLI R R S S TRUE +#> 6 N3 A 2014-12-29 B_STRPT_PNMN R R R S TRUE +#> 7 Q2 A 2019-09-22 B_ESCHR_COLI R R S S TRUE +#> 8 X7 A 2011-03-20 B_ESCHR_COLI R R S R TRUE +#> 9 C5 A 2015-08-30 B_KLBSL_PNMN R R S R TRUE +#> 10 W9 A 2013-10-02 B_ESCHR_COLI R R S S TRUE +#> # … with 461 more rows + + +Generate antibiograms + +This package comes with antibiogram(), a function that +automatically generates traditional, combined, syndromic, and even +weighted-incidence syndromic combination antibiograms (WISCA). For R +Markdown (such as this page) it automatically prints in the right table +format. +Below are some suggestions for how to generate the different +antibiograms: + +# traditional: +antibiogram(our_data_1st) + + +Pathogen (N min-max) +AMC +AMX +CIP +GEN + + + + +E. coli (1250-1250) +64 +58 +58 +63 + + + +K. pneumoniae (316-316) +63 +53 +59 +60 + + + +S. aureus (661-661) +64 +57 +57 +63 + + + +S. pneumoniae (399-399) +64 +56 +60 +66 + + + + +antibiogram(our_data_1st, + ab_transform = "name") - - - - - - - - - - - - - + + + + + -date -patient_id -hospital -bacteria -AMX -AMC -CIP -GEN -gender -gramstain -genus -species -first +Pathogen (N min-max) +Amoxicillin +Amoxicillin/clavulanic acid +Ciprofloxacin +Gentamicin -2015-09-16 -Z5 -Hospital A -B_ESCHR_COLI -S -S -I -R -F -Gram-negative -Escherichia -coli -TRUE + +E. coli (1250-1250) +58 +64 +58 +63 -2017-09-08 -V1 -Hospital B -B_STRPT_PNMN -S -S -R -R -F -Gram-positive -Streptococcus -pneumoniae -TRUE + +K. pneumoniae (316-316) +53 +63 +59 +60 -2016-08-05 -B1 -Hospital B -B_STRPT_PNMN -R -R -R -R -M -Gram-positive -Streptococcus -pneumoniae -TRUE + +S. aureus (661-661) +57 +64 +57 +63 -2011-06-03 -D1 -Hospital A -B_STRPT_PNMN -R -R -S -R -M -Gram-positive -Streptococcus -pneumoniae -TRUE - - -2016-04-15 -T10 -Hospital B -B_STRPT_PNMN -R -R -R -R -F -Gram-positive -Streptococcus -pneumoniae -TRUE - - -2014-03-09 -X8 -Hospital A -B_ESCHR_COLI -R -R -R -R -F -Gram-negative -Escherichia -coli -TRUE + +S. pneumoniae (399-399) +56 +64 +60 +66 -If you want to get a quick glance of the number of isolates in -different bug/drug combinations, you can use the -bug_drug_combinations() function: + +antibiogram(our_data_1st, + ab_transform = "name", + language = "es") # support for 20 languages + + + + + + + + + +Patógeno (N min-max) +Amoxicilina +Amoxicilina/ácido clavulánico +Ciprofloxacina +Gentamicina + + + + +E. coli (1250-1250) +58 +64 +58 +63 + + + +K. pneumoniae (316-316) +53 +63 +59 +60 + + + +S. aureus (661-661) +57 +64 +57 +63 + + + +S. pneumoniae (399-399) +56 +64 +60 +66 + + + + +# combined: +antibiogram(our_data_1st, + antibiotics = c("AMC", "AMC+CIP", "AMC+GEN")) + + +Pathogen (N min-max) +AMC +AMC + CIP +AMC + GEN + + + + +E. coli (1250-1250) +64 +76 +75 + + + +K. pneumoniae (316-316) +63 +78 +74 + + + +S. aureus (661-661) +64 +77 +75 + + + +S. pneumoniae (399-399) +64 +77 +76 + + + + +# for a syndromic antibiogram, we must fake some clinical conditions: +our_data_1st$condition <- sample(c("Cardial", "Respiratory", "Rheumatic"), + size = nrow(our_data_1st), + replace = TRUE) + +# syndromic: +antibiogram(our_data_1st, + syndromic_group = "condition") + + +Syndromic Group +Pathogen (N min-max) +AMC +AMX +CIP +GEN + + + +Cardial + +E. coli (431-431) +63 +54 +57 +63 + + +Respiratory + +E. coli (396-396) +64 +61 +58 +63 + + +Rheumatic + +E. coli (423-423) +65 +60 +58 +63 + + +Cardial + +K. pneumoniae (105-105) +63 +54 +59 +61 + + +Respiratory + +K. pneumoniae (110-110) +60 +54 +55 +63 + + +Rheumatic + +K. pneumoniae (101-101) +66 +50 +62 +56 + + +Cardial + +S. aureus (225-225) +66 +57 +53 +61 + + +Respiratory + +S. aureus (217-217) +60 +55 +61 +64 + + +Rheumatic + +S. aureus (219-219) +67 +58 +58 +64 + + +Cardial + +S. pneumoniae (152-152) +67 +57 +64 +63 + + +Respiratory + +S. pneumoniae (122-122) +67 +57 +64 +67 + + +Rheumatic + +S. pneumoniae (125-125) +57 +53 +50 +67 + + + + +antibiogram(our_data_1st, + # you can use AB selectors here as well: + antibiotics = c(penicillins(), aminoglycosides()), + syndromic_group = "condition", + mo_transform = "gramstain") +#> ℹ For penicillins() using columns 'AMX' (amoxicillin) and 'AMC' +#> (amoxicillin/clavulanic acid) +#> ℹ For aminoglycosides() using column 'GEN' (gentamicin) + + +Syndromic Group +Pathogen (N min-max) +AMC +AMX +GEN + + + +Cardial +Gram-negative (536-536) +63 +54 +62 + + +Respiratory +Gram-negative (506-506) +63 +59 +63 + + +Rheumatic +Gram-negative (524-524) +65 +58 +62 + + +Cardial +Gram-positive (377-377) +67 +57 +62 + + +Respiratory +Gram-positive (339-339) +63 +56 +65 + + +Rheumatic +Gram-positive (344-344) +63 +56 +65 + + + + +# WISCA: +# (we lack some details, but it could contain a filter on e.g. >65 year-old males) +wisca <- antibiogram(our_data_1st, + antibiotics = c("AMC", "AMC+CIP", "AMC+GEN"), + syndromic_group = "condition", + mo_transform = "gramstain") +wisca + + +Syndromic Group +Pathogen (N min-max) +AMC +AMC + CIP +AMC + GEN + + + +Cardial +Gram-negative (536-536) +63 +75 +75 + + +Respiratory +Gram-negative (506-506) +63 +77 +75 + + +Rheumatic +Gram-negative (524-524) +65 +76 +75 + + +Cardial +Gram-positive (377-377) +67 +78 +75 + + +Respiratory +Gram-positive (339-339) +63 +77 +76 + + +Rheumatic +Gram-positive (344-344) +63 +77 +76 + + + +Antibiograms can be plotted using autoplot() from the +ggplot2 packages, since this package provides an extension +to that function: -data_1st %>% - bug_drug_combinations() %>% - head() # show first 6 rows - - -mo -ab -S -I -R -total - - - -E. coli -AMC -2826 -1184 -1972 -5982 - - -E. coli -AMX -1517 -1305 -3160 -5982 - - -E. coli -CIP -2041 -1879 -2062 -5982 - - -E. coli -GEN -2065 -1813 -2104 -5982 - - -K. pneumoniae -AMC -587 -263 -455 -1305 - - -K. pneumoniae -AMX -0 -0 -1305 -1305 - - - - -data_1st %>% - select(bacteria, aminoglycosides()) %>% - bug_drug_combinations() -#> ℹ For aminoglycosides() using column 'GEN' (gentamicin) - - -mo -ab -S -I -R -total - - - -E. coli -GEN -2065 -1813 -2104 -5982 - - -K. pneumoniae -GEN -440 -426 -439 -1305 - - -S. aureus -GEN -1124 -1044 -1050 -3218 - - -S. pneumoniae -GEN -0 -0 -1867 -1867 - - - -This will only give you the crude numbers in the data. To calculate -antimicrobial resistance in a more sensible way, also by correcting for -too few results, we use the resistance() and +autoplot(wisca) + +To calculate antimicrobial resistance in a more sensible way, also by +correcting for too few results, we use the resistance() and susceptibility() functions. @@ -1142,397 +1218,23 @@ proportion of R (proportion_R() I (proportion_SI(), equal to susceptibility()). These functions can be used on their own: - -data_1st %>% resistance(AMX) -#> [1] 0.5813935 + +our_data_1st %>% resistance(AMX) +#> [1] 0.4318355 Or can be used in conjunction with group_by() and summarise(), both from the dplyr package: - -data_1st %>% + +our_data_1st %>% group_by(hospital) %>% - summarise(amoxicillin = resistance(AMX)) - - -hospital -amoxicillin - - - -Hospital A -0.5902375 - - -Hospital B -0.5636153 - - -Hospital C -0.5920537 - - -Hospital D -0.5911290 - - - -Of course it would be very convenient to know the number of isolates -responsible for the percentages. For that purpose the -n_sir() can be used, which works exactly like -n_distinct() from the dplyr package. It counts -all isolates available for every group (i.e. values S, I or R): - -data_1st %>% - group_by(hospital) %>% - summarise( - amoxicillin = resistance(AMX), - available = n_sir(AMX) - ) - - -hospital -amoxicillin -available - - - -Hospital A -0.5902375 -3790 - - -Hospital B -0.5636153 -4315 - - -Hospital C -0.5920537 -1787 - - -Hospital D -0.5911290 -2480 - - - -These functions can also be used to get the proportion of multiple -antibiotics, to calculate empiric susceptibility of combination -therapies very easily: - -data_1st %>% - group_by(genus) %>% - summarise( - amoxiclav = susceptibility(AMC), - gentamicin = susceptibility(GEN), - amoxiclav_genta = susceptibility(AMC, GEN) - ) - - -genus -amoxiclav -gentamicin -amoxiclav_genta - - - -Escherichia -0.6703444 -0.6482782 -0.8836510 - - -Klebsiella -0.6513410 -0.6636015 -0.8919540 - - -Staphylococcus -0.6615911 -0.6737104 -0.8899938 - - -Streptococcus -0.4654526 -0.0000000 -0.4654526 - - - -Or if you are curious for the resistance within certain antibiotic -classes, use a antibiotic class selector such as -penicillins(), which automatically will include the columns -AMX and AMC of our data: - -data_1st %>% - # group by hospital - group_by(hospital) %>% - # / -> select all penicillins in the data for calculation - # | / -> use resistance() for all peni's per hospital - # | | / -> print as percentages - summarise(across(penicillins(), resistance, as_percent = TRUE)) %>% - # format the antibiotic column names, using so-called snake case, - # so 'Amoxicillin/clavulanic acid' becomes 'amoxicillin_clavulanic_acid' - rename_with(set_ab_names, penicillins()) - - -hospital -amoxicillin -amoxicillin_clavulanic_acid - - - -Hospital A -59.0% -38.0% - - -Hospital B -56.4% -35.0% - - -Hospital C -59.2% -37.3% - - -Hospital D -59.1% -36.1% - - - -To make a transition to the next part, let’s see how differences in -the previously calculated combination therapies could be plotted: - -data_1st %>% - group_by(genus) %>% - summarise( - "1. Amoxi/clav" = susceptibility(AMC), - "2. Gentamicin" = susceptibility(GEN), - "3. Amoxi/clav + genta" = susceptibility(AMC, GEN) - ) %>% - # pivot_longer() from the tidyr package "lengthens" data: - tidyr::pivot_longer(-genus, names_to = "antibiotic") %>% - ggplot(aes( - x = genus, - y = value, - fill = antibiotic - )) + - geom_col(position = "dodge2") - - - -Plots - -To show results in plots, most R users would nowadays use the -ggplot2 package. This package lets you create plots in -layers. You can read more about it on their website. A quick -example would look like these syntaxes: - -ggplot( - data = a_data_set, - mapping = aes( - x = year, - y = value - ) -) + - geom_col() + - labs( - title = "A title", - subtitle = "A subtitle", - x = "My X axis", - y = "My Y axis" - ) - -# or as short as: -ggplot(a_data_set) + - geom_bar(aes(year)) -The AMR package contains functions to extend this -ggplot2 package, for example geom_sir(). It -automatically transforms data with count_df() or -proportion_df() and show results in stacked bars. Its -simplest and shortest example: - -ggplot(data_1st) + - geom_sir(translate_ab = FALSE) - -Omit the translate_ab = FALSE to have the antibiotic -codes (AMX, AMC, CIP, GEN) translated to official WHO names -(amoxicillin, amoxicillin/clavulanic acid, ciprofloxacin, -gentamicin). -If we group on e.g. the genus column and add some -additional functions from our package, we can create this: - -# group the data on `genus` -ggplot(data_1st %>% group_by(genus)) + - # create bars with genus on x axis - # it looks for variables with class `sir`, - # of which we have 4 (earlier created with `as.sir`) - geom_sir(x = "genus") + - # split plots on antibiotic - facet_sir(facet = "antibiotic") + - # set colours to the SIR interpretations (colour-blind friendly) - scale_sir_colours() + - # show percentages on y axis - scale_y_percent(breaks = 0:4 * 25) + - # turn 90 degrees, to make it bars instead of columns - coord_flip() + - # add labels - labs( - title = "Resistance per genus and antibiotic", - subtitle = "(this is fake data)" - ) + - # and print genus in italic to follow our convention - # (is now y axis because we turned the plot) - theme(axis.text.y = element_text(face = "italic")) - -To simplify this, we also created the ggplot_sir() -function, which combines almost all above functions: - -data_1st %>% - group_by(genus) %>% - ggplot_sir( - x = "genus", - facet = "antibiotic", - breaks = 0:4 * 25, - datalabels = FALSE - ) + - coord_flip() - - -Plotting MIC and disk diffusion values - -The AMR package also extends the plot() and -ggplot2::autoplot() functions for plotting minimum -inhibitory concentrations (MIC, created with as.mic()) and -disk diffusion diameters (created with as.disk()). -With the random_mic() and random_disk() -functions, we can generate sampled values for the new data types (S3 -classes) <mic> and <disk>: - -mic_values <- random_mic(size = 100) -mic_values -#> Class 'mic' -#> [1] 16 128 0.01 0.002 4 16 0.01 0.01 128 0.5 -#> [11] 1 4 0.01 16 64 0.25 16 4 0.0625 0.002 -#> [21] 4 0.5 0.5 0.001 0.125 32 64 0.005 0.025 0.0625 -#> [31] 0.125 32 0.25 0.25 0.5 0.001 32 0.01 8 256 -#> [41] 0.5 64 256 0.025 1 0.025 2 32 8 0.01 -#> [51] 0.025 0.01 0.001 0.0625 1 1 16 0.001 64 0.125 -#> [61] 0.0625 0.025 8 128 4 4 0.025 16 8 1 -#> [71] 4 2 0.5 0.5 1 0.002 128 256 0.025 0.005 -#> [81] 64 2 0.25 64 0.002 0.005 1 256 64 0.001 -#> [91] 8 64 0.25 4 1 32 16 2 32 8 - -# base R: -plot(mic_values) - - -# ggplot2: -autoplot(mic_values) - -But we could also be more specific, by generating MICs that are -likely to be found in E. coli for ciprofloxacin: - -mic_values <- random_mic(size = 100, mo = "E. coli", ab = "cipro") -For the plot() and autoplot() function, we -can define the microorganism and an antimicrobial agent the same way. -This will add the interpretation of those values according to a chosen -guidelines (defaults to the latest EUCAST guideline). -Default colours are colour-blind friendly, while maintaining the -convention that e.g. ‘susceptible’ should be green and ‘resistant’ -should be red: - -# base R: -plot(mic_values, mo = "E. coli", ab = "cipro") - - -# ggplot2: -autoplot(mic_values, mo = "E. coli", ab = "cipro") - -For disk diffusion values, there is not much of a difference in -plotting: - -disk_values <- random_disk(size = 100, mo = "E. coli", ab = "cipro") -disk_values -#> Class 'disk' -#> [1] 23 30 22 17 17 18 24 30 24 17 19 28 19 31 28 17 28 18 20 31 27 25 22 22 18 -#> [26] 30 29 25 26 25 20 18 26 26 28 21 22 25 19 27 22 31 19 18 25 30 24 28 30 19 -#> [51] 22 18 26 19 20 20 31 26 30 22 17 17 27 23 19 30 27 28 25 28 20 23 28 27 19 -#> [76] 25 30 25 27 28 24 24 21 22 22 31 21 21 27 17 19 29 29 29 26 28 21 28 29 31 - -# base R: -plot(disk_values, mo = "E. coli", ab = "cipro") - -And when using the ggplot2 package, but now choosing the -latest implemented CLSI guideline (notice that the EUCAST-specific term -“Susceptible, incr. exp.” has changed to “Intermediate”): - -autoplot( - disk_values, - mo = "E. coli", - ab = "cipro", - guideline = "CLSI" -) - - - - -Independence test - -The next example uses the example_isolates data set. -This is a data set included with this package and contains 2,000 -microbial isolates with their full antibiograms. It reflects reality and -can be used to practise AMR data analysis. -We will compare the resistance to amoxicillin/clavulanic acid (column -AMC) between an ICU and other clinical wards. The input for -the fisher.test() can be retrieved with a transformation -like this: - -# use package 'tidyr' to pivot data: -library(tidyr) - -check_AMC <- example_isolates %>% - filter(ward %in% c("ICU", "Clinical")) %>% # filter on only these wards - select(ward, AMC) %>% # select the wards and amoxi/clav - group_by(ward) %>% # group on the wards - count_df(combine_SI = TRUE) %>% # count all isolates per group (ward) - pivot_wider( - names_from = ward, # transform output so "ICU" and "Clinical" are columns - values_from = value - ) %>% - select(ICU, Clinical) %>% # and only select these columns - as.matrix() # transform to a good old matrix for fisher.test() - -check_AMC -#> ICU Clinical -#> [1,] 396 942 -#> [2,] 184 240 -We can apply the test now with: - -# do Fisher's Exact Test -fisher.test(check_AMC) -#> -#> Fisher's Exact Test for Count Data -#> -#> data: check_AMC -#> p-value = 2.263e-07 -#> alternative hypothesis: true odds ratio is not equal to 1 -#> 95 percent confidence interval: -#> 0.435261 0.691614 -#> sample estimates: -#> odds ratio -#> 0.5485079 -As can be seen, the p value is practically zero (0.0000002263247), -which means that the amoxicillin/clavulanic acid resistance found in -isolates between patients in ICUs and other clinical wards are really -different. + summarise(amoxicillin = resistance(AMX)) +#> # A tibble: 3 × 2 +#> hospital amoxicillin +#> <chr> <dbl> +#> 1 A 0.343 +#> 2 B 0.569 +#> 3 C 0.375 -Author: Dr. Matthijs Berends +Author: Dr. Matthijs Berends, 26th Feb 2023
library(dplyr) library(ggplot2) library(AMR) -library(cleaner) # (if not yet installed, install with:) -# install.packages(c("dplyr", "ggplot2", "AMR", "cleaner"))
We will create some fake example data to use for analysis. For AMR -data analysis, we need at least: a patient ID, name or code of a -microorganism, a date and antimicrobial results (an antibiogram). It -could also include a specimen type (e.g. to filter on blood or urine), -the ward type (e.g. to filter on ICUs).
With additional columns (like a hospital name, the patients gender of -even [well-defined] clinical properties) you can do a comparative -analysis, as this tutorial will demonstrate too.
To start with patients, we need a unique list of patients.
The AMR package contains a data set +example_isolates_unclean, which might look data that users +have extracted from their laboratory systems:
AMR
example_isolates_unclean
-patients <- unlist(lapply(LETTERS, paste0, 1:10))
patients <- unlist(lapply(LETTERS, paste0, 1:10))
The LETTERS object is available in R - it’s a vector -with 26 characters: A to Z. The -patients object we just created is now a vector of length -260, with values (patient IDs) varying from A1 to -Z10. Now we we also set the gender of our patients, by -putting the ID and the gender in a table:
LETTERS
A
Z
patients
A1
Z10
-patients_table <- data.frame( - patient_id = patients, - gender = c( - rep("M", 135), - rep("F", 125) - ) -)
patients_table <- data.frame( - patient_id = patients, - gender = c( - rep("M", 135), - rep("F", 125) - ) -)
The first 135 patient IDs are now male, the other 125 are female.
Let’s pretend that our data consists of blood cultures isolates from -between 1 January 2010 and 1 January 2018.
-dates <- seq(as.Date("2010-01-01"), as.Date("2018-01-01"), by = "day")
dates <- seq(as.Date("2010-01-01"), as.Date("2018-01-01"), by = "day")
This dates object now contains all days in our date -range.
dates
For this tutorial, we will uses four different microorganisms: -Escherichia coli, Staphylococcus aureus, -Streptococcus pneumoniae, and Klebsiella -pneumoniae:
-bacteria <- c( - "Escherichia coli", "Staphylococcus aureus", - "Streptococcus pneumoniae", "Klebsiella pneumoniae" -)
bacteria <- c( - "Escherichia coli", "Staphylococcus aureus", - "Streptococcus pneumoniae", "Klebsiella pneumoniae" -)
Using the sample() function, we can randomly select -items from all objects we defined earlier. To let our fake data reflect -reality a bit, we will also approximately define the probabilities of -bacteria and the antibiotic results, using the random_sir() -function.
sample()
random_sir()
-sample_size <- 20000 -data <- data.frame( - date = sample(dates, size = sample_size, replace = TRUE), - patient_id = sample(patients, size = sample_size, replace = TRUE), - hospital = sample( - c( - "Hospital A", - "Hospital B", - "Hospital C", - "Hospital D" - ), - size = sample_size, replace = TRUE, - prob = c(0.30, 0.35, 0.15, 0.20) - ), - bacteria = sample(bacteria, - size = sample_size, replace = TRUE, - prob = c(0.50, 0.25, 0.15, 0.10) - ), - AMX = random_sir(sample_size, prob_sir = c(0.35, 0.60, 0.05)), - AMC = random_sir(sample_size, prob_sir = c(0.15, 0.75, 0.10)), - CIP = random_sir(sample_size, prob_sir = c(0.20, 0.80, 0.00)), - GEN = random_sir(sample_size, prob_sir = c(0.08, 0.92, 0.00)) -)
sample_size <- 20000 -data <- data.frame( - date = sample(dates, size = sample_size, replace = TRUE), - patient_id = sample(patients, size = sample_size, replace = TRUE), - hospital = sample( - c( - "Hospital A", - "Hospital B", - "Hospital C", - "Hospital D" - ), - size = sample_size, replace = TRUE, - prob = c(0.30, 0.35, 0.15, 0.20) - ), - bacteria = sample(bacteria, - size = sample_size, replace = TRUE, - prob = c(0.50, 0.25, 0.15, 0.10) - ), - AMX = random_sir(sample_size, prob_sir = c(0.35, 0.60, 0.05)), - AMC = random_sir(sample_size, prob_sir = c(0.15, 0.75, 0.10)), - CIP = random_sir(sample_size, prob_sir = c(0.20, 0.80, 0.00)), - GEN = random_sir(sample_size, prob_sir = c(0.08, 0.92, 0.00)) -)
Using the left_join() function from the -dplyr package, we can ‘map’ the gender to the patient ID -using the patients_table object we created earlier:
left_join()
dplyr
patients_table
-data <- data %>% left_join(patients_table)
data <- data %>% left_join(patients_table)
The resulting data set contains 20 000 blood culture isolates. With -the head() function we can preview the first 6 rows of this -data set:
head()
-head(data)
head(data)
Now, let’s start the cleaning and the analysis!
We also created a package dedicated to data cleaning and checking, -called the cleaner package. It freq() function -can be used to create frequency tables.
cleaner
freq()
For example, for the gender variable:
gender
-data %>% freq(gender)
data %>% freq(gender)
Frequency table
Class: character -Length: 20,000 -Available: 20,000 (100%, NA: 0 = 0%) -Unique: 2
Shortest: 1 -Longest: 1
So, we can draw at least two conclusions immediately. From a data -scientists perspective, the data looks clean: only values M -and F. From a researchers perspective: there are slightly -more men. Nothing we didn’t already know.
M
F
The data is already quite clean, but we still need to transform some -variables. The bacteria column now consists of text, and we -want to add more variables based on microbial IDs later on. So, we will -transform this column to valid IDs. The mutate() function -of the dplyr package makes this really easy:
bacteria
mutate()
-data <- data %>% - mutate(bacteria = as.mo(bacteria))
data <- data %>% - mutate(bacteria = as.mo(bacteria))
We also want to transform the antibiotics, because in real life data -we don’t know if they are really clean. The as.sir() -function ensures reliability and reproducibility in these kind of -variables. The is_sir_eligible() can check which columns -are probably columns with SIR test results. Using mutate() -and across(), we can apply the transformation to the formal -<rsi> class:
as.sir()
is_sir_eligible()
across()
<rsi>
-is_sir_eligible(data) -#> [1] FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE FALSE -colnames(data)[is_sir_eligible(data)] -#> [1] "AMX" "AMC" "CIP" "GEN" +example_isolates_unclean +#> # A tibble: 3,000 × 8 +#> patient_id hospital date bacteria AMX AMC CIP GEN +#> <chr> <chr> <date> <chr> <chr> <chr> <chr> <chr> +#> 1 J3 A 2012-11-21 E. coli R I S S +#> 2 R7 A 2018-04-03 K. pneumoniae R I S S +#> 3 P3 A 2014-09-19 E. coli R S S S +#> 4 P10 A 2015-12-10 E. coli S I S S +#> 5 B7 A 2015-03-02 E. coli S S S S +#> 6 W3 A 2018-03-31 S. aureus R S R S +#> 7 J8 A 2016-06-14 E. coli R S S S +#> 8 M3 A 2015-10-25 E. coli R S S S +#> 9 J3 A 2019-06-19 E. coli S S S S +#> 10 G6 A 2015-04-27 S. aureus S S S S +#> # … with 2,990 more rows -data <- data %>% - mutate(across(where(is_sir_eligible), as.sir))
is_sir_eligible(data) -#> [1] FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE FALSE -colnames(data)[is_sir_eligible(data)] -#> [1] "AMX" "AMC" "CIP" "GEN" +example_isolates_unclean +#> # A tibble: 3,000 × 8 +#> patient_id hospital date bacteria AMX AMC CIP GEN +#> <chr> <chr> <date> <chr> <chr> <chr> <chr> <chr> +#> 1 J3 A 2012-11-21 E. coli R I S S +#> 2 R7 A 2018-04-03 K. pneumoniae R I S S +#> 3 P3 A 2014-09-19 E. coli R S S S +#> 4 P10 A 2015-12-10 E. coli S I S S +#> 5 B7 A 2015-03-02 E. coli S S S S +#> 6 W3 A 2018-03-31 S. aureus R S R S +#> 7 J8 A 2016-06-14 E. coli R S S S +#> 8 M3 A 2015-10-25 E. coli R S S S +#> 9 J3 A 2019-06-19 E. coli S S S S +#> 10 G6 A 2015-04-27 S. aureus S S S S +#> # … with 2,990 more rows -data <- data %>% - mutate(across(where(is_sir_eligible), as.sir))
example_isolates_unclean +#> # A tibble: 3,000 × 8 +#> patient_id hospital date bacteria AMX AMC CIP GEN +#> <chr> <chr> <date> <chr> <chr> <chr> <chr> <chr> +#> 1 J3 A 2012-11-21 E. coli R I S S +#> 2 R7 A 2018-04-03 K. pneumoniae R I S S +#> 3 P3 A 2014-09-19 E. coli R S S S +#> 4 P10 A 2015-12-10 E. coli S I S S +#> 5 B7 A 2015-03-02 E. coli S S S S +#> 6 W3 A 2018-03-31 S. aureus R S R S +#> 7 J8 A 2016-06-14 E. coli R S S S +#> 8 M3 A 2015-10-25 E. coli R S S S +#> 9 J3 A 2019-06-19 E. coli S S S S +#> 10 G6 A 2015-04-27 S. aureus S S S S +#> # … with 2,990 more rows -data <- data %>% - mutate(across(where(is_sir_eligible), as.sir))
Finally, we will apply EUCAST -rules on our antimicrobial results. In Europe, most medical -microbiological laboratories already apply these rules. Our package -features their latest insights on intrinsic resistance and exceptional -phenotypes. Moreover, the eucast_rules() function can also -apply additional rules, like forcing -ampicillin = R when -amoxicillin/clavulanic acid = R.
eucast_rules()
Because the amoxicillin (column AMX) and -amoxicillin/clavulanic acid (column AMC) in our data were -generated randomly, some rows will undoubtedly contain AMX = S and AMC = -R, which is technically impossible. The eucast_rules() -fixes this:
AMX
AMC
-data <- eucast_rules(data, col_mo = "bacteria", rules = "all")
data <- eucast_rules(data, col_mo = "bacteria", rules = "all")
For AMR data analysis, we would like the microorganism column to +contain valid, up-to-date taxonomy, and the antibiotic columns to be +cleaned as SIR values as well.
With as.mo(), users can transform arbitrary +microorganism names or codes to current taxonomy. The AMR +package contains up-to-date taxonomic data. To be specific, currently +included data were retrieved on 11 Dec 2022.
as.mo()
The codes of the AMR packages that come from as.mo() are +short, but still human readable. More importantly, as.mo() +supports all kinds of input:
+as.mo("Klebsiella pneumoniae") +#> Class 'mo' +#> [1] B_KLBSL_PNMN +as.mo("K. pneumoniae") +#> Class 'mo' +#> [1] B_KLBSL_PNMN +as.mo("KLEPNE") +#> Class 'mo' +#> [1] B_KLBSL_PNMN +as.mo("KLPN") +#> Class 'mo' +#> [1] B_KLBSL_PNMN
as.mo("Klebsiella pneumoniae") +#> Class 'mo' +#> [1] B_KLBSL_PNMN +as.mo("K. pneumoniae") +#> Class 'mo' +#> [1] B_KLBSL_PNMN +as.mo("KLEPNE") +#> Class 'mo' +#> [1] B_KLBSL_PNMN +as.mo("KLPN") +#> Class 'mo' +#> [1] B_KLBSL_PNMN
The first character in above codes denote their taxonomic kingdom, +such as Bacteria (B), Fungi (F), and Protozoa (P).
The AMR package also contain functions to directly +retrieve taxonomic properties, such as the name, genus, species, family, +order, and even Gram-stain. They all start with mo_ and +they use as.mo() internally, so that still any arbitrary +user input can be used:
mo_
+mo_family("K. pneumoniae") +#> [1] "Enterobacteriaceae" +mo_genus("K. pneumoniae") +#> [1] "Klebsiella" +mo_species("K. pneumoniae") +#> [1] "pneumoniae" + +mo_gramstain("Klebsiella pneumoniae") +#> [1] "Gram-negative" + +mo_ref("K. pneumoniae") +#> [1] "Trevisan, 1887" + +mo_snomed("K. pneumoniae") +#> [[1]] +#> [1] "1098101000112102" "446870005" "1098201000112108" "409801009" +#> [5] "56415008" "714315002" "713926009"
mo_family("K. pneumoniae") +#> [1] "Enterobacteriaceae" +mo_genus("K. pneumoniae") +#> [1] "Klebsiella" +mo_species("K. pneumoniae") +#> [1] "pneumoniae" + +mo_gramstain("Klebsiella pneumoniae") +#> [1] "Gram-negative" + +mo_ref("K. pneumoniae") +#> [1] "Trevisan, 1887" + +mo_snomed("K. pneumoniae") +#> [[1]] +#> [1] "1098101000112102" "446870005" "1098201000112108" "409801009" +#> [5] "56415008" "714315002" "713926009"
Now we can thus clean our data:
+our_data$bacteria <- as.mo(our_data$bacteria, info = TRUE) +#> ℹ Microorganism translation was uncertain for four microorganisms. Run +#> mo_uncertainties() to review these uncertainties, or use +#> add_custom_microorganisms() to add custom entries.
our_data$bacteria <- as.mo(our_data$bacteria, info = TRUE) +#> ℹ Microorganism translation was uncertain for four microorganisms. Run +#> mo_uncertainties() to review these uncertainties, or use +#> add_custom_microorganisms() to add custom entries.
Apparently, there was some uncertainty about the translation to +taxonomic codes. Let’s check this:
+mo_uncertainties() +#> Matching scores are based on the resemblance between the input and the full +#> taxonomic name, and the pathogenicity in humans. See ?mo_matching_score. +#> +#> -------------------------------------------------------------------------------- +#> "E. coli" -> Escherichia coli (B_ESCHR_COLI, 0.688) +#> Based on input "E coli" +#> Also matched: Enterobacter cowanii (0.600), Eubacterium combesii +#> (0.600), Eggerthia catenaformis (0.591), Eubacterium callanderi +#> (0.591), Enterocloster citroniae (0.587), Eubacterium cylindroides +#> (0.583), Enterococcus casseliflavus (0.577), Enterobacter cloacae +#> cloacae (0.571), Ehrlichia canis (0.567), and Enterobacter cloacae +#> dissolvens (0.565) +#> -------------------------------------------------------------------------------- +#> "K. pneumoniae" -> Klebsiella pneumoniae (B_KLBSL_PNMN, 0.786) +#> Based on input "K pneumoniae" +#> Also matched: Klebsiella pneumoniae ozaenae (0.707), Klebsiella +#> pneumoniae pneumoniae (0.688), Klebsiella pneumoniae rhinoscleromatis +#> (0.658), Klebsiella pasteurii (0.500), Klebsiella planticola (0.500), +#> Kingella potus (0.400), Kosakonia pseudosacchari (0.361), Kaistella +#> palustris (0.333), Kocuria palustris (0.333), and Kocuria pelophila +#> (0.333) +#> -------------------------------------------------------------------------------- +#> "S. aureus" -> Staphylococcus aureus (B_STPHY_AURS, 0.690) +#> Based on input "S aureus" +#> Also matched: Staphylococcus aureus aureus (0.643), Staphylococcus +#> argenteus (0.625), Staphylococcus aureus anaerobius (0.625), +#> Streptomyces argenteolus (0.483), Streptomyces aureus (0.474), +#> Streptomyces azureus (0.467), Streptomyces aureorectus (0.444), +#> Streptomyces auratus (0.433), Streptomyces aurantiogriseus (0.429), and +#> Streptomyces aureocirculatus (0.429) +#> -------------------------------------------------------------------------------- +#> "S. pneumoniae" -> Streptococcus pneumoniae (B_STRPT_PNMN, 0.750) +#> Based on input "S pneumoniae" +#> Also matched: Streptococcus pseudopneumoniae (0.700), Serratia +#> proteamaculans quinovora (0.545), Streptococcus pseudoporcinus (0.536), +#> Staphylococcus pseudintermedius (0.532), Serratia proteamaculans +#> proteamaculans (0.526), Salmonella Portanigra (0.524), Sphingomonas +#> paucimobilis (0.520), Streptococcus pluranimalium (0.519), +#> Streptococcus constellatus pharyngis (0.514), and Salmonella Pakistan +#> (0.500) +#> +#> Only the first 10 other matches of each record are shown. Run +#> print(mo_uncertainties(), n = ...) to view more entries, or save +#> mo_uncertainties() to an object.
mo_uncertainties() +#> Matching scores are based on the resemblance between the input and the full +#> taxonomic name, and the pathogenicity in humans. See ?mo_matching_score. +#> +#> -------------------------------------------------------------------------------- +#> "E. coli" -> Escherichia coli (B_ESCHR_COLI, 0.688) +#> Based on input "E coli" +#> Also matched: Enterobacter cowanii (0.600), Eubacterium combesii +#> (0.600), Eggerthia catenaformis (0.591), Eubacterium callanderi +#> (0.591), Enterocloster citroniae (0.587), Eubacterium cylindroides +#> (0.583), Enterococcus casseliflavus (0.577), Enterobacter cloacae +#> cloacae (0.571), Ehrlichia canis (0.567), and Enterobacter cloacae +#> dissolvens (0.565) +#> -------------------------------------------------------------------------------- +#> "K. pneumoniae" -> Klebsiella pneumoniae (B_KLBSL_PNMN, 0.786) +#> Based on input "K pneumoniae" +#> Also matched: Klebsiella pneumoniae ozaenae (0.707), Klebsiella +#> pneumoniae pneumoniae (0.688), Klebsiella pneumoniae rhinoscleromatis +#> (0.658), Klebsiella pasteurii (0.500), Klebsiella planticola (0.500), +#> Kingella potus (0.400), Kosakonia pseudosacchari (0.361), Kaistella +#> palustris (0.333), Kocuria palustris (0.333), and Kocuria pelophila +#> (0.333) +#> -------------------------------------------------------------------------------- +#> "S. aureus" -> Staphylococcus aureus (B_STPHY_AURS, 0.690) +#> Based on input "S aureus" +#> Also matched: Staphylococcus aureus aureus (0.643), Staphylococcus +#> argenteus (0.625), Staphylococcus aureus anaerobius (0.625), +#> Streptomyces argenteolus (0.483), Streptomyces aureus (0.474), +#> Streptomyces azureus (0.467), Streptomyces aureorectus (0.444), +#> Streptomyces auratus (0.433), Streptomyces aurantiogriseus (0.429), and +#> Streptomyces aureocirculatus (0.429) +#> -------------------------------------------------------------------------------- +#> "S. pneumoniae" -> Streptococcus pneumoniae (B_STRPT_PNMN, 0.750) +#> Based on input "S pneumoniae" +#> Also matched: Streptococcus pseudopneumoniae (0.700), Serratia +#> proteamaculans quinovora (0.545), Streptococcus pseudoporcinus (0.536), +#> Staphylococcus pseudintermedius (0.532), Serratia proteamaculans +#> proteamaculans (0.526), Salmonella Portanigra (0.524), Sphingomonas +#> paucimobilis (0.520), Streptococcus pluranimalium (0.519), +#> Streptococcus constellatus pharyngis (0.514), and Salmonella Pakistan +#> (0.500) +#> +#> Only the first 10 other matches of each record are shown. Run +#> print(mo_uncertainties(), n = ...) to view more entries, or save +#> mo_uncertainties() to an object.
That’s all good.
The column with antibiotic test results must also be cleaned. The +AMR package comes with three new data types to work with +such test results: mic for minimal inhibitory +concentrations (MIC), disk for disk diffusion diameters, +and sir for SIR data that have been interpreted already. +This package can also determine SIR values based on MIC or disk +diffusion values, read more about that on the as.sir() +page.
mic
disk
sir
For now, we will just clean the SIR columns in our data using +dplyr:
+# method 1, be explicit about the columns: +our_data <- our_data %>% + mutate_at(vars(AMX:GEN), as.sir) + +# method 2, let the AMR package determine the eligible columns +our_data <- our_data %>% + mutate_if(is_sir_eligible, as.sir) + +# result: +our_data +#> # A tibble: 3,000 × 8 +#> patient_id hospital date bacteria AMX AMC CIP GEN +#> <chr> <chr> <date> <mo> <sir> <sir> <sir> <sir> +#> 1 J3 A 2012-11-21 B_ESCHR_COLI R I S S +#> 2 R7 A 2018-04-03 B_KLBSL_PNMN R I S S +#> 3 P3 A 2014-09-19 B_ESCHR_COLI R S S S +#> 4 P10 A 2015-12-10 B_ESCHR_COLI S I S S +#> 5 B7 A 2015-03-02 B_ESCHR_COLI S S S S +#> 6 W3 A 2018-03-31 B_STPHY_AURS R S R S +#> 7 J8 A 2016-06-14 B_ESCHR_COLI R S S S +#> 8 M3 A 2015-10-25 B_ESCHR_COLI R S S S +#> 9 J3 A 2019-06-19 B_ESCHR_COLI S S S S +#> 10 G6 A 2015-04-27 B_STPHY_AURS S S S S +#> # … with 2,990 more rows
# method 1, be explicit about the columns: +our_data <- our_data %>% + mutate_at(vars(AMX:GEN), as.sir) + +# method 2, let the AMR package determine the eligible columns +our_data <- our_data %>% + mutate_if(is_sir_eligible, as.sir) + +# result: +our_data +#> # A tibble: 3,000 × 8 +#> patient_id hospital date bacteria AMX AMC CIP GEN +#> <chr> <chr> <date> <mo> <sir> <sir> <sir> <sir> +#> 1 J3 A 2012-11-21 B_ESCHR_COLI R I S S +#> 2 R7 A 2018-04-03 B_KLBSL_PNMN R I S S +#> 3 P3 A 2014-09-19 B_ESCHR_COLI R S S S +#> 4 P10 A 2015-12-10 B_ESCHR_COLI S I S S +#> 5 B7 A 2015-03-02 B_ESCHR_COLI S S S S +#> 6 W3 A 2018-03-31 B_STPHY_AURS R S R S +#> 7 J8 A 2016-06-14 B_ESCHR_COLI R S S S +#> 8 M3 A 2015-10-25 B_ESCHR_COLI R S S S +#> 9 J3 A 2019-06-19 B_ESCHR_COLI S S S S +#> 10 G6 A 2015-04-27 B_STPHY_AURS S S S S +#> # … with 2,990 more rows
This is basically it for the cleaning, time to start the data +inclusion.
Now that we have the microbial ID, we can add some taxonomic -properties:
-data <- data %>% - mutate( - gramstain = mo_gramstain(bacteria), - genus = mo_genus(bacteria), - species = mo_species(bacteria) - )
data <- data %>% - mutate( - gramstain = mo_gramstain(bacteria), - genus = mo_genus(bacteria), - species = mo_species(bacteria) - )
We also need to know which isolates we can actually use for -analysis.
We need to know which isolates we can actually use for +analysis without repetition bias.
To conduct an analysis of antimicrobial resistance, you must only include the first isolate of every patient per episode (Hindler et al., Clin Infect Dis. 2007). If you would not do this, you could easily get an @@ -614,13 +490,11 @@ different methods as defined by first_isolate() page.
first_isolate()
The outcome of the function can easily be added to our data:
-data <- data %>% + +our_data <- our_data %>% mutate(first = first_isolate(info = TRUE)) #> Determining first isolates using an episode length of 365 days #> ℹ Using column 'bacteria' as input for col_mo. @@ -629,496 +503,698 @@ takes into account the antimicrobial susceptibility test results using #> Basing inclusion on all antimicrobial results, using a points threshold of #> 2 #> Including isolates from ICU. -#> => Found 12,372 'phenotype-based' first isolates (61.9% of total where a -#> microbial ID was available) -So only 61.9% is suitable for resistance analysis! We can now filter -on it with the filter() function, also from the +#> => Found 2,626 'phenotype-based' first isolates (87.6% within scope and +#> 87.5% of total where a microbial ID was available)
data <- data %>% + +our_data <- our_data %>% mutate(first = first_isolate(info = TRUE)) #> Determining first isolates using an episode length of 365 days #> ℹ Using column 'bacteria' as input for col_mo. @@ -629,496 +503,698 @@ takes into account the antimicrobial susceptibility test results using #> Basing inclusion on all antimicrobial results, using a points threshold of #> 2 #> Including isolates from ICU. -#> => Found 12,372 'phenotype-based' first isolates (61.9% of total where a -#> microbial ID was available) -So only 61.9% is suitable for resistance analysis! We can now filter -on it with the filter() function, also from the +#> => Found 2,626 'phenotype-based' first isolates (87.6% within scope and +#> 87.5% of total where a microbial ID was available)
+our_data <- our_data %>% mutate(first = first_isolate(info = TRUE)) #> Determining first isolates using an episode length of 365 days #> ℹ Using column 'bacteria' as input for col_mo. @@ -629,496 +503,698 @@ takes into account the antimicrobial susceptibility test results using #> Basing inclusion on all antimicrobial results, using a points threshold of #> 2 #> Including isolates from ICU. -#> => Found 12,372 'phenotype-based' first isolates (61.9% of total where a -#> microbial ID was available)
our_data <- our_data %>% mutate(first = first_isolate(info = TRUE)) #> Determining first isolates using an episode length of 365 days #> ℹ Using column 'bacteria' as input for col_mo. @@ -629,496 +503,698 @@ takes into account the antimicrobial susceptibility test results using #> Basing inclusion on all antimicrobial results, using a points threshold of #> 2 #> Including isolates from ICU. -#> => Found 12,372 'phenotype-based' first isolates (61.9% of total where a -#> microbial ID was available)
So only 61.9% is suitable for resistance analysis! We can now filter -on it with the filter() function, also from the +#> => Found 2,626 'phenotype-based' first isolates (87.6% within scope and +#> 87.5% of total where a microbial ID was available)
filter()
So only 88% is suitable for resistance analysis! We can now filter on +it with the filter() function, also from the dplyr package:
-data_1st <- data %>% + +our_data_1st <- our_data %>% filter(first == TRUE) For future use, the above two syntaxes can be shortened: - -data_1st <- data %>% + +our_data_1st <- our_data %>% filter_first_isolate() -So we end up with 12 372 isolates for analysis. Now our data looks +So we end up with 2 626 isolates for analysis. Now our data looks like: - -head(data_1st) - - - - - - - - - - - - - - - - - - - -date -patient_id -hospital -bacteria -AMX -AMC -CIP -GEN -gender -gramstain -genus -species -first - - - -1 -2015-09-16 -Z5 -Hospital A -B_ESCHR_COLI -S -S -I -R -F -Gram-negative -Escherichia -coli -TRUE - - -2 -2017-09-08 -V1 -Hospital B -B_STRPT_PNMN -S -S -R -R -F -Gram-positive -Streptococcus -pneumoniae -TRUE - - -3 -2016-08-05 -B1 -Hospital B -B_STRPT_PNMN -R -R -R -R -M -Gram-positive -Streptococcus -pneumoniae -TRUE - - -4 -2010-08-24 -E10 -Hospital C -B_ESCHR_COLI -S -S -I -S -M -Gram-negative -Escherichia -coli -TRUE - - -7 -2011-06-03 -D1 -Hospital A -B_STRPT_PNMN -R -R -S -R -M -Gram-positive -Streptococcus -pneumoniae -TRUE - - -8 -2016-04-15 -T10 -Hospital B -B_STRPT_PNMN -R -R -R -R -F -Gram-positive -Streptococcus -pneumoniae -TRUE - - - -Time for the analysis! + +our_data_1st +#> # A tibble: 2,626 × 9 +#> patient_id hospital date bacteria AMX AMC CIP GEN first +#> <chr> <chr> <date> <mo> <sir> <sir> <sir> <sir> <lgl> +#> 1 J3 A 2012-11-21 B_ESCHR_COLI R I S S TRUE +#> 2 R7 A 2018-04-03 B_KLBSL_PNMN R I S S TRUE +#> 3 P10 A 2015-12-10 B_ESCHR_COLI S I S S TRUE +#> 4 B7 A 2015-03-02 B_ESCHR_COLI S S S S TRUE +#> 5 W3 A 2018-03-31 B_STPHY_AURS R S R S TRUE +#> 6 J8 A 2016-06-14 B_ESCHR_COLI R S S S TRUE +#> 7 M3 A 2015-10-25 B_ESCHR_COLI R S S S TRUE +#> 8 J3 A 2019-06-19 B_ESCHR_COLI S S S S TRUE +#> 9 G6 A 2015-04-27 B_STPHY_AURS S S S S TRUE +#> 10 P4 A 2011-06-21 B_ESCHR_COLI S S S S TRUE +#> # … with 2,616 more rows +Time for the analysis.
data_1st <- data %>% + +our_data_1st <- our_data %>% filter(first == TRUE) For future use, the above two syntaxes can be shortened: - -data_1st <- data %>% + +our_data_1st <- our_data %>% filter_first_isolate() -So we end up with 12 372 isolates for analysis. Now our data looks +So we end up with 2 626 isolates for analysis. Now our data looks like: - -head(data_1st) - - - - - - - - - - - - - - - - - - - -date -patient_id -hospital -bacteria -AMX -AMC -CIP -GEN -gender -gramstain -genus -species -first - - - -1 -2015-09-16 -Z5 -Hospital A -B_ESCHR_COLI -S -S -I -R -F -Gram-negative -Escherichia -coli -TRUE - - -2 -2017-09-08 -V1 -Hospital B -B_STRPT_PNMN -S -S -R -R -F -Gram-positive -Streptococcus -pneumoniae -TRUE - - -3 -2016-08-05 -B1 -Hospital B -B_STRPT_PNMN -R -R -R -R -M -Gram-positive -Streptococcus -pneumoniae -TRUE - - -4 -2010-08-24 -E10 -Hospital C -B_ESCHR_COLI -S -S -I -S -M -Gram-negative -Escherichia -coli -TRUE - - -7 -2011-06-03 -D1 -Hospital A -B_STRPT_PNMN -R -R -S -R -M -Gram-positive -Streptococcus -pneumoniae -TRUE - - -8 -2016-04-15 -T10 -Hospital B -B_STRPT_PNMN -R -R -R -R -F -Gram-positive -Streptococcus -pneumoniae -TRUE - - - -Time for the analysis! + +our_data_1st +#> # A tibble: 2,626 × 9 +#> patient_id hospital date bacteria AMX AMC CIP GEN first +#> <chr> <chr> <date> <mo> <sir> <sir> <sir> <sir> <lgl> +#> 1 J3 A 2012-11-21 B_ESCHR_COLI R I S S TRUE +#> 2 R7 A 2018-04-03 B_KLBSL_PNMN R I S S TRUE +#> 3 P10 A 2015-12-10 B_ESCHR_COLI S I S S TRUE +#> 4 B7 A 2015-03-02 B_ESCHR_COLI S S S S TRUE +#> 5 W3 A 2018-03-31 B_STPHY_AURS R S R S TRUE +#> 6 J8 A 2016-06-14 B_ESCHR_COLI R S S S TRUE +#> 7 M3 A 2015-10-25 B_ESCHR_COLI R S S S TRUE +#> 8 J3 A 2019-06-19 B_ESCHR_COLI S S S S TRUE +#> 9 G6 A 2015-04-27 B_STPHY_AURS S S S S TRUE +#> 10 P4 A 2011-06-21 B_ESCHR_COLI S S S S TRUE +#> # … with 2,616 more rows +Time for the analysis.
+our_data_1st <- our_data %>% filter(first == TRUE)
our_data_1st <- our_data %>% filter(first == TRUE)
For future use, the above two syntaxes can be shortened:
-data_1st <- data %>% + +our_data_1st <- our_data %>% filter_first_isolate() -So we end up with 12 372 isolates for analysis. Now our data looks +So we end up with 2 626 isolates for analysis. Now our data looks like: - -head(data_1st) - - - - - - - - - - - - - - - - - - - -date -patient_id -hospital -bacteria -AMX -AMC -CIP -GEN -gender -gramstain -genus -species -first - - - -1 -2015-09-16 -Z5 -Hospital A -B_ESCHR_COLI -S -S -I -R -F -Gram-negative -Escherichia -coli -TRUE - - -2 -2017-09-08 -V1 -Hospital B -B_STRPT_PNMN -S -S -R -R -F -Gram-positive -Streptococcus -pneumoniae -TRUE - - -3 -2016-08-05 -B1 -Hospital B -B_STRPT_PNMN -R -R -R -R -M -Gram-positive -Streptococcus -pneumoniae -TRUE - - -4 -2010-08-24 -E10 -Hospital C -B_ESCHR_COLI -S -S -I -S -M -Gram-negative -Escherichia -coli -TRUE - - -7 -2011-06-03 -D1 -Hospital A -B_STRPT_PNMN -R -R -S -R -M -Gram-positive -Streptococcus -pneumoniae -TRUE - - -8 -2016-04-15 -T10 -Hospital B -B_STRPT_PNMN -R -R -R -R -F -Gram-positive -Streptococcus -pneumoniae -TRUE - - - -Time for the analysis! + +our_data_1st +#> # A tibble: 2,626 × 9 +#> patient_id hospital date bacteria AMX AMC CIP GEN first +#> <chr> <chr> <date> <mo> <sir> <sir> <sir> <sir> <lgl> +#> 1 J3 A 2012-11-21 B_ESCHR_COLI R I S S TRUE +#> 2 R7 A 2018-04-03 B_KLBSL_PNMN R I S S TRUE +#> 3 P10 A 2015-12-10 B_ESCHR_COLI S I S S TRUE +#> 4 B7 A 2015-03-02 B_ESCHR_COLI S S S S TRUE +#> 5 W3 A 2018-03-31 B_STPHY_AURS R S R S TRUE +#> 6 J8 A 2016-06-14 B_ESCHR_COLI R S S S TRUE +#> 7 M3 A 2015-10-25 B_ESCHR_COLI R S S S TRUE +#> 8 J3 A 2019-06-19 B_ESCHR_COLI S S S S TRUE +#> 9 G6 A 2015-04-27 B_STPHY_AURS S S S S TRUE +#> 10 P4 A 2011-06-21 B_ESCHR_COLI S S S S TRUE +#> # … with 2,616 more rows +Time for the analysis.
data_1st <- data %>% + +our_data_1st <- our_data %>% filter_first_isolate() -So we end up with 12 372 isolates for analysis. Now our data looks +So we end up with 2 626 isolates for analysis. Now our data looks like: - -head(data_1st) - - - - - - - - - - - - - - - - - - - -date -patient_id -hospital -bacteria -AMX -AMC -CIP -GEN -gender -gramstain -genus -species -first - - - -1 -2015-09-16 -Z5 -Hospital A -B_ESCHR_COLI -S -S -I -R -F -Gram-negative -Escherichia -coli -TRUE - - -2 -2017-09-08 -V1 -Hospital B -B_STRPT_PNMN -S -S -R -R -F -Gram-positive -Streptococcus -pneumoniae -TRUE - - -3 -2016-08-05 -B1 -Hospital B -B_STRPT_PNMN -R -R -R -R -M -Gram-positive -Streptococcus -pneumoniae -TRUE - - -4 -2010-08-24 -E10 -Hospital C -B_ESCHR_COLI -S -S -I -S -M -Gram-negative -Escherichia -coli -TRUE - - -7 -2011-06-03 -D1 -Hospital A -B_STRPT_PNMN -R -R -S -R -M -Gram-positive -Streptococcus -pneumoniae -TRUE - - -8 -2016-04-15 -T10 -Hospital B -B_STRPT_PNMN -R -R -R -R -F -Gram-positive -Streptococcus -pneumoniae -TRUE - - - -Time for the analysis! + +our_data_1st +#> # A tibble: 2,626 × 9 +#> patient_id hospital date bacteria AMX AMC CIP GEN first +#> <chr> <chr> <date> <mo> <sir> <sir> <sir> <sir> <lgl> +#> 1 J3 A 2012-11-21 B_ESCHR_COLI R I S S TRUE +#> 2 R7 A 2018-04-03 B_KLBSL_PNMN R I S S TRUE +#> 3 P10 A 2015-12-10 B_ESCHR_COLI S I S S TRUE +#> 4 B7 A 2015-03-02 B_ESCHR_COLI S S S S TRUE +#> 5 W3 A 2018-03-31 B_STPHY_AURS R S R S TRUE +#> 6 J8 A 2016-06-14 B_ESCHR_COLI R S S S TRUE +#> 7 M3 A 2015-10-25 B_ESCHR_COLI R S S S TRUE +#> 8 J3 A 2019-06-19 B_ESCHR_COLI S S S S TRUE +#> 9 G6 A 2015-04-27 B_STPHY_AURS S S S S TRUE +#> 10 P4 A 2011-06-21 B_ESCHR_COLI S S S S TRUE +#> # … with 2,616 more rows +Time for the analysis.
+our_data_1st <- our_data %>% filter_first_isolate()
our_data_1st <- our_data %>% filter_first_isolate()
So we end up with 12 372 isolates for analysis. Now our data looks +
So we end up with 2 626 isolates for analysis. Now our data looks like:
-head(data_1st)
head(data_1st)
Time for the analysis!
+our_data_1st +#> # A tibble: 2,626 × 9 +#> patient_id hospital date bacteria AMX AMC CIP GEN first +#> <chr> <chr> <date> <mo> <sir> <sir> <sir> <sir> <lgl> +#> 1 J3 A 2012-11-21 B_ESCHR_COLI R I S S TRUE +#> 2 R7 A 2018-04-03 B_KLBSL_PNMN R I S S TRUE +#> 3 P10 A 2015-12-10 B_ESCHR_COLI S I S S TRUE +#> 4 B7 A 2015-03-02 B_ESCHR_COLI S S S S TRUE +#> 5 W3 A 2018-03-31 B_STPHY_AURS R S R S TRUE +#> 6 J8 A 2016-06-14 B_ESCHR_COLI R S S S TRUE +#> 7 M3 A 2015-10-25 B_ESCHR_COLI R S S S TRUE +#> 8 J3 A 2019-06-19 B_ESCHR_COLI S S S S TRUE +#> 9 G6 A 2015-04-27 B_STPHY_AURS S S S S TRUE +#> 10 P4 A 2011-06-21 B_ESCHR_COLI S S S S TRUE +#> # … with 2,616 more rows
our_data_1st +#> # A tibble: 2,626 × 9 +#> patient_id hospital date bacteria AMX AMC CIP GEN first +#> <chr> <chr> <date> <mo> <sir> <sir> <sir> <sir> <lgl> +#> 1 J3 A 2012-11-21 B_ESCHR_COLI R I S S TRUE +#> 2 R7 A 2018-04-03 B_KLBSL_PNMN R I S S TRUE +#> 3 P10 A 2015-12-10 B_ESCHR_COLI S I S S TRUE +#> 4 B7 A 2015-03-02 B_ESCHR_COLI S S S S TRUE +#> 5 W3 A 2018-03-31 B_STPHY_AURS R S R S TRUE +#> 6 J8 A 2016-06-14 B_ESCHR_COLI R S S S TRUE +#> 7 M3 A 2015-10-25 B_ESCHR_COLI R S S S TRUE +#> 8 J3 A 2019-06-19 B_ESCHR_COLI S S S S TRUE +#> 9 G6 A 2015-04-27 B_STPHY_AURS S S S S TRUE +#> 10 P4 A 2011-06-21 B_ESCHR_COLI S S S S TRUE +#> # … with 2,616 more rows
Time for the analysis.
You might want to start by getting an idea of how the data is -distributed. It’s an important start, because it also decides how you -will continue your analysis. Although this package contains a convenient -function to make frequency tables, exploratory data analysis (EDA) is -not the primary scope of this package. Use a package like DataExplorer -for that, or read the free online book Exploratory Data Analysis -with R by Roger D. Peng.
DataExplorer
The base R summary() function gives a good first +impression, as it comes with support for the new mo and +sir classes that we now have in our data set:
summary()
mo
+summary(our_data_1st) +#> patient_id hospital date +#> Length:2626 Length:2626 Min. :2011-01-01 +#> Class :character Class :character 1st Qu.:2013-04-14 +#> Mode :character Mode :character Median :2015-06-05 +#> Mean :2015-06-15 +#> 3rd Qu.:2017-08-23 +#> Max. :2020-01-01 +#> bacteria AMX AMC +#> Class :mo Class:sir Class:sir +#> <NA> :0 %R :43.2% (n=1134) %R :36.1% (n=947) +#> Unique:4 %SI :56.8% (n=1492) %SI :63.9% (n=1679) +#> #1 :B_ESCHR_COLI - %S :41.1% (n=1080) - %S :52.7% (n=1383) +#> #2 :B_STPHY_AURS - %I :15.7% (n=412) - %I :11.3% (n=296) +#> #3 :B_STRPT_PNMN +#> CIP GEN first +#> Class:sir Class:sir Mode:logical +#> %R :42.0% (n=1102) %R :37.0% (n=971) TRUE:2626 +#> %SI :58.0% (n=1524) %SI :63.0% (n=1655) +#> - %S :51.9% (n=1362) - %S :59.9% (n=1574) +#> - %I : 6.2% (n=162) - %I : 3.1% (n=81) +#> + +glimpse(our_data_1st) +#> Rows: 2,626 +#> Columns: 9 +#> $ patient_id <chr> "J3", "R7", "P10", "B7", "W3", "J8", "M3", "J3", "G6", "P4"… +#> $ hospital <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",… +#> $ date <date> 2012-11-21, 2018-04-03, 2015-12-10, 2015-03-02, 2018-03-31… +#> $ bacteria <mo> "B_ESCHR_COLI", "B_KLBSL_PNMN", "B_ESCHR_COLI", "B_ESCHR_COL… +#> $ AMX <sir> R, R, S, S, R, R, R, S, S, S, S, R, S, S, R, R, R, R, I, S,… +#> $ AMC <sir> I, I, I, S, S, S, S, S, S, S, S, S, S, S, S, S, S, R, S, R,… +#> $ CIP <sir> S, S, S, S, R, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S,… +#> $ GEN <sir> S, S, S, S, S, S, S, S, S, S, S, R, S, S, S, S, S, S, S, S,… +#> $ first <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,… + +# number of unique values per column: +sapply(our_data_1st, n_distinct) +#> patient_id hospital date bacteria AMX AMC CIP +#> 260 3 1808 4 3 3 3 +#> GEN first +#> 3 1
summary(our_data_1st) +#> patient_id hospital date +#> Length:2626 Length:2626 Min. :2011-01-01 +#> Class :character Class :character 1st Qu.:2013-04-14 +#> Mode :character Mode :character Median :2015-06-05 +#> Mean :2015-06-15 +#> 3rd Qu.:2017-08-23 +#> Max. :2020-01-01 +#> bacteria AMX AMC +#> Class :mo Class:sir Class:sir +#> <NA> :0 %R :43.2% (n=1134) %R :36.1% (n=947) +#> Unique:4 %SI :56.8% (n=1492) %SI :63.9% (n=1679) +#> #1 :B_ESCHR_COLI - %S :41.1% (n=1080) - %S :52.7% (n=1383) +#> #2 :B_STPHY_AURS - %I :15.7% (n=412) - %I :11.3% (n=296) +#> #3 :B_STRPT_PNMN +#> CIP GEN first +#> Class:sir Class:sir Mode:logical +#> %R :42.0% (n=1102) %R :37.0% (n=971) TRUE:2626 +#> %SI :58.0% (n=1524) %SI :63.0% (n=1655) +#> - %S :51.9% (n=1362) - %S :59.9% (n=1574) +#> - %I : 6.2% (n=162) - %I : 3.1% (n=81) +#> + +glimpse(our_data_1st) +#> Rows: 2,626 +#> Columns: 9 +#> $ patient_id <chr> "J3", "R7", "P10", "B7", "W3", "J8", "M3", "J3", "G6", "P4"… +#> $ hospital <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",… +#> $ date <date> 2012-11-21, 2018-04-03, 2015-12-10, 2015-03-02, 2018-03-31… +#> $ bacteria <mo> "B_ESCHR_COLI", "B_KLBSL_PNMN", "B_ESCHR_COLI", "B_ESCHR_COL… +#> $ AMX <sir> R, R, S, S, R, R, R, S, S, S, S, R, S, S, R, R, R, R, I, S,… +#> $ AMC <sir> I, I, I, S, S, S, S, S, S, S, S, S, S, S, S, S, S, R, S, R,… +#> $ CIP <sir> S, S, S, S, R, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S,… +#> $ GEN <sir> S, S, S, S, S, S, S, S, S, S, S, R, S, S, S, S, S, S, S, S,… +#> $ first <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,… + +# number of unique values per column: +sapply(our_data_1st, n_distinct) +#> patient_id hospital date bacteria AMX AMC CIP +#> 260 3 1808 4 3 3 3 +#> GEN first +#> 3 1
To just get an idea how the species are distributed, create a -frequency table with our freq() function. We created the -genus and species column earlier based on the -microbial ID. With paste(), we can concatenate them -together.
genus
species
paste()
The freq() function can be used like the base R language -was intended:
-freq(paste(data_1st$genus, data_1st$species))
freq(paste(data_1st$genus, data_1st$species))
Or can be used like the dplyr way, which is easier -readable:
-data_1st %>% freq(genus, species)
data_1st %>% freq(genus, species)
Class: character -Length: 12,372 -Available: 12,372 (100%, NA: 0 = 0%) -Unique: 4
Shortest: 16 -Longest: 24
count()
+our_data %>% + count(mo_name(bacteria), sort = TRUE) +#> # A tibble: 4 × 2 +#> `mo_name(bacteria)` n +#> <chr> <int> +#> 1 Escherichia coli 1518 +#> 2 Staphylococcus aureus 730 +#> 3 Streptococcus pneumoniae 426 +#> 4 Klebsiella pneumoniae 326 + +our_data_1st %>% + count(mo_name(bacteria), sort = TRUE) +#> # A tibble: 4 × 2 +#> `mo_name(bacteria)` n +#> <chr> <int> +#> 1 Escherichia coli 1250 +#> 2 Staphylococcus aureus 661 +#> 3 Streptococcus pneumoniae 399 +#> 4 Klebsiella pneumoniae 316
our_data %>% + count(mo_name(bacteria), sort = TRUE) +#> # A tibble: 4 × 2 +#> `mo_name(bacteria)` n +#> <chr> <int> +#> 1 Escherichia coli 1518 +#> 2 Staphylococcus aureus 730 +#> 3 Streptococcus pneumoniae 426 +#> 4 Klebsiella pneumoniae 326 + +our_data_1st %>% + count(mo_name(bacteria), sort = TRUE) +#> # A tibble: 4 × 2 +#> `mo_name(bacteria)` n +#> <chr> <int> +#> 1 Escherichia coli 1250 +#> 2 Staphylococcus aureus 661 +#> 3 Streptococcus pneumoniae 399 +#> 4 Klebsiella pneumoniae 316
Using tidyverse -selections, you can also select or filter columns based on the -antibiotic class they are in:
-data_1st %>% - filter(any(aminoglycosides() == "R"))
data_1st %>% - filter(any(aminoglycosides() == "R"))
#> ℹ For aminoglycosides() using column 'GEN' (gentamicin)
Using so-called antibiotic class selectors, you can select or filter +columns based on the antibiotic class that your antibiotic results are +in:
+our_data_1st %>% + select(date, aminoglycosides()) +#> ℹ For aminoglycosides() using column 'GEN' (gentamicin) +#> # A tibble: 2,626 × 2 +#> date GEN +#> <date> <sir> +#> 1 2012-11-21 S +#> 2 2018-04-03 S +#> 3 2015-12-10 S +#> 4 2015-03-02 S +#> 5 2018-03-31 S +#> 6 2016-06-14 S +#> 7 2015-10-25 S +#> 8 2019-06-19 S +#> 9 2015-04-27 S +#> 10 2011-06-21 S +#> # … with 2,616 more rows + +our_data_1st %>% + select(bacteria, betalactams()) +#> ℹ For betalactams() using columns 'AMX' (amoxicillin) and 'AMC' +#> (amoxicillin/clavulanic acid) +#> # A tibble: 2,626 × 3 +#> bacteria AMX AMC +#> <mo> <sir> <sir> +#> 1 B_ESCHR_COLI R I +#> 2 B_KLBSL_PNMN R I +#> 3 B_ESCHR_COLI S I +#> 4 B_ESCHR_COLI S S +#> 5 B_STPHY_AURS R S +#> 6 B_ESCHR_COLI R S +#> 7 B_ESCHR_COLI R S +#> 8 B_ESCHR_COLI S S +#> 9 B_STPHY_AURS S S +#> 10 B_ESCHR_COLI S S +#> # … with 2,616 more rows + +our_data_1st %>% + select(bacteria, where(is.sir)) +#> # A tibble: 2,626 × 5 +#> bacteria AMX AMC CIP GEN +#> <mo> <sir> <sir> <sir> <sir> +#> 1 B_ESCHR_COLI R I S S +#> 2 B_KLBSL_PNMN R I S S +#> 3 B_ESCHR_COLI S I S S +#> 4 B_ESCHR_COLI S S S S +#> 5 B_STPHY_AURS R S R S +#> 6 B_ESCHR_COLI R S S S +#> 7 B_ESCHR_COLI R S S S +#> 8 B_ESCHR_COLI S S S S +#> 9 B_STPHY_AURS S S S S +#> 10 B_ESCHR_COLI S S S S +#> # … with 2,616 more rows + +# filtering using AB selectors is also possible: +our_data_1st %>% + filter(any(aminoglycosides() == "R")) +#> ℹ For aminoglycosides() using column 'GEN' (gentamicin) +#> # A tibble: 971 × 9 +#> patient_id hospital date bacteria AMX AMC CIP GEN first +#> <chr> <chr> <date> <mo> <sir> <sir> <sir> <sir> <lgl> +#> 1 J5 A 2017-12-25 B_STRPT_PNMN R S S R TRUE +#> 2 X1 A 2017-07-04 B_STPHY_AURS R S S R TRUE +#> 3 B3 A 2016-07-24 B_ESCHR_COLI S S S R TRUE +#> 4 V7 A 2012-04-03 B_ESCHR_COLI S S S R TRUE +#> 5 C9 A 2017-03-23 B_ESCHR_COLI S S S R TRUE +#> 6 R1 A 2018-06-10 B_STPHY_AURS S S S R TRUE +#> 7 S2 A 2013-07-19 B_STRPT_PNMN S S S R TRUE +#> 8 P5 A 2019-03-09 B_STPHY_AURS S S S R TRUE +#> 9 Q8 A 2019-08-10 B_STPHY_AURS S S S R TRUE +#> 10 K5 A 2013-03-15 B_STRPT_PNMN S S S R TRUE +#> # … with 961 more rows + +our_data_1st %>% + filter(all(betalactams() == "R")) +#> ℹ For betalactams() using columns 'AMX' (amoxicillin) and 'AMC' +#> (amoxicillin/clavulanic acid) +#> # A tibble: 471 × 9 +#> patient_id hospital date bacteria AMX AMC CIP GEN first +#> <chr> <chr> <date> <mo> <sir> <sir> <sir> <sir> <lgl> +#> 1 M7 A 2013-07-22 B_STRPT_PNMN R R S S TRUE +#> 2 R10 A 2013-12-20 B_STPHY_AURS R R S S TRUE +#> 3 R7 A 2015-10-25 B_STPHY_AURS R R S S TRUE +#> 4 R8 A 2019-10-25 B_STPHY_AURS R R S S TRUE +#> 5 I7 A 2015-08-19 B_ESCHR_COLI R R S S TRUE +#> 6 N3 A 2014-12-29 B_STRPT_PNMN R R R S TRUE +#> 7 Q2 A 2019-09-22 B_ESCHR_COLI R R S S TRUE +#> 8 X7 A 2011-03-20 B_ESCHR_COLI R R S R TRUE +#> 9 C5 A 2015-08-30 B_KLBSL_PNMN R R S R TRUE +#> 10 W9 A 2013-10-02 B_ESCHR_COLI R R S S TRUE +#> # … with 461 more rows + +# even works in base R (since R 3.0): +our_data_1st[all(betalactams() == "R"), ] +#> ℹ For betalactams() using columns 'AMX' (amoxicillin) and 'AMC' +#> (amoxicillin/clavulanic acid) +#> # A tibble: 471 × 9 +#> patient_id hospital date bacteria AMX AMC CIP GEN first +#> <chr> <chr> <date> <mo> <sir> <sir> <sir> <sir> <lgl> +#> 1 M7 A 2013-07-22 B_STRPT_PNMN R R S S TRUE +#> 2 R10 A 2013-12-20 B_STPHY_AURS R R S S TRUE +#> 3 R7 A 2015-10-25 B_STPHY_AURS R R S S TRUE +#> 4 R8 A 2019-10-25 B_STPHY_AURS R R S S TRUE +#> 5 I7 A 2015-08-19 B_ESCHR_COLI R R S S TRUE +#> 6 N3 A 2014-12-29 B_STRPT_PNMN R R R S TRUE +#> 7 Q2 A 2019-09-22 B_ESCHR_COLI R R S S TRUE +#> 8 X7 A 2011-03-20 B_ESCHR_COLI R R S R TRUE +#> 9 C5 A 2015-08-30 B_KLBSL_PNMN R R S R TRUE +#> 10 W9 A 2013-10-02 B_ESCHR_COLI R R S S TRUE +#> # … with 461 more rows
our_data_1st %>% + select(date, aminoglycosides()) +#> ℹ For aminoglycosides() using column 'GEN' (gentamicin) +#> # A tibble: 2,626 × 2 +#> date GEN +#> <date> <sir> +#> 1 2012-11-21 S +#> 2 2018-04-03 S +#> 3 2015-12-10 S +#> 4 2015-03-02 S +#> 5 2018-03-31 S +#> 6 2016-06-14 S +#> 7 2015-10-25 S +#> 8 2019-06-19 S +#> 9 2015-04-27 S +#> 10 2011-06-21 S +#> # … with 2,616 more rows + +our_data_1st %>% + select(bacteria, betalactams()) +#> ℹ For betalactams() using columns 'AMX' (amoxicillin) and 'AMC' +#> (amoxicillin/clavulanic acid) +#> # A tibble: 2,626 × 3 +#> bacteria AMX AMC +#> <mo> <sir> <sir> +#> 1 B_ESCHR_COLI R I +#> 2 B_KLBSL_PNMN R I +#> 3 B_ESCHR_COLI S I +#> 4 B_ESCHR_COLI S S +#> 5 B_STPHY_AURS R S +#> 6 B_ESCHR_COLI R S +#> 7 B_ESCHR_COLI R S +#> 8 B_ESCHR_COLI S S +#> 9 B_STPHY_AURS S S +#> 10 B_ESCHR_COLI S S +#> # … with 2,616 more rows + +our_data_1st %>% + select(bacteria, where(is.sir)) +#> # A tibble: 2,626 × 5 +#> bacteria AMX AMC CIP GEN +#> <mo> <sir> <sir> <sir> <sir> +#> 1 B_ESCHR_COLI R I S S +#> 2 B_KLBSL_PNMN R I S S +#> 3 B_ESCHR_COLI S I S S +#> 4 B_ESCHR_COLI S S S S +#> 5 B_STPHY_AURS R S R S +#> 6 B_ESCHR_COLI R S S S +#> 7 B_ESCHR_COLI R S S S +#> 8 B_ESCHR_COLI S S S S +#> 9 B_STPHY_AURS S S S S +#> 10 B_ESCHR_COLI S S S S +#> # … with 2,616 more rows + +# filtering using AB selectors is also possible: +our_data_1st %>% + filter(any(aminoglycosides() == "R")) +#> ℹ For aminoglycosides() using column 'GEN' (gentamicin) +#> # A tibble: 971 × 9 +#> patient_id hospital date bacteria AMX AMC CIP GEN first +#> <chr> <chr> <date> <mo> <sir> <sir> <sir> <sir> <lgl> +#> 1 J5 A 2017-12-25 B_STRPT_PNMN R S S R TRUE +#> 2 X1 A 2017-07-04 B_STPHY_AURS R S S R TRUE +#> 3 B3 A 2016-07-24 B_ESCHR_COLI S S S R TRUE +#> 4 V7 A 2012-04-03 B_ESCHR_COLI S S S R TRUE +#> 5 C9 A 2017-03-23 B_ESCHR_COLI S S S R TRUE +#> 6 R1 A 2018-06-10 B_STPHY_AURS S S S R TRUE +#> 7 S2 A 2013-07-19 B_STRPT_PNMN S S S R TRUE +#> 8 P5 A 2019-03-09 B_STPHY_AURS S S S R TRUE +#> 9 Q8 A 2019-08-10 B_STPHY_AURS S S S R TRUE +#> 10 K5 A 2013-03-15 B_STRPT_PNMN S S S R TRUE +#> # … with 961 more rows + +our_data_1st %>% + filter(all(betalactams() == "R")) +#> ℹ For betalactams() using columns 'AMX' (amoxicillin) and 'AMC' +#> (amoxicillin/clavulanic acid) +#> # A tibble: 471 × 9 +#> patient_id hospital date bacteria AMX AMC CIP GEN first +#> <chr> <chr> <date> <mo> <sir> <sir> <sir> <sir> <lgl> +#> 1 M7 A 2013-07-22 B_STRPT_PNMN R R S S TRUE +#> 2 R10 A 2013-12-20 B_STPHY_AURS R R S S TRUE +#> 3 R7 A 2015-10-25 B_STPHY_AURS R R S S TRUE +#> 4 R8 A 2019-10-25 B_STPHY_AURS R R S S TRUE +#> 5 I7 A 2015-08-19 B_ESCHR_COLI R R S S TRUE +#> 6 N3 A 2014-12-29 B_STRPT_PNMN R R R S TRUE +#> 7 Q2 A 2019-09-22 B_ESCHR_COLI R R S S TRUE +#> 8 X7 A 2011-03-20 B_ESCHR_COLI R R S R TRUE +#> 9 C5 A 2015-08-30 B_KLBSL_PNMN R R S R TRUE +#> 10 W9 A 2013-10-02 B_ESCHR_COLI R R S S TRUE +#> # … with 461 more rows + +# even works in base R (since R 3.0): +our_data_1st[all(betalactams() == "R"), ] +#> ℹ For betalactams() using columns 'AMX' (amoxicillin) and 'AMC' +#> (amoxicillin/clavulanic acid) +#> # A tibble: 471 × 9 +#> patient_id hospital date bacteria AMX AMC CIP GEN first +#> <chr> <chr> <date> <mo> <sir> <sir> <sir> <sir> <lgl> +#> 1 M7 A 2013-07-22 B_STRPT_PNMN R R S S TRUE +#> 2 R10 A 2013-12-20 B_STPHY_AURS R R S S TRUE +#> 3 R7 A 2015-10-25 B_STPHY_AURS R R S S TRUE +#> 4 R8 A 2019-10-25 B_STPHY_AURS R R S S TRUE +#> 5 I7 A 2015-08-19 B_ESCHR_COLI R R S S TRUE +#> 6 N3 A 2014-12-29 B_STRPT_PNMN R R R S TRUE +#> 7 Q2 A 2019-09-22 B_ESCHR_COLI R R S S TRUE +#> 8 X7 A 2011-03-20 B_ESCHR_COLI R R S R TRUE +#> 9 C5 A 2015-08-30 B_KLBSL_PNMN R R S R TRUE +#> 10 W9 A 2013-10-02 B_ESCHR_COLI R R S S TRUE +#> # … with 461 more rows
This package comes with antibiogram(), a function that +automatically generates traditional, combined, syndromic, and even +weighted-incidence syndromic combination antibiograms (WISCA). For R +Markdown (such as this page) it automatically prints in the right table +format.
antibiogram()
Below are some suggestions for how to generate the different +antibiograms:
+# traditional: +antibiogram(our_data_1st)
# traditional: +antibiogram(our_data_1st)
+antibiogram(our_data_1st, + ab_transform = "name")
antibiogram(our_data_1st, + ab_transform = "name")
If you want to get a quick glance of the number of isolates in -different bug/drug combinations, you can use the -bug_drug_combinations() function:
bug_drug_combinations()
+antibiogram(our_data_1st, + ab_transform = "name", + language = "es") # support for 20 languages
antibiogram(our_data_1st, + ab_transform = "name", + language = "es") # support for 20 languages
+# combined: +antibiogram(our_data_1st, + antibiotics = c("AMC", "AMC+CIP", "AMC+GEN"))
# combined: +antibiogram(our_data_1st, + antibiotics = c("AMC", "AMC+CIP", "AMC+GEN"))
+# for a syndromic antibiogram, we must fake some clinical conditions: +our_data_1st$condition <- sample(c("Cardial", "Respiratory", "Rheumatic"), + size = nrow(our_data_1st), + replace = TRUE) + +# syndromic: +antibiogram(our_data_1st, + syndromic_group = "condition")
# for a syndromic antibiogram, we must fake some clinical conditions: +our_data_1st$condition <- sample(c("Cardial", "Respiratory", "Rheumatic"), + size = nrow(our_data_1st), + replace = TRUE) + +# syndromic: +antibiogram(our_data_1st, + syndromic_group = "condition")
+antibiogram(our_data_1st, + # you can use AB selectors here as well: + antibiotics = c(penicillins(), aminoglycosides()), + syndromic_group = "condition", + mo_transform = "gramstain") +#> ℹ For penicillins() using columns 'AMX' (amoxicillin) and 'AMC' +#> (amoxicillin/clavulanic acid) +#> ℹ For aminoglycosides() using column 'GEN' (gentamicin)
antibiogram(our_data_1st, + # you can use AB selectors here as well: + antibiotics = c(penicillins(), aminoglycosides()), + syndromic_group = "condition", + mo_transform = "gramstain") +#> ℹ For penicillins() using columns 'AMX' (amoxicillin) and 'AMC' +#> (amoxicillin/clavulanic acid) +#> ℹ For aminoglycosides() using column 'GEN' (gentamicin)
+# WISCA: +# (we lack some details, but it could contain a filter on e.g. >65 year-old males) +wisca <- antibiogram(our_data_1st, + antibiotics = c("AMC", "AMC+CIP", "AMC+GEN"), + syndromic_group = "condition", + mo_transform = "gramstain") +wisca
# WISCA: +# (we lack some details, but it could contain a filter on e.g. >65 year-old males) +wisca <- antibiogram(our_data_1st, + antibiotics = c("AMC", "AMC+CIP", "AMC+GEN"), + syndromic_group = "condition", + mo_transform = "gramstain") +wisca
Antibiograms can be plotted using autoplot() from the +ggplot2 packages, since this package provides an extension +to that function:
autoplot()
ggplot2
-data_1st %>% - bug_drug_combinations() %>% - head() # show first 6 rows
data_1st %>% - bug_drug_combinations() %>% - head() # show first 6 rows
-data_1st %>% - select(bacteria, aminoglycosides()) %>% - bug_drug_combinations()
data_1st %>% - select(bacteria, aminoglycosides()) %>% - bug_drug_combinations()
This will only give you the crude numbers in the data. To calculate -antimicrobial resistance in a more sensible way, also by correcting for -too few results, we use the resistance() and +autoplot(wisca)
resistance()
autoplot(wisca)
To calculate antimicrobial resistance in a more sensible way, also by +correcting for too few results, we use the resistance() and susceptibility() functions.
susceptibility()
proportion_R() I (proportion_SI(), equal to susceptibility()). These functions can be used on their own: - -data_1st %>% resistance(AMX) -#> [1] 0.5813935 + +our_data_1st %>% resistance(AMX) +#> [1] 0.4318355 Or can be used in conjunction with group_by() and summarise(), both from the dplyr package: - -data_1st %>% + +our_data_1st %>% group_by(hospital) %>% - summarise(amoxicillin = resistance(AMX)) - - -hospital -amoxicillin - - - -Hospital A -0.5902375 - - -Hospital B -0.5636153 - - -Hospital C -0.5920537 - - -Hospital D -0.5911290 - - - -Of course it would be very convenient to know the number of isolates -responsible for the percentages. For that purpose the -n_sir() can be used, which works exactly like -n_distinct() from the dplyr package. It counts -all isolates available for every group (i.e. values S, I or R): - -data_1st %>% - group_by(hospital) %>% - summarise( - amoxicillin = resistance(AMX), - available = n_sir(AMX) - ) - - -hospital -amoxicillin -available - - - -Hospital A -0.5902375 -3790 - - -Hospital B -0.5636153 -4315 - - -Hospital C -0.5920537 -1787 - - -Hospital D -0.5911290 -2480 - - - -These functions can also be used to get the proportion of multiple -antibiotics, to calculate empiric susceptibility of combination -therapies very easily: - -data_1st %>% - group_by(genus) %>% - summarise( - amoxiclav = susceptibility(AMC), - gentamicin = susceptibility(GEN), - amoxiclav_genta = susceptibility(AMC, GEN) - ) - - -genus -amoxiclav -gentamicin -amoxiclav_genta - - - -Escherichia -0.6703444 -0.6482782 -0.8836510 - - -Klebsiella -0.6513410 -0.6636015 -0.8919540 - - -Staphylococcus -0.6615911 -0.6737104 -0.8899938 - - -Streptococcus -0.4654526 -0.0000000 -0.4654526 - - - -Or if you are curious for the resistance within certain antibiotic -classes, use a antibiotic class selector such as -penicillins(), which automatically will include the columns -AMX and AMC of our data: - -data_1st %>% - # group by hospital - group_by(hospital) %>% - # / -> select all penicillins in the data for calculation - # | / -> use resistance() for all peni's per hospital - # | | / -> print as percentages - summarise(across(penicillins(), resistance, as_percent = TRUE)) %>% - # format the antibiotic column names, using so-called snake case, - # so 'Amoxicillin/clavulanic acid' becomes 'amoxicillin_clavulanic_acid' - rename_with(set_ab_names, penicillins()) - - -hospital -amoxicillin -amoxicillin_clavulanic_acid - - - -Hospital A -59.0% -38.0% - - -Hospital B -56.4% -35.0% - - -Hospital C -59.2% -37.3% - - -Hospital D -59.1% -36.1% - - - -To make a transition to the next part, let’s see how differences in -the previously calculated combination therapies could be plotted: - -data_1st %>% - group_by(genus) %>% - summarise( - "1. Amoxi/clav" = susceptibility(AMC), - "2. Gentamicin" = susceptibility(GEN), - "3. Amoxi/clav + genta" = susceptibility(AMC, GEN) - ) %>% - # pivot_longer() from the tidyr package "lengthens" data: - tidyr::pivot_longer(-genus, names_to = "antibiotic") %>% - ggplot(aes( - x = genus, - y = value, - fill = antibiotic - )) + - geom_col(position = "dodge2") - - - -Plots - -To show results in plots, most R users would nowadays use the -ggplot2 package. This package lets you create plots in -layers. You can read more about it on their website. A quick -example would look like these syntaxes: - -ggplot( - data = a_data_set, - mapping = aes( - x = year, - y = value - ) -) + - geom_col() + - labs( - title = "A title", - subtitle = "A subtitle", - x = "My X axis", - y = "My Y axis" - ) - -# or as short as: -ggplot(a_data_set) + - geom_bar(aes(year)) -The AMR package contains functions to extend this -ggplot2 package, for example geom_sir(). It -automatically transforms data with count_df() or -proportion_df() and show results in stacked bars. Its -simplest and shortest example: - -ggplot(data_1st) + - geom_sir(translate_ab = FALSE) - -Omit the translate_ab = FALSE to have the antibiotic -codes (AMX, AMC, CIP, GEN) translated to official WHO names -(amoxicillin, amoxicillin/clavulanic acid, ciprofloxacin, -gentamicin). -If we group on e.g. the genus column and add some -additional functions from our package, we can create this: - -# group the data on `genus` -ggplot(data_1st %>% group_by(genus)) + - # create bars with genus on x axis - # it looks for variables with class `sir`, - # of which we have 4 (earlier created with `as.sir`) - geom_sir(x = "genus") + - # split plots on antibiotic - facet_sir(facet = "antibiotic") + - # set colours to the SIR interpretations (colour-blind friendly) - scale_sir_colours() + - # show percentages on y axis - scale_y_percent(breaks = 0:4 * 25) + - # turn 90 degrees, to make it bars instead of columns - coord_flip() + - # add labels - labs( - title = "Resistance per genus and antibiotic", - subtitle = "(this is fake data)" - ) + - # and print genus in italic to follow our convention - # (is now y axis because we turned the plot) - theme(axis.text.y = element_text(face = "italic")) - -To simplify this, we also created the ggplot_sir() -function, which combines almost all above functions: - -data_1st %>% - group_by(genus) %>% - ggplot_sir( - x = "genus", - facet = "antibiotic", - breaks = 0:4 * 25, - datalabels = FALSE - ) + - coord_flip() - - -Plotting MIC and disk diffusion values - -The AMR package also extends the plot() and -ggplot2::autoplot() functions for plotting minimum -inhibitory concentrations (MIC, created with as.mic()) and -disk diffusion diameters (created with as.disk()). -With the random_mic() and random_disk() -functions, we can generate sampled values for the new data types (S3 -classes) <mic> and <disk>: - -mic_values <- random_mic(size = 100) -mic_values -#> Class 'mic' -#> [1] 16 128 0.01 0.002 4 16 0.01 0.01 128 0.5 -#> [11] 1 4 0.01 16 64 0.25 16 4 0.0625 0.002 -#> [21] 4 0.5 0.5 0.001 0.125 32 64 0.005 0.025 0.0625 -#> [31] 0.125 32 0.25 0.25 0.5 0.001 32 0.01 8 256 -#> [41] 0.5 64 256 0.025 1 0.025 2 32 8 0.01 -#> [51] 0.025 0.01 0.001 0.0625 1 1 16 0.001 64 0.125 -#> [61] 0.0625 0.025 8 128 4 4 0.025 16 8 1 -#> [71] 4 2 0.5 0.5 1 0.002 128 256 0.025 0.005 -#> [81] 64 2 0.25 64 0.002 0.005 1 256 64 0.001 -#> [91] 8 64 0.25 4 1 32 16 2 32 8 - -# base R: -plot(mic_values) - - -# ggplot2: -autoplot(mic_values) - -But we could also be more specific, by generating MICs that are -likely to be found in E. coli for ciprofloxacin: - -mic_values <- random_mic(size = 100, mo = "E. coli", ab = "cipro") -For the plot() and autoplot() function, we -can define the microorganism and an antimicrobial agent the same way. -This will add the interpretation of those values according to a chosen -guidelines (defaults to the latest EUCAST guideline). -Default colours are colour-blind friendly, while maintaining the -convention that e.g. ‘susceptible’ should be green and ‘resistant’ -should be red: - -# base R: -plot(mic_values, mo = "E. coli", ab = "cipro") - - -# ggplot2: -autoplot(mic_values, mo = "E. coli", ab = "cipro") - -For disk diffusion values, there is not much of a difference in -plotting: - -disk_values <- random_disk(size = 100, mo = "E. coli", ab = "cipro") -disk_values -#> Class 'disk' -#> [1] 23 30 22 17 17 18 24 30 24 17 19 28 19 31 28 17 28 18 20 31 27 25 22 22 18 -#> [26] 30 29 25 26 25 20 18 26 26 28 21 22 25 19 27 22 31 19 18 25 30 24 28 30 19 -#> [51] 22 18 26 19 20 20 31 26 30 22 17 17 27 23 19 30 27 28 25 28 20 23 28 27 19 -#> [76] 25 30 25 27 28 24 24 21 22 22 31 21 21 27 17 19 29 29 29 26 28 21 28 29 31 - -# base R: -plot(disk_values, mo = "E. coli", ab = "cipro") - -And when using the ggplot2 package, but now choosing the -latest implemented CLSI guideline (notice that the EUCAST-specific term -“Susceptible, incr. exp.” has changed to “Intermediate”): - -autoplot( - disk_values, - mo = "E. coli", - ab = "cipro", - guideline = "CLSI" -) - - - - -Independence test - -The next example uses the example_isolates data set. -This is a data set included with this package and contains 2,000 -microbial isolates with their full antibiograms. It reflects reality and -can be used to practise AMR data analysis. -We will compare the resistance to amoxicillin/clavulanic acid (column -AMC) between an ICU and other clinical wards. The input for -the fisher.test() can be retrieved with a transformation -like this: - -# use package 'tidyr' to pivot data: -library(tidyr) - -check_AMC <- example_isolates %>% - filter(ward %in% c("ICU", "Clinical")) %>% # filter on only these wards - select(ward, AMC) %>% # select the wards and amoxi/clav - group_by(ward) %>% # group on the wards - count_df(combine_SI = TRUE) %>% # count all isolates per group (ward) - pivot_wider( - names_from = ward, # transform output so "ICU" and "Clinical" are columns - values_from = value - ) %>% - select(ICU, Clinical) %>% # and only select these columns - as.matrix() # transform to a good old matrix for fisher.test() - -check_AMC -#> ICU Clinical -#> [1,] 396 942 -#> [2,] 184 240 -We can apply the test now with: - -# do Fisher's Exact Test -fisher.test(check_AMC) -#> -#> Fisher's Exact Test for Count Data -#> -#> data: check_AMC -#> p-value = 2.263e-07 -#> alternative hypothesis: true odds ratio is not equal to 1 -#> 95 percent confidence interval: -#> 0.435261 0.691614 -#> sample estimates: -#> odds ratio -#> 0.5485079 -As can be seen, the p value is practically zero (0.0000002263247), -which means that the amoxicillin/clavulanic acid resistance found in -isolates between patients in ICUs and other clinical wards are really -different. + summarise(amoxicillin = resistance(AMX)) +#> # A tibble: 3 × 2 +#> hospital amoxicillin +#> <chr> <dbl> +#> 1 A 0.343 +#> 2 B 0.569 +#> 3 C 0.375 -Author: Dr. Matthijs Berends +Author: Dr. Matthijs Berends, 26th Feb 2023
proportion_SI()
-data_1st %>% resistance(AMX) -#> [1] 0.5813935
data_1st %>% resistance(AMX) -#> [1] 0.5813935
+our_data_1st %>% resistance(AMX) +#> [1] 0.4318355
our_data_1st %>% resistance(AMX) +#> [1] 0.4318355
Or can be used in conjunction with group_by() and summarise(), both from the dplyr package:
group_by()
summarise()
-data_1st %>% + +our_data_1st %>% group_by(hospital) %>% - summarise(amoxicillin = resistance(AMX)) - - -hospital -amoxicillin - - - -Hospital A -0.5902375 - - -Hospital B -0.5636153 - - -Hospital C -0.5920537 - - -Hospital D -0.5911290 - - - -Of course it would be very convenient to know the number of isolates -responsible for the percentages. For that purpose the -n_sir() can be used, which works exactly like -n_distinct() from the dplyr package. It counts -all isolates available for every group (i.e. values S, I or R): - -data_1st %>% - group_by(hospital) %>% - summarise( - amoxicillin = resistance(AMX), - available = n_sir(AMX) - ) - - -hospital -amoxicillin -available - - - -Hospital A -0.5902375 -3790 - - -Hospital B -0.5636153 -4315 - - -Hospital C -0.5920537 -1787 - - -Hospital D -0.5911290 -2480 - - - -These functions can also be used to get the proportion of multiple -antibiotics, to calculate empiric susceptibility of combination -therapies very easily: - -data_1st %>% - group_by(genus) %>% - summarise( - amoxiclav = susceptibility(AMC), - gentamicin = susceptibility(GEN), - amoxiclav_genta = susceptibility(AMC, GEN) - ) - - -genus -amoxiclav -gentamicin -amoxiclav_genta - - - -Escherichia -0.6703444 -0.6482782 -0.8836510 - - -Klebsiella -0.6513410 -0.6636015 -0.8919540 - - -Staphylococcus -0.6615911 -0.6737104 -0.8899938 - - -Streptococcus -0.4654526 -0.0000000 -0.4654526 - - - -Or if you are curious for the resistance within certain antibiotic -classes, use a antibiotic class selector such as -penicillins(), which automatically will include the columns -AMX and AMC of our data: - -data_1st %>% - # group by hospital - group_by(hospital) %>% - # / -> select all penicillins in the data for calculation - # | / -> use resistance() for all peni's per hospital - # | | / -> print as percentages - summarise(across(penicillins(), resistance, as_percent = TRUE)) %>% - # format the antibiotic column names, using so-called snake case, - # so 'Amoxicillin/clavulanic acid' becomes 'amoxicillin_clavulanic_acid' - rename_with(set_ab_names, penicillins()) - - -hospital -amoxicillin -amoxicillin_clavulanic_acid - - - -Hospital A -59.0% -38.0% - - -Hospital B -56.4% -35.0% - - -Hospital C -59.2% -37.3% - - -Hospital D -59.1% -36.1% - - - -To make a transition to the next part, let’s see how differences in -the previously calculated combination therapies could be plotted: - -data_1st %>% - group_by(genus) %>% - summarise( - "1. Amoxi/clav" = susceptibility(AMC), - "2. Gentamicin" = susceptibility(GEN), - "3. Amoxi/clav + genta" = susceptibility(AMC, GEN) - ) %>% - # pivot_longer() from the tidyr package "lengthens" data: - tidyr::pivot_longer(-genus, names_to = "antibiotic") %>% - ggplot(aes( - x = genus, - y = value, - fill = antibiotic - )) + - geom_col(position = "dodge2") - -
data_1st %>% + +our_data_1st %>% group_by(hospital) %>% - summarise(amoxicillin = resistance(AMX)) - - -hospital -amoxicillin - - - -Hospital A -0.5902375 - - -Hospital B -0.5636153 - - -Hospital C -0.5920537 - - -Hospital D -0.5911290 - - - -Of course it would be very convenient to know the number of isolates -responsible for the percentages. For that purpose the -n_sir() can be used, which works exactly like -n_distinct() from the dplyr package. It counts -all isolates available for every group (i.e. values S, I or R): - -data_1st %>% - group_by(hospital) %>% - summarise( - amoxicillin = resistance(AMX), - available = n_sir(AMX) - ) - - -hospital -amoxicillin -available - - - -Hospital A -0.5902375 -3790 - - -Hospital B -0.5636153 -4315 - - -Hospital C -0.5920537 -1787 - - -Hospital D -0.5911290 -2480 - - - -These functions can also be used to get the proportion of multiple -antibiotics, to calculate empiric susceptibility of combination -therapies very easily: - -data_1st %>% - group_by(genus) %>% - summarise( - amoxiclav = susceptibility(AMC), - gentamicin = susceptibility(GEN), - amoxiclav_genta = susceptibility(AMC, GEN) - ) - - -genus -amoxiclav -gentamicin -amoxiclav_genta - - - -Escherichia -0.6703444 -0.6482782 -0.8836510 - - -Klebsiella -0.6513410 -0.6636015 -0.8919540 - - -Staphylococcus -0.6615911 -0.6737104 -0.8899938 - - -Streptococcus -0.4654526 -0.0000000 -0.4654526 - - - -Or if you are curious for the resistance within certain antibiotic -classes, use a antibiotic class selector such as -penicillins(), which automatically will include the columns -AMX and AMC of our data: - -data_1st %>% - # group by hospital - group_by(hospital) %>% - # / -> select all penicillins in the data for calculation - # | / -> use resistance() for all peni's per hospital - # | | / -> print as percentages - summarise(across(penicillins(), resistance, as_percent = TRUE)) %>% - # format the antibiotic column names, using so-called snake case, - # so 'Amoxicillin/clavulanic acid' becomes 'amoxicillin_clavulanic_acid' - rename_with(set_ab_names, penicillins()) - - -hospital -amoxicillin -amoxicillin_clavulanic_acid - - - -Hospital A -59.0% -38.0% - - -Hospital B -56.4% -35.0% - - -Hospital C -59.2% -37.3% - - -Hospital D -59.1% -36.1% - - - -To make a transition to the next part, let’s see how differences in -the previously calculated combination therapies could be plotted: - -data_1st %>% - group_by(genus) %>% - summarise( - "1. Amoxi/clav" = susceptibility(AMC), - "2. Gentamicin" = susceptibility(GEN), - "3. Amoxi/clav + genta" = susceptibility(AMC, GEN) - ) %>% - # pivot_longer() from the tidyr package "lengthens" data: - tidyr::pivot_longer(-genus, names_to = "antibiotic") %>% - ggplot(aes( - x = genus, - y = value, - fill = antibiotic - )) + - geom_col(position = "dodge2") - -
+our_data_1st %>% group_by(hospital) %>% - summarise(amoxicillin = resistance(AMX))
our_data_1st %>% group_by(hospital) %>% - summarise(amoxicillin = resistance(AMX))
Of course it would be very convenient to know the number of isolates -responsible for the percentages. For that purpose the -n_sir() can be used, which works exactly like -n_distinct() from the dplyr package. It counts -all isolates available for every group (i.e. values S, I or R):
n_sir()
n_distinct()
-data_1st %>% - group_by(hospital) %>% - summarise( - amoxicillin = resistance(AMX), - available = n_sir(AMX) - )
data_1st %>% - group_by(hospital) %>% - summarise( - amoxicillin = resistance(AMX), - available = n_sir(AMX) - )
These functions can also be used to get the proportion of multiple -antibiotics, to calculate empiric susceptibility of combination -therapies very easily:
-data_1st %>% - group_by(genus) %>% - summarise( - amoxiclav = susceptibility(AMC), - gentamicin = susceptibility(GEN), - amoxiclav_genta = susceptibility(AMC, GEN) - )
data_1st %>% - group_by(genus) %>% - summarise( - amoxiclav = susceptibility(AMC), - gentamicin = susceptibility(GEN), - amoxiclav_genta = susceptibility(AMC, GEN) - )
Or if you are curious for the resistance within certain antibiotic -classes, use a antibiotic class selector such as -penicillins(), which automatically will include the columns -AMX and AMC of our data:
penicillins()
-data_1st %>% - # group by hospital - group_by(hospital) %>% - # / -> select all penicillins in the data for calculation - # | / -> use resistance() for all peni's per hospital - # | | / -> print as percentages - summarise(across(penicillins(), resistance, as_percent = TRUE)) %>% - # format the antibiotic column names, using so-called snake case, - # so 'Amoxicillin/clavulanic acid' becomes 'amoxicillin_clavulanic_acid' - rename_with(set_ab_names, penicillins())
data_1st %>% - # group by hospital - group_by(hospital) %>% - # / -> select all penicillins in the data for calculation - # | / -> use resistance() for all peni's per hospital - # | | / -> print as percentages - summarise(across(penicillins(), resistance, as_percent = TRUE)) %>% - # format the antibiotic column names, using so-called snake case, - # so 'Amoxicillin/clavulanic acid' becomes 'amoxicillin_clavulanic_acid' - rename_with(set_ab_names, penicillins())
To make a transition to the next part, let’s see how differences in -the previously calculated combination therapies could be plotted:
-data_1st %>% - group_by(genus) %>% - summarise( - "1. Amoxi/clav" = susceptibility(AMC), - "2. Gentamicin" = susceptibility(GEN), - "3. Amoxi/clav + genta" = susceptibility(AMC, GEN) - ) %>% - # pivot_longer() from the tidyr package "lengthens" data: - tidyr::pivot_longer(-genus, names_to = "antibiotic") %>% - ggplot(aes( - x = genus, - y = value, - fill = antibiotic - )) + - geom_col(position = "dodge2")
data_1st %>% - group_by(genus) %>% - summarise( - "1. Amoxi/clav" = susceptibility(AMC), - "2. Gentamicin" = susceptibility(GEN), - "3. Amoxi/clav + genta" = susceptibility(AMC, GEN) - ) %>% - # pivot_longer() from the tidyr package "lengthens" data: - tidyr::pivot_longer(-genus, names_to = "antibiotic") %>% - ggplot(aes( - x = genus, - y = value, - fill = antibiotic - )) + - geom_col(position = "dodge2")
To show results in plots, most R users would nowadays use the -ggplot2 package. This package lets you create plots in -layers. You can read more about it on their website. A quick -example would look like these syntaxes:
-ggplot( - data = a_data_set, - mapping = aes( - x = year, - y = value - ) -) + - geom_col() + - labs( - title = "A title", - subtitle = "A subtitle", - x = "My X axis", - y = "My Y axis" - ) - -# or as short as: -ggplot(a_data_set) + - geom_bar(aes(year))
ggplot( - data = a_data_set, - mapping = aes( - x = year, - y = value - ) -) + - geom_col() + - labs( - title = "A title", - subtitle = "A subtitle", - x = "My X axis", - y = "My Y axis" - ) - -# or as short as: -ggplot(a_data_set) + - geom_bar(aes(year))
The AMR package contains functions to extend this -ggplot2 package, for example geom_sir(). It -automatically transforms data with count_df() or -proportion_df() and show results in stacked bars. Its -simplest and shortest example:
geom_sir()
count_df()
proportion_df()
-ggplot(data_1st) + - geom_sir(translate_ab = FALSE)
ggplot(data_1st) + - geom_sir(translate_ab = FALSE)
Omit the translate_ab = FALSE to have the antibiotic -codes (AMX, AMC, CIP, GEN) translated to official WHO names -(amoxicillin, amoxicillin/clavulanic acid, ciprofloxacin, -gentamicin).
translate_ab = FALSE
If we group on e.g. the genus column and add some -additional functions from our package, we can create this:
-# group the data on `genus` -ggplot(data_1st %>% group_by(genus)) + - # create bars with genus on x axis - # it looks for variables with class `sir`, - # of which we have 4 (earlier created with `as.sir`) - geom_sir(x = "genus") + - # split plots on antibiotic - facet_sir(facet = "antibiotic") + - # set colours to the SIR interpretations (colour-blind friendly) - scale_sir_colours() + - # show percentages on y axis - scale_y_percent(breaks = 0:4 * 25) + - # turn 90 degrees, to make it bars instead of columns - coord_flip() + - # add labels - labs( - title = "Resistance per genus and antibiotic", - subtitle = "(this is fake data)" - ) + - # and print genus in italic to follow our convention - # (is now y axis because we turned the plot) - theme(axis.text.y = element_text(face = "italic"))
# group the data on `genus` -ggplot(data_1st %>% group_by(genus)) + - # create bars with genus on x axis - # it looks for variables with class `sir`, - # of which we have 4 (earlier created with `as.sir`) - geom_sir(x = "genus") + - # split plots on antibiotic - facet_sir(facet = "antibiotic") + - # set colours to the SIR interpretations (colour-blind friendly) - scale_sir_colours() + - # show percentages on y axis - scale_y_percent(breaks = 0:4 * 25) + - # turn 90 degrees, to make it bars instead of columns - coord_flip() + - # add labels - labs( - title = "Resistance per genus and antibiotic", - subtitle = "(this is fake data)" - ) + - # and print genus in italic to follow our convention - # (is now y axis because we turned the plot) - theme(axis.text.y = element_text(face = "italic"))
To simplify this, we also created the ggplot_sir() -function, which combines almost all above functions:
ggplot_sir()
-data_1st %>% - group_by(genus) %>% - ggplot_sir( - x = "genus", - facet = "antibiotic", - breaks = 0:4 * 25, - datalabels = FALSE - ) + - coord_flip()
data_1st %>% - group_by(genus) %>% - ggplot_sir( - x = "genus", - facet = "antibiotic", - breaks = 0:4 * 25, - datalabels = FALSE - ) + - coord_flip()
The AMR package also extends the plot() and -ggplot2::autoplot() functions for plotting minimum -inhibitory concentrations (MIC, created with as.mic()) and -disk diffusion diameters (created with as.disk()).
plot()
ggplot2::autoplot()
as.mic()
as.disk()
With the random_mic() and random_disk() -functions, we can generate sampled values for the new data types (S3 -classes) <mic> and <disk>:
random_mic()
random_disk()
<mic>
<disk>
-mic_values <- random_mic(size = 100) -mic_values -#> Class 'mic' -#> [1] 16 128 0.01 0.002 4 16 0.01 0.01 128 0.5 -#> [11] 1 4 0.01 16 64 0.25 16 4 0.0625 0.002 -#> [21] 4 0.5 0.5 0.001 0.125 32 64 0.005 0.025 0.0625 -#> [31] 0.125 32 0.25 0.25 0.5 0.001 32 0.01 8 256 -#> [41] 0.5 64 256 0.025 1 0.025 2 32 8 0.01 -#> [51] 0.025 0.01 0.001 0.0625 1 1 16 0.001 64 0.125 -#> [61] 0.0625 0.025 8 128 4 4 0.025 16 8 1 -#> [71] 4 2 0.5 0.5 1 0.002 128 256 0.025 0.005 -#> [81] 64 2 0.25 64 0.002 0.005 1 256 64 0.001 -#> [91] 8 64 0.25 4 1 32 16 2 32 8
mic_values <- random_mic(size = 100) -mic_values -#> Class 'mic' -#> [1] 16 128 0.01 0.002 4 16 0.01 0.01 128 0.5 -#> [11] 1 4 0.01 16 64 0.25 16 4 0.0625 0.002 -#> [21] 4 0.5 0.5 0.001 0.125 32 64 0.005 0.025 0.0625 -#> [31] 0.125 32 0.25 0.25 0.5 0.001 32 0.01 8 256 -#> [41] 0.5 64 256 0.025 1 0.025 2 32 8 0.01 -#> [51] 0.025 0.01 0.001 0.0625 1 1 16 0.001 64 0.125 -#> [61] 0.0625 0.025 8 128 4 4 0.025 16 8 1 -#> [71] 4 2 0.5 0.5 1 0.002 128 256 0.025 0.005 -#> [81] 64 2 0.25 64 0.002 0.005 1 256 64 0.001 -#> [91] 8 64 0.25 4 1 32 16 2 32 8
-# base R: -plot(mic_values)
# base R: -plot(mic_values)
-# ggplot2: -autoplot(mic_values)
# ggplot2: -autoplot(mic_values)
But we could also be more specific, by generating MICs that are -likely to be found in E. coli for ciprofloxacin:
-mic_values <- random_mic(size = 100, mo = "E. coli", ab = "cipro")
mic_values <- random_mic(size = 100, mo = "E. coli", ab = "cipro")
For the plot() and autoplot() function, we -can define the microorganism and an antimicrobial agent the same way. -This will add the interpretation of those values according to a chosen -guidelines (defaults to the latest EUCAST guideline).
Default colours are colour-blind friendly, while maintaining the -convention that e.g. ‘susceptible’ should be green and ‘resistant’ -should be red:
-# base R: -plot(mic_values, mo = "E. coli", ab = "cipro")
# base R: -plot(mic_values, mo = "E. coli", ab = "cipro")
-# ggplot2: -autoplot(mic_values, mo = "E. coli", ab = "cipro")
# ggplot2: -autoplot(mic_values, mo = "E. coli", ab = "cipro")
For disk diffusion values, there is not much of a difference in -plotting:
-disk_values <- random_disk(size = 100, mo = "E. coli", ab = "cipro") -disk_values -#> Class 'disk' -#> [1] 23 30 22 17 17 18 24 30 24 17 19 28 19 31 28 17 28 18 20 31 27 25 22 22 18 -#> [26] 30 29 25 26 25 20 18 26 26 28 21 22 25 19 27 22 31 19 18 25 30 24 28 30 19 -#> [51] 22 18 26 19 20 20 31 26 30 22 17 17 27 23 19 30 27 28 25 28 20 23 28 27 19 -#> [76] 25 30 25 27 28 24 24 21 22 22 31 21 21 27 17 19 29 29 29 26 28 21 28 29 31
disk_values <- random_disk(size = 100, mo = "E. coli", ab = "cipro") -disk_values -#> Class 'disk' -#> [1] 23 30 22 17 17 18 24 30 24 17 19 28 19 31 28 17 28 18 20 31 27 25 22 22 18 -#> [26] 30 29 25 26 25 20 18 26 26 28 21 22 25 19 27 22 31 19 18 25 30 24 28 30 19 -#> [51] 22 18 26 19 20 20 31 26 30 22 17 17 27 23 19 30 27 28 25 28 20 23 28 27 19 -#> [76] 25 30 25 27 28 24 24 21 22 22 31 21 21 27 17 19 29 29 29 26 28 21 28 29 31
-# base R: -plot(disk_values, mo = "E. coli", ab = "cipro")
# base R: -plot(disk_values, mo = "E. coli", ab = "cipro")
And when using the ggplot2 package, but now choosing the -latest implemented CLSI guideline (notice that the EUCAST-specific term -“Susceptible, incr. exp.” has changed to “Intermediate”):
-autoplot( - disk_values, - mo = "E. coli", - ab = "cipro", - guideline = "CLSI" -)
autoplot( - disk_values, - mo = "E. coli", - ab = "cipro", - guideline = "CLSI" -)
The next example uses the example_isolates data set. -This is a data set included with this package and contains 2,000 -microbial isolates with their full antibiograms. It reflects reality and -can be used to practise AMR data analysis.
example_isolates
We will compare the resistance to amoxicillin/clavulanic acid (column -AMC) between an ICU and other clinical wards. The input for -the fisher.test() can be retrieved with a transformation -like this:
fisher.test()
-# use package 'tidyr' to pivot data: -library(tidyr) - -check_AMC <- example_isolates %>% - filter(ward %in% c("ICU", "Clinical")) %>% # filter on only these wards - select(ward, AMC) %>% # select the wards and amoxi/clav - group_by(ward) %>% # group on the wards - count_df(combine_SI = TRUE) %>% # count all isolates per group (ward) - pivot_wider( - names_from = ward, # transform output so "ICU" and "Clinical" are columns - values_from = value - ) %>% - select(ICU, Clinical) %>% # and only select these columns - as.matrix() # transform to a good old matrix for fisher.test() - -check_AMC -#> ICU Clinical -#> [1,] 396 942 -#> [2,] 184 240
# use package 'tidyr' to pivot data: -library(tidyr) - -check_AMC <- example_isolates %>% - filter(ward %in% c("ICU", "Clinical")) %>% # filter on only these wards - select(ward, AMC) %>% # select the wards and amoxi/clav - group_by(ward) %>% # group on the wards - count_df(combine_SI = TRUE) %>% # count all isolates per group (ward) - pivot_wider( - names_from = ward, # transform output so "ICU" and "Clinical" are columns - values_from = value - ) %>% - select(ICU, Clinical) %>% # and only select these columns - as.matrix() # transform to a good old matrix for fisher.test() - -check_AMC -#> ICU Clinical -#> [1,] 396 942 -#> [2,] 184 240
We can apply the test now with:
-# do Fisher's Exact Test -fisher.test(check_AMC) -#> -#> Fisher's Exact Test for Count Data -#> -#> data: check_AMC -#> p-value = 2.263e-07 -#> alternative hypothesis: true odds ratio is not equal to 1 -#> 95 percent confidence interval: -#> 0.435261 0.691614 -#> sample estimates: -#> odds ratio -#> 0.5485079
# do Fisher's Exact Test -fisher.test(check_AMC) -#> -#> Fisher's Exact Test for Count Data -#> -#> data: check_AMC -#> p-value = 2.263e-07 -#> alternative hypothesis: true odds ratio is not equal to 1 -#> 95 percent confidence interval: -#> 0.435261 0.691614 -#> sample estimates: -#> odds ratio -#> 0.5485079
As can be seen, the p value is practically zero (0.0000002263247), -which means that the amoxicillin/clavulanic acid resistance found in -isolates between patients in ICUs and other clinical wards are really -different.
Author: Dr. Matthijs Berends
Author: Dr. Matthijs Berends, 26th Feb 2023