diff --git a/DESCRIPTION b/DESCRIPTION index 696a98db..ac2e70e5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: AMR -Version: 2.1.1.9121 -Date: 2024-12-19 +Version: 2.1.1.9122 +Date: 2024-12-20 Title: Antimicrobial Resistance Data Analysis Description: Functions to simplify and standardise antimicrobial resistance (AMR) data analysis and to work with microbial and antimicrobial properties by diff --git a/NEWS.md b/NEWS.md index 0d8ce2f6..6442079c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -# AMR 2.1.1.9121 +# AMR 2.1.1.9122 *(this beta version will eventually become v3.0. We're happy to reach a new major milestone soon, which will be all about the new One Health support! Install this beta using [the instructions here](https://msberends.github.io/AMR/#latest-development-version).)* diff --git a/PythonPackage/AMR/AMR.egg-info/PKG-INFO b/PythonPackage/AMR/AMR.egg-info/PKG-INFO index 347e5180..17b97b6c 100644 --- a/PythonPackage/AMR/AMR.egg-info/PKG-INFO +++ b/PythonPackage/AMR/AMR.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: AMR -Version: 2.1.1.9121 +Version: 2.1.1.9122 Summary: A Python wrapper for the AMR R package Home-page: https://github.com/msberends/AMR Author: Matthijs Berends diff --git a/PythonPackage/AMR/dist/AMR-2.1.1.9121-py3-none-any.whl b/PythonPackage/AMR/dist/AMR-2.1.1.9122-py3-none-any.whl similarity index 84% rename from PythonPackage/AMR/dist/AMR-2.1.1.9121-py3-none-any.whl rename to PythonPackage/AMR/dist/AMR-2.1.1.9122-py3-none-any.whl index 29d89122..7e659733 100644 Binary files a/PythonPackage/AMR/dist/AMR-2.1.1.9121-py3-none-any.whl and b/PythonPackage/AMR/dist/AMR-2.1.1.9122-py3-none-any.whl differ diff --git a/PythonPackage/AMR/dist/amr-2.1.1.9121.tar.gz b/PythonPackage/AMR/dist/amr-2.1.1.9121.tar.gz deleted file mode 100644 index e6ad65a7..00000000 Binary files a/PythonPackage/AMR/dist/amr-2.1.1.9121.tar.gz and /dev/null differ diff --git a/PythonPackage/AMR/dist/amr-2.1.1.9122.tar.gz b/PythonPackage/AMR/dist/amr-2.1.1.9122.tar.gz new file mode 100644 index 00000000..f57e1433 Binary files /dev/null and b/PythonPackage/AMR/dist/amr-2.1.1.9122.tar.gz differ diff --git a/PythonPackage/AMR/setup.py b/PythonPackage/AMR/setup.py index 8ea2c7a8..b83dc2c5 100644 --- a/PythonPackage/AMR/setup.py +++ b/PythonPackage/AMR/setup.py @@ -2,7 +2,7 @@ from setuptools import setup, find_packages setup( name='AMR', - version='2.1.1.9121', + version='2.1.1.9122', packages=find_packages(), install_requires=[ 'rpy2', diff --git a/R/ggplot_sir.R b/R/ggplot_sir.R index 36fc3fb2..59207cf4 100755 --- a/R/ggplot_sir.R +++ b/R/ggplot_sir.R @@ -40,7 +40,6 @@ #' @inheritParams proportion #' @param nrow (when using `facet`) number of rows #' @param colours a named vactor with colour to be used for filling. The default colours are colour-blind friendly. -#' @param aesthetics aesthetics to apply the colours to - the default is "fill" but can also be (a combination of) "alpha", "colour", "fill", "linetype", "shape" or "size" #' @param datalabels show datalabels using [labels_sir_count()] #' @param datalabels.size size of the datalabels #' @param datalabels.colour colour of the datalabels diff --git a/R/plotting.R b/R/plotting.R index e0067178..29135e16 100755 --- a/R/plotting.R +++ b/R/plotting.R @@ -42,7 +42,10 @@ #' @param colours_SIR colours to use for filling in the bars, must be a vector of three values (in the order S, I and R). The default colours are colour-blind friendly. #' @param language language to be used to translate 'Susceptible', 'Increased exposure'/'Intermediate' and 'Resistant' - the default is system language (see [get_AMR_locale()]) and can be overwritten by setting the package option [`AMR_locale`][AMR-options], e.g. `options(AMR_locale = "de")`, see [translate]. Use `language = NULL` or `language = ""` to prevent translation. #' @param expand a [logical] to indicate whether the range on the x axis should be expanded between the lowest and highest value. For MIC values, intermediate values will be factors of 2 starting from the highest MIC value. For disk diameters, the whole diameter range will be filled. +#' @param aesthetics aesthetics to apply the colours to - the default is "fill" but can also be (a combination of) "alpha", "colour", "fill", "linetype", "shape" or "size" #' @inheritParams as.sir +#' @inheritParams ggplot_sir +#' @inheritParams proportion #' @details #' The interpretation of "I" will be named "Increased exposure" for all EUCAST guidelines since 2019, and will be named "Intermediate" in all other cases. #' @@ -80,7 +83,7 @@ #' plot(some_disk_values, mo = "Escherichia coli", ab = "cipro", language = "nl") #' #' -#' # Plotting using scale_x_mic() +#' # Plotting using scale_x_mic() --------------------------------------------- #' \donttest{ #' if (require("ggplot2")) { #' mic_plot <- ggplot(data.frame(mics = as.mic(c(0.25, "<=4", 4, 8, 32, ">=32")), @@ -120,6 +123,25 @@ #' if (require("ggplot2")) { #' autoplot(some_sir_values) #' } +#' +#' # Plotting using scale_y_percent() ----------------------------------------- +#' if (require("ggplot2")) { +#' p <- ggplot(data.frame(mics = as.mic(c(0.25, "<=4", 4, 8, 32, ">=32")), +#' counts = c(1, 1, 2, 2, 3, 3)), +#' aes(mics, counts / sum(counts))) + +#' geom_col() +#' print(p) +#' +#' p2 <- p + +#' scale_y_percent() + +#' theme_sir() +#' print(p2) +#' +#' p + +#' scale_y_percent(breaks = seq(from = 0, to = 1, by = 0.1), +#' limits = c(0, 1)) + +#' theme_sir() +#' } #' } NULL @@ -954,7 +976,7 @@ facet_sir <- function(facet = c("interpretation", "antibiotic"), nrow = NULL) { #' @rdname plot #' @export -scale_y_percent <- function(breaks = function(x) seq(0, max(x, na.rm = TRUE), 0.1), limits = NULL) { +scale_y_percent <- function(breaks = function(x) seq(0, max(x, na.rm = TRUE), 0.1), limits = c(0, NA)) { stop_ifnot_installed("ggplot2") meet_criteria(breaks, allow_class = c("numeric", "integer", "function")) meet_criteria(limits, allow_class = c("numeric", "integer"), has_length = 2, allow_NULL = TRUE, allow_NA = TRUE) diff --git a/data-raw/gpt_training_text_v2.1.1.9121.txt b/data-raw/gpt_training_text_v2.1.1.9122.txt similarity index 99% rename from data-raw/gpt_training_text_v2.1.1.9121.txt rename to data-raw/gpt_training_text_v2.1.1.9122.txt index 17f9b3e3..4073905b 100644 --- a/data-raw/gpt_training_text_v2.1.1.9121.txt +++ b/data-raw/gpt_training_text_v2.1.1.9122.txt @@ -1,5 +1,5 @@ This files contains all context you must know about the AMR package for R. -First and foremost, you are trained on version 2.1.1.9121. Remember this whenever someone asks which AMR package version you’re at. +First and foremost, you are trained on version 2.1.1.9122. Remember this whenever someone asks which AMR package version you’re at. -------------------------------- THE PART HEREAFTER CONTAINS CONTENTS FROM FILE 'NAMESPACE': @@ -5448,8 +5448,6 @@ geom_sir( \item{y.title}{text to show as y axis description} \item{...}{other arguments passed on to \code{\link[=geom_sir]{geom_sir()}} or, in case of \code{\link[=scale_sir_colours]{scale_sir_colours()}}, named values to set colours. The default colours are colour-blind friendly, while maintaining the convention that e.g. 'susceptible' should be green and 'resistant' should be red. See \emph{Examples}.} - -\item{aesthetics}{aesthetics to apply the colours to - the default is "fill" but can also be (a combination of) "alpha", "colour", "fill", "linetype", "shape" or "size"} } \description{ Use these functions to create bar plots for AMR data analysis. All functions rely on \link[ggplot2:ggplot]{ggplot2} functions. @@ -7545,7 +7543,7 @@ facet_sir(facet = c("interpretation", "antibiotic"), nrow = NULL) scale_y_percent( breaks = function(x) seq(0, max(x, na.rm = TRUE), 0.1), - limits = NULL + limits = c(0, NA) ) scale_sir_colours( @@ -7597,6 +7595,28 @@ labels_sir_count( \item{include_PKPD}{a \link{logical} to indicate that PK/PD clinical breakpoints must be applied as a last resort - the default is \code{TRUE}. Can also be set with the package option \code{\link[=AMR-options]{AMR_include_PKPD}}.} \item{breakpoint_type}{the type of breakpoints to use, either "ECOFF", "animal", or "human". ECOFF stands for Epidemiological Cut-Off values. The default is \code{"human"}, which can also be set with the package option \code{\link[=AMR-options]{AMR_breakpoint_type}}. If \code{host} is set to values of veterinary species, this will automatically be set to \code{"animal"}.} + +\item{facet}{variable to split plots by, either \code{"interpretation"} (default) or \code{"antibiotic"} or a grouping variable} + +\item{nrow}{(when using \code{facet}) number of rows} + +\item{breaks}{a \link{numeric} vector of positions} + +\item{limits}{a \link{numeric} vector of length two providing limits of the scale, use \code{NA} to refer to the existing minimum or maximum} + +\item{aesthetics}{aesthetics to apply the colours to - the default is "fill" but can also be (a combination of) "alpha", "colour", "fill", "linetype", "shape" or "size"} + +\item{position}{position adjustment of bars, either \code{"fill"}, \code{"stack"} or \code{"dodge"}} + +\item{translate_ab}{a column name of the \link{antibiotics} data set to translate the antibiotic abbreviations to, using \code{\link[=ab_property]{ab_property()}}} + +\item{minimum}{the minimum allowed number of available (tested) isolates. Any isolate count lower than \code{minimum} will return \code{NA} with a warning. The default number of \code{30} isolates is advised by the Clinical and Laboratory Standards Institute (CLSI) as best practice, see \emph{Source}.} + +\item{combine_SI}{a \link{logical} to indicate whether all values of S, SDD, and I must be merged into one, so the output only consists of S+SDD+I vs. R (susceptible vs. resistant) - the default is \code{TRUE}} + +\item{datalabels.size}{size of the datalabels} + +\item{datalabels.colour}{colour of the datalabels} } \value{ The \code{autoplot()} functions return a \code{\link[ggplot2:ggplot]{ggplot}} model that is extendible with any \code{ggplot2} function. @@ -7641,7 +7661,7 @@ plot(some_disk_values, mo = "Escherichia coli", ab = "cipro") plot(some_disk_values, mo = "Escherichia coli", ab = "cipro", language = "nl") -# Plotting using scale_x_mic() +# Plotting using scale_x_mic() --------------------------------------------- \donttest{ if (require("ggplot2")) { mic_plot <- ggplot(data.frame(mics = as.mic(c(0.25, "<=4", 4, 8, 32, ">=32")), @@ -7681,6 +7701,25 @@ if (require("ggplot2")) { if (require("ggplot2")) { autoplot(some_sir_values) } + +# Plotting using scale_y_percent() ----------------------------------------- +if (require("ggplot2")) { + p <- ggplot(data.frame(mics = as.mic(c(0.25, "<=4", 4, 8, 32, ">=32")), + counts = c(1, 1, 2, 2, 3, 3)), + aes(mics, counts / sum(counts))) + + geom_col() + print(p) + + p2 <- p + + scale_y_percent() + + theme_sir() + print(p2) + + p + + scale_y_percent(breaks = seq(from = 0, to = 1, by = 0.1), + limits = c(0, 1)) + + theme_sir() +} } } @@ -8912,13 +8951,13 @@ THE PART HEREAFTER CONTAINS CONTENTS FROM FILE 'vignettes/AMR_with_tidymodels.Rm --- -title: "`AMR` with `tidymodels`" +title: "AMR with tidymodels" output: rmarkdown::html_vignette: toc: true toc_depth: 3 vignette: > - %\VignetteIndexEntry{`AMR` with `tidymodels`} + %\VignetteIndexEntry{AMR with tidymodels} %\VignetteEncoding{UTF-8} %\VignetteEngine{knitr::rmarkdown} editor_options: @@ -8935,22 +8974,20 @@ knitr::opts_chunk$set( ) ``` +> This page was entirely written by our [AMR for R Assistant](https://chatgpt.com/g/g-M4UNLwFi5-amr-for-r-assistant), a ChatGPT manually-trained model able to answer any question about the AMR package. + Antimicrobial resistance (AMR) is a global health crisis, and understanding resistance patterns is crucial for managing effective treatments. The `AMR` R package provides robust tools for analysing AMR data, including convenient antibiotic selector functions like `aminoglycosides()` and `betalactams()`. In this post, we will explore how to use the `tidymodels` framework to predict resistance patterns in the `example_isolates` dataset. -By leveraging the power of `tidymodels` and the `AMR` package, we’ll build a reproducible machine learning workflow to predict resistance to two important antibiotic classes: aminoglycosides and beta-lactams. - ---- +By leveraging the power of `tidymodels` and the `AMR` package, we’ll build a reproducible machine learning workflow to predict the Gramstain of the microorganism to two important antibiotic classes: aminoglycosides and beta-lactams. ### **Objective** -Our goal is to build a predictive model using the `tidymodels` framework to determine resistance patterns based on microbial data. We will: +Our goal is to build a predictive model using the `tidymodels` framework to determine the Gramstain of the microorganism based on microbial data. We will: 1. Preprocess data using the selector functions `aminoglycosides()` and `betalactams()`. 2. Define a logistic regression model for prediction. 3. Use a structured `tidymodels` workflow to preprocess, train, and evaluate the model. ---- - ### **Data Preparation** We begin by loading the required libraries and preparing the `example_isolates` dataset from the `AMR` package. @@ -8976,26 +9013,21 @@ data <- example_isolates %>% # get Gramstain of microorganisms mo = as.factor(mo_gramstain(mo))) %>% # drop NAs - the ones without a Gramstain (fungi, etc.) - drop_na() # %>% - # Cefepime is not reliable - #select(-FEP) + drop_na() ``` **Explanation:** + - `aminoglycosides()` and `betalactams()` dynamically select columns for antibiotics in these classes. - `drop_na()` ensures the model receives complete cases for training. ---- - ### **Defining the Workflow** We now define the `tidymodels` workflow, which consists of three steps: preprocessing, model specification, and fitting. #### 1. Preprocessing with a Recipe -We create a recipe to preprocess the data for modelling. This includes: -- Encoding resistance results (`S`, `I`, `R`) as binary (resistant or not resistant). -- Converting microbial organism names (`mo`) into numerical features using one-hot encoding. +We create a recipe to preprocess the data for modelling. ```{r} # Define the recipe for data preprocessing @@ -9005,8 +9037,11 @@ resistance_recipe ``` **Explanation:** -- `step_mutate()` transforms resistance results (`R`) into binary variables (TRUE/FALSE). -- `step_dummy()` converts categorical organism (`mo`) names into one-hot encoded numerical features, making them compatible with the model. + +- `recipe(mo ~ ., data = data)` will take the `mo` column as outcome and all other columns as predictors. +- `step_corr()` removes predictors (i.e., antibiotic columns) that have a higher correlation than 90%. + +Notice how the recipe contains just the antibiotic selector functions - no need to define the columns specifically. #### 2. Specifying the Model @@ -9020,6 +9055,7 @@ logistic_model ``` **Explanation:** + - `logistic_reg()` sets up a logistic regression model. - `set_engine("glm")` specifies the use of R's built-in GLM engine. @@ -9032,11 +9068,8 @@ We bundle the recipe and model together into a `workflow`, which organizes the e resistance_workflow <- workflow() %>% add_recipe(resistance_recipe) %>% # Add the preprocessing recipe add_model(logistic_model) # Add the logistic regression model -resistance_workflow ``` ---- - ### **Training and Evaluating the Model** To train the model, we split the data into training and testing sets. Then, we fit the workflow on the training set and evaluate its performance. @@ -9051,14 +9084,15 @@ testing_data <- testing(data_split) # Testing set # Fit the workflow to the training data fitted_workflow <- resistance_workflow %>% fit(training_data) # Train the model - -fitted_workflow ``` **Explanation:** + - `initial_split()` splits the data into training and testing sets. - `fit()` trains the workflow on the training set. +Notice how in `fit()`, the antibiotic selector functions are internally called again. For training, these functions are called since they are stored in the recipe. + Next, we evaluate the model on the testing data. ```{r} @@ -9082,10 +9116,11 @@ metrics ``` **Explanation:** -- `predict()` generates predictions on the testing set. -- `metrics()` computes evaluation metrics like accuracy and AUC. -It appears we can predict the Gram based on AMR results with a `r round(metrics$.estimate[1], 3)` accuracy. The ROC curve looks like: +- `predict()` generates predictions on the testing set. +- `metrics()` computes evaluation metrics like accuracy and kappa. + +It appears we can predict the Gram based on AMR results with a `r round(metrics$.estimate[1], 3)` accuracy based on AMR results of aminoglycosides and beta-lactam antibiotics. The ROC curve looks like this: ```{r} predictions %>% @@ -9093,16 +9128,12 @@ predictions %>% autoplot() ``` ---- - ### **Conclusion** In this post, we demonstrated how to build a machine learning pipeline with the `tidymodels` framework and the `AMR` package. By combining selector functions like `aminoglycosides()` and `betalactams()` with `tidymodels`, we efficiently prepared data, trained a model, and evaluated its performance. This workflow is extensible to other antibiotic classes and resistance patterns, empowering users to analyse AMR data systematically and reproducibly. ---- - THE PART HEREAFTER CONTAINS CONTENTS FROM FILE 'vignettes/EUCAST.Rmd': diff --git a/man/ggplot_sir.Rd b/man/ggplot_sir.Rd index efb99b54..058033df 100644 --- a/man/ggplot_sir.Rd +++ b/man/ggplot_sir.Rd @@ -86,8 +86,6 @@ geom_sir( \item{y.title}{text to show as y axis description} \item{...}{other arguments passed on to \code{\link[=geom_sir]{geom_sir()}} or, in case of \code{\link[=scale_sir_colours]{scale_sir_colours()}}, named values to set colours. The default colours are colour-blind friendly, while maintaining the convention that e.g. 'susceptible' should be green and 'resistant' should be red. See \emph{Examples}.} - -\item{aesthetics}{aesthetics to apply the colours to - the default is "fill" but can also be (a combination of) "alpha", "colour", "fill", "linetype", "shape" or "size"} } \description{ Use these functions to create bar plots for AMR data analysis. All functions rely on \link[ggplot2:ggplot]{ggplot2} functions. diff --git a/man/plot.Rd b/man/plot.Rd index 685b669a..4e09485e 100644 --- a/man/plot.Rd +++ b/man/plot.Rd @@ -123,7 +123,7 @@ facet_sir(facet = c("interpretation", "antibiotic"), nrow = NULL) scale_y_percent( breaks = function(x) seq(0, max(x, na.rm = TRUE), 0.1), - limits = NULL + limits = c(0, NA) ) scale_sir_colours( @@ -175,6 +175,28 @@ labels_sir_count( \item{include_PKPD}{a \link{logical} to indicate that PK/PD clinical breakpoints must be applied as a last resort - the default is \code{TRUE}. Can also be set with the package option \code{\link[=AMR-options]{AMR_include_PKPD}}.} \item{breakpoint_type}{the type of breakpoints to use, either "ECOFF", "animal", or "human". ECOFF stands for Epidemiological Cut-Off values. The default is \code{"human"}, which can also be set with the package option \code{\link[=AMR-options]{AMR_breakpoint_type}}. If \code{host} is set to values of veterinary species, this will automatically be set to \code{"animal"}.} + +\item{facet}{variable to split plots by, either \code{"interpretation"} (default) or \code{"antibiotic"} or a grouping variable} + +\item{nrow}{(when using \code{facet}) number of rows} + +\item{breaks}{a \link{numeric} vector of positions} + +\item{limits}{a \link{numeric} vector of length two providing limits of the scale, use \code{NA} to refer to the existing minimum or maximum} + +\item{aesthetics}{aesthetics to apply the colours to - the default is "fill" but can also be (a combination of) "alpha", "colour", "fill", "linetype", "shape" or "size"} + +\item{position}{position adjustment of bars, either \code{"fill"}, \code{"stack"} or \code{"dodge"}} + +\item{translate_ab}{a column name of the \link{antibiotics} data set to translate the antibiotic abbreviations to, using \code{\link[=ab_property]{ab_property()}}} + +\item{minimum}{the minimum allowed number of available (tested) isolates. Any isolate count lower than \code{minimum} will return \code{NA} with a warning. The default number of \code{30} isolates is advised by the Clinical and Laboratory Standards Institute (CLSI) as best practice, see \emph{Source}.} + +\item{combine_SI}{a \link{logical} to indicate whether all values of S, SDD, and I must be merged into one, so the output only consists of S+SDD+I vs. R (susceptible vs. resistant) - the default is \code{TRUE}} + +\item{datalabels.size}{size of the datalabels} + +\item{datalabels.colour}{colour of the datalabels} } \value{ The \code{autoplot()} functions return a \code{\link[ggplot2:ggplot]{ggplot}} model that is extendible with any \code{ggplot2} function. @@ -219,7 +241,7 @@ plot(some_disk_values, mo = "Escherichia coli", ab = "cipro") plot(some_disk_values, mo = "Escherichia coli", ab = "cipro", language = "nl") -# Plotting using scale_x_mic() +# Plotting using scale_x_mic() --------------------------------------------- \donttest{ if (require("ggplot2")) { mic_plot <- ggplot(data.frame(mics = as.mic(c(0.25, "<=4", 4, 8, 32, ">=32")), @@ -259,5 +281,24 @@ if (require("ggplot2")) { if (require("ggplot2")) { autoplot(some_sir_values) } + +# Plotting using scale_y_percent() ----------------------------------------- +if (require("ggplot2")) { + p <- ggplot(data.frame(mics = as.mic(c(0.25, "<=4", 4, 8, 32, ">=32")), + counts = c(1, 1, 2, 2, 3, 3)), + aes(mics, counts / sum(counts))) + + geom_col() + print(p) + + p2 <- p + + scale_y_percent() + + theme_sir() + print(p2) + + p + + scale_y_percent(breaks = seq(from = 0, to = 1, by = 0.1), + limits = c(0, 1)) + + theme_sir() +} } } diff --git a/vignettes/AMR_with_tidymodels.Rmd b/vignettes/AMR_with_tidymodels.Rmd index e3edd17e..09b88171 100644 --- a/vignettes/AMR_with_tidymodels.Rmd +++ b/vignettes/AMR_with_tidymodels.Rmd @@ -1,11 +1,11 @@ --- -title: "`AMR` with `tidymodels`" +title: "AMR with tidymodels" output: rmarkdown::html_vignette: toc: true toc_depth: 3 vignette: > - %\VignetteIndexEntry{`AMR` with `tidymodels`} + %\VignetteIndexEntry{AMR with tidymodels} %\VignetteEncoding{UTF-8} %\VignetteEngine{knitr::rmarkdown} editor_options: @@ -22,22 +22,20 @@ knitr::opts_chunk$set( ) ``` +> This page was entirely written by our [AMR for R Assistant](https://chatgpt.com/g/g-M4UNLwFi5-amr-for-r-assistant), a ChatGPT manually-trained model able to answer any question about the AMR package. + Antimicrobial resistance (AMR) is a global health crisis, and understanding resistance patterns is crucial for managing effective treatments. The `AMR` R package provides robust tools for analysing AMR data, including convenient antibiotic selector functions like `aminoglycosides()` and `betalactams()`. In this post, we will explore how to use the `tidymodels` framework to predict resistance patterns in the `example_isolates` dataset. -By leveraging the power of `tidymodels` and the `AMR` package, we’ll build a reproducible machine learning workflow to predict resistance to two important antibiotic classes: aminoglycosides and beta-lactams. - ---- +By leveraging the power of `tidymodels` and the `AMR` package, we’ll build a reproducible machine learning workflow to predict the Gramstain of the microorganism to two important antibiotic classes: aminoglycosides and beta-lactams. ### **Objective** -Our goal is to build a predictive model using the `tidymodels` framework to determine resistance patterns based on microbial data. We will: +Our goal is to build a predictive model using the `tidymodels` framework to determine the Gramstain of the microorganism based on microbial data. We will: 1. Preprocess data using the selector functions `aminoglycosides()` and `betalactams()`. 2. Define a logistic regression model for prediction. 3. Use a structured `tidymodels` workflow to preprocess, train, and evaluate the model. ---- - ### **Data Preparation** We begin by loading the required libraries and preparing the `example_isolates` dataset from the `AMR` package. @@ -63,26 +61,21 @@ data <- example_isolates %>% # get Gramstain of microorganisms mo = as.factor(mo_gramstain(mo))) %>% # drop NAs - the ones without a Gramstain (fungi, etc.) - drop_na() # %>% - # Cefepime is not reliable - #select(-FEP) + drop_na() ``` **Explanation:** + - `aminoglycosides()` and `betalactams()` dynamically select columns for antibiotics in these classes. - `drop_na()` ensures the model receives complete cases for training. ---- - ### **Defining the Workflow** We now define the `tidymodels` workflow, which consists of three steps: preprocessing, model specification, and fitting. #### 1. Preprocessing with a Recipe -We create a recipe to preprocess the data for modelling. This includes: -- Encoding resistance results (`S`, `I`, `R`) as binary (resistant or not resistant). -- Converting microbial organism names (`mo`) into numerical features using one-hot encoding. +We create a recipe to preprocess the data for modelling. ```{r} # Define the recipe for data preprocessing @@ -92,8 +85,11 @@ resistance_recipe ``` **Explanation:** -- `step_mutate()` transforms resistance results (`R`) into binary variables (TRUE/FALSE). -- `step_dummy()` converts categorical organism (`mo`) names into one-hot encoded numerical features, making them compatible with the model. + +- `recipe(mo ~ ., data = data)` will take the `mo` column as outcome and all other columns as predictors. +- `step_corr()` removes predictors (i.e., antibiotic columns) that have a higher correlation than 90%. + +Notice how the recipe contains just the antibiotic selector functions - no need to define the columns specifically. #### 2. Specifying the Model @@ -107,6 +103,7 @@ logistic_model ``` **Explanation:** + - `logistic_reg()` sets up a logistic regression model. - `set_engine("glm")` specifies the use of R's built-in GLM engine. @@ -119,11 +116,8 @@ We bundle the recipe and model together into a `workflow`, which organizes the e resistance_workflow <- workflow() %>% add_recipe(resistance_recipe) %>% # Add the preprocessing recipe add_model(logistic_model) # Add the logistic regression model -resistance_workflow ``` ---- - ### **Training and Evaluating the Model** To train the model, we split the data into training and testing sets. Then, we fit the workflow on the training set and evaluate its performance. @@ -138,14 +132,15 @@ testing_data <- testing(data_split) # Testing set # Fit the workflow to the training data fitted_workflow <- resistance_workflow %>% fit(training_data) # Train the model - -fitted_workflow ``` **Explanation:** + - `initial_split()` splits the data into training and testing sets. - `fit()` trains the workflow on the training set. +Notice how in `fit()`, the antibiotic selector functions are internally called again. For training, these functions are called since they are stored in the recipe. + Next, we evaluate the model on the testing data. ```{r} @@ -169,10 +164,11 @@ metrics ``` **Explanation:** -- `predict()` generates predictions on the testing set. -- `metrics()` computes evaluation metrics like accuracy and AUC. -It appears we can predict the Gram based on AMR results with a `r round(metrics$.estimate[1], 3)` accuracy. The ROC curve looks like: +- `predict()` generates predictions on the testing set. +- `metrics()` computes evaluation metrics like accuracy and kappa. + +It appears we can predict the Gram based on AMR results with a `r round(metrics$.estimate[1], 3)` accuracy based on AMR results of aminoglycosides and beta-lactam antibiotics. The ROC curve looks like this: ```{r} predictions %>% @@ -180,12 +176,8 @@ predictions %>% autoplot() ``` ---- - ### **Conclusion** In this post, we demonstrated how to build a machine learning pipeline with the `tidymodels` framework and the `AMR` package. By combining selector functions like `aminoglycosides()` and `betalactams()` with `tidymodels`, we efficiently prepared data, trained a model, and evaluated its performance. This workflow is extensible to other antibiotic classes and resistance patterns, empowering users to analyse AMR data systematically and reproducibly. - ----