1
0
mirror of https://github.com/msberends/AMR.git synced 2025-12-22 00:20:26 +01:00

(v3.0.1.9005) re-add tidymodels implementation

This commit is contained in:
2025-12-21 12:19:43 +01:00
parent 225c73f7e7
commit 151af21f38
16 changed files with 502 additions and 61 deletions

View File

@@ -22,9 +22,9 @@ body:
label: Minimal Reproducible Example (optional)
description: Please include a short R code snippet that reproduces the problem, if possible.
placeholder:
e.g.
```r
ab_name("amoxicillin/clavulanic acid", language = "es")
e.g.\n\n
```r<br>
ab_name("amoxicillin/clavulanic acid", language = "es")\n
```
validations:
required: false
@@ -42,7 +42,7 @@ body:
multiple: false
options:
- ''
- Latest CRAN version (3.0.0)
- One of the latest GitHub versions (3.0.0.9xxx)
- Latest CRAN version (3.0.1)
- One of the latest GitHub versions (3.0.1.9xxx)
validations:
required: true

View File

@@ -1,6 +1,6 @@
Package: AMR
Version: 3.0.1.9004
Date: 2025-12-15
Version: 3.0.1.9005
Date: 2025-12-21
Title: Antimicrobial Resistance Data Analysis
Description: Functions to simplify and standardise antimicrobial resistance (AMR)
data analysis and to work with microbial and antimicrobial properties by

View File

@@ -106,6 +106,8 @@ S3method(print,mo_uncertainties)
S3method(print,pca)
S3method(print,sir)
S3method(print,sir_log)
S3method(print,step_mic_log2)
S3method(print,step_sir_numeric)
S3method(quantile,mic)
S3method(rep,ab)
S3method(rep,av)
@@ -159,6 +161,12 @@ export(administrable_per_os)
export(age)
export(age_groups)
export(all_antimicrobials)
export(all_disk)
export(all_disk_predictors)
export(all_mic)
export(all_mic_predictors)
export(all_sir)
export(all_sir_predictors)
export(aminoglycosides)
export(aminopenicillins)
export(amr_class)
@@ -352,6 +360,8 @@ export(sir_df)
export(sir_interpretation_history)
export(sir_predict)
export(skewness)
export(step_mic_log2)
export(step_sir_numeric)
export(streptogramins)
export(sulfonamides)
export(susceptibility)
@@ -390,6 +400,12 @@ if(getRversion() >= "3.0.0") S3method(pillar::type_sum, av)
if(getRversion() >= "3.0.0") S3method(pillar::type_sum, mic)
if(getRversion() >= "3.0.0") S3method(pillar::type_sum, mo)
if(getRversion() >= "3.0.0") S3method(pillar::type_sum, sir)
if(getRversion() >= "3.0.0") S3method(recipes::bake, step_mic_log2)
if(getRversion() >= "3.0.0") S3method(recipes::bake, step_sir_numeric)
if(getRversion() >= "3.0.0") S3method(recipes::prep, step_mic_log2)
if(getRversion() >= "3.0.0") S3method(recipes::prep, step_sir_numeric)
if(getRversion() >= "3.0.0") S3method(recipes::tidy, step_mic_log2)
if(getRversion() >= "3.0.0") S3method(recipes::tidy, step_sir_numeric)
if(getRversion() >= "3.0.0") S3method(skimr::get_skimmers, ab)
if(getRversion() >= "3.0.0") S3method(skimr::get_skimmers, disk)
if(getRversion() >= "3.0.0") S3method(skimr::get_skimmers, mic)

View File

@@ -1,4 +1,10 @@
# AMR 3.0.1.9004
# AMR 3.0.1.9005
### New
* Integration with the **tidymodels** framework to allow seamless use of SIR, MIC and disk data in modelling pipelines via `recipes`
- `step_mic_log2()` to transform `<mic>` columns with log2, and `step_sir_numeric()` to convert `<sir>` columns to numeric
- New `tidyselect` helpers: `all_sir()`, `all_sir_predictors()`, `all_mic()`, `all_mic_predictors()`, `all_disk()`, `all_disk_predictors()`
* Data set `esbl_isolates` to practise with AMR modelling
### Changed
* Fixed a bug in `antibiogram()` for when no antimicrobials are set

View File

@@ -966,8 +966,13 @@ get_current_data <- function(arg_name, call) {
# an element `.data` will be in the environment when using dplyr::select()
return(env$`.data`)
} else if (valid_df(env$training)) {
# an element `training` will be in the environment when using some tidymodels functions such as `prep()`
if (!is.null(env$x) && valid_df(env$x$template)) {
# an element `x$template` will be in the environment when using some tidymodels functions such as `prep()`
return(env$x$template)
} else {
# this is a fallback for some tidymodels functions such as `prep()`
return(env$training)
}
} else if (valid_df(env$data)) {
# an element `data` will be in the environment when using older dplyr versions, or some tidymodels functions such as `fit()`
return(env$data)

View File

@@ -99,7 +99,8 @@ atc_online_property <- function(atc_code,
read_html <- import_fn("read_html", "xml2")
if (!all(atc_code %in% unlist(AMR::antimicrobials$atc))) {
atc_code <- as.character(ab_atc(atc_code, only_first = TRUE))
missing <- atc_code %unlike% "[A-Z][0-9][0-9][A-Z][A-Z][0-9][0-9]"
atc_code[missing] <- as.character(ab_atc(atc_code[missing], only_first = TRUE))
}
if (!has_internet()) {

View File

@@ -282,7 +282,7 @@
#' Data Set with Clinical Breakpoints for SIR Interpretation
#'
#' @description Data set containing clinical breakpoints to interpret MIC and disk diffusion to SIR values, according to international guidelines. This dataset contain breakpoints for humans, `r length(unique(clinical_breakpoints$host[!clinical_breakpoints$host %in% clinical_breakpoints$type]))` different animal groups, and ECOFFs.
#' @description Data set containing clinical breakpoints to interpret MIC and disk diffusion to SIR values, according to international guidelines. This data set contains breakpoints for humans, `r length(unique(clinical_breakpoints$host[!clinical_breakpoints$host %in% clinical_breakpoints$type]))` different animal groups, and ECOFFs.
#'
#' These breakpoints are currently implemented:
#' - For **clinical microbiology**: EUCAST `r min(as.integer(gsub("[^0-9]", "", subset(AMR::clinical_breakpoints, guideline %like% "EUCAST" & type == "human")$guideline)))`-`r max(as.integer(gsub("[^0-9]", "", subset(AMR::clinical_breakpoints, guideline %like% "EUCAST" & type == "human")$guideline)))` and CLSI `r min(as.integer(gsub("[^0-9]", "", subset(AMR::clinical_breakpoints, guideline %like% "CLSI" & type == "human")$guideline)))`-`r max(as.integer(gsub("[^0-9]", "", subset(AMR::clinical_breakpoints, guideline %like% "CLSI" & type == "human")$guideline)))`;
@@ -362,14 +362,14 @@
#' dosage
"dosage"
# TODO #' Data Set with `r format(nrow(esbl_isolates), big.mark = " ")` ESBL Isolates
# TODO #'
# TODO #' A data set containing `r format(nrow(esbl_isolates), big.mark = " ")` microbial isolates with MIC values of common antibiotics and a binary `esbl` column for extended-spectrum beta-lactamase (ESBL) production. This data set contains randomised fictitious data but reflects reality and can be used to practise AMR-related machine learning, e.g., classification modelling with [tidymodels](https://amr-for-r.org/articles/AMR_with_tidymodels.html).
# TODO #' @format A [tibble][tibble::tibble] with `r format(nrow(esbl_isolates), big.mark = " ")` observations and `r ncol(esbl_isolates)` variables:
# TODO #' - `esbl`\cr Logical indicator if the isolate is ESBL-producing
# TODO #' - `genus`\cr Genus of the microorganism
# TODO #' - `AMC:COL`\cr MIC values for 17 antimicrobial agents, transformed to class [`mic`] (see [as.mic()])
# TODO #' @details See our [tidymodels integration][amr-tidymodels] for an example using this data set.
# TODO #' @examples
# TODO #' esbl_isolates
# TODO "esbl_isolates"
#' Data Set with `r format(nrow(esbl_isolates), big.mark = " ")` ESBL Isolates
#'
#' A data set containing `r format(nrow(esbl_isolates), big.mark = " ")` microbial isolates with MIC values of common antibiotics and a binary `esbl` column for extended-spectrum beta-lactamase (ESBL) production. This data set contains randomised fictitious data but reflects reality and can be used to practise AMR-related machine learning, e.g., classification modelling with [tidymodels](https://amr-for-r.org/articles/AMR_with_tidymodels.html).
#' @format A [tibble][tibble::tibble] with `r format(nrow(esbl_isolates), big.mark = " ")` observations and `r ncol(esbl_isolates)` variables:
#' - `esbl`\cr Logical indicator if the isolate is ESBL-producing
#' - `genus`\cr Genus of the microorganism
#' - `AMC:COL`\cr MIC values for 17 antimicrobial agents, transformed to class [`mic`] (see [as.mic()])
#' @details See our [tidymodels integration][amr-tidymodels] for an example using this data set.
#' @examples
#' esbl_isolates
"esbl_isolates"

View File

@@ -1,20 +1,21 @@
#' AMR Extensions for Tidymodels
#'
#' This family of functions allows using AMR-specific data types such as `<mic>` and `<sir>` inside `tidymodels` pipelines.
#' This family of functions allows using AMR-specific data types such as `<sir>` and `<mic>` inside `tidymodels` pipelines.
#' @inheritParams recipes::step_center
#' @details
#' You can read more in our online [AMR with tidymodels introduction](https://amr-for-r.org/articles/AMR_with_tidymodels.html).
#'
#' Tidyselect helpers include:
#' - [all_mic()] and [all_mic_predictors()] to select `<mic>` columns
#' - [all_sir()] and [all_sir_predictors()] to select `<sir>` columns
#' - [all_sir()] and [all_sir_predictors()] to select [`<sir>`][as.sir()] columns
#' - [all_mic()] and [all_mic_predictors()] to select [`<mic>`][as.mic()] columns
#' - [all_disk()] and [all_disk_predictors()] to select [`<disk>`][as.disk()] columns
#'
#' Pre-processing pipeline steps include:
#' - [step_mic_log2()] to convert MIC columns to numeric (via `as.numeric()`) and apply a log2 transform, to be used with [all_mic_predictors()]
#' - [step_sir_numeric()] to convert SIR columns to numeric (via `as.numeric()`), to be used with [all_sir_predictors()]: `"S"` = 1, `"I"`/`"SDD"` = 2, `"R"` = 3. All other values are rendered `NA`. Keep this in mind for further processing, especially if the model does not allow for `NA` values.
#' - [step_mic_log2()] to convert MIC columns to numeric (via `as.numeric()`) and apply a log2 transform, to be used with [all_mic_predictors()]
#'
#' These steps integrate with `recipes::recipe()` and work like standard preprocessing steps. They are useful for preparing data for modelling, especially with classification models.
#' @seealso [recipes::recipe()], [as.mic()], [as.sir()]
#' @seealso [recipes::recipe()], [as.sir()], [as.mic()], [as.disk()]
#' @name amr-tidymodels
#' @keywords internal
#' @export
@@ -66,35 +67,55 @@
#' bind_cols(out_testing)
#'
#' # Evaluate predictions using standard classification metrics
#' our_metrics <- metric_set(accuracy, kap, ppv, npv)
#' our_metrics <- metric_set(accuracy,
#' recall,
#' precision,
#' sensitivity,
#' specificity,
#' ppv,
#' npv)
#' metrics <- our_metrics(predictions, truth = esbl, estimate = .pred_class)
#'
#' # Show performance
#' metrics
#' }
all_mic <- function() {
x <- tidymodels_amr_select(levels(NA_mic_))
names(x)
}
#' @rdname amr-tidymodels
#' @export
all_mic_predictors <- function() {
x <- tidymodels_amr_select(levels(NA_mic_))
intersect(x, recipes::has_role("predictor"))
}
#' @rdname amr-tidymodels
#' @export
all_sir <- function() {
x <- tidymodels_amr_select(levels(NA_sir_))
x <- tidymodels_amr_select(class = "sir")
names(x)
}
#' @rdname amr-tidymodels
#' @export
all_sir_predictors <- function() {
x <- tidymodels_amr_select(levels(NA_sir_))
x <- tidymodels_amr_select(class = "sir")
intersect(x, recipes::has_role("predictor"))
}
#' @rdname amr-tidymodels
#' @export
all_mic <- function() {
x <- tidymodels_amr_select(class = "mic")
names(x)
}
#' @rdname amr-tidymodels
#' @export
all_mic_predictors <- function() {
x <- tidymodels_amr_select(class = "mic")
intersect(x, recipes::has_role("predictor"))
}
#' @rdname amr-tidymodels
#' @export
all_disk <- function() {
x <- tidymodels_amr_select(class = "disk")
names(x)
}
#' @rdname amr-tidymodels
#' @export
all_disk_predictors <- function() {
x <- tidymodels_amr_select(class = "disk")
intersect(x, recipes::has_role("predictor"))
}
@@ -160,7 +181,6 @@ bake.step_mic_log2 <- function(object, new_data, ...) {
print.step_mic_log2 <- function(x, width = max(20, options()$width - 35), ...) {
title <- "Log2 transformation of MIC columns"
recipes::print_step(x$columns, x$terms, x$trained, title, width)
invisible(x)
}
#' @rawNamespace if(getRversion() >= "3.0.0") S3method(recipes::tidy, step_mic_log2)
@@ -236,7 +256,6 @@ bake.step_sir_numeric <- function(object, new_data, ...) {
print.step_sir_numeric <- function(x, width = max(20, options()$width - 35), ...) {
title <- "Numeric transformation of SIR columns"
recipes::print_step(x$columns, x$terms, x$trained, title, width)
invisible(x)
}
#' @rawNamespace if(getRversion() >= "3.0.0") S3method(recipes::tidy, step_sir_numeric)
@@ -250,13 +269,13 @@ tidy.step_sir_numeric <- function(x, ...) {
res
}
tidymodels_amr_select <- function(check_vector) {
tidymodels_amr_select <- function(class) {
df <- get_current_data()
ind <- which(
vapply(
FUN.VALUE = logical(1),
df,
function(x) all(x %in% c(check_vector, NA), na.rm = TRUE) & any(x %in% check_vector),
function(x) inherits(x, class),
USE.NAMES = TRUE
),
useNames = TRUE

138
man/amr-tidymodels.Rd Normal file
View File

@@ -0,0 +1,138 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/tidymodels.R
\name{amr-tidymodels}
\alias{amr-tidymodels}
\alias{all_sir}
\alias{all_sir_predictors}
\alias{all_mic}
\alias{all_mic_predictors}
\alias{all_disk}
\alias{all_disk_predictors}
\alias{step_mic_log2}
\alias{step_sir_numeric}
\title{AMR Extensions for Tidymodels}
\usage{
all_sir()
all_sir_predictors()
all_mic()
all_mic_predictors()
all_disk()
all_disk_predictors()
step_mic_log2(recipe, ..., role = NA, trained = FALSE, columns = NULL,
skip = FALSE, id = recipes::rand_id("mic_log2"))
step_sir_numeric(recipe, ..., role = NA, trained = FALSE, columns = NULL,
skip = FALSE, id = recipes::rand_id("sir_numeric"))
}
\arguments{
\item{recipe}{A recipe object. The step will be added to the sequence of
operations for this recipe.}
\item{...}{One or more selector functions to choose variables for this step.
See \code{\link[recipes:selections]{selections()}} for more details.}
\item{role}{Not used by this step since no new variables are created.}
\item{trained}{A logical to indicate if the quantities for preprocessing have
been estimated.}
\item{skip}{A logical. Should the step be skipped when the recipe is baked by
\code{\link[recipes:bake]{bake()}}? While all operations are baked when \code{\link[recipes:prep]{prep()}} is run, some
operations may not be able to be conducted on new data (e.g. processing the
outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it
may affect the computations for subsequent operations.}
\item{id}{A character string that is unique to this step to identify it.}
}
\description{
This family of functions allows using AMR-specific data types such as \verb{<sir>} and \verb{<mic>} inside \code{tidymodels} pipelines.
}
\details{
You can read more in our online \href{https://amr-for-r.org/articles/AMR_with_tidymodels.html}{AMR with tidymodels introduction}.
Tidyselect helpers include:
\itemize{
\item \code{\link[=all_sir]{all_sir()}} and \code{\link[=all_sir_predictors]{all_sir_predictors()}} to select \code{\link[=as.sir]{<sir>}} columns
\item \code{\link[=all_mic]{all_mic()}} and \code{\link[=all_mic_predictors]{all_mic_predictors()}} to select \code{\link[=as.mic]{<mic>}} columns
\item \code{\link[=all_disk]{all_disk()}} and \code{\link[=all_disk_predictors]{all_disk_predictors()}} to select \code{\link[=as.disk]{<disk>}} columns
}
Pre-processing pipeline steps include:
\itemize{
\item \code{\link[=step_sir_numeric]{step_sir_numeric()}} to convert SIR columns to numeric (via \code{as.numeric()}), to be used with \code{\link[=all_sir_predictors]{all_sir_predictors()}}: \code{"S"} = 1, \code{"I"}/\code{"SDD"} = 2, \code{"R"} = 3. All other values are rendered \code{NA}. Keep this in mind for further processing, especially if the model does not allow for \code{NA} values.
\item \code{\link[=step_mic_log2]{step_mic_log2()}} to convert MIC columns to numeric (via \code{as.numeric()}) and apply a log2 transform, to be used with \code{\link[=all_mic_predictors]{all_mic_predictors()}}
}
These steps integrate with \code{recipes::recipe()} and work like standard preprocessing steps. They are useful for preparing data for modelling, especially with classification models.
}
\examples{
if (require("tidymodels")) {
# The below approach formed the basis for this paper: DOI 10.3389/fmicb.2025.1582703
# Presence of ESBL genes was predicted based on raw MIC values.
# example data set in the AMR package
esbl_isolates
# Prepare a binary outcome and convert to ordered factor
data <- esbl_isolates \%>\%
mutate(esbl = factor(esbl, levels = c(FALSE, TRUE), ordered = TRUE))
# Split into training and testing sets
split <- initial_split(data)
training_data <- training(split)
testing_data <- testing(split)
# Create and prep a recipe with MIC log2 transformation
mic_recipe <- recipe(esbl ~ ., data = training_data) \%>\%
# Optionally remove non-predictive variables
remove_role(genus, old_role = "predictor") \%>\%
# Apply the log2 transformation to all MIC predictors
step_mic_log2(all_mic_predictors()) \%>\%
# And apply the preparation steps
prep()
# View prepped recipe
mic_recipe
# Apply the recipe to training and testing data
out_training <- bake(mic_recipe, new_data = NULL)
out_testing <- bake(mic_recipe, new_data = testing_data)
# Fit a logistic regression model
fitted <- logistic_reg(mode = "classification") \%>\%
set_engine("glm") \%>\%
fit(esbl ~ ., data = out_training)
# Generate predictions on the test set
predictions <- predict(fitted, out_testing) \%>\%
bind_cols(out_testing)
# Evaluate predictions using standard classification metrics
our_metrics <- metric_set(accuracy,
recall,
precision,
sensitivity,
specificity,
ppv,
npv)
metrics <- our_metrics(predictions, truth = esbl, estimate = .pred_class)
# Show performance
metrics
}
}
\seealso{
\code{\link[recipes:recipe]{recipes::recipe()}}, \code{\link[=as.sir]{as.sir()}}, \code{\link[=as.mic]{as.mic()}}, \code{\link[=as.disk]{as.disk()}}
}
\keyword{internal}

View File

@@ -27,7 +27,7 @@ A \link[tibble:tibble]{tibble} with 40 217 observations and 14 variables:
clinical_breakpoints
}
\description{
Data set containing clinical breakpoints to interpret MIC and disk diffusion to SIR values, according to international guidelines. This dataset contain breakpoints for humans, 7 different animal groups, and ECOFFs.
Data set containing clinical breakpoints to interpret MIC and disk diffusion to SIR values, according to international guidelines. This data set contains breakpoints for humans, 7 different animal groups, and ECOFFs.
These breakpoints are currently implemented:
\itemize{

27
man/esbl_isolates.Rd Normal file
View File

@@ -0,0 +1,27 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data.R
\docType{data}
\name{esbl_isolates}
\alias{esbl_isolates}
\title{Data Set with 500 ESBL Isolates}
\format{
A \link[tibble:tibble]{tibble} with 500 observations and 19 variables:
\itemize{
\item \code{esbl}\cr Logical indicator if the isolate is ESBL-producing
\item \code{genus}\cr Genus of the microorganism
\item \code{AMC:COL}\cr MIC values for 17 antimicrobial agents, transformed to class \code{\link{mic}} (see \code{\link[=as.mic]{as.mic()}})
}
}
\usage{
esbl_isolates
}
\description{
A data set containing 500 microbial isolates with MIC values of common antibiotics and a binary \code{esbl} column for extended-spectrum beta-lactamase (ESBL) production. This data set contains randomised fictitious data but reflects reality and can be used to practise AMR-related machine learning, e.g., classification modelling with \href{https://amr-for-r.org/articles/AMR_with_tidymodels.html}{tidymodels}.
}
\details{
See our \link[=amr-tidymodels]{tidymodels integration} for an example using this data set.
}
\examples{
esbl_isolates
}
\keyword{datasets}

View File

@@ -0,0 +1,84 @@
# ==================================================================== #
# TITLE: #
# AMR: An R Package for Working with Antimicrobial Resistance Data #
# #
# SOURCE CODE: #
# https://github.com/msberends/AMR #
# #
# PLEASE CITE THIS SOFTWARE AS: #
# Berends MS, Luz CF, Friedrich AW, et al. (2022). #
# AMR: An R Package for Working with Antimicrobial Resistance Data. #
# Journal of Statistical Software, 104(3), 1-31. #
# https://doi.org/10.18637/jss.v104.i03 #
# #
# Developed at the University of Groningen and the University Medical #
# Center Groningen in The Netherlands, in collaboration with many #
# colleagues from around the world, see our website. #
# #
# This R package is free software; you can freely use and distribute #
# it for both personal and commercial purposes under the terms of the #
# GNU General Public License version 2.0 (GNU GPL-2), as published by #
# the Free Software Foundation. #
# We created this package for both routine data analysis and academic #
# research and it was publicly released in the hope that it will be #
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. #
# #
# Visit our website for the full manual and a complete tutorial about #
# how to conduct AMR data analysis: https://amr-for-r.org #
# ==================================================================== #
test_that("tidymodels.R", {
skip_on_cran()
if (AMR:::pkg_is_available("recipes", also_load = TRUE) && AMR:::pkg_is_available("dplyr", also_load = TRUE)) {
# SIR
df <- tibble(
sir1 = as.sir(c("S", "I", "R", "S", "R")),
sir2 = as.sir(c("I", "R", "S", "R", "I")),
not_sir = c("S", "R", "R", "S", "I")
)
rec <- recipe(~., data = df) %>% step_sir_numeric(all_sir())
prepped <- prep(rec)
baked <- bake(prepped, new_data = df)
expect_inherits(baked$sir1, "numeric")
expect_inherits(baked$sir2, "numeric")
expect_equal(baked$not_sir, as.factor(df$not_sir))
# MIC
df <- tibble(
mic_col1 = as.mic(c("<=0.002", "0.002", "0.004", "0.016", "32")),
mic_col2 = as.mic(c("0.5", "1", "2", "4", "8")),
non_mic = c(1, 2, 3, 4, 5)
)
rec <- recipe(~., data = df) %>% step_mic_log2(all_mic())
prepped <- prep(rec)
baked <- bake(prepped, new_data = df)
expect_inherits(baked$mic_col1, "numeric")
expect_inherits(baked$mic_col2, "numeric")
expect_equal(baked$non_mic, df$non_mic)
expect_equal(baked$mic_col2, log2(as.numeric(df$mic_col2)))
# disk
df <- tibble(
disk_col = as.disk(c(21, 22, 23, 24, 25)),
non_disk = c(21, 22, 23, 24, 25)
)
rec <- recipe(~., data = df) %>% step_rm(!all_disk())
prepped <- prep(rec)
baked <- bake(prepped, new_data = df)
expect_inherits(baked$disk_col, "disk")
# steps check
df <- tibble(x = as.mic(c("1", "2", "4")))
rec <- recipe(~x, data = df) %>% step_mic_log2(all_mic())
prepped <- prep(rec)
tidy_df <- tidy(prepped, number = 1)
expect_equal(unname(tidy_df$terms), "x")
df <- tibble(x = as.sir(c("S", "I", "R")))
rec <- recipe(~x, data = df) %>% step_sir_numeric(all_sir())
prepped <- prep(rec)
tidy_df <- tidy(prepped, number = 1)
expect_equal(unname(tidy_df$terms), "x")
}
})

View File

@@ -22,7 +22,7 @@ knitr::opts_chunk$set(
)
```
> This page was entirely written by our [AMR for R Assistant](https://chat.amr-for-r.org), a ChatGPT manually-trained model able to answer any question about the `AMR` package.
> This page was almost entirely written by our [AMR for R Assistant](https://chat.amr-for-r.org), a ChatGPT manually-trained model able to answer any question about the `AMR` package.
Antimicrobial resistance (AMR) is a global health crisis, and understanding resistance patterns is crucial for managing effective treatments. The `AMR` R package provides robust tools for analysing AMR data, including convenient antimicrobial selector functions like `aminoglycosides()` and `betalactams()`.
@@ -219,18 +219,163 @@ This workflow is extensible to other antimicrobial classes and resistance patter
In this second example, we demonstrate how to use `<mic>` columns directly in `tidymodels` workflows using AMR-specific recipe steps. This includes a transformation to `log2` scale using `step_mic_log2()`, which prepares MIC values for use in classification models.
This approach and idea formed the basis for the publication [DOI: 10.3389/fmicb.2025.1582703](https://doi.org/10.3389/fmicb.2025.1582703) to model the presence of extended-spectrum beta-lactamases (ESBL).
This approach and idea formed the basis for the publication [DOI: 10.3389/fmicb.2025.1582703](https://doi.org/10.3389/fmicb.2025.1582703) to model the presence of extended-spectrum beta-lactamases (ESBL) based on MIC values.
> NOTE: THIS EXAMPLE WILL BE AVAILABLE IN A NEXT VERSION (#TODO)
>
> The new AMR package version will contain new tidymodels selectors such as `step_mic_log2()`.
### **Objective**
<!-- TODO for AMR v3.1.0: add info from here: https://github.com/msberends/AMR/blob/2461631bcefa78ebdb37bdfad359be74cdd9165a/vignettes/AMR_with_tidymodels.Rmd#L212-L291 -->
Our goal is to:
1. Use raw MIC values to predict whether a bacterial isolate produces ESBL.
2. Apply AMR-aware preprocessing in a `tidymodels` recipe.
3. Train a classification model and evaluate its predictive performance.
### **Data Preparation**
We use the `esbl_isolates` dataset that comes with the AMR package.
```{r}
# Load required libraries
library(AMR)
library(tidymodels)
# View the esbl_isolates data set
esbl_isolates
# Prepare a binary outcome and convert to ordered factor
data <- esbl_isolates %>%
mutate(esbl = factor(esbl, levels = c(FALSE, TRUE), ordered = TRUE))
```
**Explanation:**
- `esbl_isolates`: Contains MIC test results and ESBL status for each isolate.
- `mutate(esbl = ...)`: Converts the target column to an ordered factor for classification.
### **Defining the Workflow**
#### 1. Preprocessing with a Recipe
We use our `step_mic_log2()` function to log2-transform MIC values, ensuring that MICs are numeric and properly scaled. All MIC predictors can easily and agnostically selected using the new `all_mic_predictors()`:
```{r}
# Split into training and testing sets
set.seed(123)
split <- initial_split(data)
training_data <- training(split)
testing_data <- testing(split)
# Define the recipe
mic_recipe <- recipe(esbl ~ ., data = training_data) %>%
remove_role(genus, old_role = "predictor") %>% # Remove non-informative variable
step_mic_log2(all_mic_predictors()) #%>% # Log2 transform all MIC predictors
# prep()
mic_recipe
```
**Explanation:**
- `remove_role()`: Removes irrelevant variables like genus.
- `step_mic_log2()`: Applies `log2(as.numeric(...))` to all MIC predictors in one go.
- `prep()`: Finalises the recipe based on training data.
#### 2. Specifying the Model
We use a simple logistic regression to model ESBL presence, though recent models such as xgboost ([link to `parsnip` manual](https://parsnip.tidymodels.org/reference/details_boost_tree_xgboost.html)) could be much more precise.
```{r}
# Define the model
model <- logistic_reg(mode = "classification") %>%
set_engine("glm")
model
```
**Explanation:**
- `logistic_reg()`: Specifies a binary classification model.
- `set_engine("glm")`: Uses the base R GLM engine.
#### 3. Building the Workflow
```{r}
# Create workflow
workflow_model <- workflow() %>%
add_recipe(mic_recipe) %>%
add_model(model)
workflow_model
```
### **Training and Evaluating the Model**
```{r}
# Fit the model
fitted <- fit(workflow_model, training_data)
# Generate predictions
predictions <- predict(fitted, testing_data) %>%
bind_cols(predict(fitted, out_testing, type = "prob")) %>% # add probabilities
bind_cols(testing_data)
# Evaluate model performance
our_metrics <- metric_set(accuracy, recall, precision, sensitivity, specificity, ppv, npv)
metrics <- our_metrics(predictions, truth = esbl, estimate = .pred_class)
metrics
```
**Explanation:**
- `fit()`: Trains the model on the processed training data.
- `predict()`: Produces predictions for unseen test data.
- `metric_set()`: Allows evaluating multiple classification metrics. This will make `our_metrics` to become a function that we can use to check the predictions with.
It appears we can predict ESBL gene presence with a positive predictive value (PPV) of `r round(metrics[metrics$.metric == "ppv", ]$.estimate, 3) * 100`% and a negative predictive value (NPV) of `r round(metrics[metrics$.metric == "npv", ]$.estimate, 3) * 100` using a simplistic logistic regression model.
### **Visualising Predictions**
We can visualise predictions by comparing predicted and actual ESBL status.
```{r}
library(ggplot2)
ggplot(predictions, aes(x = esbl, fill = .pred_class)) +
geom_bar(position = "stack") +
labs(title = "Predicted vs Actual ESBL Status",
x = "Actual ESBL",
y = "Count") +
theme_minimal()
```
And plot the certainties too - how certain were the actual predictions?
```{r}
predictions %>%
mutate(certainty = ifelse(.pred_class == "FALSE",
.pred_FALSE,
.pred_TRUE),
correct = ifelse(esbl == .pred_class, "Right", "Wrong")) %>%
ggplot(aes(x = seq_len(nrow(predictions)),
y = certainty,
colour = correct)) +
scale_colour_manual(values = c(Right = "green3", Wrong = "red2"),
name = "Correct?") +
geom_point() +
scale_y_continuous(labels = function(x) paste0(x * 100, "%"),
limits = c(0.5, 1)) +
theme_minimal()
```
### **Conclusion**
In this example, we showcased how the new `AMR`-specific recipe steps simplify working with `<mic>` columns in `tidymodels`. The `step_mic_log2()` transformation converts ordered MICs to log2-transformed numerics, improving compatibility with classification models.
This pipeline enables realistic, reproducible, and interpretable modelling of antimicrobial resistance data.
---
## Example 2: Predicting AMR Over Time
## Example 3: Predicting AMR Over Time
In this third example, we aim to predict antimicrobial resistance (AMR) trends over time using `tidymodels`. We will model resistance to three antibiotics (amoxicillin `AMX`, amoxicillin-clavulanic acid `AMC`, and ciprofloxacin `CIP`), based on historical data grouped by year and hospital ward.