(v3.0.1.9005) re-add tidymodels implementation

2026-07-13 02:30:55 +02:00 · 2025-12-21 12:19:43 +01:00
parent 225c73f7e7
commit 151af21f38
16 changed files with 502 additions and 61 deletions
--- a/.github/ISSUE_TEMPLATE/1-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/1-bug-report.yml
@@ -22,9 +22,9 @@ body:
      label: Minimal Reproducible Example (optional)
      description: Please include a short R code snippet that reproduces the problem, if possible.
      placeholder: 
-        e.g.
-        ```r
-        ab_name("amoxicillin/clavulanic acid", language = "es")
+        e.g.\n\n
+        ```r<br>
+        ab_name("amoxicillin/clavulanic acid", language = "es")\n
        ```
    validations:
      required: false
@@ -42,7 +42,7 @@ body:
      multiple: false
      options:
        - ''
-        - Latest CRAN version (3.0.0)
-        - One of the latest GitHub versions (3.0.0.9xxx)
+        - Latest CRAN version (3.0.1)
+        - One of the latest GitHub versions (3.0.1.9xxx)
    validations:
      required: true
--- a/4
+++ b/4
@@ -1,6 +1,6 @@
 Package: AMR
-Version: 3.0.1.9004
-Date: 2025-12-15
+Version: 3.0.1.9005
+Date: 2025-12-21
 Title: Antimicrobial Resistance Data Analysis
 Description: Functions to simplify and standardise antimicrobial resistance (AMR)
  data analysis and to work with microbial and antimicrobial properties by
--- a/16
+++ b/16
@@ -106,6 +106,8 @@ S3method(print,mo_uncertainties)
 S3method(print,pca)
 S3method(print,sir)
 S3method(print,sir_log)
+S3method(print,step_mic_log2)
+S3method(print,step_sir_numeric)
 S3method(quantile,mic)
 S3method(rep,ab)
 S3method(rep,av)
@@ -159,6 +161,12 @@ export(administrable_per_os)
 export(age)
 export(age_groups)
 export(all_antimicrobials)
+export(all_disk)
+export(all_disk_predictors)
+export(all_mic)
+export(all_mic_predictors)
+export(all_sir)
+export(all_sir_predictors)
 export(aminoglycosides)
 export(aminopenicillins)
 export(amr_class)
@@ -352,6 +360,8 @@ export(sir_df)
 export(sir_interpretation_history)
 export(sir_predict)
 export(skewness)
+export(step_mic_log2)
+export(step_sir_numeric)
 export(streptogramins)
 export(sulfonamides)
 export(susceptibility)
@@ -390,6 +400,12 @@ if(getRversion() >= "3.0.0") S3method(pillar::type_sum, av)
 if(getRversion() >= "3.0.0") S3method(pillar::type_sum, mic)
 if(getRversion() >= "3.0.0") S3method(pillar::type_sum, mo)
 if(getRversion() >= "3.0.0") S3method(pillar::type_sum, sir)
+if(getRversion() >= "3.0.0") S3method(recipes::bake, step_mic_log2)
+if(getRversion() >= "3.0.0") S3method(recipes::bake, step_sir_numeric)
+if(getRversion() >= "3.0.0") S3method(recipes::prep, step_mic_log2)
+if(getRversion() >= "3.0.0") S3method(recipes::prep, step_sir_numeric)
+if(getRversion() >= "3.0.0") S3method(recipes::tidy, step_mic_log2)
+if(getRversion() >= "3.0.0") S3method(recipes::tidy, step_sir_numeric)
 if(getRversion() >= "3.0.0") S3method(skimr::get_skimmers, ab)
 if(getRversion() >= "3.0.0") S3method(skimr::get_skimmers, disk)
 if(getRversion() >= "3.0.0") S3method(skimr::get_skimmers, mic)
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,4 +1,10 @@
-# AMR 3.0.1.9004
+# AMR 3.0.1.9005
+
+### New
+* Integration with the **tidymodels** framework to allow seamless use of SIR, MIC and disk data in modelling pipelines via `recipes`
+  - `step_mic_log2()` to transform `<mic>` columns with log2, and `step_sir_numeric()` to convert `<sir>` columns to numeric
+  - New `tidyselect` helpers: `all_sir()`, `all_sir_predictors()`, `all_mic()`, `all_mic_predictors()`, `all_disk()`, `all_disk_predictors()`
+* Data set `esbl_isolates` to practise with AMR modelling

 ### Changed
 * Fixed a bug in `antibiogram()` for when no antimicrobials are set
--- a/R/aa_helper_functions.R
+++ b/R/aa_helper_functions.R
@@ -966,8 +966,13 @@ get_current_data <- function(arg_name, call) {
      # an element `.data` will be in the environment when using dplyr::select()
      return(env$`.data`)
    } else if (valid_df(env$training)) {
-      # an element `training` will be in the environment when using some tidymodels functions such as `prep()`
+      if (!is.null(env$x) && valid_df(env$x$template)) {
+        # an element `x$template` will be in the environment when using some tidymodels functions such as `prep()`
+        return(env$x$template)
+      } else {
+        # this is a fallback for some tidymodels functions such as `prep()`
        return(env$training)
+      }
    } else if (valid_df(env$data)) {
      # an element `data` will be in the environment when using older dplyr versions, or some tidymodels functions such as `fit()`
      return(env$data)
--- a/R/atc_online.R
+++ b/R/atc_online.R
@@ -99,7 +99,8 @@ atc_online_property <- function(atc_code,
  read_html <- import_fn("read_html", "xml2")

  if (!all(atc_code %in% unlist(AMR::antimicrobials$atc))) {
-    atc_code <- as.character(ab_atc(atc_code, only_first = TRUE))
+    missing <- atc_code %unlike% "[A-Z][0-9][0-9][A-Z][A-Z][0-9][0-9]"
+    atc_code[missing] <- as.character(ab_atc(atc_code[missing], only_first = TRUE))
  }

  if (!has_internet()) {
--- a/R/data.R
+++ b/R/data.R
@@ -282,7 +282,7 @@

 #' Data Set with Clinical Breakpoints for SIR Interpretation
 #'
-#' @description Data set containing clinical breakpoints to interpret MIC and disk diffusion to SIR values, according to international guidelines. This dataset contain breakpoints for humans, `r length(unique(clinical_breakpoints$host[!clinical_breakpoints$host %in% clinical_breakpoints$type]))` different animal groups, and ECOFFs.
+#' @description Data set containing clinical breakpoints to interpret MIC and disk diffusion to SIR values, according to international guidelines. This data set contains breakpoints for humans, `r length(unique(clinical_breakpoints$host[!clinical_breakpoints$host %in% clinical_breakpoints$type]))` different animal groups, and ECOFFs.
 #'
 #' These breakpoints are currently implemented:
 #' - For **clinical microbiology**: EUCAST `r min(as.integer(gsub("[^0-9]", "", subset(AMR::clinical_breakpoints, guideline %like% "EUCAST" & type == "human")$guideline)))`-`r max(as.integer(gsub("[^0-9]", "", subset(AMR::clinical_breakpoints, guideline %like% "EUCAST" & type == "human")$guideline)))` and CLSI `r min(as.integer(gsub("[^0-9]", "", subset(AMR::clinical_breakpoints, guideline %like% "CLSI" & type == "human")$guideline)))`-`r max(as.integer(gsub("[^0-9]", "", subset(AMR::clinical_breakpoints, guideline %like% "CLSI" & type == "human")$guideline)))`;
@@ -362,14 +362,14 @@
 #' dosage
 "dosage"

-# TODO #' Data Set with `r format(nrow(esbl_isolates), big.mark = " ")` ESBL Isolates
-# TODO #'
-# TODO #' A data set containing `r format(nrow(esbl_isolates), big.mark = " ")` microbial isolates with MIC values of common antibiotics and a binary `esbl` column for extended-spectrum beta-lactamase (ESBL) production. This data set contains randomised fictitious data but reflects reality and can be used to practise AMR-related machine learning, e.g., classification modelling with [tidymodels](https://amr-for-r.org/articles/AMR_with_tidymodels.html).
-# TODO #' @format A [tibble][tibble::tibble] with `r format(nrow(esbl_isolates), big.mark = " ")` observations and `r ncol(esbl_isolates)` variables:
-# TODO #' - `esbl`\cr Logical indicator if the isolate is ESBL-producing
-# TODO #' - `genus`\cr Genus of the microorganism
-# TODO #' - `AMC:COL`\cr MIC values for 17 antimicrobial agents, transformed to class [`mic`] (see [as.mic()])
-# TODO #' @details See our [tidymodels integration][amr-tidymodels] for an example using this data set.
-# TODO #' @examples
-# TODO #' esbl_isolates
-# TODO "esbl_isolates"
+#' Data Set with `r format(nrow(esbl_isolates), big.mark = " ")` ESBL Isolates
+#'
+#' A data set containing `r format(nrow(esbl_isolates), big.mark = " ")` microbial isolates with MIC values of common antibiotics and a binary `esbl` column for extended-spectrum beta-lactamase (ESBL) production. This data set contains randomised fictitious data but reflects reality and can be used to practise AMR-related machine learning, e.g., classification modelling with [tidymodels](https://amr-for-r.org/articles/AMR_with_tidymodels.html).
+#' @format A [tibble][tibble::tibble] with `r format(nrow(esbl_isolates), big.mark = " ")` observations and `r ncol(esbl_isolates)` variables:
+#' - `esbl`\cr Logical indicator if the isolate is ESBL-producing
+#' - `genus`\cr Genus of the microorganism
+#' - `AMC:COL`\cr MIC values for 17 antimicrobial agents, transformed to class [`mic`] (see [as.mic()])
+#' @details See our [tidymodels integration][amr-tidymodels] for an example using this data set.
+#' @examples
+#' esbl_isolates
+"esbl_isolates"
--- a/R/tidymodels.R.no_include
+++ b/R/tidymodels.R.no_include
@@ -1,20 +1,21 @@
 #' AMR Extensions for Tidymodels
 #'
-#' This family of functions allows using AMR-specific data types such as `<mic>` and `<sir>` inside `tidymodels` pipelines.
+#' This family of functions allows using AMR-specific data types such as `<sir>` and `<mic>` inside `tidymodels` pipelines.
 #' @inheritParams recipes::step_center
 #' @details
 #' You can read more in our online [AMR with tidymodels introduction](https://amr-for-r.org/articles/AMR_with_tidymodels.html).
 #'
 #' Tidyselect helpers include:
-#' - [all_mic()] and [all_mic_predictors()] to select `<mic>` columns
-#' - [all_sir()] and [all_sir_predictors()] to select `<sir>` columns
+#' - [all_sir()] and [all_sir_predictors()] to select [`<sir>`][as.sir()] columns
+#' - [all_mic()] and [all_mic_predictors()] to select [`<mic>`][as.mic()] columns
+#' - [all_disk()] and [all_disk_predictors()] to select [`<disk>`][as.disk()] columns
 #'
 #' Pre-processing pipeline steps include:
-#' - [step_mic_log2()] to convert MIC columns to numeric (via `as.numeric()`) and apply a log2 transform, to be used with [all_mic_predictors()]
 #' - [step_sir_numeric()] to convert SIR columns to numeric (via `as.numeric()`), to be used with [all_sir_predictors()]: `"S"` = 1, `"I"`/`"SDD"` = 2, `"R"` = 3. All other values are rendered `NA`. Keep this in mind for further processing, especially if the model does not allow for `NA` values.
+#' - [step_mic_log2()] to convert MIC columns to numeric (via `as.numeric()`) and apply a log2 transform, to be used with [all_mic_predictors()]
 #'
 #' These steps integrate with `recipes::recipe()` and work like standard preprocessing steps. They are useful for preparing data for modelling, especially with classification models.
-#' @seealso [recipes::recipe()], [as.mic()], [as.sir()]
+#' @seealso [recipes::recipe()], [as.sir()], [as.mic()], [as.disk()]
 #' @name amr-tidymodels
 #' @keywords internal
 #' @export
@@ -66,35 +67,55 @@
 #'     bind_cols(out_testing)
 #'
 #'   # Evaluate predictions using standard classification metrics
-#'   our_metrics <- metric_set(accuracy, kap, ppv, npv)
+#'   our_metrics <- metric_set(accuracy,
+#'                             recall,
+#'                             precision,
+#'                             sensitivity,
+#'                             specificity,
+#'                             ppv,
+#'                             npv)
 #'   metrics <- our_metrics(predictions, truth = esbl, estimate = .pred_class)
 #'
 #'   # Show performance
 #'   metrics
 #' }
-all_mic <- function() {
-  x <- tidymodels_amr_select(levels(NA_mic_))
-  names(x)
-}
-
-#' @rdname amr-tidymodels
-#' @export
-all_mic_predictors <- function() {
-  x <- tidymodels_amr_select(levels(NA_mic_))
-  intersect(x, recipes::has_role("predictor"))
-}
-
-#' @rdname amr-tidymodels
-#' @export
 all_sir <- function() {
-  x <- tidymodels_amr_select(levels(NA_sir_))
+  x <- tidymodels_amr_select(class = "sir")
  names(x)
 }

 #' @rdname amr-tidymodels
 #' @export
 all_sir_predictors <- function() {
-  x <- tidymodels_amr_select(levels(NA_sir_))
+  x <- tidymodels_amr_select(class = "sir")
+  intersect(x, recipes::has_role("predictor"))
+}
+
+#' @rdname amr-tidymodels
+#' @export
+all_mic <- function() {
+  x <- tidymodels_amr_select(class = "mic")
+  names(x)
+}
+
+#' @rdname amr-tidymodels
+#' @export
+all_mic_predictors <- function() {
+  x <- tidymodels_amr_select(class = "mic")
+  intersect(x, recipes::has_role("predictor"))
+}
+
+#' @rdname amr-tidymodels
+#' @export
+all_disk <- function() {
+  x <- tidymodels_amr_select(class = "disk")
+  names(x)
+}
+
+#' @rdname amr-tidymodels
+#' @export
+all_disk_predictors <- function() {
+  x <- tidymodels_amr_select(class = "disk")
  intersect(x, recipes::has_role("predictor"))
 }

@@ -160,7 +181,6 @@ bake.step_mic_log2 <- function(object, new_data, ...) {
 print.step_mic_log2 <- function(x, width = max(20, options()$width - 35), ...) {
  title <- "Log2 transformation of MIC columns"
  recipes::print_step(x$columns, x$terms, x$trained, title, width)
-  invisible(x)
 }

 #' @rawNamespace if(getRversion() >= "3.0.0") S3method(recipes::tidy, step_mic_log2)
@@ -236,7 +256,6 @@ bake.step_sir_numeric <- function(object, new_data, ...) {
 print.step_sir_numeric <- function(x, width = max(20, options()$width - 35), ...) {
  title <- "Numeric transformation of SIR columns"
  recipes::print_step(x$columns, x$terms, x$trained, title, width)
-  invisible(x)
 }

 #' @rawNamespace if(getRversion() >= "3.0.0") S3method(recipes::tidy, step_sir_numeric)
@@ -250,13 +269,13 @@ tidy.step_sir_numeric <- function(x, ...) {
  res
 }

-tidymodels_amr_select <- function(check_vector) {
+tidymodels_amr_select <- function(class) {
  df <- get_current_data()
  ind <- which(
    vapply(
      FUN.VALUE = logical(1),
      df,
-      function(x) all(x %in% c(check_vector, NA), na.rm = TRUE) & any(x %in% check_vector),
+      function(x) inherits(x, class),
      USE.NAMES = TRUE
    ),
    useNames = TRUE
--- a/data/esbl_isolates.rda.no_include
+++ b/data/esbl_isolates.rda.no_include
--- a/man/amr-tidymodels.Rd
+++ b/man/amr-tidymodels.Rd
@@ -0,0 +1,138 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/tidymodels.R
+\name{amr-tidymodels}
+\alias{amr-tidymodels}
+\alias{all_sir}
+\alias{all_sir_predictors}
+\alias{all_mic}
+\alias{all_mic_predictors}
+\alias{all_disk}
+\alias{all_disk_predictors}
+\alias{step_mic_log2}
+\alias{step_sir_numeric}
+\title{AMR Extensions for Tidymodels}
+\usage{
+all_sir()
+
+all_sir_predictors()
+
+all_mic()
+
+all_mic_predictors()
+
+all_disk()
+
+all_disk_predictors()
+
+step_mic_log2(recipe, ..., role = NA, trained = FALSE, columns = NULL,
+  skip = FALSE, id = recipes::rand_id("mic_log2"))
+
+step_sir_numeric(recipe, ..., role = NA, trained = FALSE, columns = NULL,
+  skip = FALSE, id = recipes::rand_id("sir_numeric"))
+}
+\arguments{
+\item{recipe}{A recipe object. The step will be added to the sequence of
+operations for this recipe.}
+
+\item{...}{One or more selector functions to choose variables for this step.
+See \code{\link[recipes:selections]{selections()}} for more details.}
+
+\item{role}{Not used by this step since no new variables are created.}
+
+\item{trained}{A logical to indicate if the quantities for preprocessing have
+been estimated.}
+
+\item{skip}{A logical. Should the step be skipped when the recipe is baked by
+\code{\link[recipes:bake]{bake()}}? While all operations are baked when \code{\link[recipes:prep]{prep()}} is run, some
+operations may not be able to be conducted on new data (e.g. processing the
+outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it
+may affect the computations for subsequent operations.}
+
+\item{id}{A character string that is unique to this step to identify it.}
+}
+\description{
+This family of functions allows using AMR-specific data types such as \verb{<sir>} and \verb{<mic>} inside \code{tidymodels} pipelines.
+}
+\details{
+You can read more in our online \href{https://amr-for-r.org/articles/AMR_with_tidymodels.html}{AMR with tidymodels introduction}.
+
+Tidyselect helpers include:
+\itemize{
+\item \code{\link[=all_sir]{all_sir()}} and \code{\link[=all_sir_predictors]{all_sir_predictors()}} to select \code{\link[=as.sir]{<sir>}} columns
+\item \code{\link[=all_mic]{all_mic()}} and \code{\link[=all_mic_predictors]{all_mic_predictors()}} to select \code{\link[=as.mic]{<mic>}} columns
+\item \code{\link[=all_disk]{all_disk()}} and \code{\link[=all_disk_predictors]{all_disk_predictors()}} to select \code{\link[=as.disk]{<disk>}} columns
+}
+
+Pre-processing pipeline steps include:
+\itemize{
+\item \code{\link[=step_sir_numeric]{step_sir_numeric()}} to convert SIR columns to numeric (via \code{as.numeric()}), to be used with \code{\link[=all_sir_predictors]{all_sir_predictors()}}: \code{"S"} = 1, \code{"I"}/\code{"SDD"} = 2, \code{"R"} = 3. All other values are rendered \code{NA}. Keep this in mind for further processing, especially if the model does not allow for \code{NA} values.
+\item \code{\link[=step_mic_log2]{step_mic_log2()}} to convert MIC columns to numeric (via \code{as.numeric()}) and apply a log2 transform, to be used with \code{\link[=all_mic_predictors]{all_mic_predictors()}}
+}
+
+These steps integrate with \code{recipes::recipe()} and work like standard preprocessing steps. They are useful for preparing data for modelling, especially with classification models.
+}
+\examples{
+if (require("tidymodels")) {
+
+  # The below approach formed the basis for this paper: DOI 10.3389/fmicb.2025.1582703
+  # Presence of ESBL genes was predicted based on raw MIC values.
+
+
+  # example data set in the AMR package
+  esbl_isolates
+
+  # Prepare a binary outcome and convert to ordered factor
+  data <- esbl_isolates \%>\%
+    mutate(esbl = factor(esbl, levels = c(FALSE, TRUE), ordered = TRUE))
+
+  # Split into training and testing sets
+  split <- initial_split(data)
+  training_data <- training(split)
+  testing_data <- testing(split)
+
+  # Create and prep a recipe with MIC log2 transformation
+  mic_recipe <- recipe(esbl ~ ., data = training_data) \%>\%
+
+    # Optionally remove non-predictive variables
+    remove_role(genus, old_role = "predictor") \%>\%
+
+    # Apply the log2 transformation to all MIC predictors
+    step_mic_log2(all_mic_predictors()) \%>\%
+
+    # And apply the preparation steps
+    prep()
+
+  # View prepped recipe
+  mic_recipe
+
+  # Apply the recipe to training and testing data
+  out_training <- bake(mic_recipe, new_data = NULL)
+  out_testing <- bake(mic_recipe, new_data = testing_data)
+
+  # Fit a logistic regression model
+  fitted <- logistic_reg(mode = "classification") \%>\%
+    set_engine("glm") \%>\%
+    fit(esbl ~ ., data = out_training)
+
+  # Generate predictions on the test set
+  predictions <- predict(fitted, out_testing) \%>\%
+    bind_cols(out_testing)
+
+  # Evaluate predictions using standard classification metrics
+  our_metrics <- metric_set(accuracy,
+                            recall,
+                            precision,
+                            sensitivity,
+                            specificity,
+                            ppv,
+                            npv)
+  metrics <- our_metrics(predictions, truth = esbl, estimate = .pred_class)
+
+  # Show performance
+  metrics
+}
+}
+\seealso{
+\code{\link[recipes:recipe]{recipes::recipe()}}, \code{\link[=as.sir]{as.sir()}}, \code{\link[=as.mic]{as.mic()}}, \code{\link[=as.disk]{as.disk()}}
+}
+\keyword{internal}
--- a/man/clinical_breakpoints.Rd
+++ b/man/clinical_breakpoints.Rd
@@ -27,7 +27,7 @@ A \link[tibble:tibble]{tibble} with 40 217 observations and 14 variables:
 clinical_breakpoints
 }
 \description{
-Data set containing clinical breakpoints to interpret MIC and disk diffusion to SIR values, according to international guidelines. This dataset contain breakpoints for humans, 7 different animal groups, and ECOFFs.
+Data set containing clinical breakpoints to interpret MIC and disk diffusion to SIR values, according to international guidelines. This data set contains breakpoints for humans, 7 different animal groups, and ECOFFs.

 These breakpoints are currently implemented:
 \itemize{
--- a/man/esbl_isolates.Rd
+++ b/man/esbl_isolates.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data.R
+\docType{data}
+\name{esbl_isolates}
+\alias{esbl_isolates}
+\title{Data Set with 500 ESBL Isolates}
+\format{
+A \link[tibble:tibble]{tibble} with 500 observations and 19 variables:
+\itemize{
+\item \code{esbl}\cr Logical indicator if the isolate is ESBL-producing
+\item \code{genus}\cr Genus of the microorganism
+\item \code{AMC:COL}\cr MIC values for 17 antimicrobial agents, transformed to class \code{\link{mic}} (see \code{\link[=as.mic]{as.mic()}})
+}
+}
+\usage{
+esbl_isolates
+}
+\description{
+A data set containing 500 microbial isolates with MIC values of common antibiotics and a binary \code{esbl} column for extended-spectrum beta-lactamase (ESBL) production. This data set contains randomised fictitious data but reflects reality and can be used to practise AMR-related machine learning, e.g., classification modelling with \href{https://amr-for-r.org/articles/AMR_with_tidymodels.html}{tidymodels}.
+}
+\details{
+See our \link[=amr-tidymodels]{tidymodels integration} for an example using this data set.
+}
+\examples{
+esbl_isolates
+}
+\keyword{datasets}
--- a/tests/testthat/test-tidymodels.R
+++ b/tests/testthat/test-tidymodels.R
@@ -0,0 +1,84 @@
+# ==================================================================== #
+# TITLE:                                                               #
+# AMR: An R Package for Working with Antimicrobial Resistance Data     #
+#                                                                      #
+# SOURCE CODE:                                                         #
+# https://github.com/msberends/AMR                                     #
+#                                                                      #
+# PLEASE CITE THIS SOFTWARE AS:                                        #
+# Berends MS, Luz CF, Friedrich AW, et al. (2022).                     #
+# AMR: An R Package for Working with Antimicrobial Resistance Data.    #
+# Journal of Statistical Software, 104(3), 1-31.                       #
+# https://doi.org/10.18637/jss.v104.i03                                #
+#                                                                      #
+# Developed at the University of Groningen and the University Medical  #
+# Center Groningen in The Netherlands, in collaboration with many      #
+# colleagues from around the world, see our website.                   #
+#                                                                      #
+# This R package is free software; you can freely use and distribute   #
+# it for both personal and commercial purposes under the terms of the  #
+# GNU General Public License version 2.0 (GNU GPL-2), as published by  #
+# the Free Software Foundation.                                        #
+# We created this package for both routine data analysis and academic  #
+# research and it was publicly released in the hope that it will be    #
+# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY.              #
+#                                                                      #
+# Visit our website for the full manual and a complete tutorial about  #
+# how to conduct AMR data analysis: https://amr-for-r.org              #
+# ==================================================================== #
+
+test_that("tidymodels.R", {
+  skip_on_cran()
+
+  if (AMR:::pkg_is_available("recipes", also_load = TRUE) && AMR:::pkg_is_available("dplyr", also_load = TRUE)) {
+    # SIR
+    df <- tibble(
+      sir1 = as.sir(c("S", "I", "R", "S", "R")),
+      sir2 = as.sir(c("I", "R", "S", "R", "I")),
+      not_sir = c("S", "R", "R", "S", "I")
+    )
+    rec <- recipe(~., data = df) %>% step_sir_numeric(all_sir())
+    prepped <- prep(rec)
+    baked <- bake(prepped, new_data = df)
+    expect_inherits(baked$sir1, "numeric")
+    expect_inherits(baked$sir2, "numeric")
+    expect_equal(baked$not_sir, as.factor(df$not_sir))
+
+    # MIC
+    df <- tibble(
+      mic_col1 = as.mic(c("<=0.002", "0.002", "0.004", "0.016", "32")),
+      mic_col2 = as.mic(c("0.5", "1", "2", "4", "8")),
+      non_mic = c(1, 2, 3, 4, 5)
+    )
+    rec <- recipe(~., data = df) %>% step_mic_log2(all_mic())
+    prepped <- prep(rec)
+    baked <- bake(prepped, new_data = df)
+    expect_inherits(baked$mic_col1, "numeric")
+    expect_inherits(baked$mic_col2, "numeric")
+    expect_equal(baked$non_mic, df$non_mic)
+    expect_equal(baked$mic_col2, log2(as.numeric(df$mic_col2)))
+
+    # disk
+    df <- tibble(
+      disk_col = as.disk(c(21, 22, 23, 24, 25)),
+      non_disk = c(21, 22, 23, 24, 25)
+    )
+    rec <- recipe(~., data = df) %>% step_rm(!all_disk())
+    prepped <- prep(rec)
+    baked <- bake(prepped, new_data = df)
+    expect_inherits(baked$disk_col, "disk")
+
+    # steps check
+    df <- tibble(x = as.mic(c("1", "2", "4")))
+    rec <- recipe(~x, data = df) %>% step_mic_log2(all_mic())
+    prepped <- prep(rec)
+    tidy_df <- tidy(prepped, number = 1)
+    expect_equal(unname(tidy_df$terms), "x")
+
+    df <- tibble(x = as.sir(c("S", "I", "R")))
+    rec <- recipe(~x, data = df) %>% step_sir_numeric(all_sir())
+    prepped <- prep(rec)
+    tidy_df <- tidy(prepped, number = 1)
+    expect_equal(unname(tidy_df$terms), "x")
+  }
+})
--- a/vignettes/AMR_with_tidymodels.Rmd
+++ b/vignettes/AMR_with_tidymodels.Rmd
@@ -22,7 +22,7 @@ knitr::opts_chunk$set(
 )
 ```

-> This page was entirely written by our [AMR for R Assistant](https://chat.amr-for-r.org), a ChatGPT manually-trained model able to answer any question about the `AMR` package.
+> This page was almost entirely written by our [AMR for R Assistant](https://chat.amr-for-r.org), a ChatGPT manually-trained model able to answer any question about the `AMR` package.

 Antimicrobial resistance (AMR) is a global health crisis, and understanding resistance patterns is crucial for managing effective treatments. The `AMR` R package provides robust tools for analysing AMR data, including convenient antimicrobial selector functions like `aminoglycosides()` and `betalactams()`. 

@@ -219,18 +219,163 @@ This workflow is extensible to other antimicrobial classes and resistance patter

 In this second example, we demonstrate how to use `<mic>` columns directly in `tidymodels` workflows using AMR-specific recipe steps. This includes a transformation to `log2` scale using `step_mic_log2()`, which prepares MIC values for use in classification models.

-This approach and idea formed the basis for the publication [DOI: 10.3389/fmicb.2025.1582703](https://doi.org/10.3389/fmicb.2025.1582703) to model the presence of extended-spectrum beta-lactamases (ESBL).
+This approach and idea formed the basis for the publication [DOI: 10.3389/fmicb.2025.1582703](https://doi.org/10.3389/fmicb.2025.1582703) to model the presence of extended-spectrum beta-lactamases (ESBL) based on MIC values.

-> NOTE: THIS EXAMPLE WILL BE AVAILABLE IN A NEXT VERSION (#TODO)
-> 
-> The new AMR package version will contain new tidymodels selectors such as `step_mic_log2()`.
+### **Objective**

-<!-- TODO for AMR v3.1.0: add info from here: https://github.com/msberends/AMR/blob/2461631bcefa78ebdb37bdfad359be74cdd9165a/vignettes/AMR_with_tidymodels.Rmd#L212-L291 -->
+Our goal is to:
+
+1. Use raw MIC values to predict whether a bacterial isolate produces ESBL.
+2. Apply AMR-aware preprocessing in a `tidymodels` recipe.
+3. Train a classification model and evaluate its predictive performance.
+
+### **Data Preparation**
+
+We use the `esbl_isolates` dataset that comes with the AMR package.
+
+```{r}
+# Load required libraries
+library(AMR)
+library(tidymodels)
+
+# View the esbl_isolates data set
+esbl_isolates
+
+# Prepare a binary outcome and convert to ordered factor
+data <- esbl_isolates %>%
+  mutate(esbl = factor(esbl, levels = c(FALSE, TRUE), ordered = TRUE))
+```
+
+**Explanation:**
+
+- `esbl_isolates`: Contains MIC test results and ESBL status for each isolate.
+- `mutate(esbl = ...)`: Converts the target column to an ordered factor for classification.
+
+### **Defining the Workflow**
+
+#### 1. Preprocessing with a Recipe
+
+We use our `step_mic_log2()` function to log2-transform MIC values, ensuring that MICs are numeric and properly scaled. All MIC predictors can easily and agnostically selected using the new `all_mic_predictors()`:
+
+```{r}
+# Split into training and testing sets
+set.seed(123)
+split <- initial_split(data)
+training_data <- training(split)
+testing_data <- testing(split)
+
+# Define the recipe
+mic_recipe <- recipe(esbl ~ ., data = training_data) %>%
+  remove_role(genus, old_role = "predictor") %>%  # Remove non-informative variable
+  step_mic_log2(all_mic_predictors()) #%>%         # Log2 transform all MIC predictors
+ # prep()
+
+mic_recipe
+```
+
+**Explanation:**
+
+- `remove_role()`: Removes irrelevant variables like genus.
+- `step_mic_log2()`: Applies `log2(as.numeric(...))` to all MIC predictors in one go.
+- `prep()`: Finalises the recipe based on training data.
+
+#### 2. Specifying the Model
+
+We use a simple logistic regression to model ESBL presence, though recent models such as xgboost ([link to `parsnip` manual](https://parsnip.tidymodels.org/reference/details_boost_tree_xgboost.html)) could be much more precise.
+
+```{r}
+# Define the model
+model <- logistic_reg(mode = "classification") %>%
+  set_engine("glm")
+
+model
+```
+
+**Explanation:**
+
+- `logistic_reg()`: Specifies a binary classification model.
+- `set_engine("glm")`: Uses the base R GLM engine.
+
+#### 3. Building the Workflow
+
+```{r}
+# Create workflow
+workflow_model <- workflow() %>%
+  add_recipe(mic_recipe) %>%
+  add_model(model)
+
+workflow_model
+```
+
+### **Training and Evaluating the Model**
+
+```{r}
+# Fit the model
+fitted <- fit(workflow_model, training_data)
+
+# Generate predictions
+predictions <- predict(fitted, testing_data) %>%
+  bind_cols(predict(fitted, out_testing, type = "prob")) %>% # add probabilities
+  bind_cols(testing_data)
+
+# Evaluate model performance
+our_metrics <- metric_set(accuracy, recall, precision, sensitivity, specificity, ppv, npv)
+metrics <- our_metrics(predictions, truth = esbl, estimate = .pred_class)
+
+metrics
+```
+
+**Explanation:**
+
+- `fit()`: Trains the model on the processed training data.
+- `predict()`: Produces predictions for unseen test data.
+- `metric_set()`: Allows evaluating multiple classification metrics. This will make `our_metrics` to become a function that we can use to check the predictions with.
+
+It appears we can predict ESBL gene presence with a positive predictive value (PPV) of `r round(metrics[metrics$.metric == "ppv", ]$.estimate, 3) * 100`% and a negative predictive value (NPV) of `r round(metrics[metrics$.metric == "npv", ]$.estimate, 3) * 100` using a simplistic logistic regression model.
+
+### **Visualising Predictions**
+
+We can visualise predictions by comparing predicted and actual ESBL status.
+
+```{r}
+library(ggplot2)
+
+ggplot(predictions, aes(x = esbl, fill = .pred_class)) +
+  geom_bar(position = "stack") +
+  labs(title = "Predicted vs Actual ESBL Status",
+       x = "Actual ESBL",
+       y = "Count") +
+  theme_minimal()
+```
+
+And plot the certainties too - how certain were the actual predictions?
+
+```{r}
+predictions %>%
+  mutate(certainty = ifelse(.pred_class == "FALSE",
+                            .pred_FALSE,
+                            .pred_TRUE),
+         correct = ifelse(esbl == .pred_class, "Right", "Wrong")) %>%
+  ggplot(aes(x = seq_len(nrow(predictions)),
+             y = certainty,
+             colour = correct)) +
+  scale_colour_manual(values = c(Right = "green3", Wrong = "red2"),
+                      name = "Correct?") +
+  geom_point() + 
+  scale_y_continuous(labels = function(x) paste0(x * 100, "%"),
+                     limits = c(0.5, 1)) +
+  theme_minimal()
+```
+### **Conclusion**
+
+In this example, we showcased how the new `AMR`-specific recipe steps simplify working with `<mic>` columns in `tidymodels`. The `step_mic_log2()` transformation converts ordered MICs to log2-transformed numerics, improving compatibility with classification models.
+
+This pipeline enables realistic, reproducible, and interpretable modelling of antimicrobial resistance data.

 ---


-## Example 2: Predicting AMR Over Time
+## Example 3: Predicting AMR Over Time

 In this third example, we aim to predict antimicrobial resistance (AMR) trends over time using `tidymodels`. We will model resistance to three antibiotics (amoxicillin `AMX`, amoxicillin-clavulanic acid `AMC`, and ciprofloxacin `CIP`), based on historical data grouped by year and hospital ward.