(v3.0.1.9005) re-add tidymodels implementation

2026-02-09 15:12:52 +01:00 · 2025-12-21 12:19:43 +01:00
parent 225c73f7e7
commit 151af21f38
16 changed files with 502 additions and 61 deletions
--- a/R/aa_helper_functions.R
+++ b/R/aa_helper_functions.R
@@ -966,8 +966,13 @@ get_current_data <- function(arg_name, call) {
      # an element `.data` will be in the environment when using dplyr::select()
      return(env$`.data`)
    } else if (valid_df(env$training)) {
-      # an element `training` will be in the environment when using some tidymodels functions such as `prep()`
-      return(env$training)
+      if (!is.null(env$x) && valid_df(env$x$template)) {
+        # an element `x$template` will be in the environment when using some tidymodels functions such as `prep()`
+        return(env$x$template)
+      } else {
+        # this is a fallback for some tidymodels functions such as `prep()`
+        return(env$training)
+      }
    } else if (valid_df(env$data)) {
      # an element `data` will be in the environment when using older dplyr versions, or some tidymodels functions such as `fit()`
      return(env$data)
--- a/R/antibiogram.R
+++ b/R/antibiogram.R
@@ -163,7 +163,7 @@
 #'          antimicrobials = c("TZP", "TZP+TOB", "TZP+GEN"))
 #'    ```
 #'
-#'    WISCA uses a sophisticated Bayesian decision model to combine both local and pooled antimicrobial resistance data. This approach not only evaluates local patterns but can also draw on multi-centre datasets to improve regimen accuracy, even in low-incidence infections like paediatric bloodstream infections (BSIs).
+#'    WISCA uses a sophisticated Bayesian decision model to combine both local and pooled antimicrobial resistance data. This approach not only evaluates local patterns but can also draw on multi-centre data sets to improve regimen accuracy, even in low-incidence infections like paediatric bloodstream infections (BSIs).
 #'
 #' ### Grouped tibbles
 #'
--- a/R/atc_online.R
+++ b/R/atc_online.R
@@ -99,7 +99,8 @@ atc_online_property <- function(atc_code,
  read_html <- import_fn("read_html", "xml2")

  if (!all(atc_code %in% unlist(AMR::antimicrobials$atc))) {
-    atc_code <- as.character(ab_atc(atc_code, only_first = TRUE))
+    missing <- atc_code %unlike% "[A-Z][0-9][0-9][A-Z][A-Z][0-9][0-9]"
+    atc_code[missing] <- as.character(ab_atc(atc_code[missing], only_first = TRUE))
  }

  if (!has_internet()) {
--- a/R/data.R
+++ b/R/data.R
@@ -282,7 +282,7 @@

 #' Data Set with Clinical Breakpoints for SIR Interpretation
 #'
-#' @description Data set containing clinical breakpoints to interpret MIC and disk diffusion to SIR values, according to international guidelines. This dataset contain breakpoints for humans, `r length(unique(clinical_breakpoints$host[!clinical_breakpoints$host %in% clinical_breakpoints$type]))` different animal groups, and ECOFFs.
+#' @description Data set containing clinical breakpoints to interpret MIC and disk diffusion to SIR values, according to international guidelines. This data set contains breakpoints for humans, `r length(unique(clinical_breakpoints$host[!clinical_breakpoints$host %in% clinical_breakpoints$type]))` different animal groups, and ECOFFs.
 #'
 #' These breakpoints are currently implemented:
 #' - For **clinical microbiology**: EUCAST `r min(as.integer(gsub("[^0-9]", "", subset(AMR::clinical_breakpoints, guideline %like% "EUCAST" & type == "human")$guideline)))`-`r max(as.integer(gsub("[^0-9]", "", subset(AMR::clinical_breakpoints, guideline %like% "EUCAST" & type == "human")$guideline)))` and CLSI `r min(as.integer(gsub("[^0-9]", "", subset(AMR::clinical_breakpoints, guideline %like% "CLSI" & type == "human")$guideline)))`-`r max(as.integer(gsub("[^0-9]", "", subset(AMR::clinical_breakpoints, guideline %like% "CLSI" & type == "human")$guideline)))`;
@@ -362,14 +362,14 @@
 #' dosage
 "dosage"

-# TODO #' Data Set with `r format(nrow(esbl_isolates), big.mark = " ")` ESBL Isolates
-# TODO #'
-# TODO #' A data set containing `r format(nrow(esbl_isolates), big.mark = " ")` microbial isolates with MIC values of common antibiotics and a binary `esbl` column for extended-spectrum beta-lactamase (ESBL) production. This data set contains randomised fictitious data but reflects reality and can be used to practise AMR-related machine learning, e.g., classification modelling with [tidymodels](https://amr-for-r.org/articles/AMR_with_tidymodels.html).
-# TODO #' @format A [tibble][tibble::tibble] with `r format(nrow(esbl_isolates), big.mark = " ")` observations and `r ncol(esbl_isolates)` variables:
-# TODO #' - `esbl`\cr Logical indicator if the isolate is ESBL-producing
-# TODO #' - `genus`\cr Genus of the microorganism
-# TODO #' - `AMC:COL`\cr MIC values for 17 antimicrobial agents, transformed to class [`mic`] (see [as.mic()])
-# TODO #' @details See our [tidymodels integration][amr-tidymodels] for an example using this data set.
-# TODO #' @examples
-# TODO #' esbl_isolates
-# TODO "esbl_isolates"
+#' Data Set with `r format(nrow(esbl_isolates), big.mark = " ")` ESBL Isolates
+#'
+#' A data set containing `r format(nrow(esbl_isolates), big.mark = " ")` microbial isolates with MIC values of common antibiotics and a binary `esbl` column for extended-spectrum beta-lactamase (ESBL) production. This data set contains randomised fictitious data but reflects reality and can be used to practise AMR-related machine learning, e.g., classification modelling with [tidymodels](https://amr-for-r.org/articles/AMR_with_tidymodels.html).
+#' @format A [tibble][tibble::tibble] with `r format(nrow(esbl_isolates), big.mark = " ")` observations and `r ncol(esbl_isolates)` variables:
+#' - `esbl`\cr Logical indicator if the isolate is ESBL-producing
+#' - `genus`\cr Genus of the microorganism
+#' - `AMC:COL`\cr MIC values for 17 antimicrobial agents, transformed to class [`mic`] (see [as.mic()])
+#' @details See our [tidymodels integration][amr-tidymodels] for an example using this data set.
+#' @examples
+#' esbl_isolates
+"esbl_isolates"
--- a/R/tidymodels.R.no_include
+++ b/R/tidymodels.R.no_include
@@ -1,20 +1,21 @@
 #' AMR Extensions for Tidymodels
 #'
-#' This family of functions allows using AMR-specific data types such as `<mic>` and `<sir>` inside `tidymodels` pipelines.
+#' This family of functions allows using AMR-specific data types such as `<sir>` and `<mic>` inside `tidymodels` pipelines.
 #' @inheritParams recipes::step_center
 #' @details
 #' You can read more in our online [AMR with tidymodels introduction](https://amr-for-r.org/articles/AMR_with_tidymodels.html).
 #'
 #' Tidyselect helpers include:
-#' - [all_mic()] and [all_mic_predictors()] to select `<mic>` columns
-#' - [all_sir()] and [all_sir_predictors()] to select `<sir>` columns
+#' - [all_sir()] and [all_sir_predictors()] to select [`<sir>`][as.sir()] columns
+#' - [all_mic()] and [all_mic_predictors()] to select [`<mic>`][as.mic()] columns
+#' - [all_disk()] and [all_disk_predictors()] to select [`<disk>`][as.disk()] columns
 #'
 #' Pre-processing pipeline steps include:
-#' - [step_mic_log2()] to convert MIC columns to numeric (via `as.numeric()`) and apply a log2 transform, to be used with [all_mic_predictors()]
 #' - [step_sir_numeric()] to convert SIR columns to numeric (via `as.numeric()`), to be used with [all_sir_predictors()]: `"S"` = 1, `"I"`/`"SDD"` = 2, `"R"` = 3. All other values are rendered `NA`. Keep this in mind for further processing, especially if the model does not allow for `NA` values.
+#' - [step_mic_log2()] to convert MIC columns to numeric (via `as.numeric()`) and apply a log2 transform, to be used with [all_mic_predictors()]
 #'
 #' These steps integrate with `recipes::recipe()` and work like standard preprocessing steps. They are useful for preparing data for modelling, especially with classification models.
-#' @seealso [recipes::recipe()], [as.mic()], [as.sir()]
+#' @seealso [recipes::recipe()], [as.sir()], [as.mic()], [as.disk()]
 #' @name amr-tidymodels
 #' @keywords internal
 #' @export
@@ -66,35 +67,55 @@
 #'     bind_cols(out_testing)
 #'
 #'   # Evaluate predictions using standard classification metrics
-#'   our_metrics <- metric_set(accuracy, kap, ppv, npv)
+#'   our_metrics <- metric_set(accuracy,
+#'                             recall,
+#'                             precision,
+#'                             sensitivity,
+#'                             specificity,
+#'                             ppv,
+#'                             npv)
 #'   metrics <- our_metrics(predictions, truth = esbl, estimate = .pred_class)
 #'
 #'   # Show performance
 #'   metrics
 #' }
-all_mic <- function() {
-  x <- tidymodels_amr_select(levels(NA_mic_))
-  names(x)
-}
-
-#' @rdname amr-tidymodels
-#' @export
-all_mic_predictors <- function() {
-  x <- tidymodels_amr_select(levels(NA_mic_))
-  intersect(x, recipes::has_role("predictor"))
-}
-
-#' @rdname amr-tidymodels
-#' @export
 all_sir <- function() {
-  x <- tidymodels_amr_select(levels(NA_sir_))
+  x <- tidymodels_amr_select(class = "sir")
  names(x)
 }

 #' @rdname amr-tidymodels
 #' @export
 all_sir_predictors <- function() {
-  x <- tidymodels_amr_select(levels(NA_sir_))
+  x <- tidymodels_amr_select(class = "sir")
+  intersect(x, recipes::has_role("predictor"))
+}
+
+#' @rdname amr-tidymodels
+#' @export
+all_mic <- function() {
+  x <- tidymodels_amr_select(class = "mic")
+  names(x)
+}
+
+#' @rdname amr-tidymodels
+#' @export
+all_mic_predictors <- function() {
+  x <- tidymodels_amr_select(class = "mic")
+  intersect(x, recipes::has_role("predictor"))
+}
+
+#' @rdname amr-tidymodels
+#' @export
+all_disk <- function() {
+  x <- tidymodels_amr_select(class = "disk")
+  names(x)
+}
+
+#' @rdname amr-tidymodels
+#' @export
+all_disk_predictors <- function() {
+  x <- tidymodels_amr_select(class = "disk")
  intersect(x, recipes::has_role("predictor"))
 }

@@ -160,7 +181,6 @@ bake.step_mic_log2 <- function(object, new_data, ...) {
 print.step_mic_log2 <- function(x, width = max(20, options()$width - 35), ...) {
  title <- "Log2 transformation of MIC columns"
  recipes::print_step(x$columns, x$terms, x$trained, title, width)
-  invisible(x)
 }

 #' @rawNamespace if(getRversion() >= "3.0.0") S3method(recipes::tidy, step_mic_log2)
@@ -236,7 +256,6 @@ bake.step_sir_numeric <- function(object, new_data, ...) {
 print.step_sir_numeric <- function(x, width = max(20, options()$width - 35), ...) {
  title <- "Numeric transformation of SIR columns"
  recipes::print_step(x$columns, x$terms, x$trained, title, width)
-  invisible(x)
 }

 #' @rawNamespace if(getRversion() >= "3.0.0") S3method(recipes::tidy, step_sir_numeric)
@@ -250,13 +269,13 @@ tidy.step_sir_numeric <- function(x, ...) {
  res
 }

-tidymodels_amr_select <- function(check_vector) {
+tidymodels_amr_select <- function(class) {
  df <- get_current_data()
  ind <- which(
    vapply(
      FUN.VALUE = logical(1),
      df,
-      function(x) all(x %in% c(check_vector, NA), na.rm = TRUE) & any(x %in% check_vector),
+      function(x) inherits(x, class),
      USE.NAMES = TRUE
    ),
    useNames = TRUE