Add add_if_missing parameter to control NA handling in interpretive rules (#264)

2026-06-24 19:36:23 +02:00 · 2026-04-21 21:53:43 +02:00
parent fb8758f36b
commit 8ff5d4472a
46 changed files with 1232 additions and 1016 deletions
--- a/vignettes/AMR_with_tidymodels.Rmd
+++ b/vignettes/AMR_with_tidymodels.Rmd
@@ -53,8 +53,8 @@ We begin by loading the required libraries and preparing the `example_isolates`

 ```{r lib packages, message = FALSE, warning = FALSE, results = 'asis'}
 # Load required libraries
-library(AMR)          # For AMR data analysis
-library(tidymodels)   # For machine learning workflows, and data manipulation (dplyr, tidyr, ...)
+library(AMR) # For AMR data analysis
+library(tidymodels) # For machine learning workflows, and data manipulation (dplyr, tidyr, ...)
 ```

 Prepare the data:
@@ -68,13 +68,19 @@ data <- example_isolates %>%
  # select AB results dynamically
  select(mo, aminoglycosides(), betalactams()) %>%
  # replace NAs with NI (not-interpretable)
-   mutate(across(where(is.sir),
-                 ~replace_na(.x, "NI")),
-          # make factors of SIR columns
-          across(where(is.sir),
-                 as.integer),
-          # get Gramstain of microorganisms
-          mo = as.factor(mo_gramstain(mo))) %>%
+  mutate(
+    across(
+      where(is.sir),
+      ~ replace_na(.x, "NI")
+    ),
+    # make factors of SIR columns
+    across(
+      where(is.sir),
+      as.integer
+    ),
+    # get Gramstain of microorganisms
+    mo = as.factor(mo_gramstain(mo))
+  ) %>%
  # drop NAs - the ones without a Gramstain (fungi, etc.)
  drop_na()
 ```
@@ -149,7 +155,7 @@ To train the model, we split the data into training and testing sets. Then, we f
 set.seed(123) # For reproducibility
 data_split <- initial_split(data, prop = 0.8) # 80% training, 20% testing
 training_data <- training(data_split) # Training set
-testing_data <- testing(data_split)   # Testing set
+testing_data <- testing(data_split) # Testing set

 # Fit the workflow to the training data
 fitted_workflow <- resistance_workflow %>%
@@ -168,7 +174,7 @@ Next, we evaluate the model on the testing data.
 ```{r}
 # Make predictions on the testing set
 predictions <- fitted_workflow %>%
-  predict(testing_data)                # Generate predictions
+  predict(testing_data) # Generate predictions
 probabilities <- fitted_workflow %>%
  predict(testing_data, type = "prob") # Generate probabilities

@@ -266,8 +272,8 @@ testing_data <- testing(split)

 # Define the recipe
 mic_recipe <- recipe(esbl ~ ., data = training_data) %>%
-  remove_role(genus, old_role = "predictor") %>%  # Remove non-informative variable
-  step_mic_log2(all_mic_predictors())             # Log2 transform all MIC predictors
+  remove_role(genus, old_role = "predictor") %>% # Remove non-informative variable
+  step_mic_log2(all_mic_predictors()) # Log2 transform all MIC predictors

 prep(mic_recipe)
 ```
@@ -341,9 +347,11 @@ library(ggplot2)

 ggplot(predictions, aes(x = esbl, fill = .pred_class)) +
  geom_bar(position = "stack") +
-  labs(title = "Predicted vs Actual ESBL Status",
-       x = "Actual ESBL",
-       y = "Count") +
+  labs(
+    title = "Predicted vs Actual ESBL Status",
+    x = "Actual ESBL",
+    y = "Count"
+  ) +
  theme_minimal()
 ```

@@ -351,18 +359,27 @@ And plot the certainties too - how certain were the actual predictions?

 ```{r}
 predictions %>%
-  mutate(certainty = ifelse(.pred_class == "FALSE",
-                            .pred_FALSE,
-                            .pred_TRUE),
-         correct = ifelse(esbl == .pred_class, "Right", "Wrong")) %>%
-  ggplot(aes(x = seq_len(nrow(predictions)),
-             y = certainty,
-             colour = correct)) +
-  scale_colour_manual(values = c(Right = "green3", Wrong = "red2"),
-                      name = "Correct?") +
+  mutate(
+    certainty = ifelse(.pred_class == "FALSE",
+      .pred_FALSE,
+      .pred_TRUE
+    ),
+    correct = ifelse(esbl == .pred_class, "Right", "Wrong")
+  ) %>%
+  ggplot(aes(
+    x = seq_len(nrow(predictions)),
+    y = certainty,
+    colour = correct
+  )) +
+  scale_colour_manual(
+    values = c(Right = "green3", Wrong = "red2"),
+    name = "Correct?"
+  ) +
  geom_point() +
-  scale_y_continuous(labels = function(x) paste0(x * 100, "%"),
-                     limits = c(0.5, 1)) +
+  scale_y_continuous(
+    labels = function(x) paste0(x * 100, "%"),
+    limits = c(0.5, 1)
+  ) +
  theme_minimal()
 ```

@@ -399,13 +416,18 @@ library(tidymodels)
 # Transform dataset
 data_time <- example_isolates %>%
  top_n_microorganisms(n = 10) %>% # Filter on the top #10 species
-  mutate(year = as.integer(format(date, "%Y")),  # Extract year from date
-         gramstain = mo_gramstain(mo)) %>% # Get taxonomic names
+  mutate(
+    year = as.integer(format(date, "%Y")), # Extract year from date
+    gramstain = mo_gramstain(mo)
+  ) %>% # Get taxonomic names
  group_by(year, gramstain) %>%
-  summarise(across(c(AMX, AMC, CIP), 
-                   function(x) resistance(x, minimum = 0),
-                   .names = "res_{.col}"), 
-            .groups = "drop") %>% 
+  summarise(
+    across(c(AMX, AMC, CIP),
+      function(x) resistance(x, minimum = 0),
+      .names = "res_{.col}"
+    ),
+    .groups = "drop"
+  ) %>%
  filter(!is.na(res_AMX) & !is.na(res_AMC) & !is.na(res_CIP)) # Drop missing values

 data_time
@@ -426,9 +448,9 @@ We now define the modelling workflow, which consists of a preprocessing step, a
 ```{r}
 # Define the recipe
 resistance_recipe_time <- recipe(res_AMX ~ year + gramstain, data = data_time) %>%
-  step_dummy(gramstain, one_hot = TRUE) %>%  # Convert categorical to numerical
-  step_normalize(year) %>%  # Normalise year for better model performance
-  step_nzv(all_predictors())  # Remove near-zero variance predictors
+  step_dummy(gramstain, one_hot = TRUE) %>% # Convert categorical to numerical
+  step_normalize(year) %>% # Normalise year for better model performance
+  step_nzv(all_predictors()) # Remove near-zero variance predictors

 resistance_recipe_time
 ```
@@ -514,9 +536,11 @@ library(ggplot2)
 ggplot(predictions_time, aes(x = year)) +
  geom_point(aes(y = res_AMX, color = "Actual")) +
  geom_line(aes(y = .pred, color = "Predicted")) +
-  labs(title = "Predicted vs Actual AMX Resistance Over Time",
-       x = "Year",
-       y = "Resistance Proportion") +
+  labs(
+    title = "Predicted vs Actual AMX Resistance Over Time",
+    x = "Year",
+    y = "Resistance Proportion"
+  ) +
  theme_minimal()
 ```

@@ -525,13 +549,17 @@ Additionally, we can visualise resistance trends in `ggplot2` and directly add l
 ```{r}
 ggplot(data_time, aes(x = year, y = res_AMX, color = gramstain)) +
  geom_line() +
-  labs(title = "AMX Resistance Trends",
-       x = "Year",
-       y = "Resistance Proportion") +
+  labs(
+    title = "AMX Resistance Trends",
+    x = "Year",
+    y = "Resistance Proportion"
+  ) +
  # add a linear model directly in ggplot2:
-  geom_smooth(method = "lm",
-              formula = y ~ x,
-              alpha = 0.25) +
+  geom_smooth(
+    method = "lm",
+    formula = y ~ x,
+    alpha = 0.25
+  ) +
  theme_minimal()
 ```