1
0
mirror of https://github.com/msberends/AMR.git synced 2026-04-28 13:43:56 +02:00

Add add_if_missing parameter to control NA handling in interpretive rules (#264)

This commit is contained in:
Matthijs Berends
2026-04-21 21:53:43 +02:00
committed by GitHub
parent fb8758f36b
commit 8ff5d4472a
46 changed files with 1232 additions and 1016 deletions

View File

@@ -53,8 +53,8 @@ We begin by loading the required libraries and preparing the `example_isolates`
```{r lib packages, message = FALSE, warning = FALSE, results = 'asis'}
# Load required libraries
library(AMR) # For AMR data analysis
library(tidymodels) # For machine learning workflows, and data manipulation (dplyr, tidyr, ...)
library(AMR) # For AMR data analysis
library(tidymodels) # For machine learning workflows, and data manipulation (dplyr, tidyr, ...)
```
Prepare the data:
@@ -68,13 +68,19 @@ data <- example_isolates %>%
# select AB results dynamically
select(mo, aminoglycosides(), betalactams()) %>%
# replace NAs with NI (not-interpretable)
mutate(across(where(is.sir),
~replace_na(.x, "NI")),
# make factors of SIR columns
across(where(is.sir),
as.integer),
# get Gramstain of microorganisms
mo = as.factor(mo_gramstain(mo))) %>%
mutate(
across(
where(is.sir),
~ replace_na(.x, "NI")
),
# make factors of SIR columns
across(
where(is.sir),
as.integer
),
# get Gramstain of microorganisms
mo = as.factor(mo_gramstain(mo))
) %>%
# drop NAs - the ones without a Gramstain (fungi, etc.)
drop_na()
```
@@ -149,7 +155,7 @@ To train the model, we split the data into training and testing sets. Then, we f
set.seed(123) # For reproducibility
data_split <- initial_split(data, prop = 0.8) # 80% training, 20% testing
training_data <- training(data_split) # Training set
testing_data <- testing(data_split) # Testing set
testing_data <- testing(data_split) # Testing set
# Fit the workflow to the training data
fitted_workflow <- resistance_workflow %>%
@@ -168,7 +174,7 @@ Next, we evaluate the model on the testing data.
```{r}
# Make predictions on the testing set
predictions <- fitted_workflow %>%
predict(testing_data) # Generate predictions
predict(testing_data) # Generate predictions
probabilities <- fitted_workflow %>%
predict(testing_data, type = "prob") # Generate probabilities
@@ -266,8 +272,8 @@ testing_data <- testing(split)
# Define the recipe
mic_recipe <- recipe(esbl ~ ., data = training_data) %>%
remove_role(genus, old_role = "predictor") %>% # Remove non-informative variable
step_mic_log2(all_mic_predictors()) # Log2 transform all MIC predictors
remove_role(genus, old_role = "predictor") %>% # Remove non-informative variable
step_mic_log2(all_mic_predictors()) # Log2 transform all MIC predictors
prep(mic_recipe)
```
@@ -341,9 +347,11 @@ library(ggplot2)
ggplot(predictions, aes(x = esbl, fill = .pred_class)) +
geom_bar(position = "stack") +
labs(title = "Predicted vs Actual ESBL Status",
x = "Actual ESBL",
y = "Count") +
labs(
title = "Predicted vs Actual ESBL Status",
x = "Actual ESBL",
y = "Count"
) +
theme_minimal()
```
@@ -351,18 +359,27 @@ And plot the certainties too - how certain were the actual predictions?
```{r}
predictions %>%
mutate(certainty = ifelse(.pred_class == "FALSE",
.pred_FALSE,
.pred_TRUE),
correct = ifelse(esbl == .pred_class, "Right", "Wrong")) %>%
ggplot(aes(x = seq_len(nrow(predictions)),
y = certainty,
colour = correct)) +
scale_colour_manual(values = c(Right = "green3", Wrong = "red2"),
name = "Correct?") +
mutate(
certainty = ifelse(.pred_class == "FALSE",
.pred_FALSE,
.pred_TRUE
),
correct = ifelse(esbl == .pred_class, "Right", "Wrong")
) %>%
ggplot(aes(
x = seq_len(nrow(predictions)),
y = certainty,
colour = correct
)) +
scale_colour_manual(
values = c(Right = "green3", Wrong = "red2"),
name = "Correct?"
) +
geom_point() +
scale_y_continuous(labels = function(x) paste0(x * 100, "%"),
limits = c(0.5, 1)) +
scale_y_continuous(
labels = function(x) paste0(x * 100, "%"),
limits = c(0.5, 1)
) +
theme_minimal()
```
@@ -399,13 +416,18 @@ library(tidymodels)
# Transform dataset
data_time <- example_isolates %>%
top_n_microorganisms(n = 10) %>% # Filter on the top #10 species
mutate(year = as.integer(format(date, "%Y")), # Extract year from date
gramstain = mo_gramstain(mo)) %>% # Get taxonomic names
mutate(
year = as.integer(format(date, "%Y")), # Extract year from date
gramstain = mo_gramstain(mo)
) %>% # Get taxonomic names
group_by(year, gramstain) %>%
summarise(across(c(AMX, AMC, CIP),
function(x) resistance(x, minimum = 0),
.names = "res_{.col}"),
.groups = "drop") %>%
summarise(
across(c(AMX, AMC, CIP),
function(x) resistance(x, minimum = 0),
.names = "res_{.col}"
),
.groups = "drop"
) %>%
filter(!is.na(res_AMX) & !is.na(res_AMC) & !is.na(res_CIP)) # Drop missing values
data_time
@@ -426,9 +448,9 @@ We now define the modelling workflow, which consists of a preprocessing step, a
```{r}
# Define the recipe
resistance_recipe_time <- recipe(res_AMX ~ year + gramstain, data = data_time) %>%
step_dummy(gramstain, one_hot = TRUE) %>% # Convert categorical to numerical
step_normalize(year) %>% # Normalise year for better model performance
step_nzv(all_predictors()) # Remove near-zero variance predictors
step_dummy(gramstain, one_hot = TRUE) %>% # Convert categorical to numerical
step_normalize(year) %>% # Normalise year for better model performance
step_nzv(all_predictors()) # Remove near-zero variance predictors
resistance_recipe_time
```
@@ -514,9 +536,11 @@ library(ggplot2)
ggplot(predictions_time, aes(x = year)) +
geom_point(aes(y = res_AMX, color = "Actual")) +
geom_line(aes(y = .pred, color = "Predicted")) +
labs(title = "Predicted vs Actual AMX Resistance Over Time",
x = "Year",
y = "Resistance Proportion") +
labs(
title = "Predicted vs Actual AMX Resistance Over Time",
x = "Year",
y = "Resistance Proportion"
) +
theme_minimal()
```
@@ -525,13 +549,17 @@ Additionally, we can visualise resistance trends in `ggplot2` and directly add l
```{r}
ggplot(data_time, aes(x = year, y = res_AMX, color = gramstain)) +
geom_line() +
labs(title = "AMX Resistance Trends",
x = "Year",
y = "Resistance Proportion") +
labs(
title = "AMX Resistance Trends",
x = "Year",
y = "Resistance Proportion"
) +
# add a linear model directly in ggplot2:
geom_smooth(method = "lm",
formula = y ~ x,
alpha = 0.25) +
geom_smooth(
method = "lm",
formula = y ~ x,
alpha = 0.25
) +
theme_minimal()
```