mirror of
https://github.com/msberends/AMR.git
synced 2026-04-28 13:43:56 +02:00
Add add_if_missing parameter to control NA handling in interpretive rules (#264)
This commit is contained in:
@@ -53,8 +53,8 @@ We begin by loading the required libraries and preparing the `example_isolates`
|
||||
|
||||
```{r lib packages, message = FALSE, warning = FALSE, results = 'asis'}
|
||||
# Load required libraries
|
||||
library(AMR) # For AMR data analysis
|
||||
library(tidymodels) # For machine learning workflows, and data manipulation (dplyr, tidyr, ...)
|
||||
library(AMR) # For AMR data analysis
|
||||
library(tidymodels) # For machine learning workflows, and data manipulation (dplyr, tidyr, ...)
|
||||
```
|
||||
|
||||
Prepare the data:
|
||||
@@ -68,13 +68,19 @@ data <- example_isolates %>%
|
||||
# select AB results dynamically
|
||||
select(mo, aminoglycosides(), betalactams()) %>%
|
||||
# replace NAs with NI (not-interpretable)
|
||||
mutate(across(where(is.sir),
|
||||
~replace_na(.x, "NI")),
|
||||
# make factors of SIR columns
|
||||
across(where(is.sir),
|
||||
as.integer),
|
||||
# get Gramstain of microorganisms
|
||||
mo = as.factor(mo_gramstain(mo))) %>%
|
||||
mutate(
|
||||
across(
|
||||
where(is.sir),
|
||||
~ replace_na(.x, "NI")
|
||||
),
|
||||
# make factors of SIR columns
|
||||
across(
|
||||
where(is.sir),
|
||||
as.integer
|
||||
),
|
||||
# get Gramstain of microorganisms
|
||||
mo = as.factor(mo_gramstain(mo))
|
||||
) %>%
|
||||
# drop NAs - the ones without a Gramstain (fungi, etc.)
|
||||
drop_na()
|
||||
```
|
||||
@@ -149,7 +155,7 @@ To train the model, we split the data into training and testing sets. Then, we f
|
||||
set.seed(123) # For reproducibility
|
||||
data_split <- initial_split(data, prop = 0.8) # 80% training, 20% testing
|
||||
training_data <- training(data_split) # Training set
|
||||
testing_data <- testing(data_split) # Testing set
|
||||
testing_data <- testing(data_split) # Testing set
|
||||
|
||||
# Fit the workflow to the training data
|
||||
fitted_workflow <- resistance_workflow %>%
|
||||
@@ -168,7 +174,7 @@ Next, we evaluate the model on the testing data.
|
||||
```{r}
|
||||
# Make predictions on the testing set
|
||||
predictions <- fitted_workflow %>%
|
||||
predict(testing_data) # Generate predictions
|
||||
predict(testing_data) # Generate predictions
|
||||
probabilities <- fitted_workflow %>%
|
||||
predict(testing_data, type = "prob") # Generate probabilities
|
||||
|
||||
@@ -266,8 +272,8 @@ testing_data <- testing(split)
|
||||
|
||||
# Define the recipe
|
||||
mic_recipe <- recipe(esbl ~ ., data = training_data) %>%
|
||||
remove_role(genus, old_role = "predictor") %>% # Remove non-informative variable
|
||||
step_mic_log2(all_mic_predictors()) # Log2 transform all MIC predictors
|
||||
remove_role(genus, old_role = "predictor") %>% # Remove non-informative variable
|
||||
step_mic_log2(all_mic_predictors()) # Log2 transform all MIC predictors
|
||||
|
||||
prep(mic_recipe)
|
||||
```
|
||||
@@ -341,9 +347,11 @@ library(ggplot2)
|
||||
|
||||
ggplot(predictions, aes(x = esbl, fill = .pred_class)) +
|
||||
geom_bar(position = "stack") +
|
||||
labs(title = "Predicted vs Actual ESBL Status",
|
||||
x = "Actual ESBL",
|
||||
y = "Count") +
|
||||
labs(
|
||||
title = "Predicted vs Actual ESBL Status",
|
||||
x = "Actual ESBL",
|
||||
y = "Count"
|
||||
) +
|
||||
theme_minimal()
|
||||
```
|
||||
|
||||
@@ -351,18 +359,27 @@ And plot the certainties too - how certain were the actual predictions?
|
||||
|
||||
```{r}
|
||||
predictions %>%
|
||||
mutate(certainty = ifelse(.pred_class == "FALSE",
|
||||
.pred_FALSE,
|
||||
.pred_TRUE),
|
||||
correct = ifelse(esbl == .pred_class, "Right", "Wrong")) %>%
|
||||
ggplot(aes(x = seq_len(nrow(predictions)),
|
||||
y = certainty,
|
||||
colour = correct)) +
|
||||
scale_colour_manual(values = c(Right = "green3", Wrong = "red2"),
|
||||
name = "Correct?") +
|
||||
mutate(
|
||||
certainty = ifelse(.pred_class == "FALSE",
|
||||
.pred_FALSE,
|
||||
.pred_TRUE
|
||||
),
|
||||
correct = ifelse(esbl == .pred_class, "Right", "Wrong")
|
||||
) %>%
|
||||
ggplot(aes(
|
||||
x = seq_len(nrow(predictions)),
|
||||
y = certainty,
|
||||
colour = correct
|
||||
)) +
|
||||
scale_colour_manual(
|
||||
values = c(Right = "green3", Wrong = "red2"),
|
||||
name = "Correct?"
|
||||
) +
|
||||
geom_point() +
|
||||
scale_y_continuous(labels = function(x) paste0(x * 100, "%"),
|
||||
limits = c(0.5, 1)) +
|
||||
scale_y_continuous(
|
||||
labels = function(x) paste0(x * 100, "%"),
|
||||
limits = c(0.5, 1)
|
||||
) +
|
||||
theme_minimal()
|
||||
```
|
||||
|
||||
@@ -399,13 +416,18 @@ library(tidymodels)
|
||||
# Transform dataset
|
||||
data_time <- example_isolates %>%
|
||||
top_n_microorganisms(n = 10) %>% # Filter on the top #10 species
|
||||
mutate(year = as.integer(format(date, "%Y")), # Extract year from date
|
||||
gramstain = mo_gramstain(mo)) %>% # Get taxonomic names
|
||||
mutate(
|
||||
year = as.integer(format(date, "%Y")), # Extract year from date
|
||||
gramstain = mo_gramstain(mo)
|
||||
) %>% # Get taxonomic names
|
||||
group_by(year, gramstain) %>%
|
||||
summarise(across(c(AMX, AMC, CIP),
|
||||
function(x) resistance(x, minimum = 0),
|
||||
.names = "res_{.col}"),
|
||||
.groups = "drop") %>%
|
||||
summarise(
|
||||
across(c(AMX, AMC, CIP),
|
||||
function(x) resistance(x, minimum = 0),
|
||||
.names = "res_{.col}"
|
||||
),
|
||||
.groups = "drop"
|
||||
) %>%
|
||||
filter(!is.na(res_AMX) & !is.na(res_AMC) & !is.na(res_CIP)) # Drop missing values
|
||||
|
||||
data_time
|
||||
@@ -426,9 +448,9 @@ We now define the modelling workflow, which consists of a preprocessing step, a
|
||||
```{r}
|
||||
# Define the recipe
|
||||
resistance_recipe_time <- recipe(res_AMX ~ year + gramstain, data = data_time) %>%
|
||||
step_dummy(gramstain, one_hot = TRUE) %>% # Convert categorical to numerical
|
||||
step_normalize(year) %>% # Normalise year for better model performance
|
||||
step_nzv(all_predictors()) # Remove near-zero variance predictors
|
||||
step_dummy(gramstain, one_hot = TRUE) %>% # Convert categorical to numerical
|
||||
step_normalize(year) %>% # Normalise year for better model performance
|
||||
step_nzv(all_predictors()) # Remove near-zero variance predictors
|
||||
|
||||
resistance_recipe_time
|
||||
```
|
||||
@@ -514,9 +536,11 @@ library(ggplot2)
|
||||
ggplot(predictions_time, aes(x = year)) +
|
||||
geom_point(aes(y = res_AMX, color = "Actual")) +
|
||||
geom_line(aes(y = .pred, color = "Predicted")) +
|
||||
labs(title = "Predicted vs Actual AMX Resistance Over Time",
|
||||
x = "Year",
|
||||
y = "Resistance Proportion") +
|
||||
labs(
|
||||
title = "Predicted vs Actual AMX Resistance Over Time",
|
||||
x = "Year",
|
||||
y = "Resistance Proportion"
|
||||
) +
|
||||
theme_minimal()
|
||||
```
|
||||
|
||||
@@ -525,13 +549,17 @@ Additionally, we can visualise resistance trends in `ggplot2` and directly add l
|
||||
```{r}
|
||||
ggplot(data_time, aes(x = year, y = res_AMX, color = gramstain)) +
|
||||
geom_line() +
|
||||
labs(title = "AMX Resistance Trends",
|
||||
x = "Year",
|
||||
y = "Resistance Proportion") +
|
||||
labs(
|
||||
title = "AMX Resistance Trends",
|
||||
x = "Year",
|
||||
y = "Resistance Proportion"
|
||||
) +
|
||||
# add a linear model directly in ggplot2:
|
||||
geom_smooth(method = "lm",
|
||||
formula = y ~ x,
|
||||
alpha = 0.25) +
|
||||
geom_smooth(
|
||||
method = "lm",
|
||||
formula = y ~ x,
|
||||
alpha = 0.25
|
||||
) +
|
||||
theme_minimal()
|
||||
```
|
||||
|
||||
|
||||
Reference in New Issue
Block a user