1
0
mirror of https://github.com/msberends/AMR.git synced 2025-01-12 16:11:37 +01:00
AMR/data-raw/read_EUCAST.R

309 lines
11 KiB
R
Raw Normal View History

2020-02-14 19:54:13 +01:00
# ==================================================================== #
# TITLE #
2022-10-05 09:12:22 +02:00
# AMR: An R Package for Working with Antimicrobial Resistance Data #
2020-02-14 19:54:13 +01:00
# #
# SOURCE #
# https://github.com/msberends/AMR #
2020-02-14 19:54:13 +01:00
# #
2022-10-05 09:12:22 +02:00
# CITE AS #
# Berends MS, Luz CF, Friedrich AW, Sinha BNM, Albers CJ, Glasner C #
# (2022). AMR: An R Package for Working with Antimicrobial Resistance #
# Data. Journal of Statistical Software, 104(3), 1-31. #
# doi:10.18637/jss.v104.i03 #
# #
2022-12-27 15:16:15 +01:00
# Developed at the University of Groningen and the University Medical #
# Center Groningen in The Netherlands, in collaboration with many #
# colleagues from around the world, see our website. #
2020-02-14 19:54:13 +01:00
# #
# This R package is free software; you can freely use and distribute #
# it for both personal and commercial purposes under the terms of the #
# GNU General Public License version 2.0 (GNU GPL-2), as published by #
# the Free Software Foundation. #
# We created this package for both routine data analysis and academic #
# research and it was publicly released in the hope that it will be #
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. #
2020-10-08 11:16:03 +02:00
# #
# Visit our website for the full manual and a complete tutorial about #
# how to conduct AMR data analysis: https://msberends.github.io/AMR/ #
2020-02-14 19:54:13 +01:00
# ==================================================================== #
library(openxlsx)
library(dplyr)
library(tidyr)
2020-02-14 19:54:13 +01:00
library(cleaner)
library(AMR)
2020-02-17 14:38:01 +01:00
# USE THIS FUNCTION TO READ THE EUCAST EXCEL FILE THAT CONTAINS THE BREAKPOINT TABLES
2022-08-28 10:31:50 +02:00
read_EUCAST <- function(sheet, file, guideline_name) {
message("\nGetting sheet: ", sheet)
2020-02-14 19:54:13 +01:00
sheet.bak <- sheet
2022-08-28 10:31:50 +02:00
uncertainties <- NULL
add_uncertainties <- function(old, new) {
if (is.null(old)) {
new
} else {
bind_rows(old, new)
}
}
2022-08-28 10:31:50 +02:00
raw_data <- read.xlsx(
xlsxFile = file,
sheet = sheet,
colNames = FALSE,
skipEmptyRows = FALSE,
skipEmptyCols = FALSE,
fillMergedCells = TRUE,
na.strings = c("", "-", "NA", "IE", "IP")
)
probable_rows <- suppressWarnings(raw_data %>% mutate_all(as.double) %>% summarise_all(~ sum(!is.na(.))) %>% unlist() %>% max())
if (probable_rows == 0) {
message("NO ROWS FOUND")
message("------------------------")
return(NULL)
}
2022-08-28 10:31:50 +02:00
2020-02-14 19:54:13 +01:00
# in the info header in the Excel file, EUCAST mentions which genera are targeted
if (sheet %like% "anaerob.*Gram.*posi") {
2023-01-23 15:01:21 +01:00
sheet <- paste0(
c(
"Actinomyces", "Bifidobacterium", "Clostridioides",
"Clostridium", "Cutibacterium", "Eggerthella",
"Eubacterium", "Lactobacillus", "Propionibacterium",
"Staphylococcus saccharolyticus"
),
collapse = "_"
2022-08-28 10:31:50 +02:00
)
2020-02-14 19:54:13 +01:00
} else if (sheet %like% "anaerob.*Gram.*nega") {
2023-01-23 15:01:21 +01:00
sheet <- paste0(
c(
"Bacteroides",
"Bilophila",
"Fusobacterium",
"Mobiluncus",
"Parabacteroides",
"Porphyromonas",
"Prevotella"
),
collapse = "_"
2022-08-28 10:31:50 +02:00
)
2020-02-14 19:54:13 +01:00
} else if (sheet == "Streptococcus A,B,C,G") {
2023-01-23 15:01:21 +01:00
sheet <- paste0(
microorganisms %>%
filter(genus == "Streptococcus") %>%
mutate(lancefield = mo_name(mo, Lancefield = TRUE)) %>%
filter(lancefield %like% "^Streptococcus group") %>%
pull(fullname),
collapse = "_"
2022-08-28 10:31:50 +02:00
)
2020-02-14 19:54:13 +01:00
} else if (sheet %like% "PK.*PD") {
sheet <- "UNKNOWN"
}
mo_sheet <- paste0(suppressMessages(as.mo(unlist(strsplit(sheet, "_")))), collapse = "|")
if (!is.null(mo_uncertainties())) uncertainties <- add_uncertainties(uncertainties, mo_uncertainties())
2022-08-28 10:31:50 +02:00
2020-02-14 19:54:13 +01:00
set_columns_names <- function(x, cols) {
colnames(x) <- cols[1:length(colnames(x))]
x
}
2022-08-28 10:31:50 +02:00
2020-02-14 19:54:13 +01:00
get_mo <- function(x) {
for (i in seq_len(length(x))) {
2022-10-05 09:12:22 +02:00
y <- trimws2(unlist(strsplit(x[i], "(,|and)")))
y <- trimws2(gsub("[(].*[)]", "", y))
y <- suppressWarnings(suppressMessages(as.mo(y)))
if (!is.null(mo_uncertainties())) uncertainties <<- add_uncertainties(uncertainties, mo_uncertainties())
2020-02-14 19:54:13 +01:00
y <- y[!is.na(y) & y != "UNKNOWN"]
x[i] <- paste(y, collapse = "|")
}
x
}
2022-08-28 10:31:50 +02:00
MICs_with_trailing_superscript <- c(
seq(from = 0.0011, to = 0.0019, by = 0.0001),
seq(from = 0.031, to = 0.039, by = 0.001),
seq(from = 0.061, to = 0.069, by = 0.001),
seq(from = 0.1251, to = 0.1259, by = 0.0001),
seq(from = 0.251, to = 0.259, by = 0.001),
seq(from = 0.51, to = 0.59, by = 0.01),
seq(from = 11, to = 19, by = 1),
seq(from = 161, to = 169, by = 01),
seq(from = 21, to = 29, by = 1),
seq(from = 321, to = 329, by = 1),
seq(from = 41, to = 49, by = 1),
seq(from = 81, to = 89, by = 1)
)
2020-02-20 13:19:23 +01:00
has_superscript <- function(x) {
2022-08-28 10:31:50 +02:00
# because due to floating point error, 0.1252 is not in:
2020-02-20 13:19:23 +01:00
# seq(from = 0.1251, to = 0.1259, by = 0.0001)
sapply(x, function(x) any(near(x, MICs_with_trailing_superscript)))
}
2022-08-28 10:31:50 +02:00
2020-02-14 19:54:13 +01:00
has_zone_diameters <- rep(any(unlist(raw_data) %like% "zone diameter"), nrow(raw_data))
2022-08-28 10:31:50 +02:00
cleaned <- raw_data %>%
as_tibble() %>%
set_columns_names(LETTERS) %>%
transmute(
drug = A,
MIC_S = B,
MIC_R = C,
disk_dose = ifelse(has_zone_diameters, E, NA_character_),
disk_S = ifelse(has_zone_diameters, `F`, NA_character_),
disk_R = ifelse(has_zone_diameters, G, NA_character_)
) %>%
filter(
!is.na(drug),
!(is.na(MIC_S) & is.na(MIC_R) & is.na(disk_S) & is.na(disk_R)),
MIC_S %unlike% "(MIC|S ≤|note)",
MIC_S %unlike% "^[-]",
drug != MIC_S,
) %>%
mutate(
administration = case_when(
drug %like% "[( ]oral" ~ "oral",
drug %like% "[( ]iv" ~ "iv",
TRUE ~ NA_character_
),
uti = ifelse(drug %like% "(UTI|urinary|urine)", TRUE, FALSE),
systemic = ifelse(drug %like% "(systemic|septic)", TRUE, FALSE),
mo = ifelse(drug %like% "([.]|spp)", get_mo(drug), mo_sheet)
) %>%
2020-02-14 19:54:13 +01:00
# clean disk doses
2022-08-28 10:31:50 +02:00
mutate(disk_dose = clean_character(disk_dose, remove = "[^0-9.-]")) %>%
2020-02-14 19:54:13 +01:00
# clean MIC and disk values
2022-08-28 10:31:50 +02:00
mutate(
MIC_S = gsub(".,.", "", MIC_S), # remove superscript notes with comma, like 0.5^2,3
MIC_R = gsub(".,.", "", MIC_R),
disk_S = gsub(".,.", "", disk_S),
disk_R = gsub(".,.", "", disk_R),
MIC_S = clean_double(MIC_S), # make them valid numeric values
MIC_R = clean_double(MIC_R),
disk_S = clean_integer(disk_S),
disk_R = clean_integer(disk_R),
# invalid MIC values have a superscript text, delete those
MIC_S = ifelse(has_superscript(MIC_S),
substr(MIC_S, 1, nchar(MIC_S) - 1),
MIC_S
),
MIC_R = ifelse(has_superscript(MIC_R),
substr(MIC_R, 1, nchar(MIC_R) - 1),
MIC_R
),
# and some are just awful
MIC_S = ifelse(MIC_S == 43.4, 4, MIC_S),
MIC_R = ifelse(MIC_R == 43.4, 4, MIC_R),
) %>%
2020-02-14 19:54:13 +01:00
# clean drug names
2022-08-28 10:31:50 +02:00
mutate(
drug = gsub(" ?[(, ].*$", "", drug),
drug = gsub("[1-9]+$", "", drug),
ab = as.ab(drug)
) %>%
select(ab, mo, everything(), -drug) %>%
as.data.frame(stringsAsFactors = FALSE)
2022-08-28 10:31:50 +02:00
2020-02-14 19:54:13 +01:00
# new row for every different MO mentioned
for (i in 1:nrow(cleaned)) {
mo <- cleaned[i, "mo", drop = TRUE]
if (grepl(pattern = "|", mo, fixed = TRUE)) {
mo_vect <- unlist(strsplit(mo, "|", fixed = TRUE))
cleaned[i, "mo"] <- mo_vect[1]
for (j in seq_len(length(mo_vect))) {
2022-08-28 10:31:50 +02:00
cleaned <- bind_rows(cleaned, cleaned[i, , drop = FALSE])
2020-02-14 19:54:13 +01:00
cleaned[nrow(cleaned), "mo"] <- mo_vect[j]
}
}
}
2022-08-28 10:31:50 +02:00
cleaned <- cleaned %>%
distinct(ab, mo, administration, uti, systemic, .keep_all = TRUE) %>%
arrange(ab, mo) %>%
2020-02-14 19:54:13 +01:00
mutate_at(c("MIC_S", "MIC_R", "disk_S", "disk_R"), as.double) %>%
2022-08-28 10:31:50 +02:00
pivot_longer(c("MIC_S", "MIC_R", "disk_S", "disk_R"), "type") %>%
mutate(
method = ifelse(type %like% "MIC", "MIC", "DISK"),
type = gsub("^.*_", "breakpoint_", type)
) %>%
pivot_wider(names_from = type, values_from = value) %>%
mutate(
guideline = guideline_name,
disk_dose = ifelse(method == "DISK", disk_dose, NA_character_),
mo = ifelse(mo == "", mo_sheet, mo)
) %>%
filter(!(is.na(breakpoint_S) & is.na(breakpoint_R))) %>%
2023-01-21 23:47:20 +01:00
# comply with clinical_breakpoints for now
2020-02-14 19:54:13 +01:00
transmute(guideline,
2022-08-28 10:31:50 +02:00
method,
site = case_when(
uti ~ "UTI",
systemic ~ "Systemic",
TRUE ~ administration
),
mo, ab,
ref_tbl = sheet.bak,
disk_dose = ifelse(!is.na(disk_dose), paste0(disk_dose, "ug"), NA_character_),
breakpoint_S,
breakpoint_R
) %>%
as.data.frame(stringsAsFactors = FALSE)
2022-08-28 10:31:50 +02:00
if (!is.null(uncertainties)) {
print(uncertainties %>% distinct(input, mo, .keep_all = TRUE))
}
message("Estimated: ", probable_rows, ", gained: ", cleaned %>% count(ab) %>% nrow())
message("------------------------")
2020-02-14 19:54:13 +01:00
cleaned
}
# Actual import -----------------------------------------------------------
file <- "data-raw/v_11.0_Breakpoint_Tables.xlsx"
sheets <- readxl::excel_sheets(file)
guideline_name <- "EUCAST 2021"
sheets_to_analyse <- sheets[!sheets %in% c("Content", "Changes", "Notes", "Guidance", "Dosages", "Technical uncertainty", "Topical agents")]
2020-04-14 14:12:31 +02:00
# takes the longest time:
2022-08-28 10:31:50 +02:00
new_EUCAST <- read_EUCAST(
sheet = sheets_to_analyse[1],
file = file,
guideline_name = guideline_name
)
2020-02-14 19:54:13 +01:00
for (i in 2:length(sheets_to_analyse)) {
2020-04-14 14:12:31 +02:00
tryCatch(
2022-08-28 10:31:50 +02:00
new_EUCAST <<- bind_rows(
new_EUCAST,
read_EUCAST(
sheet = sheets_to_analyse[i],
file = file,
guideline_name = guideline_name
)
),
error = function(e) message(e$message)
)
2020-02-14 19:54:13 +01:00
}
# 2021-07-12 fix for Morganellaceae (check other lines too next time)
2023-01-21 23:47:20 +01:00
morg <- clinical_breakpoints %>%
as_tibble() %>%
2022-08-28 10:31:50 +02:00
filter(
ab == "IPM",
guideline == "EUCAST 2021",
mo == as.mo("Enterobacterales")
) %>%
mutate(mo = as.mo("Morganellaceae"))
morg[which(morg$method == "MIC"), "breakpoint_S"] <- 0.001
morg[which(morg$method == "MIC"), "breakpoint_R"] <- 4
morg[which(morg$method == "DISK"), "breakpoint_S"] <- 50
morg[which(morg$method == "DISK"), "breakpoint_R"] <- 19
2023-01-21 23:47:20 +01:00
clinical_breakpoints <- clinical_breakpoints %>%
bind_rows(morg) %>%
bind_rows(morg %>%
2022-08-28 10:31:50 +02:00
mutate(guideline = "EUCAST 2020")) %>%
arrange(desc(guideline), ab, mo, method)