1
0
mirror of https://github.com/msberends/AMR.git synced 2025-08-27 01:02:20 +02:00
Files
.github
PythonPackage
R
data
data-raw
AMR intro.jpg
AMR new logo.txt
AMR_intro.ai
AMR_vet.html
AMR_vet.qmd
AMRforRGPT.pdf
AMRforRGPT.pptx
AMRforRGPT.svg
AMRforRGPT_python.pdf
AMRforRGPT_python.pptx
Dosages_v_11.0_Breakpoint_Tables.xlsx
Dosages_v_12.0_Breakpoint_Tables.pdf
Dosages_v_12.0_Breakpoint_Tables.xlsx
Dosages_v_13.0_Breakpoint_Tables.pdf
Dosages_v_13.0_Breakpoint_Tables.xlsx
Intrinsic_Resistance_and_Unusual_Phenotypes_Tables_v3.3_20211018.pdf
Loinc.csv
Preference animal species groups.xlsx
SNOMED_PHVS_Microorganism_CDC_V12.txt
Species groups.xlsx
_generate_GPT_knowledge_input.sh
_generate_python_wrapper.sh
_language_update.R
_pre_commit_checks.R
ab.md5
antibiotics.dta
antibiotics.feather
antibiotics.parquet
antibiotics.rds
antibiotics.sav
antibiotics.txt
antibiotics.xlsx
antivirals.dta
antivirals.feather
antivirals.parquet
antivirals.rds
antivirals.sav
antivirals.txt
antivirals.xlsx
av.md5
bacdive.csv
bartlett_et_al_2022_human_pathogens.xlsx
clin_break.md5
clinical_breakpoints.dta
clinical_breakpoints.feather
clinical_breakpoints.parquet
clinical_breakpoints.rds
clinical_breakpoints.sav
clinical_breakpoints.txt
clinical_breakpoints.xlsx
create_survey_page.R
dosage.dta
dosage.feather
dosage.md5
dosage.parquet
dosage.rds
dosage.sav
dosage.txt
dosage.xlsx
eucast_rules.tsv
extractATCs.R
gpt_training_text_v2.1.1.9163.txt
intrinsicR.md5
intrinsic_resistant.dta
intrinsic_resistant.feather
intrinsic_resistant.md5
intrinsic_resistant.parquet
intrinsic_resistant.rds
intrinsic_resistant.sav
intrinsic_resistant.txt
intrinsic_resistant.xlsx
logo.old.svg
logo.svg
loinc.R
microorganisms.codes.dta
microorganisms.codes.feather
microorganisms.codes.md5
microorganisms.codes.parquet
microorganisms.codes.rds
microorganisms.codes.sav
microorganisms.codes.txt
microorganisms.codes.xlsx
microorganisms.dta
microorganisms.feather
microorganisms.groups.dta
microorganisms.groups.feather
microorganisms.groups.md5
microorganisms.groups.parquet
microorganisms.groups.rds
microorganisms.groups.sav
microorganisms.groups.txt
microorganisms.groups.xlsx
microorganisms.md5
microorganisms.parquet
microorganisms.rds
microorganisms.sav
microorganisms.txt
microorganisms.xlsx
mo.md5
organisms.rds
poorman_prepend.R
read_EUCAST.R
reproduction_of_antibiotics.R
reproduction_of_antivirals.R
reproduction_of_clinical_breakpoints.R
reproduction_of_dosage.R
reproduction_of_example_isolates_unclean.R
reproduction_of_intrinsic_resistant.R
reproduction_of_microorganisms.R
reproduction_of_microorganisms.groups.R
reproduction_of_poorman.R
salmonellae.R
salmonellae.rds
salonella_fix.R
species groups.R
taxonomy0.rds
taxonomy1.rds
taxonomy1b.rds
taxonomy1c.rds
taxonomy2.rds
taxonomy2b.rds
taxonomy2c.rds
taxonomy3.rds
taxonomy3b.rds
taxonomy_gbif.rds
taxonomy_lpsn.rds
taxonomy_lpsn_missing.rds
taxonomy_mycobank.rds
toxoplasma.R
translations.tsv
v_10.0_Breakpoint_Tables.xlsx
v_11.0_Breakpoint_Tables.xlsx
v_12.0_Breakpoint_Tables.xlsx
v_13.0_Breakpoint_Tables.xlsx
whonet_bugfixes.R
wisca.xlsx
wisca_params.xlsx
inst
man
pkgdown
tests
vignettes
.Rbuildignore
.gitignore
AMR.Rproj
CRAN-SUBMISSION
DESCRIPTION
LICENSE
NAMESPACE
NEWS.md
README.md
_pkgdown.yml
codecov.yml
cran-comments.md
index.md
logo.svg
AMR/data-raw/read_EUCAST.R

309 lines
11 KiB
R

# ==================================================================== #
# TITLE: #
# AMR: An R Package for Working with Antimicrobial Resistance Data #
# #
# SOURCE CODE: #
# https://github.com/msberends/AMR #
# #
# PLEASE CITE THIS SOFTWARE AS: #
# Berends MS, Luz CF, Friedrich AW, et al. (2022). #
# AMR: An R Package for Working with Antimicrobial Resistance Data. #
# Journal of Statistical Software, 104(3), 1-31. #
# https://doi.org/10.18637/jss.v104.i03 #
# #
# Developed at the University of Groningen and the University Medical #
# Center Groningen in The Netherlands, in collaboration with many #
# colleagues from around the world, see our website. #
# #
# This R package is free software; you can freely use and distribute #
# it for both personal and commercial purposes under the terms of the #
# GNU General Public License version 2.0 (GNU GPL-2), as published by #
# the Free Software Foundation. #
# We created this package for both routine data analysis and academic #
# research and it was publicly released in the hope that it will be #
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. #
# #
# Visit our website for the full manual and a complete tutorial about #
# how to conduct AMR data analysis: https://msberends.github.io/AMR/ #
# ==================================================================== #
library(openxlsx)
library(dplyr)
library(tidyr)
library(cleaner)
library(AMR)
# USE THIS FUNCTION TO READ THE EUCAST EXCEL FILE THAT CONTAINS THE BREAKPOINT TABLES
read_EUCAST <- function(sheet, file, guideline_name) {
message("\nGetting sheet: ", sheet)
sheet.bak <- sheet
uncertainties <- NULL
add_uncertainties <- function(old, new) {
if (is.null(old)) {
new
} else {
bind_rows(old, new)
}
}
raw_data <- read.xlsx(
xlsxFile = file,
sheet = sheet,
colNames = FALSE,
skipEmptyRows = FALSE,
skipEmptyCols = FALSE,
fillMergedCells = TRUE,
na.strings = c("", "-", "NA", "IE", "IP")
)
probable_rows <- suppressWarnings(raw_data %>% mutate_all(as.double) %>% summarise_all(~ sum(!is.na(.))) %>% unlist() %>% max())
if (probable_rows == 0) {
message("NO ROWS FOUND")
message("------------------------")
return(NULL)
}
# in the info header in the Excel file, EUCAST mentions which genera are targeted
if (sheet %like% "anaerob.*Gram.*posi") {
sheet <- paste0(
c(
"Actinomyces", "Bifidobacterium", "Clostridioides",
"Clostridium", "Cutibacterium", "Eggerthella",
"Eubacterium", "Lactobacillus", "Propionibacterium",
"Staphylococcus saccharolyticus"
),
collapse = "_"
)
} else if (sheet %like% "anaerob.*Gram.*nega") {
sheet <- paste0(
c(
"Bacteroides",
"Bilophila",
"Fusobacterium",
"Mobiluncus",
"Parabacteroides",
"Porphyromonas",
"Prevotella"
),
collapse = "_"
)
} else if (sheet == "Streptococcus A,B,C,G") {
sheet <- paste0(
microorganisms %>%
filter(genus == "Streptococcus") %>%
mutate(lancefield = mo_name(mo, Lancefield = TRUE)) %>%
filter(lancefield %like% "^Streptococcus group") %>%
pull(fullname),
collapse = "_"
)
} else if (sheet %like% "PK.*PD") {
sheet <- "UNKNOWN"
}
mo_sheet <- paste0(suppressMessages(as.mo(unlist(strsplit(sheet, "_")))), collapse = "|")
if (!is.null(mo_uncertainties())) uncertainties <- add_uncertainties(uncertainties, mo_uncertainties())
set_columns_names <- function(x, cols) {
colnames(x) <- cols[1:length(colnames(x))]
x
}
get_mo <- function(x) {
for (i in seq_len(length(x))) {
y <- trimws2(unlist(strsplit(x[i], "(,|and)")))
y <- trimws2(gsub("[(].*[)]", "", y))
y <- suppressWarnings(suppressMessages(as.mo(y)))
if (!is.null(mo_uncertainties())) uncertainties <<- add_uncertainties(uncertainties, mo_uncertainties())
y <- y[!is.na(y) & y != "UNKNOWN"]
x[i] <- paste(y, collapse = "|")
}
x
}
MICs_with_trailing_superscript <- c(
seq(from = 0.0011, to = 0.0019, by = 0.0001),
seq(from = 0.031, to = 0.039, by = 0.001),
seq(from = 0.061, to = 0.069, by = 0.001),
seq(from = 0.1251, to = 0.1259, by = 0.0001),
seq(from = 0.251, to = 0.259, by = 0.001),
seq(from = 0.51, to = 0.59, by = 0.01),
seq(from = 11, to = 19, by = 1),
seq(from = 161, to = 169, by = 01),
seq(from = 21, to = 29, by = 1),
seq(from = 321, to = 329, by = 1),
seq(from = 41, to = 49, by = 1),
seq(from = 81, to = 89, by = 1)
)
has_superscript <- function(x) {
# because due to floating point error, 0.1252 is not in:
# seq(from = 0.1251, to = 0.1259, by = 0.0001)
sapply(x, function(x) any(near(x, MICs_with_trailing_superscript)))
}
has_zone_diameters <- rep(any(unlist(raw_data) %like% "zone diameter"), nrow(raw_data))
cleaned <- raw_data %>%
as_tibble() %>%
set_columns_names(LETTERS) %>%
transmute(
drug = A,
MIC_S = B,
MIC_R = C,
disk_dose = ifelse(has_zone_diameters, E, NA_character_),
disk_S = ifelse(has_zone_diameters, `F`, NA_character_),
disk_R = ifelse(has_zone_diameters, G, NA_character_)
) %>%
filter(
!is.na(drug),
!(is.na(MIC_S) & is.na(MIC_R) & is.na(disk_S) & is.na(disk_R)),
MIC_S %unlike% "(MIC|S ≤|note)",
MIC_S %unlike% "^[-]",
drug != MIC_S,
) %>%
mutate(
administration = case_when(
drug %like% "[( ]oral" ~ "oral",
drug %like% "[( ]iv" ~ "iv",
TRUE ~ NA_character_
),
uti = ifelse(drug %like% "(UTI|urinary|urine)", TRUE, FALSE),
systemic = ifelse(drug %like% "(systemic|septic)", TRUE, FALSE),
mo = ifelse(drug %like% "([.]|spp)", get_mo(drug), mo_sheet)
) %>%
# clean disk doses
mutate(disk_dose = clean_character(disk_dose, remove = "[^0-9.-]")) %>%
# clean MIC and disk values
mutate(
MIC_S = gsub(".,.", "", MIC_S), # remove superscript notes with comma, like 0.5^2,3
MIC_R = gsub(".,.", "", MIC_R),
disk_S = gsub(".,.", "", disk_S),
disk_R = gsub(".,.", "", disk_R),
MIC_S = clean_double(MIC_S), # make them valid numeric values
MIC_R = clean_double(MIC_R),
disk_S = clean_integer(disk_S),
disk_R = clean_integer(disk_R),
# invalid MIC values have a superscript text, delete those
MIC_S = ifelse(has_superscript(MIC_S),
substr(MIC_S, 1, nchar(MIC_S) - 1),
MIC_S
),
MIC_R = ifelse(has_superscript(MIC_R),
substr(MIC_R, 1, nchar(MIC_R) - 1),
MIC_R
),
# and some are just awful
MIC_S = ifelse(MIC_S == 43.4, 4, MIC_S),
MIC_R = ifelse(MIC_R == 43.4, 4, MIC_R),
) %>%
# clean drug names
mutate(
drug = gsub(" ?[(, ].*$", "", drug),
drug = gsub("[1-9]+$", "", drug),
ab = as.ab(drug)
) %>%
select(ab, mo, everything(), -drug) %>%
as.data.frame(stringsAsFactors = FALSE)
# new row for every different MO mentioned
for (i in 1:nrow(cleaned)) {
mo <- cleaned[i, "mo", drop = TRUE]
if (grepl(pattern = "|", mo, fixed = TRUE)) {
mo_vect <- unlist(strsplit(mo, "|", fixed = TRUE))
cleaned[i, "mo"] <- mo_vect[1]
for (j in seq_len(length(mo_vect))) {
cleaned <- bind_rows(cleaned, cleaned[i, , drop = FALSE])
cleaned[nrow(cleaned), "mo"] <- mo_vect[j]
}
}
}
cleaned <- cleaned %>%
distinct(ab, mo, administration, uti, systemic, .keep_all = TRUE) %>%
arrange(ab, mo) %>%
mutate_at(c("MIC_S", "MIC_R", "disk_S", "disk_R"), as.double) %>%
pivot_longer(c("MIC_S", "MIC_R", "disk_S", "disk_R"), "type") %>%
mutate(
method = ifelse(type %like% "MIC", "MIC", "DISK"),
type = gsub("^.*_", "breakpoint_", type)
) %>%
pivot_wider(names_from = type, values_from = value) %>%
mutate(
guideline = guideline_name,
disk_dose = ifelse(method == "DISK", disk_dose, NA_character_),
mo = ifelse(mo == "", mo_sheet, mo)
) %>%
filter(!(is.na(breakpoint_S) & is.na(breakpoint_R))) %>%
# comply with clinical_breakpoints for now
transmute(guideline,
method,
site = case_when(
uti ~ "UTI",
systemic ~ "Systemic",
TRUE ~ administration
),
mo, ab,
ref_tbl = sheet.bak,
disk_dose = ifelse(!is.na(disk_dose), paste0(disk_dose, "ug"), NA_character_),
breakpoint_S,
breakpoint_R
) %>%
as.data.frame(stringsAsFactors = FALSE)
if (!is.null(uncertainties)) {
print(uncertainties %>% distinct(input, mo, .keep_all = TRUE))
}
message("Estimated: ", probable_rows, ", gained: ", cleaned %>% count(ab) %>% nrow())
message("------------------------")
cleaned
}
# Actual import -----------------------------------------------------------
file <- "data-raw/v_11.0_Breakpoint_Tables.xlsx"
sheets <- readxl::excel_sheets(file)
guideline_name <- "EUCAST 2021"
sheets_to_analyse <- sheets[!sheets %in% c("Content", "Changes", "Notes", "Guidance", "Dosages", "Technical uncertainty", "Topical agents")]
# takes the longest time:
new_EUCAST <- read_EUCAST(
sheet = sheets_to_analyse[1],
file = file,
guideline_name = guideline_name
)
for (i in 2:length(sheets_to_analyse)) {
tryCatch(
new_EUCAST <<- bind_rows(
new_EUCAST,
read_EUCAST(
sheet = sheets_to_analyse[i],
file = file,
guideline_name = guideline_name
)
),
error = function(e) message(e$message)
)
}
# 2021-07-12 fix for Morganellaceae (check other lines too next time)
morg <- clinical_breakpoints %>%
as_tibble() %>%
filter(
ab == "IPM",
guideline == "EUCAST 2021",
mo == as.mo("Enterobacterales")
) %>%
mutate(mo = as.mo("Morganellaceae"))
morg[which(morg$method == "MIC"), "breakpoint_S"] <- 0.001
morg[which(morg$method == "MIC"), "breakpoint_R"] <- 4
morg[which(morg$method == "DISK"), "breakpoint_S"] <- 50
morg[which(morg$method == "DISK"), "breakpoint_R"] <- 19
clinical_breakpoints <- clinical_breakpoints %>%
bind_rows(morg) %>%
bind_rows(morg %>%
mutate(guideline = "EUCAST 2020")) %>%
arrange(desc(guideline), ab, mo, method)