AMR/data-raw/reproduction_of_dosage.R

179 lines
6.7 KiB
R
Raw Normal View History

# ==================================================================== #
2023-06-26 13:52:02 +02:00
# TITLE: #
2022-10-05 09:12:22 +02:00
# AMR: An R Package for Working with Antimicrobial Resistance Data #
# #
2023-06-26 13:52:02 +02:00
# SOURCE CODE: #
# https://github.com/msberends/AMR #
# #
2023-06-26 13:52:02 +02:00
# PLEASE CITE THIS SOFTWARE AS: #
2022-10-05 09:12:22 +02:00
# Berends MS, Luz CF, Friedrich AW, Sinha BNM, Albers CJ, Glasner C #
# (2022). AMR: An R Package for Working with Antimicrobial Resistance #
# Data. Journal of Statistical Software, 104(3), 1-31. #
2023-05-27 10:39:22 +02:00
# https://doi.org/10.18637/jss.v104.i03 #
2022-10-05 09:12:22 +02:00
# #
2022-12-27 15:16:15 +01:00
# Developed at the University of Groningen and the University Medical #
# Center Groningen in The Netherlands, in collaboration with many #
# colleagues from around the world, see our website. #
# #
# This R package is free software; you can freely use and distribute #
# it for both personal and commercial purposes under the terms of the #
# GNU General Public License version 2.0 (GNU GPL-2), as published by #
# the Free Software Foundation. #
# We created this package for both routine data analysis and academic #
# research and it was publicly released in the hope that it will be #
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. #
# #
# Visit our website for the full manual and a complete tutorial about #
# how to conduct AMR data analysis: https://msberends.github.io/AMR/ #
# ==================================================================== #
library(dplyr)
library(readxl)
library(cleaner)
# URL:
# https://www.eucast.org/fileadmin/src/media/PDFs/EUCAST_files/Breakpoint_tables/Dosages_v_13.0_Breakpoint_Tables.pdf
2022-11-14 15:20:39 +01:00
# download the PDF file, open in Adobe Acrobat and export as Excel workbook
breakpoints_version <- 13
2022-11-14 15:20:39 +01:00
dosage_source <- read_excel("data-raw/Dosages_v_12.0_Breakpoint_Tables.xlsx", skip = 4, na = "None") %>%
2022-08-28 10:31:50 +02:00
format_names(snake_case = TRUE, penicillins = "drug") %>%
2022-11-14 15:20:39 +01:00
filter(!tolower(standard_dosage) %in% c("standard dosage", "standard dosage_source", "under review")) %>%
2022-08-28 10:31:50 +02:00
filter(!is.na(standard_dosage)) %>%
# keep only one drug in the table
2022-08-28 10:31:50 +02:00
arrange(desc(drug)) %>%
mutate(drug = gsub("(.*) ([(]|iv|oral).*", "\\1", drug)) %>%
# distinct(drug, .keep_all = TRUE) %>%
arrange(drug) %>%
mutate(
ab = as.ab(drug),
ab_name = ab_name(ab, language = NULL)
)
2021-01-24 23:27:11 +01:00
dosage_source <- bind_rows(
# oral
2022-08-28 10:31:50 +02:00
dosage_source %>%
2021-01-24 23:27:11 +01:00
filter(standard_dosage %like% " oral") %>%
2022-08-28 10:31:50 +02:00
mutate(
standard_dosage = gsub("oral.*", "oral", standard_dosage),
high_dosage = if_else(high_dosage %like% "oral",
gsub("oral.*", "oral", high_dosage),
NA_character_
)
),
2021-01-24 23:27:11 +01:00
# iv
2022-08-28 10:31:50 +02:00
dosage_source %>%
2021-01-24 23:27:11 +01:00
filter(standard_dosage %like% " iv") %>%
2022-08-28 10:31:50 +02:00
mutate(
standard_dosage = gsub(".* or ", "", standard_dosage),
high_dosage = if_else(high_dosage %like% "( or | iv)",
gsub(".* or ", "", high_dosage),
NA_character_
)
),
2021-01-24 23:27:11 +01:00
# im
2022-08-28 10:31:50 +02:00
dosage_source %>%
2021-01-24 23:27:11 +01:00
filter(standard_dosage %like% " im")
2022-08-28 10:31:50 +02:00
) %>%
2021-01-24 23:27:11 +01:00
arrange(drug)
get_dosage_lst <- function(col_data) {
standard <- col_data %>%
# remove new lines
gsub(" ?(\n|\t)+ ?", " ", .) %>%
# keep only the first suggestion, replace all after 'or' and more informative texts
2022-08-28 10:31:50 +02:00
gsub("(.*?) (or|with|loading|depending|over|by) .*", "\\1", .) %>%
# remove (1 MU)
2022-08-28 10:31:50 +02:00
gsub(" [(][0-9] [A-Z]+[)]", "", .) %>%
# remove parentheses
2022-08-28 10:31:50 +02:00
gsub("[)(]", "", .) %>%
# remove drug names
2022-08-28 10:31:50 +02:00
gsub(" [a-z]{5,99}( |$)", " ", .) %>%
gsub(" [a-z]{5,99}( |$)", " ", .) %>%
gsub(" (acid|dose)", "", .) # %>%
# keep lowest value only (25-30 mg -> 25 mg)
# gsub("[-].*? ", " ", .)
dosage_lst <- lapply(
strsplit(standard, " x "),
function(x) {
dose <- x[1]
if (dose %like% "under") {
dose <- NA_character_
}
admin <- x[2]
list(
dose = trimws(dose),
dose_times = gsub("^([0-9.]+).*", "\\1", admin),
administration = clean_character(admin),
notes = "",
original_txt = ""
)
}
)
for (i in seq_len(length(col_data))) {
dosage_lst[[i]]$original_txt <- gsub("\n", " ", col_data[i])
if (col_data[i] %like% " (or|with|loading|depending|over) ") {
dosage_lst[[i]]$notes <- gsub("\n", " ", gsub(".* ((or|with|loading|depending|over) .*)", "\\1", col_data[i]))
}
}
dosage_lst
}
standard <- get_dosage_lst(dosage_source$standard_dosage)
high <- get_dosage_lst(dosage_source$high_dosage)
uti <- get_dosage_lst(dosage_source$uncomplicated_uti)
2022-11-14 15:20:39 +01:00
dosage_new <- bind_rows(
2021-01-24 23:27:11 +01:00
# standard dose
data.frame(
ab = dosage_source$ab,
name = dosage_source$ab_name,
type = "standard_dosage",
dose = sapply(standard, function(x) x$dose),
dose_times = sapply(standard, function(x) x$dose_times),
administration = sapply(standard, function(x) x$administration),
notes = sapply(standard, function(x) x$notes),
original_txt = sapply(standard, function(x) x$original_txt),
stringsAsFactors = FALSE
),
2021-01-24 23:27:11 +01:00
# high dose
data.frame(
ab = dosage_source$ab,
name = dosage_source$ab_name,
type = "high_dosage",
dose = sapply(high, function(x) x$dose),
dose_times = sapply(high, function(x) x$dose_times),
administration = sapply(high, function(x) x$administration),
notes = sapply(high, function(x) x$notes),
original_txt = sapply(high, function(x) x$original_txt),
stringsAsFactors = FALSE
),
2021-01-24 23:27:11 +01:00
# UTIs
data.frame(
ab = dosage_source$ab,
name = dosage_source$ab_name,
type = "uncomplicated_uti",
dose = sapply(uti, function(x) x$dose),
dose_times = sapply(uti, function(x) x$dose_times),
administration = sapply(uti, function(x) x$administration),
notes = sapply(uti, function(x) x$notes),
original_txt = sapply(uti, function(x) x$original_txt),
stringsAsFactors = FALSE
2022-08-28 10:31:50 +02:00
)
) %>%
mutate(
eucast_version = breakpoints_version,
dose_times = as.integer(dose_times),
administration = gsub("([a-z]+) .*", "\\1", administration)
) %>%
arrange(name, administration, type) %>%
filter(!is.na(dose), dose != ".") %>%
# this makes it a tibble as well:
2022-11-14 15:20:39 +01:00
dataset_UTF8_to_ASCII()
dosage <- bind_rows(dosage_new, AMR::dosage)
usethis::use_data(dosage, internal = FALSE, overwrite = TRUE, version = 2, compress = "xz")