1
0
mirror of https://github.com/msberends/AMR.git synced 2025-07-21 12:13:20 +02:00

update reproduction scripts links

This commit is contained in:
2025-04-13 10:02:47 +02:00
parent d31371613e
commit 7c3320b967
9 changed files with 0 additions and 0 deletions

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,160 @@
# ==================================================================== #
# TITLE: #
# AMR: An R Package for Working with Antimicrobial Resistance Data #
# #
# SOURCE CODE: #
# https://github.com/msberends/AMR #
# #
# PLEASE CITE THIS SOFTWARE AS: #
# Berends MS, Luz CF, Friedrich AW, et al. (2022). #
# AMR: An R Package for Working with Antimicrobial Resistance Data. #
# Journal of Statistical Software, 104(3), 1-31. #
# https://doi.org/10.18637/jss.v104.i03 #
# #
# Developed at the University of Groningen and the University Medical #
# Center Groningen in The Netherlands, in collaboration with many #
# colleagues from around the world, see our website. #
# #
# This R package is free software; you can freely use and distribute #
# it for both personal and commercial purposes under the terms of the #
# GNU General Public License version 2.0 (GNU GPL-2), as published by #
# the Free Software Foundation. #
# We created this package for both routine data analysis and academic #
# research and it was publicly released in the hope that it will be #
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. #
# #
# Visit our website for the full manual and a complete tutorial about #
# how to conduct AMR data analysis: https://amr-for-r.org #
# ==================================================================== #
library(dplyr)
library(tidyr)
library(rvest)
# get all data from the WHOCC website
get_atc_table <- function(atc_group) {
# give as input J0XXX, like atc_group = "J05AB"
downloaded <- read_html(paste0("https://atcddd.fhi.no/atc_ddd_index/?code=", atc_group, "&showdescription=no"))
table_title <- downloaded %>%
html_nodes(paste0('a[href^="./?code=', atc_group, '&"]')) %>%
html_text()
table_title <- table_title[tolower(table_title) != "show text from guidelines"][1]
table_content <- downloaded %>%
html_nodes("table") %>%
html_table(header = TRUE) %>%
# returns list, so make data.frame out of it
as.data.frame(stringsAsFactors = FALSE) %>%
# select right columns
select(atc = ATC.code, name = Name, ddd = DDD, unit = U, ddd_type = Adm.R) %>%
# fill empty rows
mutate(atc = ifelse(atc == "", lag(atc), atc), name = ifelse(name == "", lag(name), name)) %>%
pivot_wider(names_from = ddd_type, values_from = c(ddd, unit)) %>%
mutate(atc_group = table_title)
if (!"ddd_O" %in% colnames(table_content)) {
table_content <- table_content %>% mutate(ddd_O = NA_real_, unit_O = NA_character_)
}
if (!"ddd_P" %in% colnames(table_content)) {
table_content <- table_content %>% mutate(ddd_P = NA_real_, unit_P = NA_character_)
}
table_content %>% select(atc, name, atc_group,
oral_ddd = ddd_O, oral_units = unit_O,
iv_ddd = ddd_P, iv_units = unit_P
)
}
# these are the relevant groups for input: https://atcddd.fhi.no/atc_ddd_index/?code=J05A (J05 only contains J05A)
atc_groups <- c("J05AA", "J05AB", "J05AC", "J05AD", "J05AE", "J05AF", "J05AG", "J05AH", "J05AJ", "J05AP", "J05AR", "J05AX")
# get the first
antivirals <- get_atc_table(atc_groups[1])
# bind all others to it
for (i in 2:length(atc_groups)) {
message(atc_groups[i], "...")
antivirals <- rbind(antivirals, get_atc_table(atc_groups[i]))
}
# arrange on name, untibble it
antivirals <- antivirals %>%
arrange(name) %>%
as.data.frame(stringsAsFactors = FALSE)
# add PubChem Compound ID (cid) and their trade names
# see `data-raw/_reproduction_scripts/reproduction_of_antimicrobials.R` for get_CID() and get_synonyms()
CIDs <- get_CID(antivirals$name)
# these could not be found:
antivirals[is.na(CIDs), ] %>% View()
# get brand names from PubChem
synonyms <- get_synonyms(CIDs)
synonyms <- lapply(
synonyms,
function(x) {
if (length(x) == 0 | all(is.na(x))) {
""
} else {
x
}
}
)
antivirals <- antivirals %>%
transmute(atc,
cid = as.double(CIDs),
name,
atc_group,
synonyms = unname(synonyms),
oral_ddd,
oral_units,
iv_ddd,
iv_units
) %>%
AMR:::dataset_UTF8_to_ASCII()
av_codes <- tibble(name = antivirals$name %>%
strsplit("(, | and )") %>%
unlist() %>%
unique() %>%
sort()) %>%
mutate(av_1st = toupper(abbreviate(name, minlength = 3, use.classes = FALSE))) %>%
filter(!name %in% c("acid", "dipivoxil", "disoproxil", "marboxil", "alafenamide"))
replace_with_av_code <- function(name) {
unname(av_codes$av_1st[match(name, av_codes$name)])
}
names_codes <- antivirals %>%
separate(name,
into = paste0("name", c(1:7)),
sep = "(, | and )",
remove = FALSE,
fill = "right"
) %>%
# remove empty columns
select(!where(function(x) all(is.na(x)))) %>%
mutate_at(vars(matches("name[1-9]")), replace_with_av_code) %>%
unite(av, matches("name[1-9]"), sep = "+", na.rm = TRUE) %>%
mutate(name = gsub("(, | and )", "/", name))
substr(names_codes$name, 1, 1) <- toupper(substr(names_codes$name, 1, 1))
antivirals <- bind_cols(
names_codes %>% select(av, name),
antivirals %>% select(-name)
)
class(antivirals$av) <- c("av", "character")
antivirals <- antivirals %>% AMR:::dataset_UTF8_to_ASCII()
# ! add loinc, run 'data-raw/loinc.R' !
# de-duplicate synonyms
for (i in 1:nrow(antivirals)) {
syn <- as.character(sort(unique(tolower(antivirals[i, "synonyms", drop = TRUE][[1]]))))
syn <- syn[!syn %in% tolower(antivirals[i, "name", drop = TRUE])]
antivirals[i, "synonyms"][[1]] <- ifelse(length(syn[!syn == ""]) == 0, list(""), list(syn))
}
antivirals <- antivirals %>% AMR:::dataset_UTF8_to_ASCII()
# check it
antivirals
# save it
usethis::use_data(antivirals, overwrite = TRUE, internal = FALSE, compress = "xz", version = 2)

View File

@ -0,0 +1,418 @@
# ==================================================================== #
# TITLE: #
# AMR: An R Package for Working with Antimicrobial Resistance Data #
# #
# SOURCE CODE: #
# https://github.com/msberends/AMR #
# #
# PLEASE CITE THIS SOFTWARE AS: #
# Berends MS, Luz CF, Friedrich AW, et al. (2022). #
# AMR: An R Package for Working with Antimicrobial Resistance Data. #
# Journal of Statistical Software, 104(3), 1-31. #
# https://doi.org/10.18637/jss.v104.i03 #
# #
# Developed at the University of Groningen and the University Medical #
# Center Groningen in The Netherlands, in collaboration with many #
# colleagues from around the world, see our website. #
# #
# This R package is free software; you can freely use and distribute #
# it for both personal and commercial purposes under the terms of the #
# GNU General Public License version 2.0 (GNU GPL-2), as published by #
# the Free Software Foundation. #
# We created this package for both routine data analysis and academic #
# research and it was publicly released in the hope that it will be #
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. #
# #
# Visit our website for the full manual and a complete tutorial about #
# how to conduct AMR data analysis: https://amr-for-r.org #
# ==================================================================== #
# This script runs in 20-30 minutes and renews all guidelines of CLSI and EUCAST!
# Run it with source("data-raw/_reproduction_scripts/reproduction_of_clinical_breakpoints.R")
library(dplyr)
library(readr)
library(tidyr)
devtools::load_all()
# Install the WHONET software on Windows (http://www.whonet.org/software.html),
# and copy the folder C:\WHONET\Resources to the data-raw/WHONET/ folder
# (for ASIARS-Net update, also copy C:\WHONET\Codes to the data-raw/WHONET/ folder)
# BE SURE TO RUN data-raw/_reproduction_scripts/reproduction_of_microorganisms.groups.R FIRST TO GET THE GROUPS!
# READ DATA ----
whonet_organisms <- read_tsv("data-raw/WHONET/Resources/Organisms.txt", na = c("", "NA", "-"), show_col_types = FALSE) %>%
# remove old taxonomic names
filter(TAXONOMIC_STATUS == "C") %>%
mutate(ORGANISM_CODE = toupper(WHONET_ORG_CODE))
whonet_breakpoints <- read_tsv("data-raw/WHONET/Resources/Breakpoints.txt", na = c("", "NA", "-"),
show_col_types = FALSE, guess_max = Inf) %>%
filter(GUIDELINES %in% c("CLSI", "EUCAST"))
whonet_antibiotics <- read_tsv("data-raw/WHONET/Resources/Antibiotics.txt", na = c("", "NA", "-"), show_col_types = FALSE) %>%
arrange(WHONET_ABX_CODE) %>%
distinct(WHONET_ABX_CODE, .keep_all = TRUE)
# MICROORGANISMS WHONET CODES ----
whonet_organisms <- whonet_organisms %>%
select(ORGANISM_CODE, ORGANISM, SPECIES_GROUP, GBIF_TAXON_ID) %>%
mutate(
# this one was called Issatchenkia orientalis, but it should be:
ORGANISM = if_else(ORGANISM_CODE == "ckr", "Candida krusei", ORGANISM)
) %>%
# try to match on GBIF identifier
left_join(microorganisms %>% distinct(mo, gbif, status) %>% filter(!is.na(gbif)), by = c("GBIF_TAXON_ID" = "gbif")) %>%
# remove duplicates
arrange(ORGANISM_CODE, GBIF_TAXON_ID, status) %>%
distinct(ORGANISM_CODE, .keep_all = TRUE) %>%
# add Enterobacterales, which is a subkingdom code in their data
bind_rows(data.frame(ORGANISM_CODE = "ebc", ORGANISM = "Enterobacterales", mo = as.mo("Enterobacterales"))) %>%
arrange(ORGANISM)
## Add new WHO codes to microorganisms.codes ----
matched <- whonet_organisms %>% filter(!is.na(mo))
unmatched <- whonet_organisms %>% filter(is.na(mo))
# generate the mo codes and add their names
message("Getting MO codes for WHONET input...")
unmatched <- unmatched %>%
mutate(mo = as.mo(gsub("(sero[a-z]*| nontypable| non[-][a-zA-Z]+|var[.]| not .*|sp[.],.*|, .*variant.*|, .*toxin.*|, microaer.*| beta-haem[.])", "", ORGANISM),
minimum_matching_score = 0.55,
keep_synonyms = TRUE,
language = "en"),
mo = case_when(ORGANISM %like% "Anaerobic" & ORGANISM %like% "negative" ~ as.mo("B_ANAER-NEG"),
ORGANISM %like% "Anaerobic" & ORGANISM %like% "positive" ~ as.mo("B_ANAER-POS"),
ORGANISM %like% "Anaerobic" ~ as.mo("B_ANAER"),
TRUE ~ mo),
mo_name = mo_name(mo,
keep_synonyms = TRUE,
language = "en"))
# check if coercion at least resembles the first part (genus)
unmatched <- unmatched %>%
mutate(
first_part = sapply(ORGANISM, function(x) strsplit(gsub("[^a-zA-Z _-]+", "", x), " ")[[1]][1], USE.NAMES = FALSE),
keep = mo_name %like_case% first_part | ORGANISM %like% "Gram " | ORGANISM == "Other" | ORGANISM %like% "anaerobic") %>%
arrange(keep)
unmatched %>%
View()
unmatched <- unmatched %>%
filter(keep == TRUE)
organisms <- matched %>% transmute(code = toupper(ORGANISM_CODE), group = SPECIES_GROUP, mo) %>%
bind_rows(unmatched %>% transmute(code = toupper(ORGANISM_CODE), group = SPECIES_GROUP, mo)) %>%
mutate(name = mo_name(mo, keep_synonyms = TRUE)) %>%
arrange(code)
# some subspecies exist, while their upper species do not, add them as the species level:
subspp <- organisms %>%
filter(mo_species(mo, keep_synonyms = TRUE) == mo_subspecies(mo, keep_synonyms = TRUE) &
mo_species(mo, keep_synonyms = TRUE) != "" &
mo_genus(mo, keep_synonyms = TRUE) != "Salmonella") %>%
mutate(mo = as.mo(paste(mo_genus(mo, keep_synonyms = TRUE),
mo_species(mo, keep_synonyms = TRUE)),
keep_synonyms = TRUE),
name = mo_name(mo, keep_synonyms = TRUE))
organisms <- organisms %>%
filter(!code %in% subspp$code) %>%
bind_rows(subspp) %>%
arrange(code)
# add the groups
organisms <- organisms %>%
bind_rows(tibble(code = organisms %>% filter(!is.na(group)) %>% pull(group) %>% unique(),
group = NA,
mo = organisms %>% filter(!is.na(group)) %>% pull(group) %>% unique() %>% as.mo(keep_synonyms = TRUE),
name = mo_name(mo, keep_synonyms = TRUE))) %>%
arrange(code, group) %>%
select(-group) %>%
distinct()
# 2023-07-08 SGM is also Strep gamma in WHONET, must only be Slowly-growing Mycobacterium
# 2024-06-14 still the case
organisms <- organisms %>%
filter(!(code == "SGM" & name %like% "Streptococcus"))
# this must be empty:
organisms$code[organisms$code %>% duplicated()]
saveRDS(organisms, "data-raw/organisms.rds", version = 2)
#---
# AT THIS POINT, `organisms` is clean and all entries have an mo code
#---
# update microorganisms.codes with the latest WHONET codes
microorganisms.codes2 <- microorganisms.codes %>%
# remove all old WHONET codes, whether we (in the end) keep them or not
filter(!toupper(code) %in% toupper(organisms$code)) %>%
# and add the new ones
bind_rows(organisms %>% select(code, mo)) %>%
arrange(code) %>%
distinct(code, .keep_all = TRUE)
# new codes:
microorganisms.codes2$code[which(!microorganisms.codes2$code %in% microorganisms.codes$code)]
mo_name(microorganisms.codes2$mo[which(!microorganisms.codes2$code %in% microorganisms.codes$code)], keep_synonyms = TRUE)
microorganisms.codes <- microorganisms.codes2
# Run this part to update ASIARS-Net:
# 2024-06-14: file not available anymore
# # start
# asiarsnet <- read_tsv("data-raw/WHONET/Codes/ASIARS_Net_Organisms_ForwardLookup.txt")
# asiarsnet <- asiarsnet %>%
# mutate(WHONET_Code = toupper(WHONET_Code)) %>%
# left_join(whonet_organisms %>% mutate(WHONET_Code = toupper(ORGANISM_CODE))) %>%
# mutate(
# mo1 = as.mo(ORGANISM_CODE),
# mo2 = as.mo(ORGANISM)
# ) %>%
# mutate(mo = if_else(mo2 == "UNKNOWN" | is.na(mo2), mo1, mo2)) %>%
# filter(!is.na(mo))
# insert1 <- asiarsnet %>% transmute(code = WHONET_Code, mo)
# insert2 <- asiarsnet %>% transmute(code = as.character(ASIARS_Net_Code), mo)
# # these will be updated
# bind_rows(insert1, insert2) %>%
# rename(mo_new = mo) %>%
# left_join(microorganisms.codes) %>%
# filter(mo != mo_new)
# microorganisms.codes <- microorganisms.codes %>%
# filter(!code %in% c(insert1$code, insert2$code)) %>%
# bind_rows(insert1, insert2) %>%
# arrange(code)
# # end
## Save to package ----
class(microorganisms.codes$mo) <- c("mo", "character")
usethis::use_data(microorganisms.codes, overwrite = TRUE, compress = "xz", version = 2)
rm(microorganisms.codes)
devtools::load_all()
# BREAKPOINTS ----
# now that we have the correct MO codes, get the breakpoints and convert them
whonet_breakpoints %>%
count(GUIDELINES, BREAKPOINT_TYPE) %>%
pivot_wider(names_from = BREAKPOINT_TYPE, values_from = n) %>%
janitor::adorn_totals(where = c("row", "col"))
# compared to current
AMR::clinical_breakpoints %>%
count(GUIDELINES = gsub("[^a-zA-Z]", "", guideline), type) %>%
arrange(tolower(type)) %>%
pivot_wider(names_from = type, values_from = n) %>%
as.data.frame() %>%
janitor::adorn_totals(where = c("row", "col"))
breakpoints <- whonet_breakpoints %>%
mutate(code = toupper(ORGANISM_CODE)) %>%
left_join(bind_rows(microorganisms.codes %>% filter(!code %in% c("ALL", "GEN")),
# GEN (Generic) and ALL (All) are PK/PD codes
data.frame(code = c("ALL", "GEN"),
mo = rep(as.mo("UNKNOWN"), 2))))
# these ones lack an MO name, they cannot be used:
unknown <- breakpoints %>%
filter(is.na(mo)) %>%
pull(code) %>%
unique()
breakpoints %>%
filter(code %in% unknown) %>%
count(GUIDELINES, YEAR, ORGANISM_CODE, BREAKPOINT_TYPE, sort = TRUE)
# 2025-03-11: these codes are currently: clu, kma, fso, tyi. No clue (are not in MO list of WHONET), and they are only ECOFFs, so remove them:
breakpoints <- breakpoints %>%
filter(!is.na(mo))
# and these ones have unknown antibiotics according to WHONET itself:
breakpoints %>%
filter(!WHONET_ABX_CODE %in% whonet_antibiotics$WHONET_ABX_CODE) %>%
count(YEAR, GUIDELINES, WHONET_ABX_CODE) %>%
arrange(desc(YEAR))
breakpoints %>%
filter(!WHONET_ABX_CODE %in% whonet_antibiotics$WHONET_ABX_CODE) %>%
pull(WHONET_ABX_CODE) %>%
unique()
# they are at the moment all old codes ("CFC", "ROX", "FIX") that have the right replacements in `antimicrobials`, so we can use as.ab()
## Build new breakpoints table ----
breakpoints_new <- breakpoints %>%
filter(!is.na(WHONET_ABX_CODE)) %>%
transmute(
guideline = paste(GUIDELINES, YEAR),
type = ifelse(BREAKPOINT_TYPE == "ECOFF", "ECOFF", tolower(BREAKPOINT_TYPE)),
host = ifelse(BREAKPOINT_TYPE == "ECOFF", "ECOFF", tolower(HOST)),
method = TEST_METHOD,
site = SITE_OF_INFECTION,
mo,
rank_index = case_when(
is.na(mo_rank(mo, keep_synonyms = TRUE)) ~ 6, # for UNKNOWN, B_GRAMN, B_ANAER, B_ANAER-NEG, etc.
mo_rank(mo, keep_synonyms = TRUE) %like% "(infra|sub)" ~ 1,
mo_rank(mo, keep_synonyms = TRUE) == "species" ~ 2,
mo_rank(mo, keep_synonyms = TRUE) == "species group" ~ 2.5,
mo_rank(mo, keep_synonyms = TRUE) == "genus" ~ 3,
mo_rank(mo, keep_synonyms = TRUE) == "family" ~ 4,
mo_rank(mo, keep_synonyms = TRUE) == "order" ~ 5,
TRUE ~ 6
),
ab = as.ab(WHONET_ABX_CODE),
ref_tbl = ifelse(type == "ECOFF" & is.na(REFERENCE_TABLE), "ECOFF", REFERENCE_TABLE),
disk_dose = POTENCY,
breakpoint_S = ifelse(type == "ECOFF" & is.na(S) & !is.na(ECV_ECOFF), ECV_ECOFF, S),
breakpoint_R = ifelse(type == "ECOFF" & is.na(R) & !is.na(ECV_ECOFF), ECV_ECOFF, R),
uti = ifelse(is.na(site), FALSE, gsub(".*(UTI|urinary|urine).*", "UTI", site) == "UTI"),
is_SDD = !is.na(SDD)
) %>%
# Greek symbols and EM dash symbols are not allowed by CRAN, so replace them with ASCII:
mutate(disk_dose = disk_dose %>%
gsub("μ", "mc", ., fixed = TRUE) %>% # this is 'mu', \u03bc
gsub("µ", "mc", ., fixed = TRUE) %>% # this is 'micro', \u00b5 (yes, they look the same)
gsub("", "-", ., fixed = TRUE) %>%
gsub("(?<=\\d)(?=[a-zA-Z])", " ", ., perl = TRUE)) %>% # make sure we keep a space after a number, e.g. "1mcg" to "1 mcg"
arrange(desc(guideline), mo, ab, type, method) %>%
filter(!(is.na(breakpoint_S) & is.na(breakpoint_R)) & !is.na(mo) & !is.na(ab)) %>%
distinct(guideline, type, host, ab, mo, method, site, breakpoint_S, .keep_all = TRUE)
# fix reference table names
breakpoints_new %>% filter(guideline %like% "EUCAST", is.na(ref_tbl)) %>% View()
breakpoints_new <- breakpoints_new %>%
mutate(ref_tbl = case_when(is.na(ref_tbl) & guideline %like% "EUCAST 202" ~ lead(ref_tbl),
is.na(ref_tbl) ~ "Unknown",
TRUE ~ ref_tbl))
# clean disk zones
breakpoints_new[which(breakpoints_new$method == "DISK"), "breakpoint_S"] <- as.double(as.disk(breakpoints_new[which(breakpoints_new$method == "DISK"), "breakpoint_S", drop = TRUE]))
breakpoints_new[which(breakpoints_new$method == "DISK"), "breakpoint_R"] <- as.double(as.disk(breakpoints_new[which(breakpoints_new$method == "DISK"), "breakpoint_R", drop = TRUE]))
# regarding animal breakpoints, CLSI has adults and foals for horses, but only for amikacin - only keep adult horses
breakpoints_new %>%
filter(host %like% "foal") %>%
count(guideline, host)
breakpoints_new <- breakpoints_new %>%
filter(host %unlike% "foal") %>%
mutate(host = ifelse(host %like% "horse", "horse", host))
# FIXES FOR WHONET ERRORS ----
m <- unique(as.double(as.mic(levels(as.mic(1)))))
# WHONET has no >1024 but instead uses 1025, 513, etc, so as.mic() cannot be used to clean.
# instead, raise these one higher valid MIC factor level:
breakpoints_new[which(breakpoints_new$breakpoint_R == 129), "breakpoint_R"] <- m[which(m == 128) + 1]
breakpoints_new[which(breakpoints_new$breakpoint_R == 257), "breakpoint_R"] <- m[which(m == 256) + 1]
breakpoints_new[which(breakpoints_new$breakpoint_R == 513), "breakpoint_R"] <- m[which(m == 512) + 1]
breakpoints_new[which(breakpoints_new$breakpoint_R == 1025), "breakpoint_R"] <- m[which(m == 1024) + 1]
# a lot of R breakpoints are missing, though none of the S breakpoints are missing:
anyNA(breakpoints_new$breakpoint_S)
breakpoints_new %>%
filter(is.na(breakpoint_R)) %>%
count(guideline, host) |>
pivot_wider(names_from = host,
values_from = n,
values_fill = list(n = 0)) |>
View()
# 2025-03-12 don't do this anymore - we now use as.sir(..., substitute_missing_r_breakpoint = TRUE/FALSE, ...)
# breakpoints_new[which(breakpoints_new$method == "MIC" &
# is.na(breakpoints_new$breakpoint_R)), "breakpoint_R"] <- max(m)
# fix streptococci in WHONET table of EUCAST: Strep A, B, C and G must only include these groups and not all streptococci:
breakpoints_new$mo[breakpoints_new$mo == "B_STRPT" & breakpoints_new$ref_tbl %like% "^strep.* a.* b.*c.*g"] <- as.mo("B_STRPT_ABCG")
# Haemophilus same error (must only be H. influenzae)
breakpoints_new$mo[breakpoints_new$mo == "B_HMPHL" & breakpoints_new$ref_tbl %like% "^h.* influenzae"] <- as.mo("B_HMPHL_INFL")
# EUCAST says that for H. parainfluenzae the H. influenza rules can be used, so add them
breakpoints_new <- breakpoints_new %>%
bind_rows(
breakpoints_new %>%
filter(guideline %like% "EUCAST", mo == "B_HMPHL_INFL") %>%
mutate(mo = as.mo("B_HMPHL_PRNF"))
) %>%
arrange(desc(guideline), mo, ab, type, host, method)
# Achromobacter denitrificans is in WHONET included in their A. xylosoxidans table, must be removed
breakpoints_new <- breakpoints_new %>% filter(mo != as.mo("Achromobacter denitrificans"))
# WHONET contains gentamicin breakpoints for viridans streptocci, which are intrinsic R - they meant genta-high, which is ALSO in their table, so we just remove gentamicin in viridans streptococci
breakpoints_new <- breakpoints_new %>% filter(!(mo == as.mo("Streptococcus viridans") & ab == "GEN"))
# Nitrofurantoin in Staph (EUCAST) only applies to S. saprophyticus, while WHONET has the DISK correct but the MIC on genus level
breakpoints_new$mo[breakpoints_new$mo == "B_STPHY" & breakpoints_new$ab == "NIT" & breakpoints_new$guideline %like% "EUCAST"] <- as.mo("B_STPHY_SPRP")
# WHONET sets the 2023 breakpoints for SAM to MIC of 16/32 for Enterobacterales, should be MIC 8/32 like AMC (see issue #123 on github.com/msberends/AMR)
# UPDATE 2024-02-22: fixed now
# There's a problem with C. diff in EUCAST where breakpoint_R is missing - they are listed as normal human breakpoints but are ECOFF
rows <- which(breakpoints_new$guideline %like% "EUCAST" & breakpoints_new$mo == "B_CRDDS_DFFC" & is.na(breakpoints_new$breakpoint_R) & !is.na(breakpoints_new$breakpoint_S))
breakpoints_new$type[rows] <- "ECOFF"
breakpoints_new$host[rows] <- "ECOFF"
breakpoints_new$ref_tbl[rows] <- "ECOFF"
breakpoints_new$breakpoint_R[rows] <- breakpoints_new$breakpoint_S[rows]
breakpoints_new <- distinct(breakpoints_new, .keep_all = TRUE)
# determine rank again now that some changes were made on taxonomic level (genus -> species)
breakpoints_new <- breakpoints_new %>%
mutate(rank_index = case_when(
mo_rank(mo, keep_synonyms = TRUE) %like% "(infra|sub)" ~ 1,
mo_rank(mo, keep_synonyms = TRUE) == "species" ~ 2,
mo_rank(mo, keep_synonyms = TRUE) == "species group" ~ 2.5,
mo_rank(mo, keep_synonyms = TRUE) == "genus" ~ 3,
mo_rank(mo, keep_synonyms = TRUE) == "family" ~ 4,
mo_rank(mo, keep_synonyms = TRUE) == "order" ~ 5,
mo != "UNKNOWN" ~ 6, # for B_ANAER, etc.
TRUE ~ 7
))
# WHONET adds one log2 level to the R breakpoint for their software, e.g. in AMC in Enterobacterales:
# EUCAST 2023 guideline: S <= 8 and R > 8
# WHONET file: S <= 8 and R >= 16
breakpoints_new %>% filter(guideline == "EUCAST 2023", ab == "AMC", mo == "B_[ORD]_ENTRBCTR", method == "MIC")
# but this will make an MIC of 12 I, which should be R according to EUCAST, so:
breakpoints_new <- breakpoints_new %>%
mutate(breakpoint_R = ifelse(guideline %like% "EUCAST" & method == "MIC" & log2(breakpoint_R) - log2(breakpoint_S) != 0,
pmax(breakpoint_S, breakpoint_R / 2),
breakpoint_R
))
# fix disks as well
breakpoints_new %>% filter(guideline == "EUCAST 2023", ab == "AMC", mo == "B_[ORD]_ENTRBCTR", method == "DISK")
breakpoints_new <- breakpoints_new %>%
mutate(breakpoint_R = ifelse(guideline %like% "EUCAST" & method == "DISK" & breakpoint_S - breakpoint_R != 0,
breakpoint_R + 1,
breakpoint_R
))
# fill missing R breakpoint where there is an S breakpoint
# 2025-03-12 don't do this anymore - we now use as.sir(..., substitute_missing_r_breakpoint = TRUE/FALSE, ...)
# breakpoints_new[which(is.na(breakpoints_new$breakpoint_R)), "breakpoint_R"] <- breakpoints_new[which(is.na(breakpoints_new$breakpoint_R)), "breakpoint_S"]
# check the strange duplicates
breakpoints_new %>%
mutate(id = paste(guideline, type, host, method, site, mo, ab, uti)) %>%
filter(id %in% .$id[which(duplicated(id))]) %>%
arrange(desc(guideline))
# 2024-06-19 mostly ECOFFs, but there's no explanation in the whonet_breakpoints file, we have to remove duplicates
# remove duplicates
breakpoints_new <- breakpoints_new %>%
distinct(guideline, type, host, method, site, mo, ab, uti, .keep_all = TRUE)
# CHECKS AND SAVE TO PACKAGE ----
# check again
breakpoints_new %>% filter(guideline == "EUCAST 2024", ab == "AMC", mo == "B_[ORD]_ENTRBCTR", method == "MIC")
# compare with current version
clinical_breakpoints %>% filter(guideline == "EUCAST 2024", ab == "AMC", mo == "B_[ORD]_ENTRBCTR", method == "MIC")
# must have "human" and "ECOFF"
breakpoints_new %>% filter(mo == "B_STRPT_PNMN", ab == "AMP", guideline == "EUCAST 2020", method == "MIC")
# check dimensions
dim(breakpoints_new)
dim(clinical_breakpoints)
clinical_breakpoints <- breakpoints_new
clinical_breakpoints <- clinical_breakpoints %>% dataset_UTF8_to_ASCII()
usethis::use_data(clinical_breakpoints, overwrite = TRUE, compress = "xz", version = 2)
rm(clinical_breakpoints)
devtools::load_all(".")

View File

@ -0,0 +1,178 @@
# ==================================================================== #
# TITLE: #
# AMR: An R Package for Working with Antimicrobial Resistance Data #
# #
# SOURCE CODE: #
# https://github.com/msberends/AMR #
# #
# PLEASE CITE THIS SOFTWARE AS: #
# Berends MS, Luz CF, Friedrich AW, et al. (2022). #
# AMR: An R Package for Working with Antimicrobial Resistance Data. #
# Journal of Statistical Software, 104(3), 1-31. #
# https://doi.org/10.18637/jss.v104.i03 #
# #
# Developed at the University of Groningen and the University Medical #
# Center Groningen in The Netherlands, in collaboration with many #
# colleagues from around the world, see our website. #
# #
# This R package is free software; you can freely use and distribute #
# it for both personal and commercial purposes under the terms of the #
# GNU General Public License version 2.0 (GNU GPL-2), as published by #
# the Free Software Foundation. #
# We created this package for both routine data analysis and academic #
# research and it was publicly released in the hope that it will be #
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. #
# #
# Visit our website for the full manual and a complete tutorial about #
# how to conduct AMR data analysis: https://amr-for-r.org #
# ==================================================================== #
library(dplyr)
library(readxl)
library(cleaner)
# URL:
# https://www.eucast.org/fileadmin/src/media/PDFs/EUCAST_files/Breakpoint_tables/Dosages_v_13.0_Breakpoint_Tables.pdf
# download the PDF file, open in Adobe Acrobat and export as Excel workbook
breakpoints_version <- 13
dosage_source <- read_excel("data-raw/Dosages_v_12.0_Breakpoint_Tables.xlsx", skip = 4, na = "None") %>%
format_names(snake_case = TRUE, penicillins = "drug") %>%
filter(!tolower(standard_dosage) %in% c("standard dosage", "standard dosage_source", "under review")) %>%
filter(!is.na(standard_dosage)) %>%
# keep only one drug in the table
arrange(desc(drug)) %>%
mutate(drug = gsub("(.*) ([(]|iv|oral).*", "\\1", drug)) %>%
# distinct(drug, .keep_all = TRUE) %>%
arrange(drug) %>%
mutate(
ab = as.ab(drug),
ab_name = ab_name(ab, language = NULL)
)
dosage_source <- bind_rows(
# oral
dosage_source %>%
filter(standard_dosage %like% " oral") %>%
mutate(
standard_dosage = gsub("oral.*", "oral", standard_dosage),
high_dosage = if_else(high_dosage %like% "oral",
gsub("oral.*", "oral", high_dosage),
NA_character_
)
),
# iv
dosage_source %>%
filter(standard_dosage %like% " iv") %>%
mutate(
standard_dosage = gsub(".* or ", "", standard_dosage),
high_dosage = if_else(high_dosage %like% "( or | iv)",
gsub(".* or ", "", high_dosage),
NA_character_
)
),
# im
dosage_source %>%
filter(standard_dosage %like% " im")
) %>%
arrange(drug)
get_dosage_lst <- function(col_data) {
standard <- col_data %>%
# remove new lines
gsub(" ?(\n|\t)+ ?", " ", .) %>%
# keep only the first suggestion, replace all after 'or' and more informative texts
gsub("(.*?) (or|with|loading|depending|over|by) .*", "\\1", .) %>%
# remove (1 MU)
gsub(" [(][0-9] [A-Z]+[)]", "", .) %>%
# remove parentheses
gsub("[)(]", "", .) %>%
# remove drug names
gsub(" [a-z]{5,99}( |$)", " ", .) %>%
gsub(" [a-z]{5,99}( |$)", " ", .) %>%
gsub(" (acid|dose)", "", .) # %>%
# keep lowest value only (25-30 mg -> 25 mg)
# gsub("[-].*? ", " ", .)
dosage_lst <- lapply(
strsplit(standard, " x "),
function(x) {
dose <- x[1]
if (dose %like% "under") {
dose <- NA_character_
}
admin <- x[2]
list(
dose = trimws(dose),
dose_times = gsub("^([0-9.]+).*", "\\1", admin),
administration = clean_character(admin),
notes = "",
original_txt = ""
)
}
)
for (i in seq_len(length(col_data))) {
dosage_lst[[i]]$original_txt <- gsub("\n", " ", col_data[i])
if (col_data[i] %like% " (or|with|loading|depending|over) ") {
dosage_lst[[i]]$notes <- gsub("\n", " ", gsub(".* ((or|with|loading|depending|over) .*)", "\\1", col_data[i]))
}
}
dosage_lst
}
standard <- get_dosage_lst(dosage_source$standard_dosage)
high <- get_dosage_lst(dosage_source$high_dosage)
uti <- get_dosage_lst(dosage_source$uncomplicated_uti)
dosage_new <- bind_rows(
# standard dose
data.frame(
ab = dosage_source$ab,
name = dosage_source$ab_name,
type = "standard_dosage",
dose = sapply(standard, function(x) x$dose),
dose_times = sapply(standard, function(x) x$dose_times),
administration = sapply(standard, function(x) x$administration),
notes = sapply(standard, function(x) x$notes),
original_txt = sapply(standard, function(x) x$original_txt),
stringsAsFactors = FALSE
),
# high dose
data.frame(
ab = dosage_source$ab,
name = dosage_source$ab_name,
type = "high_dosage",
dose = sapply(high, function(x) x$dose),
dose_times = sapply(high, function(x) x$dose_times),
administration = sapply(high, function(x) x$administration),
notes = sapply(high, function(x) x$notes),
original_txt = sapply(high, function(x) x$original_txt),
stringsAsFactors = FALSE
),
# UTIs
data.frame(
ab = dosage_source$ab,
name = dosage_source$ab_name,
type = "uncomplicated_uti",
dose = sapply(uti, function(x) x$dose),
dose_times = sapply(uti, function(x) x$dose_times),
administration = sapply(uti, function(x) x$administration),
notes = sapply(uti, function(x) x$notes),
original_txt = sapply(uti, function(x) x$original_txt),
stringsAsFactors = FALSE
)
) %>%
mutate(
eucast_version = breakpoints_version,
dose_times = as.integer(dose_times),
administration = gsub("([a-z]+) .*", "\\1", administration)
) %>%
arrange(name, administration, type) %>%
filter(!is.na(dose), dose != ".") %>%
# this makes it a tibble as well:
dataset_UTF8_to_ASCII()
dosage <- bind_rows(dosage_new, AMR::dosage)
usethis::use_data(dosage, internal = FALSE, overwrite = TRUE, version = 2, compress = "xz")

View File

@ -0,0 +1,146 @@
# ==================================================================== #
# TITLE: #
# AMR: An R Package for Working with Antimicrobial Resistance Data #
# #
# SOURCE CODE: #
# https://github.com/msberends/AMR #
# #
# PLEASE CITE THIS SOFTWARE AS: #
# Berends MS, Luz CF, Friedrich AW, et al. (2022). #
# AMR: An R Package for Working with Antimicrobial Resistance Data. #
# Journal of Statistical Software, 104(3), 1-31. #
# https://doi.org/10.18637/jss.v104.i03 #
# #
# Developed at the University of Groningen and the University Medical #
# Center Groningen in The Netherlands, in collaboration with many #
# colleagues from around the world, see our website. #
# #
# This R package is free software; you can freely use and distribute #
# it for both personal and commercial purposes under the terms of the #
# GNU General Public License version 2.0 (GNU GPL-2), as published by #
# the Free Software Foundation. #
# We created this package for both routine data analysis and academic #
# research and it was publicly released in the hope that it will be #
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. #
# #
# Visit our website for the full manual and a complete tutorial about #
# how to conduct AMR data analysis: https://amr-for-r.org #
# ==================================================================== #
patients <- unlist(lapply(LETTERS, paste0, 1:10))
patients_table <- data.frame(
patient_id = patients,
gender = c(
rep("M", 135),
rep("F", 125)
)
)
dates <- seq(as.Date("2011-01-01"), as.Date("2020-01-01"), by = "day")
bacteria_a <- c(
"E. coli", "S. aureus",
"S. pneumoniae", "K. pneumoniae"
)
bacteria_b <- c("esccol", "staaur", "strpne", "klepne")
bacteria_c <- c(
"Escherichia coli", "Staphylococcus aureus",
"Streptococcus pneumoniae", "Klebsiella pneumoniae"
)
ab_interpretations <- c("S", "I", "R")
ab_interpretations_messy <- c("R", "< 0.5 S", "I")
sample_size <- 1000
data_a <- data.frame(
date = sample(dates, size = sample_size, replace = TRUE),
hospital = "A",
bacteria = sample(bacteria_a,
size = sample_size, replace = TRUE,
prob = c(0.50, 0.25, 0.15, 0.10)
),
AMX = sample(ab_interpretations,
size = sample_size, replace = TRUE,
prob = c(0.60, 0.05, 0.35)
),
AMC = sample(ab_interpretations,
size = sample_size, replace = TRUE,
prob = c(0.75, 0.10, 0.15)
),
CIP = sample(ab_interpretations,
size = sample_size, replace = TRUE,
prob = c(0.80, 0.00, 0.20)
),
GEN = sample(ab_interpretations,
size = sample_size, replace = TRUE,
prob = c(0.92, 0.00, 0.08)
)
)
data_b <- data.frame(
date = sample(dates, size = sample_size, replace = TRUE),
hospital = "B",
bacteria = sample(bacteria_b,
size = sample_size, replace = TRUE,
prob = c(0.50, 0.25, 0.15, 0.10)
),
AMX = sample(ab_interpretations_messy,
size = sample_size, replace = TRUE,
prob = c(0.60, 0.05, 0.35)
),
AMC = sample(ab_interpretations_messy,
size = sample_size, replace = TRUE,
prob = c(0.75, 0.10, 0.15)
),
CIP = sample(ab_interpretations_messy,
size = sample_size, replace = TRUE,
prob = c(0.80, 0.00, 0.20)
),
GEN = sample(ab_interpretations_messy,
size = sample_size, replace = TRUE,
prob = c(0.92, 0.00, 0.08)
)
)
data_c <- data.frame(
date = sample(dates, size = sample_size, replace = TRUE),
hospital = "C",
bacteria = sample(bacteria_c,
size = sample_size, replace = TRUE,
prob = c(0.50, 0.25, 0.15, 0.10)
),
AMX = sample(ab_interpretations,
size = sample_size, replace = TRUE,
prob = c(0.60, 0.05, 0.35)
),
AMC = sample(ab_interpretations,
size = sample_size, replace = TRUE,
prob = c(0.75, 0.10, 0.15)
),
CIP = sample(ab_interpretations,
size = sample_size, replace = TRUE,
prob = c(0.80, 0.00, 0.20)
),
GEN = sample(ab_interpretations,
size = sample_size, replace = TRUE,
prob = c(0.92, 0.00, 0.08)
)
)
example_isolates_unclean <- data_a %>%
bind_rows(data_b, data_c)
example_isolates_unclean$patient_id <- sample(patients, size = nrow(example_isolates_unclean), replace = TRUE)
example_isolates_unclean <- example_isolates_unclean %>%
select(patient_id, hospital, date, bacteria, everything()) %>%
dataset_UTF8_to_ASCII()
usethis::use_data(example_isolates_unclean, overwrite = TRUE, internal = FALSE, version = 2, compress = "xz")

View File

@ -0,0 +1,68 @@
# ==================================================================== #
# TITLE: #
# AMR: An R Package for Working with Antimicrobial Resistance Data #
# #
# SOURCE CODE: #
# https://github.com/msberends/AMR #
# #
# PLEASE CITE THIS SOFTWARE AS: #
# Berends MS, Luz CF, Friedrich AW, et al. (2022). #
# AMR: An R Package for Working with Antimicrobial Resistance Data. #
# Journal of Statistical Software, 104(3), 1-31. #
# https://doi.org/10.18637/jss.v104.i03 #
# #
# Developed at the University of Groningen and the University Medical #
# Center Groningen in The Netherlands, in collaboration with many #
# colleagues from around the world, see our website. #
# #
# This R package is free software; you can freely use and distribute #
# it for both personal and commercial purposes under the terms of the #
# GNU General Public License version 2.0 (GNU GPL-2), as published by #
# the Free Software Foundation. #
# We created this package for both routine data analysis and academic #
# research and it was publicly released in the hope that it will be #
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. #
# #
# Visit our website for the full manual and a complete tutorial about #
# how to conduct AMR data analysis: https://amr-for-r.org #
# ==================================================================== #
library(AMR)
library(dplyr)
int_resis <- data.frame(mo = microorganisms$mo, stringsAsFactors = FALSE)
for (i in seq_len(nrow(antimicrobials))) {
int_resis$new <- as.sir("S")
colnames(int_resis)[ncol(int_resis)] <- antimicrobials$ab[i]
}
int_resis <- eucast_rules(int_resis,
eucast_rules_df = subset(
AMR:::EUCAST_RULES_DF,
is.na(have_these_values) & reference.rule_group == "Expected phenotypes" & reference.version == 1.2
),
overwrite = TRUE,
info = FALSE
)
int_resis2 <- int_resis[, sapply(int_resis, function(x) any(!is.sir(x) | x == "R")), drop = FALSE] %>%
tidyr::pivot_longer(-mo) %>%
filter(value == "R") %>%
select(mo, ab = name)
# remove lab drugs
untreatable <- antimicrobials[which(antimicrobials$name %like% "-high|EDTA|polysorbate|macromethod|screening|/nacubactam"), "ab", drop = TRUE]
# takes ages with filter()..., weird
int_resis3 <- int_resis2[which(!int_resis2$ab %in% untreatable), ]
int_resis3$ab <- as.ab(int_resis3$ab)
int_resis3
all(int_resis3$mo %in% microorganisms$mo)
all(int_resis3$ab %in% antimicrobials$ab)
intrinsic_resistant <- int_resis3
usethis::use_data(intrinsic_resistant, internal = FALSE, overwrite = TRUE, version = 2, compress = "xz")
rm(intrinsic_resistant)
# AFTER THIS:
# DO NOT FORGET TO UPDATE THE VERSION NUMBER IN mo_is_intrinsic_resistant() AND R/data.R

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,189 @@
# ==================================================================== #
# TITLE: #
# AMR: An R Package for Working with Antimicrobial Resistance Data #
# #
# SOURCE CODE: #
# https://github.com/msberends/AMR #
# #
# PLEASE CITE THIS SOFTWARE AS: #
# Berends MS, Luz CF, Friedrich AW, et al. (2022). #
# AMR: An R Package for Working with Antimicrobial Resistance Data. #
# Journal of Statistical Software, 104(3), 1-31. #
# https://doi.org/10.18637/jss.v104.i03 #
# #
# Developed at the University of Groningen and the University Medical #
# Center Groningen in The Netherlands, in collaboration with many #
# colleagues from around the world, see our website. #
# #
# This R package is free software; you can freely use and distribute #
# it for both personal and commercial purposes under the terms of the #
# GNU General Public License version 2.0 (GNU GPL-2), as published by #
# the Free Software Foundation. #
# We created this package for both routine data analysis and academic #
# research and it was publicly released in the hope that it will be #
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. #
# #
# Visit our website for the full manual and a complete tutorial about #
# how to conduct AMR data analysis: https://amr-for-r.org #
# ==================================================================== #
# This data set is being used in the clinical_breakpoints data set, and thus by as.sir().
# It prevents the breakpoints table from being extremely long for species that are part of a species group.
# Also used by eucast_rules() to expand group names.
library(dplyr)
library(readr)
library(tidyr)
devtools::load_all()
# Install the WHONET software on Windows (http://www.whonet.org/software.html),
# and copy the folder C:\WHONET\Resources to the data-raw/WHONET/ folder
# BACTERIAL COMPLEXES
# find all bacterial complex in the NCBI Taxonomy Browser here:
# https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Undef&id=2&lvl=6&lin=f&keep=1&srchmode=1&unlock
# and search the page for 'complex', then follow the link and add the missing ones in this R file
# READ DATA ----
whonet_organisms <- read_tsv("data-raw/WHONET/Resources/Organisms.txt", na = c("", "NA", "-"), show_col_types = FALSE) %>%
# remove old taxonomic names
filter(TAXONOMIC_STATUS == "C") %>%
mutate(ORGANISM_CODE = toupper(WHONET_ORG_CODE))
whonet_organisms <- whonet_organisms %>%
select(ORGANISM_CODE, ORGANISM, SPECIES_GROUP, GBIF_TAXON_ID) %>%
mutate(
# this one was called Issatchenkia orientalis, but it should be:
ORGANISM = if_else(ORGANISM_CODE == "ckr", "Candida krusei", ORGANISM)
) %>%
# try to match on GBIF identifier
left_join(microorganisms %>% distinct(mo, gbif, status) %>% filter(!is.na(gbif)), by = c("GBIF_TAXON_ID" = "gbif")) %>%
# remove duplicates
arrange(ORGANISM_CODE, GBIF_TAXON_ID, status) %>%
distinct(ORGANISM_CODE, .keep_all = TRUE) %>%
# add Enterobacterales, which is a subkingdom code in their data
bind_rows(data.frame(ORGANISM_CODE = "ebc", ORGANISM = "Enterobacterales", mo = as.mo("Enterobacterales"))) %>%
arrange(ORGANISM)
# check non-existing species groups in the microorganisms table
whonet_organisms %>%
filter(!is.na(SPECIES_GROUP)) %>%
group_by(SPECIES_GROUP) %>%
summarise(complex = ORGANISM[ORGANISM %like% " (group|complex)"][1],
organisms = paste0(n(), ": ", paste(sort(unique(ORGANISM)), collapse = ", "))) %>%
filter(!SPECIES_GROUP %in% microorganisms.codes$code)
# create the species group data set ----
microorganisms.groups <- whonet_organisms %>%
# these will not be translated well
filter(!ORGANISM %in% c("Trueperella pyogenes-like bacteria",
"Mycobacterium suricattae",
"Mycobacterium canetti")) %>%
filter(!is.na(SPECIES_GROUP), SPECIES_GROUP != ORGANISM_CODE) %>%
transmute(mo_group = as.mo(SPECIES_GROUP),
mo = ifelse(is.na(mo),
as.character(as.mo(ORGANISM, keep_synonyms = TRUE, minimum_matching_score = 0)),
mo)) %>%
# add our own CoNS and CoPS, WHONET does not strictly follow Becker et al (2014, 2019, 2020)
filter(mo_group != as.mo("CoNS")) %>%
bind_rows(tibble(mo_group = as.mo("CoNS"), mo = MO_CONS)) %>%
filter(mo_group != as.mo("CoPS")) %>%
bind_rows(tibble(mo_group = as.mo("CoPS"), mo = MO_COPS)) %>%
# at least all our Lancefield-grouped streptococci must be in the beta-haemolytic group:
bind_rows(tibble(mo_group = as.mo("Beta-haemolytic streptococcus"),
mo = c(MO_LANCEFIELD,
microorganisms %>% filter(fullname %like% "^Streptococcus Group") %>% pull(mo)))) %>%
# and per Streptococcus group as well:
# group A - S. pyogenes
bind_rows(tibble(mo_group = as.mo("Streptococcus Group A"),
mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_PYGN(_|$)")])) %>%
# group B - S. agalactiae
bind_rows(tibble(mo_group = as.mo("Streptococcus Group B"),
mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_AGLC(_|$)")])) %>%
# group C - all subspecies within S. dysgalactiae and S. equi (such as S. equi zooepidemicus)
bind_rows(tibble(mo_group = as.mo("Streptococcus Group C"),
mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_(DYSG|EQUI)(_|$)")])) %>%
# group F - Milleri group == S. anginosus group, which incl. S. anginosus, S. constellatus, S. intermedius
bind_rows(tibble(mo_group = as.mo("Streptococcus Group F"),
mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_(ANGN|CNST|INTR)(_|$)")])) %>%
# group G - S. dysgalactiae and S. canis (though dysgalactiae is also group C and will be matched there)
bind_rows(tibble(mo_group = as.mo("Streptococcus Group G"),
mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_(DYSG|CANS)(_|$)")])) %>%
# group H - S. sanguinis
bind_rows(tibble(mo_group = as.mo("Streptococcus Group H"),
mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_SNGN(_|$)")])) %>%
# group K - S. salivarius, incl. S. salivarius salivariuss and S. salivarius thermophilus
bind_rows(tibble(mo_group = as.mo("Streptococcus Group K"),
mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_SLVR(_|$)")])) %>%
# group L - only S. dysgalactiae
bind_rows(tibble(mo_group = as.mo("Streptococcus Group L"),
mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_DYSG(_|$)")])) %>%
# and for EUCAST: Strep group A, B, C, G
bind_rows(tibble(mo_group = as.mo("Streptococcus Group A, B, C, G"),
mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_(PYGN|AGLC|DYSG|EQUI|CANS|GRPA|GRPB|GRPC|GRPG)(_|$)")])) %>%
# HACEK is:
# - Haemophilus species
# - Aggregatibacter species
# - Cardiobacterium hominis
# - Eikenella corrodens
# - Kingella species
# - and previously Actinobacillus actinomycetemcomitans
# see https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3656887/
filter(mo_group != as.mo("HACEK")) %>%
bind_rows(tibble(mo_group = as.mo("HACEK"), mo = microorganisms %>% filter(genus == "Haemophilus") %>% pull(mo))) %>%
bind_rows(tibble(mo_group = as.mo("HACEK"), mo = microorganisms %>% filter(genus == "Aggregatibacter") %>% pull(mo))) %>%
bind_rows(tibble(mo_group = as.mo("HACEK"), mo = as.mo("Cardiobacterium hominis", keep_synonyms = TRUE))) %>%
bind_rows(tibble(mo_group = as.mo("HACEK"), mo = as.mo("Eikenella corrodens", keep_synonyms = TRUE))) %>%
bind_rows(tibble(mo_group = as.mo("HACEK"), mo = microorganisms %>% filter(genus == "Kingella") %>% pull(mo))) %>%
bind_rows(tibble(mo_group = as.mo("HACEK"), mo = as.mo("Actinobacillus actinomycetemcomitans", keep_synonyms = TRUE))) %>%
# Citrobacter freundii complex in the NCBI Taxonomy Browser:
# https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=1344959
filter(mo_group != "B_CTRBC_FRND-C") %>%
bind_rows(tibble(mo_group = as.mo("B_CTRBC_FRND-C"),
mo = paste("Citrobacter", c("freundii", "braakii", "gillenii", "murliniae", "portucalensis", "sedlakii", "werkmanii", "youngae")) %>% as.mo(keep_synonyms = TRUE))) %>%
# Klebsiella pneumoniae complex
filter(mo_group != "B_KLBSL_PNMN-C") %>%
bind_rows(tibble(mo_group = as.mo("B_KLBSL_PNMN-C"),
mo = paste("Klebsiella", c("africana", "pneumoniae", "quasipneumoniae", "quasivariicola", "variicola")) %>% as.mo(keep_synonyms = TRUE))) %>%
# Yersinia pseudotuberculosis complex in the NCBI Taxonomy Browser:
# https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=1649845
filter(mo_group != "B_YERSN_PSDT-C") %>%
bind_rows(tibble(mo_group = as.mo("B_YERSN_PSDT-C"),
mo = paste("Yersinia", c("pseudotuberculosis", "pestis", "similis", "wautersii")) %>% as.mo(keep_synonyms = TRUE))) %>%
# RGM are Rapidly-grwoing Mycobacteria, see https://pubmed.ncbi.nlm.nih.gov/28084211/
filter(mo_group != "B_MYCBC_RGM") %>%
bind_rows(tibble(mo_group = as.mo("B_MYCBC_RGM"),
mo = paste("Mycobacterium", c( "abscessus abscessus", "abscessus bolletii", "abscessus massiliense", "agri", "aichiense", "algericum", "alvei", "anyangense", "arabiense", "aromaticivorans", "aubagnense", "aubagnense", "aurum", "austroafricanum", "bacteremicum", "boenickei", "bourgelatii", "brisbanense", "brumae", "canariasense", "celeriflavum", "chelonae", "chitae", "chlorophenolicum", "chubuense", "confluentis", "cosmeticum", "crocinum", "diernhoferi", "duvalii", "elephantis", "fallax", "flavescens", "fluoranthenivorans", "fortuitum", "franklinii", "frederiksbergense", "gadium", "gilvum", "goodii", "hassiacum", "hippocampi", "hodleri", "holsaticum", "houstonense", "immunogenum", "insubricum", "iranicum", "komossense", "litorale", "llatzerense", "madagascariense", "mageritense", "monacense", "moriokaense", "mucogenicum", "mucogenicum", "murale", "neoaurum", "neworleansense", "novocastrense", "obuense", "pallens", "parafortuitum", "peregrinum", "phlei", "phocaicum", "phocaicum", "porcinum", "poriferae", "psychrotolerans", "pyrenivorans", "rhodesiae", "rufum", "rutilum", "salmoniphilum", "sediminis", "senegalense", "septicum", "setense", "smegmatis", "sphagni", "thermoresistibile", "tokaiense", "vaccae", "vanbaalenii", "wolinskyi")) %>% as.mo(keep_synonyms = TRUE)))
# add subspecies to all species
for (group in unique(microorganisms.groups$mo_group)) {
spp <- microorganisms.groups %>%
filter(mo_group == group & mo_rank(mo, keep_synonyms = TRUE) == "species") %>%
pull(mo) %>%
paste0(collapse = "|") %>%
paste0("^(", ., ")")
mos <- microorganisms %>%
filter(mo %like% spp & rank == "subspecies") %>%
pull(mo)
# add them
microorganisms.groups <- microorganisms.groups %>% bind_rows(tibble(mo_group = as.mo(group), mo = mos))
}
# add full names, arrange and clean
microorganisms.groups <- microorganisms.groups %>%
mutate(mo_group_name = mo_name(mo_group, keep_synonyms = TRUE, language = NULL),
mo_name = mo_name(mo, keep_synonyms = TRUE, language = NULL)) %>%
arrange(mo_group_name, mo_name) %>%
filter(mo_group != mo) %>%
distinct() %>%
dataset_UTF8_to_ASCII()
mo_uncertainties()
class(microorganisms.groups$mo_group) <- c("mo", "character")
class(microorganisms.groups$mo) <- c("mo", "character")
usethis::use_data(microorganisms.groups, internal = FALSE, overwrite = TRUE, compress = "xz", version = 2)
rm(microorganisms.groups)
devtools::load_all()

View File

@ -0,0 +1,110 @@
# get complete filenames of all R files in the GitHub repository of nathaneastwood/poorman
library(magrittr)
`%like%` <- function(x, y) grepl(y, x, ignore.case = TRUE, perl = TRUE)
`%unlike%` <- function(x, y) !grepl(y, x, ignore.case = TRUE, perl = TRUE)
commit <- "3cc0a9920b1eb559dd166f548561244189586b3a"
files <- xml2::read_html(paste0("https://github.com/nathaneastwood/poorman/tree/", commit, "/R")) %>%
rvest::html_nodes("a") %>%
rvest::html_attr("href")
files <- files[files %like% "/blob/.*R$"]
# get full URLs of all raw R files
files <- sort(paste0("https://raw.githubusercontent.com", gsub("blob/", "", files[files %like% "/R/.*.R$"])))
# remove files with only pkg specific code
files <- files[files %unlike% "(zzz|init)[.]R$"]
# also, there's a lot of functions we don't use
files <- files[files %unlike% "/(between|coalesce|cumulative|fill|glimpse|group_cols|na_if|near|nest_by|check_filter|poorman-package|print|recode|reconstruct|replace_na|replace_with|rownames|slice|union_all|unite|window_rank|with_groups)[.]R$"]
# add our prepend file, containing info about the source of the data
intro <- readLines("data-raw/poorman_prepend.R") %>%
# add commit to intro part
gsub("{commit}", commit, ., fixed = TRUE) %>%
# add date to intro part
gsub("{date}", trimws(format(Sys.Date(), "%e %B %Y")), ., fixed = TRUE)
# copyright info:
copyright <- paste0("# ", readLines(paste0("https://raw.githubusercontent.com/nathaneastwood/poorman/", commit, "/LICENSE")))
# read all contents to a character vector
contents <- character(0)
sapply(files, function(file) {
message("reading ", basename(file))
contents <<- c(contents, readLines(file))
invisible()
})
# remove lines starting with "#'" and NULL and write to file
contents <- contents[!grepl("^(#'|NULL|\"_PACKAGE)", contents)]
contents.bak <- contents
# grouped attributes same as dplyr
contents <- gsub("grouped_data", "grouped_df", contents, fixed = TRUE)
# now make it independent on UseMethod, since we will not export these functions
has_usemethods <- gsub("^([a-z_]+).*", "\\1", contents[which(contents %like% "usemethod") - 1])
for (use in has_usemethods) {
relevant_row <- which(contents %like% paste0("^", use, " <- function")) + 1
function_call <- trimws(gsub(".*function(.*)\\{.*", "\\1", contents[relevant_row - 1]))
function_call1 <- trimws(gsub("[()]", "", strsplit(function_call, ",")[[1]][1]))
if (any(contents %like% paste0(use, ".grouped_df"))) {
# this function will have methods for data.frame and grouped_df
contents[relevant_row] <- paste0(" if (\"grouped_df\" %in% class(", function_call1, ")) ", use, ".grouped_df", function_call, " else ", use, ".data.frame", function_call)
} else {
# this function will only have data.frame as method
contents[relevant_row] <- paste0(" ", use, ".data.frame", function_call)
}
# add pm_ prefix
contents[relevant_row - 1] <- paste0("pm_", contents[relevant_row - 1])
}
# correct for NextMethod
contents <- gsub("NextMethod\\(\"(.*)\"\\)", "\\1.data.frame(...)", contents)
# correct for 'default' method
contents <- gsub(".default <-", ".data.frame <-", contents, fixed = TRUE)
contents <- gsub("pm_group_by_drop.data.frame", "pm_group_by_drop", contents, fixed = TRUE)
contents <- gsub("(stats::)?setNames", "stats::setNames", contents)
# now get all those pm_* functions to replace all untransformed function name calls as well
new_pm_names <- sort(gsub("pm_(.*?) <-.*", "\\1", contents[grepl("^pm_", contents)]))
for (i in seq_len(length(new_pm_names))) {
contents <- gsub(paste0("([^a-z._])", new_pm_names[i], "([^a-z._])"), paste0("\\1pm_", new_pm_names[i], "\\2"), contents)
# starting with a space or a straight bracket or an opening parenthesis, ending with nothing or a non-character or a closing parenthesis
contents <- gsub(paste0("( |\\[|\\()", new_pm_names[i], "($|[^a-z]|\\))"), paste0("\\1pm_", new_pm_names[i], "\\2"), contents)
}
# replace %>% with %pm>%
contents[which(contents %like% "^\\|\\|") - 1] <- paste0(contents[which(contents %like% "^\\|\\|") - 1], " ||")
contents[which(contents %like% "^\\|\\|")] <- gsub("^\\|\\|", "", contents[which(contents %like% "^\\|\\|")])
contents <- gsub("%>%", "%pm>%", contents, fixed = TRUE)
# fix for new lines, since n() also existed
contents <- gsub("\\pm_n", "\\n", contents, fixed = TRUE)
# prefix other functions also with "pm_"
contents <- gsub("^([a-z_]+)(\\$|)", "pm_\\1\\2", contents)
# prefix environmental objects and functions
contents <- gsub("(add_group_columns|add_tally|apply_grouped_function|as_function|as_symbols|build_data_frame|calculate_groups|check_filter|check_if_types|check_name|check_context|collapse_to_sentence|context|deparse_|dotdotdot|drop_dup_list|eval_call|eval_env|eval_expr|eval_select_pos|find_used|flatten|get_group_details|gluestick|group_|groups|groups_set|has_groups|have_name|insert_dot|is.grouped_df|is_df_or_vector|is_empty_list|is_formula|is_named|is_negated_colon|is_nested|is_string|is_wholenumber|join_message|join_worker|names_are_invalid|nth|peek_vars|reconstruct_attrs|replace_na|replace_with|select_|select_context|select_env|select_positions|setup_|split_into_groups|squash|tally|tally_n|validate_case_when_length)", "pm_\\1", contents)
# now a lot of items are overprefixed
contents <- gsub("(pm_)+", "pm_", contents)
contents <- gsub("_pm_", "_", contents)
contents <- gsub("pm_if (\"grouped_df", "if (\"grouped_df", contents, fixed = TRUE)
# remove comments and empty lines
contents <- gsub("#.*", "", contents)
contents <- contents[trimws(contents) != ""]
# fix for their relocate()
contents <- gsub("if (!missing(.before))", "if (!missing(.before) && !is.null(.before))", contents, fixed = TRUE)
contents <- gsub("if (!missing(.after))", "if (!missing(.after) && !is.null(.after))", contents, fixed = TRUE)
contents[which(contents %like% "reshape\\($") + 1] <- gsub("data", "as.data.frame(data, stringsAsFactors = FALSE)", contents[which(contents %like% "reshape\\($") + 1])
contents <- gsub("pm_relocate(.data = long, values_to, .after = -1)", 'pm_relocate(.data = long, "value", .after = -1)', contents, fixed = TRUE)
# who needs US spelling?
contents <- contents[contents %unlike% "summarize"]
# add intro
contents <- c(
intro,
copyright,
"",
contents
)
writeLines(contents, "R/aa_helper_pm_functions.R")
# note: pm_left_join() will be overwritten by aaa_helper_functions.R, which contains a faster implementation
# replace `res <- as.data.frame(res)` with `res <- as.data.frame(res, stringsAsFactors = FALSE)`