mirror of https://github.com/msberends/AMR.git
188 lines
12 KiB
R
188 lines
12 KiB
R
# ==================================================================== #
|
|
# TITLE: #
|
|
# AMR: An R Package for Working with Antimicrobial Resistance Data #
|
|
# #
|
|
# SOURCE CODE: #
|
|
# https://github.com/msberends/AMR #
|
|
# #
|
|
# PLEASE CITE THIS SOFTWARE AS: #
|
|
# Berends MS, Luz CF, Friedrich AW, Sinha BNM, Albers CJ, Glasner C #
|
|
# (2022). AMR: An R Package for Working with Antimicrobial Resistance #
|
|
# Data. Journal of Statistical Software, 104(3), 1-31. #
|
|
# https://doi.org/10.18637/jss.v104.i03 #
|
|
# #
|
|
# Developed at the University of Groningen and the University Medical #
|
|
# Center Groningen in The Netherlands, in collaboration with many #
|
|
# colleagues from around the world, see our website. #
|
|
# #
|
|
# This R package is free software; you can freely use and distribute #
|
|
# it for both personal and commercial purposes under the terms of the #
|
|
# GNU General Public License version 2.0 (GNU GPL-2), as published by #
|
|
# the Free Software Foundation. #
|
|
# We created this package for both routine data analysis and academic #
|
|
# research and it was publicly released in the hope that it will be #
|
|
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. #
|
|
# #
|
|
# Visit our website for the full manual and a complete tutorial about #
|
|
# how to conduct AMR data analysis: https://msberends.github.io/AMR/ #
|
|
# ==================================================================== #
|
|
|
|
# this data set is being used in the clinical_breakpoints data set, and thus by as.sir().
|
|
# it prevents the breakpoints table from being extremely long for species that are part of a species group.
|
|
|
|
library(dplyr)
|
|
library(readr)
|
|
library(tidyr)
|
|
devtools::load_all()
|
|
|
|
# Install the WHONET software on Windows (http://www.whonet.org/software.html),
|
|
# and copy the folder C:\WHONET\Resources to the data-raw/WHONET/ folder
|
|
|
|
|
|
# BACTERIAL COMPLEXES
|
|
# find all bacterial complex in the NCBI Taxonomy Browser here:
|
|
# https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Undef&id=2&lvl=6&lin=f&keep=1&srchmode=1&unlock
|
|
# and search the page for 'complex', then follow the link and add the missing ones in this R file
|
|
|
|
|
|
# READ DATA ----
|
|
|
|
whonet_organisms <- read_tsv("data-raw/WHONET/Resources/Organisms.txt", na = c("", "NA", "-"), show_col_types = FALSE) %>%
|
|
# remove old taxonomic names
|
|
filter(TAXONOMIC_STATUS == "C") %>%
|
|
mutate(ORGANISM_CODE = toupper(WHONET_ORG_CODE))
|
|
|
|
whonet_organisms <- whonet_organisms %>%
|
|
select(ORGANISM_CODE, ORGANISM, SPECIES_GROUP, GBIF_TAXON_ID) %>%
|
|
mutate(
|
|
# this one was called Issatchenkia orientalis, but it should be:
|
|
ORGANISM = if_else(ORGANISM_CODE == "ckr", "Candida krusei", ORGANISM)
|
|
) %>%
|
|
# try to match on GBIF identifier
|
|
left_join(microorganisms %>% distinct(mo, gbif, status) %>% filter(!is.na(gbif)), by = c("GBIF_TAXON_ID" = "gbif")) %>%
|
|
# remove duplicates
|
|
arrange(ORGANISM_CODE, GBIF_TAXON_ID, status) %>%
|
|
distinct(ORGANISM_CODE, .keep_all = TRUE) %>%
|
|
# add Enterobacterales, which is a subkingdom code in their data
|
|
bind_rows(data.frame(ORGANISM_CODE = "ebc", ORGANISM = "Enterobacterales", mo = as.mo("Enterobacterales"))) %>%
|
|
arrange(ORGANISM)
|
|
|
|
# check non-existing species groups in the microorganisms table
|
|
whonet_organisms %>%
|
|
filter(!is.na(SPECIES_GROUP)) %>%
|
|
group_by(SPECIES_GROUP) %>%
|
|
summarise(complex = ORGANISM[ORGANISM %like% " (group|complex)"][1],
|
|
organisms = paste0(n(), ": ", paste(sort(unique(ORGANISM)), collapse = ", "))) %>%
|
|
filter(!SPECIES_GROUP %in% microorganisms.codes$code)
|
|
|
|
# create the species group data set ----
|
|
microorganisms.groups <- whonet_organisms %>%
|
|
# these will not be translated well
|
|
filter(!ORGANISM %in% c("Trueperella pyogenes-like bacteria",
|
|
"Mycobacterium suricattae",
|
|
"Mycobacterium canetti")) %>%
|
|
filter(!is.na(SPECIES_GROUP), SPECIES_GROUP != ORGANISM_CODE) %>%
|
|
transmute(mo_group = as.mo(SPECIES_GROUP),
|
|
mo = ifelse(is.na(mo),
|
|
as.character(as.mo(ORGANISM, keep_synonyms = TRUE, minimum_matching_score = 0)),
|
|
mo)) %>%
|
|
# add our own CoNS and CoPS, WHONET does not strictly follow Becker et al (2014, 2019, 2020)
|
|
filter(mo_group != as.mo("CoNS")) %>%
|
|
bind_rows(tibble(mo_group = as.mo("CoNS"), mo = MO_CONS)) %>%
|
|
filter(mo_group != as.mo("CoPS")) %>%
|
|
bind_rows(tibble(mo_group = as.mo("CoPS"), mo = MO_COPS)) %>%
|
|
# at least all our Lancefield-grouped streptococci must be in the beta-haemolytic group:
|
|
bind_rows(tibble(mo_group = as.mo("Beta-haemolytic streptococcus"),
|
|
mo = c(MO_LANCEFIELD,
|
|
microorganisms %>% filter(fullname %like% "^Streptococcus Group") %>% pull(mo)))) %>%
|
|
# and per Streptococcus group as well:
|
|
# group A - S. pyogenes
|
|
bind_rows(tibble(mo_group = as.mo("Streptococcus Group A"),
|
|
mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_PYGN(_|$)")])) %>%
|
|
# group B - S. agalactiae
|
|
bind_rows(tibble(mo_group = as.mo("Streptococcus Group B"),
|
|
mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_AGLC(_|$)")])) %>%
|
|
# group C - all subspecies within S. dysgalactiae and S. equi (such as S. equi zooepidemicus)
|
|
bind_rows(tibble(mo_group = as.mo("Streptococcus Group C"),
|
|
mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_(DYSG|EQUI)(_|$)")])) %>%
|
|
# group F - Milleri group == S. anginosus group, which incl. S. anginosus, S. constellatus, S. intermedius
|
|
bind_rows(tibble(mo_group = as.mo("Streptococcus Group F"),
|
|
mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_(ANGN|CNST|INTR)(_|$)")])) %>%
|
|
# group G - S. dysgalactiae and S. canis (though dysgalactiae is also group C and will be matched there)
|
|
bind_rows(tibble(mo_group = as.mo("Streptococcus Group G"),
|
|
mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_(DYSG|CANS)(_|$)")])) %>%
|
|
# group H - S. sanguinis
|
|
bind_rows(tibble(mo_group = as.mo("Streptococcus Group H"),
|
|
mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_SNGN(_|$)")])) %>%
|
|
# group K - S. salivarius, incl. S. salivarius salivariuss and S. salivarius thermophilus
|
|
bind_rows(tibble(mo_group = as.mo("Streptococcus Group K"),
|
|
mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_SLVR(_|$)")])) %>%
|
|
# group L - only S. dysgalactiae
|
|
bind_rows(tibble(mo_group = as.mo("Streptococcus Group L"),
|
|
mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_DYSG(_|$)")])) %>%
|
|
# and for EUCAST: Strep group A, B, C, G
|
|
bind_rows(tibble(mo_group = as.mo("Streptococcus Group A, B, C, G"),
|
|
mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_(PYGN|AGLC|DYSG|EQUI|CANS|GRPA|GRPB|GRPC|GRPG)(_|$)")])) %>%
|
|
# HACEK is:
|
|
# - Haemophilus species
|
|
# - Aggregatibacter species
|
|
# - Cardiobacterium hominis
|
|
# - Eikenella corrodens
|
|
# - Kingella species
|
|
# - and previously Actinobacillus actinomycetemcomitans
|
|
# see https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3656887/
|
|
filter(mo_group != as.mo("HACEK")) %>%
|
|
bind_rows(tibble(mo_group = as.mo("HACEK"), mo = microorganisms %>% filter(genus == "Haemophilus") %>% pull(mo))) %>%
|
|
bind_rows(tibble(mo_group = as.mo("HACEK"), mo = microorganisms %>% filter(genus == "Aggregatibacter") %>% pull(mo))) %>%
|
|
bind_rows(tibble(mo_group = as.mo("HACEK"), mo = as.mo("Cardiobacterium hominis", keep_synonyms = TRUE))) %>%
|
|
bind_rows(tibble(mo_group = as.mo("HACEK"), mo = as.mo("Eikenella corrodens", keep_synonyms = TRUE))) %>%
|
|
bind_rows(tibble(mo_group = as.mo("HACEK"), mo = microorganisms %>% filter(genus == "Kingella") %>% pull(mo))) %>%
|
|
bind_rows(tibble(mo_group = as.mo("HACEK"), mo = as.mo("Actinobacillus actinomycetemcomitans", keep_synonyms = TRUE))) %>%
|
|
# Citrobacter freundii complex in the NCBI Taxonomy Browser:
|
|
# https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=1344959
|
|
filter(mo_group != "B_CTRBC_FRND-C") %>%
|
|
bind_rows(tibble(mo_group = as.mo("B_CTRBC_FRND-C"),
|
|
mo = paste("Citrobacter", c("freundii", "braakii", "gillenii", "murliniae", "portucalensis", "sedlakii", "werkmanii", "youngae")) %>% as.mo(keep_synonyms = TRUE))) %>%
|
|
# Yersinia pseudotuberculosis complex in the NCBI Taxonomy Browser:
|
|
# https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=1649845
|
|
filter(mo_group != "B_YERSN_PSDT-C") %>%
|
|
bind_rows(tibble(mo_group = as.mo("B_YERSN_PSDT-C"),
|
|
mo = paste("Yersinia", c("pseudotuberculosis", "pestis", "similis", "wautersii")) %>% as.mo(keep_synonyms = TRUE))) %>%
|
|
# RGM are Rapidly-grwoing Mycobacteria, see https://pubmed.ncbi.nlm.nih.gov/28084211/
|
|
filter(mo_group != "B_MYCBC_RGM") %>%
|
|
bind_rows(tibble(mo_group = as.mo("B_MYCBC_RGM"),
|
|
mo = paste("Mycobacterium", c( "abscessus abscessus", "abscessus bolletii", "abscessus massiliense", "agri", "aichiense", "algericum", "alvei", "anyangense", "arabiense", "aromaticivorans", "aubagnense", "aubagnense", "aurum", "austroafricanum", "bacteremicum", "boenickei", "bourgelatii", "brisbanense", "brumae", "canariasense", "celeriflavum", "chelonae", "chitae", "chlorophenolicum", "chubuense", "confluentis", "cosmeticum", "crocinum", "diernhoferi", "duvalii", "elephantis", "fallax", "flavescens", "fluoranthenivorans", "fortuitum", "franklinii", "frederiksbergense", "gadium", "gilvum", "goodii", "hassiacum", "hippocampi", "hodleri", "holsaticum", "houstonense", "immunogenum", "insubricum", "iranicum", "komossense", "litorale", "llatzerense", "madagascariense", "mageritense", "monacense", "moriokaense", "mucogenicum", "mucogenicum", "murale", "neoaurum", "neworleansense", "novocastrense", "obuense", "pallens", "parafortuitum", "peregrinum", "phlei", "phocaicum", "phocaicum", "porcinum", "poriferae", "psychrotolerans", "pyrenivorans", "rhodesiae", "rufum", "rutilum", "salmoniphilum", "sediminis", "senegalense", "septicum", "setense", "smegmatis", "sphagni", "thermoresistibile", "tokaiense", "vaccae", "vanbaalenii", "wolinskyi")) %>% as.mo(keep_synonyms = TRUE)))
|
|
|
|
# add subspecies to all species
|
|
for (group in unique(microorganisms.groups$mo_group)) {
|
|
spp <- microorganisms.groups %>%
|
|
filter(mo_group == group & mo_rank(mo, keep_synonyms = TRUE) == "species") %>%
|
|
pull(mo) %>%
|
|
paste0(collapse = "|") %>%
|
|
paste0("^(", ., ")")
|
|
mos <- microorganisms %>%
|
|
filter(mo %like% spp & rank == "subspecies") %>%
|
|
pull(mo)
|
|
# add them
|
|
microorganisms.groups <- microorganisms.groups %>% bind_rows(tibble(mo_group = group, mo = mos))
|
|
}
|
|
|
|
# add full names, arrange and clean
|
|
microorganisms.groups <- microorganisms.groups %>%
|
|
mutate(mo_group_name = mo_name(mo_group, keep_synonyms = TRUE, language = NULL),
|
|
mo_name = mo_name(mo, keep_synonyms = TRUE, language = NULL)) %>%
|
|
arrange(mo_group_name, mo_name) %>%
|
|
filter(mo_group != mo) %>%
|
|
distinct() %>%
|
|
dataset_UTF8_to_ASCII()
|
|
mo_uncertainties()
|
|
|
|
# add subspecies to all species
|
|
|
|
|
|
class(microorganisms.groups$mo_group) <- c("mo", "character")
|
|
class(microorganisms.groups$mo) <- c("mo", "character")
|
|
usethis::use_data(microorganisms.groups, internal = FALSE, overwrite = TRUE, compress = "xz", version = 2)
|
|
rm(microorganisms.groups)
|
|
devtools::load_all()
|