mirror of https://github.com/msberends/AMR.git
481 lines
20 KiB
R
481 lines
20 KiB
R
# ==================================================================== #
|
|
# TITLE #
|
|
# Antimicrobial Resistance (AMR) Data Analysis for R #
|
|
# #
|
|
# SOURCE #
|
|
# https://github.com/msberends/AMR #
|
|
# #
|
|
# LICENCE #
|
|
# (c) 2018-2021 Berends MS, Luz CF et al. #
|
|
# Developed at the University of Groningen, the Netherlands, in #
|
|
# collaboration with non-profit organisations Certe Medical #
|
|
# Diagnostics & Advice, and University Medical Center Groningen. #
|
|
# #
|
|
# This R package is free software; you can freely use and distribute #
|
|
# it for both personal and commercial purposes under the terms of the #
|
|
# GNU General Public License version 2.0 (GNU GPL-2), as published by #
|
|
# the Free Software Foundation. #
|
|
# We created this package for both routine data analysis and academic #
|
|
# research and it was publicly released in the hope that it will be #
|
|
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. #
|
|
# #
|
|
# Visit our website for the full manual and a complete tutorial about #
|
|
# how to conduct AMR data analysis: https://msberends.github.io/AMR/ #
|
|
# ==================================================================== #
|
|
|
|
# Register at List of Prokaryotic names with Standing in Nomenclature (LPSN)
|
|
# then got to https://lpsn.dsmz.de/downloads and download the latest CSV file.
|
|
|
|
library(tidyverse)
|
|
library(AMR)
|
|
|
|
# these should still work after this update
|
|
test_fullname <- microorganisms$fullname
|
|
test_mo <- microorganisms$mo
|
|
|
|
|
|
# Helper functions --------------------------------------------------------
|
|
|
|
get_author_year <- function(ref) {
|
|
# Only keep first author, e.g. transform 'Smith, Jones, 2011' to 'Smith et al., 2011'
|
|
|
|
authors2 <- iconv(ref, from = "UTF-8", to = "ASCII//TRANSLIT")
|
|
authors2 <- gsub(" ?\\(Approved Lists [0-9]+\\) ?", " () ", authors2)
|
|
authors2 <- gsub(" [)(]+ $", "", authors2)
|
|
# remove leading and trailing brackets
|
|
authors2 <- trimws(gsub("^[(](.*)[)]$", "\\1", authors2))
|
|
# only take part after brackets if there's a name
|
|
authors2 <- ifelse(grepl(".*[)] [a-zA-Z]+.*", authors2),
|
|
gsub(".*[)] (.*)", "\\1", authors2),
|
|
authors2)
|
|
# get year from last 4 digits
|
|
lastyear = as.integer(gsub(".*([0-9]{4})$", "\\1", authors2))
|
|
# can never be later than now
|
|
lastyear = ifelse(lastyear > as.integer(format(Sys.Date(), "%Y")),
|
|
NA,
|
|
lastyear)
|
|
# get authors without last year
|
|
authors <- gsub("(.*)[0-9]{4}$", "\\1", authors2)
|
|
# remove nonsense characters from names
|
|
authors <- gsub("[^a-zA-Z,'& -]", "", authors)
|
|
# remove trailing and leading spaces
|
|
authors <- trimws(authors)
|
|
# only keep first author and replace all others by 'et al'
|
|
authors <- gsub("(,| and| et| &| ex| emend\\.?) .*", " et al.", authors)
|
|
# et al. always with ending dot
|
|
authors <- gsub(" et al\\.?", " et al.", authors)
|
|
authors <- gsub(" ?,$", "", authors)
|
|
# don't start with 'sensu' or 'ehrenb'
|
|
authors <- gsub("^(sensu|Ehrenb.?) ", "", authors, ignore.case = TRUE)
|
|
# no initials, only surname
|
|
authors <- gsub("^([A-Z]+ )+", "", authors, ignore.case = FALSE)
|
|
# combine author and year if year is available
|
|
ref <- ifelse(!is.na(lastyear),
|
|
paste0(authors, ", ", lastyear),
|
|
authors)
|
|
# fix beginning and ending
|
|
ref <- gsub(", $", "", ref)
|
|
ref <- gsub("^, ", "", ref)
|
|
ref <- gsub("^(emend|et al.,?)", "", ref)
|
|
ref <- trimws(ref)
|
|
ref <- gsub("'", "", ref)
|
|
|
|
# a lot start with a lowercase character - fix that
|
|
ref[!grepl("^d[A-Z]", ref)] <- gsub("^([a-z])", "\\U\\1", ref[!grepl("^d[A-Z]", ref)], perl = TRUE)
|
|
# specific one for the French that are named dOrbigny
|
|
ref[grepl("^d[A-Z]", ref)] <- gsub("^d", "d'", ref[grepl("^d[A-Z]", ref)])
|
|
ref <- gsub(" +", " ", ref)
|
|
ref
|
|
}
|
|
|
|
df_remove_nonASCII <- function(df) {
|
|
# Remove non-ASCII characters (these are not allowed by CRAN)
|
|
df %>%
|
|
mutate_if(is.character, iconv, from = "UTF-8", to = "ASCII//TRANSLIT") %>%
|
|
# also remove invalid characters
|
|
mutate_if(is.character, ~gsub("[\"'`]+", "", .)) %>%
|
|
AMR:::dataset_UTF8_to_ASCII()
|
|
}
|
|
|
|
abbreviate_mo <- function(x, minlength = 5, prefix = "", ...) {
|
|
# keep a starting Latin ae
|
|
suppressWarnings(
|
|
gsub("^ae", "\u00E6\u00E6", x, ignore.case = TRUE) %>%
|
|
abbreviate(minlength = minlength,
|
|
use.classes = TRUE,
|
|
method = "both.sides", ...) %>%
|
|
paste0(prefix, .) %>%
|
|
toupper() %>%
|
|
gsub("(\u00C6|\u00E6)+", "AE", .)
|
|
)
|
|
}
|
|
|
|
# Read data ---------------------------------------------------------------
|
|
|
|
taxonomy <- read_csv("~/Downloads/taxonomy.csv")
|
|
|
|
# Create synonyms ---------------------------------------------------------
|
|
|
|
new_synonyms <- taxonomy %>%
|
|
left_join(taxonomy,
|
|
by = c("record_lnk" = "record_no"),
|
|
suffix = c("", ".new")) %>%
|
|
filter(!is.na(record_lnk)) %>%
|
|
mutate_all(~ifelse(is.na(.), "", .)) %>%
|
|
transmute(fullname = trimws(paste(genus_name, sp_epithet, subsp_epithet)),
|
|
fullname_new = trimws(paste(genus_name.new, sp_epithet.new, subsp_epithet.new)),
|
|
ref = get_author_year(authors),
|
|
prevalence = 0) %>%
|
|
distinct(fullname, .keep_all = TRUE) %>%
|
|
filter(fullname != fullname_new) %>%
|
|
# this part joins this table to itself to correct for entries that had >1 renames,
|
|
# such as:
|
|
# Bacteroides tectum -> Bacteroides tectus
|
|
# Bacteroides tectus -> Bacteroides pyogenes
|
|
left_join(., .,
|
|
by = c("fullname_new" = "fullname"),
|
|
suffix = c("", ".2")) %>%
|
|
mutate(fullname_new = ifelse(!is.na(fullname_new.2), fullname_new.2, fullname_new),
|
|
ref = ifelse(!is.na(ref.2), ref.2, ref)) %>%
|
|
select(-ends_with(".2"))
|
|
|
|
mo_became_synonym <- microorganisms %>%
|
|
filter(fullname %in% new_synonyms$fullname)
|
|
|
|
updated_microorganisms <- taxonomy %>%
|
|
filter(is.na(record_lnk)) %>%
|
|
mutate_all(~ifelse(is.na(.), "", .)) %>%
|
|
transmute(mo = "",
|
|
fullname = trimws(paste(genus_name, sp_epithet, subsp_epithet)),
|
|
kingdom = "Bacteria",
|
|
phylum = "",
|
|
class = "",
|
|
order = "",
|
|
family = "",
|
|
genus = trimws(genus_name),
|
|
species = trimws(replace_na(sp_epithet, "")),
|
|
subspecies = trimws(replace_na(subsp_epithet, "")),
|
|
rank = case_when(subspecies == "" & species == "" ~ "genus",
|
|
subspecies == "" ~ "species",
|
|
TRUE ~ "subsp."),
|
|
ref = get_author_year(authors),
|
|
species_id = as.character(record_no),
|
|
source = "LPSN",
|
|
prevalence = 0,
|
|
snomed = NA)
|
|
|
|
new_microorganisms <- updated_microorganisms %>%
|
|
filter(!fullname %in% microorganisms$fullname)
|
|
|
|
genera_with_mo_code <- updated_microorganisms %>%
|
|
filter(genus %in% (microorganisms %>% filter(kingdom == "Bacteria", rank == "genus") %>% pull(genus))) %>%
|
|
distinct(genus) %>%
|
|
left_join(microorganisms %>% filter(kingdom == "Bacteria", rank == "genus") %>% select(mo, genus),
|
|
by = "genus")
|
|
|
|
genera_without_mo_code <- updated_microorganisms %>%
|
|
filter(!genus %in% genera_with_mo_code$genus) %>%
|
|
pull(genus) %>%
|
|
unique()
|
|
|
|
genera_without_mo_code_abbr <- genera_without_mo_code %>%
|
|
abbreviate_mo(5, prefix = "B_")
|
|
genera_without_mo_code_abbr[genera_without_mo_code_abbr %in% microorganisms$mo] <- abbreviate_mo(genera_without_mo_code[genera_without_mo_code_abbr %in% microorganisms$mo], 6, prefix = "B_")
|
|
genera_without_mo_code_abbr[genera_without_mo_code_abbr %in% microorganisms$mo] <- abbreviate_mo(genera_without_mo_code[genera_without_mo_code_abbr %in% microorganisms$mo], 7, prefix = "B_")
|
|
# all unique??
|
|
sum(genera_without_mo_code_abbr %in% microorganisms$mo) == 0
|
|
|
|
genus_abb <- tibble(genus = genera_without_mo_code,
|
|
abbr = genera_without_mo_code_abbr) %>%
|
|
bind_rows(microorganisms %>%
|
|
filter(kingdom == "Bacteria", rank == "genus", !genus %in% genera_without_mo_code) %>%
|
|
transmute(genus, abbr = as.character(mo))) %>%
|
|
arrange(genus)
|
|
|
|
|
|
# Update taxonomy ---------------------------------------------------------
|
|
|
|
# fill in the taxonomy of new genera
|
|
updated_taxonomy <- tibble(phylum = character(0),
|
|
class = character(0),
|
|
order = character(0),
|
|
family = character(0),
|
|
genus = character(0))
|
|
for (page in LETTERS) {
|
|
message("Downloading page ", page, "... ", appendLF = FALSE)
|
|
url <- paste0("https://lpsn.dsmz.de/genus?page=", page)
|
|
|
|
x <- xml2::read_html(url) %>%
|
|
rvest::html_node(".main-list") %>%
|
|
# evety list element with a set <id> attribute
|
|
rvest::html_nodes("li[id]")
|
|
for (i in seq_len(length(x))) {
|
|
txt <- x %>%
|
|
magrittr::extract2(i) %>%
|
|
rvest::html_text() %>%
|
|
gsub("\\[[A-Za-z]+, no [a-z]+\\]", "NA", .) %>%
|
|
gsub("Candidatus ", "", ., fixed = TRUE) %>%
|
|
gsub("[ \t\r\n\"]+", "|", .) %>%
|
|
gsub("\\|ShowHide.*", "", .) %>%
|
|
gsub("[\\[\\]]", "", ., fixed = TRUE) %>%
|
|
gsub("^\\|", "", .) %>%
|
|
strsplit("|", fixed = TRUE) %>%
|
|
unlist()
|
|
txt[txt == "NA"] <- ""
|
|
txt <- gsub("[^A-Za-z]+", "", txt)
|
|
updated_taxonomy <- updated_taxonomy %>%
|
|
bind_rows(tibble(phylum = txt[2],
|
|
class = txt[3],
|
|
order = txt[4],
|
|
family = txt[5],
|
|
genus = txt[6]))
|
|
}
|
|
message(length(x), " entries (total ", nrow(updated_taxonomy), ")")
|
|
}
|
|
|
|
# Create new microorganisms -----------------------------------------------
|
|
|
|
new_microorganisms <- new_microorganisms %>%
|
|
left_join(genus_abb, by = "genus") %>%
|
|
group_by(genus) %>%
|
|
mutate(species_abb = abbreviate_mo(species, 4)) %>%
|
|
group_by(genus, species) %>%
|
|
mutate(subspecies_abb = abbreviate_mo(subspecies, 4)) %>%
|
|
ungroup() %>%
|
|
mutate(mo = paste(abbr, species_abb, subspecies_abb, sep = "_"),
|
|
mo = gsub("_+$", "", mo)) %>%
|
|
select(-matches("abb"))
|
|
|
|
# add taxonomy new microorganisms
|
|
MOs <- microorganisms %>%
|
|
mutate(mo = as.character(mo)) %>%
|
|
bind_rows(new_microorganisms) %>%
|
|
arrange(fullname)
|
|
|
|
# unique MO codes
|
|
MOs$mo[which(duplicated(MOs$mo))] <- paste0(MOs$mo[which(duplicated(MOs$mo))], 1)
|
|
# all unique?
|
|
!any(duplicated(MOs$mo))
|
|
|
|
MOs <- MOs %>%
|
|
# remove entries that are now a synonym
|
|
filter(!fullname %in% new_synonyms$fullname) %>%
|
|
# update the taxonomy
|
|
left_join(updated_taxonomy, by = "genus", suffix = c("", ".new")) %>%
|
|
mutate(phylum = ifelse(!is.na(phylum.new), phylum.new, phylum),
|
|
class = ifelse(!is.na(class.new), class.new, class),
|
|
order = ifelse(!is.na(order.new), order.new, order),
|
|
family = ifelse(!is.na(family.new), family.new, family)) %>%
|
|
select(-ends_with(".new")) %>%
|
|
# update prevalence based on taxonomy (Berends et al., 2021)
|
|
mutate(prevalence = case_when(
|
|
class == "Gammaproteobacteria"
|
|
| genus %in% c("Enterococcus", "Staphylococcus", "Streptococcus")
|
|
~ 1,
|
|
kingdom %in% c("Archaea", "Bacteria", "Chromista", "Fungi")
|
|
& (phylum %in% c("Proteobacteria",
|
|
"Firmicutes",
|
|
"Actinobacteria",
|
|
"Sarcomastigophora")
|
|
| genus %in% c("Absidia", "Acremonium", "Actinotignum", "Alternaria", "Anaerosalibacter", "Apophysomyces",
|
|
"Arachnia", "Aspergillus", "Aureobacterium", "Aureobasidium", "Bacteroides", "Basidiobolus",
|
|
"Beauveria", "Blastocystis", "Branhamella", "Calymmatobacterium", "Candida", "Capnocytophaga",
|
|
"Catabacter", "Chaetomium", "Chryseobacterium", "Chryseomonas", "Chrysonilia", "Cladophialophora",
|
|
"Cladosporium", "Conidiobolus", "Cryptococcus", "Curvularia", "Exophiala", "Exserohilum",
|
|
"Flavobacterium", "Fonsecaea", "Fusarium", "Fusobacterium", "Hendersonula", "Hypomyces",
|
|
"Koserella", "Lelliottia", "Leptosphaeria", "Leptotrichia", "Malassezia", "Malbranchea",
|
|
"Mortierella", "Mucor", "Mycocentrospora", "Mycoplasma", "Nectria", "Ochroconis",
|
|
"Oidiodendron", "Phoma", "Piedraia", "Pithomyces", "Pityrosporum", "Prevotella", "Pseudallescheria",
|
|
"Rhizomucor", "Rhizopus", "Rhodotorula", "Scolecobasidium", "Scopulariopsis", "Scytalidium",
|
|
"Sporobolomyces", "Stachybotrys", "Stomatococcus", "Treponema", "Trichoderma", "Trichophyton",
|
|
"Trichosporon", "Tritirachium", "Ureaplasma")
|
|
| rank %in% c("kingdom", "phylum", "class", "order", "family"))
|
|
~ 2,
|
|
TRUE ~ 3
|
|
))
|
|
|
|
# add all mssing genera, families and orders
|
|
MOs <- MOs %>%
|
|
bind_rows(
|
|
MOs %>%
|
|
arrange(genus, species) %>%
|
|
distinct(genus, .keep_all = TRUE) %>%
|
|
filter(rank == "species", source != "manually added") %>%
|
|
mutate(mo = gsub("^([A-Z]_[A-Z]+)_.*", "\\1", mo),
|
|
fullname = genus,
|
|
species = "",
|
|
subspecies = "",
|
|
rank = "genus",
|
|
species_id = "",
|
|
snomed = NA,
|
|
ref = NA_character_),
|
|
MOs %>%
|
|
group_by(family) %>%
|
|
filter(!any(rank == "family") & n() > 1) %>%
|
|
ungroup() %>%
|
|
arrange(family) %>%
|
|
distinct(family, .keep_all = TRUE) %>%
|
|
filter(!family %in% c("", NA), source != "manually added") %>%
|
|
mutate(mo = paste0(substr(kingdom, 1, 1), "_[FAM]_",
|
|
abbreviate(family,
|
|
minlength = 8,
|
|
use.classes = TRUE,
|
|
method = "both.sides",
|
|
strict = FALSE)),
|
|
mo = toupper(mo),
|
|
fullname = family,
|
|
genus = "",
|
|
species = "",
|
|
subspecies = "",
|
|
rank = "family",
|
|
species_id = "",
|
|
snomed = NA,
|
|
ref = NA_character_),
|
|
MOs %>%
|
|
group_by(order) %>%
|
|
filter(!any(rank == "order") & n() > 1) %>%
|
|
ungroup() %>%
|
|
arrange(order) %>%
|
|
distinct(order, .keep_all = TRUE) %>%
|
|
filter(!order %in% c("", NA), source != "manually added") %>%
|
|
mutate(mo = paste0(substr(kingdom, 1, 1), "_[ORD]_",
|
|
abbreviate(order,
|
|
minlength = 8,
|
|
use.classes = TRUE,
|
|
method = "both.sides",
|
|
strict = FALSE)),
|
|
mo = toupper(mo),
|
|
fullname = order,
|
|
family = "",
|
|
genus = "",
|
|
species = "",
|
|
subspecies = "",
|
|
rank = "order",
|
|
species_id = "",
|
|
snomed = NA,
|
|
ref = NA_character_)
|
|
) %>%
|
|
arrange(fullname)
|
|
|
|
# clean up
|
|
MOs <- MOs %>%
|
|
df_remove_nonASCII()
|
|
|
|
|
|
# Merge synonyms ----------------------------------------------------------
|
|
|
|
# remove synonyms that are now valid names
|
|
MOs.old <- microorganisms.old %>%
|
|
# add new synonyms
|
|
bind_rows(new_synonyms) %>%
|
|
filter(!fullname %in% MOs$fullname) %>%
|
|
arrange(fullname) %>%
|
|
distinct(fullname, fullname_new, .keep_all = TRUE) %>%
|
|
# add prevalence to old taxonomic names
|
|
select(-prevalence) %>%
|
|
left_join(MOs %>% select(fullname, prevalence), by = c("fullname_new" = "fullname")) %>%
|
|
# clean up
|
|
df_remove_nonASCII()
|
|
|
|
# Keep old codes for translation ------------------------------------------
|
|
|
|
# add removed microbial IDs to the internal translation table so old package versions keep working
|
|
MOs.translation <- microorganisms %>%
|
|
filter(!mo %in% MOs$mo) %>%
|
|
select(mo, fullname) %>%
|
|
left_join(new_synonyms) %>%
|
|
left_join(MOs %>% transmute(fullname_new = fullname, mo2 = as.character(mo))) %>%
|
|
select(mo_old = mo, mo_new = mo2) %>%
|
|
distinct()
|
|
MOs.translation <- AMR:::microorganisms.translation %>%
|
|
left_join(MOs.translation %>% select(mo_new_update = mo_new, mo_new = mo_old)) %>%
|
|
mutate(mo_new = as.character(ifelse(!is.na(mo_new_update), mo_new_update, mo_new))) %>%
|
|
select(-mo_new_update) %>%
|
|
bind_rows(
|
|
# old IDs used in microorganisms.codes must put in here as well
|
|
microorganisms.codes %>%
|
|
filter(!mo %in% MOs$mo) %>%
|
|
transmute(mo_old = mo, fullname = mo_name(mo)) %>%
|
|
left_join(MOs.old %>%
|
|
select(fullname, fullname_new)) %>%
|
|
left_join(MOs %>%
|
|
select(mo_new = mo, fullname_new = fullname)) %>%
|
|
transmute(mo_old = as.character(mo_old), mo_new)) %>%
|
|
arrange(mo_old) %>%
|
|
filter(mo_old != mo_new,
|
|
!mo_old %in% MOs$mo) %>%
|
|
left_join(., .,
|
|
by = c("mo_new" = "mo_old"),
|
|
suffix = c("", ".2")) %>%
|
|
mutate(mo_new = ifelse(!is.na(mo_new.2), mo_new.2, mo_new)) %>%
|
|
distinct(mo_old, mo_new) %>%
|
|
# clean up
|
|
df_remove_nonASCII()
|
|
|
|
message("microorganisms new: ", sum(!MOs$fullname %in% c(microorganisms$fullname, MOs.old$fullname)))
|
|
message("microorganisms renamed: ", sum(!MOs.old$fullname %in% microorganisms.old$fullname))
|
|
|
|
|
|
# Save --------------------------------------------------------------------
|
|
|
|
# class <mo>
|
|
class(MOs$mo) <- c("mo", "character")
|
|
class(MOs.translation$mo_new) <- c("mo", "character")
|
|
|
|
microorganisms <- MOs
|
|
microorganisms.old <- MOs.old
|
|
microorganisms.translation <- MOs.translation
|
|
|
|
# on the server, do:
|
|
usethis::use_data(microorganisms, overwrite = TRUE, version = 2, compress = "xz")
|
|
usethis::use_data(microorganisms.old, overwrite = TRUE, version = 2)
|
|
saveRDS(microorganisms.translation, file = "data-raw/microorganisms.translation.rds", version = 2)
|
|
rm(microorganisms)
|
|
rm(microorganisms.old)
|
|
rm(microorganisms.translation)
|
|
# to save microorganisms.translation internally to the package
|
|
devtools::load_all(".")
|
|
source("data-raw/_internals.R")
|
|
|
|
# load new data sets
|
|
devtools::load_all(".")
|
|
|
|
# reset previously changed mo codes
|
|
rsi_translation$mo <- as.mo(rsi_translation$mo, language = NULL)
|
|
usethis::use_data(rsi_translation, overwrite = TRUE, version = 2)
|
|
rm(rsi_translation)
|
|
|
|
microorganisms.codes$mo <- as.mo(microorganisms.codes$mo, language = NULL)
|
|
usethis::use_data(microorganisms.codes, overwrite = TRUE, version = 2)
|
|
rm(microorganisms.codes)
|
|
|
|
example_isolates$mo <- as.mo(example_isolates$mo, language = NULL)
|
|
usethis::use_data(example_isolates, overwrite = TRUE, version = 2)
|
|
rm(example_isolates)
|
|
|
|
intrinsic_resistant$microorganism <- suppressMessages(mo_name(intrinsic_resistant$microorganism))
|
|
usethis::use_data(intrinsic_resistant, overwrite = TRUE, version = 2)
|
|
rm(intrinsic_resistant)
|
|
|
|
# load new data sets again
|
|
devtools::load_all(".")
|
|
source("data-raw/_internals.R")
|
|
devtools::load_all(".")
|
|
|
|
|
|
# Test updates ------------------------------------------------------------
|
|
|
|
# and check: these codes should not be missing (will otherwise throw a unit test error):
|
|
AMR::microorganisms.codes %>% filter(!mo %in% MOs$mo)
|
|
AMR::rsi_translation %>% filter(!mo %in% MOs$mo)
|
|
AMR:::microorganisms.translation %>% filter(!mo_new %in% MOs$mo)
|
|
AMR::example_isolates %>% filter(!mo %in% MOs$mo)
|
|
|
|
# Don't forget to add SNOMED codes! (data-raw/snomed.R)
|
|
|
|
# run the unit tests
|
|
Sys.setenv(NOT_CRAN = "true")
|
|
testthat::test_file("tests/testthat/test-data.R")
|
|
testthat::test_file("tests/testthat/test-mo.R")
|
|
testthat::test_file("tests/testthat/test-mo_property.R")
|