1
0
mirror of https://github.com/msberends/AMR.git synced 2025-07-09 01:22:25 +02:00

sort sir history

This commit is contained in:
2023-01-23 15:01:21 +01:00
parent af139a3c82
commit 19fd0ef121
57 changed files with 2864 additions and 2739 deletions

View File

@ -101,46 +101,48 @@ create_species_cons_cops <- function(type = c("CoNS", "CoPS")) {
MO_staph <- AMR::microorganisms
MO_staph <- MO_staph[which(MO_staph$genus == "Staphylococcus"), , drop = FALSE]
if (type == "CoNS") {
MO_staph[which(MO_staph$species %in% c(
"coagulase-negative", "argensis", "arlettae",
"auricularis", "borealis", "caeli", "capitis", "caprae",
"carnosus", "casei", "caseolyticus", "chromogenes", "cohnii", "condimenti",
"croceilyticus",
"debuckii", "devriesei", "edaphicus", "epidermidis",
"equorum", "felis", "fleurettii", "gallinarum",
"haemolyticus", "hominis", "jettensis", "kloosii",
"lentus", "lugdunensis", "massiliensis", "microti",
"muscae", "nepalensis", "pasteuri", "petrasii",
"pettenkoferi", "piscifermentans", "pragensis", "pseudoxylosus",
"pulvereri", "rostri", "saccharolyticus", "saprophyticus",
"sciuri", "simulans", "stepanovicii", "succinus",
"ureilyticus",
"vitulinus", "vitulus", "warneri", "xylosus",
"caledonicus", "canis",
"durrellii", "lloydii",
"ratti", "taiwanensis", "veratri", "urealyticus"
) |
# old, now renamed to S. schleiferi (but still as synonym in our data of course):
(MO_staph$species == "schleiferi" & MO_staph$subspecies %in% c("schleiferi", ""))),
"mo",
drop = TRUE
MO_staph[
which(MO_staph$species %in% c(
"coagulase-negative", "argensis", "arlettae",
"auricularis", "borealis", "caeli", "capitis", "caprae",
"carnosus", "casei", "caseolyticus", "chromogenes", "cohnii", "condimenti",
"croceilyticus",
"debuckii", "devriesei", "edaphicus", "epidermidis",
"equorum", "felis", "fleurettii", "gallinarum",
"haemolyticus", "hominis", "jettensis", "kloosii",
"lentus", "lugdunensis", "massiliensis", "microti",
"muscae", "nepalensis", "pasteuri", "petrasii",
"pettenkoferi", "piscifermentans", "pragensis", "pseudoxylosus",
"pulvereri", "rostri", "saccharolyticus", "saprophyticus",
"sciuri", "simulans", "stepanovicii", "succinus",
"ureilyticus",
"vitulinus", "vitulus", "warneri", "xylosus",
"caledonicus", "canis",
"durrellii", "lloydii",
"ratti", "taiwanensis", "veratri", "urealyticus"
) |
# old, now renamed to S. schleiferi (but still as synonym in our data of course):
(MO_staph$species == "schleiferi" & MO_staph$subspecies %in% c("schleiferi", ""))),
"mo",
drop = TRUE
]
} else if (type == "CoPS") {
MO_staph[which(MO_staph$species %in% c(
"coagulase-positive", "coagulans",
"agnetis", "argenteus",
"cornubiensis",
"delphini", "lutrae",
"hyicus", "intermedius",
"pseudintermedius", "pseudointermedius",
"schweitzeri", "simiae",
"roterodami",
"singaporensis"
) |
# old, now renamed to S. coagulans (but still as synonym in our data of course):
(MO_staph$species == "schleiferi" & MO_staph$subspecies == "coagulans")),
"mo",
drop = TRUE
MO_staph[
which(MO_staph$species %in% c(
"coagulase-positive", "coagulans",
"agnetis", "argenteus",
"cornubiensis",
"delphini", "lutrae",
"hyicus", "intermedius",
"pseudintermedius", "pseudointermedius",
"schweitzeri", "simiae",
"roterodami",
"singaporensis"
) |
# old, now renamed to S. coagulans (but still as synonym in our data of course):
(MO_staph$species == "schleiferi" & MO_staph$subspecies == "coagulans")),
"mo",
drop = TRUE
]
}
}
@ -254,14 +256,15 @@ create_AB_AV_lookup <- function(df) {
}
new_df$generalised_loinc <- lapply(new_df$loinc, generalise_antibiotic_name)
new_df$generalised_all <- unname(lapply(
as.list(as.data.frame(t(new_df[,
c(
colnames(new_df)[colnames(new_df) %in% c("ab", "av", "atc", "cid", "name")],
colnames(new_df)[colnames(new_df) %like% "generalised"]
),
drop = FALSE
]),
stringsAsFactors = FALSE
as.list(as.data.frame(
t(new_df[,
c(
colnames(new_df)[colnames(new_df) %in% c("ab", "av", "atc", "cid", "name")],
colnames(new_df)[colnames(new_df) %like% "generalised"]
),
drop = FALSE
]),
stringsAsFactors = FALSE
)),
function(x) {
x <- generalise_antibiotic_name(unname(unlist(x)))
@ -472,7 +475,7 @@ suppressMessages(devtools::document(quiet = TRUE))
if (!"styler" %in% rownames(utils::installed.packages())) {
message("Package 'styler' not installed!")
} else if (interactive()) {
# # only when sourcing this file ourselves
# only when sourcing this file ourselves
# usethis::ui_info("Styling package")
# styler::style_pkg(
# style = styler::tidyverse_style,

View File

@ -1,4 +1,3 @@
license_text <- readLines("docs/LICENSE-text.html")
license_text <- paste(license_text, collapse = "|||")
license_text <- gsub("licen(s|c)e", "Survey", license_text, ignore.case = TRUE)

View File

@ -66,33 +66,36 @@ read_EUCAST <- function(sheet, file, guideline_name) {
# in the info header in the Excel file, EUCAST mentions which genera are targeted
if (sheet %like% "anaerob.*Gram.*posi") {
sheet <- paste0(c(
"Actinomyces", "Bifidobacterium", "Clostridioides",
"Clostridium", "Cutibacterium", "Eggerthella",
"Eubacterium", "Lactobacillus", "Propionibacterium",
"Staphylococcus saccharolyticus"
),
collapse = "_"
sheet <- paste0(
c(
"Actinomyces", "Bifidobacterium", "Clostridioides",
"Clostridium", "Cutibacterium", "Eggerthella",
"Eubacterium", "Lactobacillus", "Propionibacterium",
"Staphylococcus saccharolyticus"
),
collapse = "_"
)
} else if (sheet %like% "anaerob.*Gram.*nega") {
sheet <- paste0(c(
"Bacteroides",
"Bilophila",
"Fusobacterium",
"Mobiluncus",
"Parabacteroides",
"Porphyromonas",
"Prevotella"
),
collapse = "_"
sheet <- paste0(
c(
"Bacteroides",
"Bilophila",
"Fusobacterium",
"Mobiluncus",
"Parabacteroides",
"Porphyromonas",
"Prevotella"
),
collapse = "_"
)
} else if (sheet == "Streptococcus A,B,C,G") {
sheet <- paste0(microorganisms %>%
filter(genus == "Streptococcus") %>%
mutate(lancefield = mo_name(mo, Lancefield = TRUE)) %>%
filter(lancefield %like% "^Streptococcus group") %>%
pull(fullname),
collapse = "_"
sheet <- paste0(
microorganisms %>%
filter(genus == "Streptococcus") %>%
mutate(lancefield = mo_name(mo, Lancefield = TRUE)) %>%
filter(lancefield %like% "^Streptococcus group") %>%
pull(fullname),
collapse = "_"
)
} else if (sheet %like% "PK.*PD") {
sheet <- "UNKNOWN"

View File

@ -142,14 +142,15 @@ abx2 <- bind_rows(abx_atc1, abx_atc2)
rm(abx_atc1)
rm(abx_atc2)
abx2$ab[is.na(abx2$ab)] <- toupper(abbreviate(gsub(
"[/0-9-]",
" ",
abx2$name[is.na(abx2$ab)]
),
minlength = 3,
method = "left.kept",
strict = TRUE
abx2$ab[is.na(abx2$ab)] <- toupper(abbreviate(
gsub(
"[/0-9-]",
" ",
abx2$name[is.na(abx2$ab)]
),
minlength = 3,
method = "left.kept",
strict = TRUE
))
n_distinct(abx2$ab)
@ -197,24 +198,26 @@ get_CID <- function(ab) {
p$tick()
CID[i] <- tryCatch(
data.table::fread(paste0(
"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/",
URLencode(ab[i], reserved = TRUE),
"/cids/TXT?name_type=complete"
),
showProgress = FALSE
data.table::fread(
paste0(
"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/",
URLencode(ab[i], reserved = TRUE),
"/cids/TXT?name_type=complete"
),
showProgress = FALSE
)[[1]][1],
error = function(e) NA_integer_
)
if (is.na(CID[i])) {
# try with removing the text in brackets
CID[i] <- tryCatch(
data.table::fread(paste0(
"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/",
URLencode(trimws(gsub("[(].*[)]", "", ab[i])), reserved = TRUE),
"/cids/TXT?name_type=complete"
),
showProgress = FALSE
data.table::fread(
paste0(
"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/",
URLencode(trimws(gsub("[(].*[)]", "", ab[i])), reserved = TRUE),
"/cids/TXT?name_type=complete"
),
showProgress = FALSE
)[[1]][1],
error = function(e) NA_integer_
)
@ -223,12 +226,13 @@ get_CID <- function(ab) {
# try match on word and take the lowest CID value (sorted)
ab[i] <- gsub("[^a-z0-9]+", " ", ab[i], ignore.case = TRUE)
CID[i] <- tryCatch(
data.table::fread(paste0(
"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/",
URLencode(ab[i], reserved = TRUE),
"/cids/TXT?name_type=word"
),
showProgress = FALSE
data.table::fread(
paste0(
"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/",
URLencode(ab[i], reserved = TRUE),
"/cids/TXT?name_type=word"
),
showProgress = FALSE
)[[1]][1],
error = function(e) NA_integer_
)
@ -260,13 +264,14 @@ get_synonyms <- function(CID, clean = TRUE) {
}
synonyms_txt <- tryCatch(
data.table::fread(paste0(
"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/fastidentity/cid/",
CID[i],
"/synonyms/TXT"
),
sep = "\n",
showProgress = FALSE
data.table::fread(
paste0(
"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/fastidentity/cid/",
CID[i],
"/synonyms/TXT"
),
sep = "\n",
showProgress = FALSE
)[[1]],
error = function(e) NA_character_
)

View File

@ -106,31 +106,32 @@ antivirals <- antivirals %>%
oral_units,
iv_ddd,
iv_units
) %>%
) %>%
AMR:::dataset_UTF8_to_ASCII()
av_codes <- tibble(name = antivirals$name %>%
strsplit("(, | and )") %>%
unlist() %>%
unique() %>%
sort()) %>%
mutate(av_1st = toupper(abbreviate(name, minlength = 3, use.classes = FALSE))) %>%
strsplit("(, | and )") %>%
unlist() %>%
unique() %>%
sort()) %>%
mutate(av_1st = toupper(abbreviate(name, minlength = 3, use.classes = FALSE))) %>%
filter(!name %in% c("acid", "dipivoxil", "disoproxil", "marboxil", "alafenamide"))
replace_with_av_code <- function(name) {
unname(av_codes$av_1st[match(name, av_codes$name)])
}
names_codes <- antivirals %>%
names_codes <- antivirals %>%
separate(name,
into = paste0("name", c(1:7)),
sep = "(, | and )",
remove = FALSE,
fill = "right") %>%
into = paste0("name", c(1:7)),
sep = "(, | and )",
remove = FALSE,
fill = "right"
) %>%
# remove empty columns
select(!where(function(x) all(is.na(x)))) %>%
mutate_at(vars(matches("name[1-9]")), replace_with_av_code) %>%
unite(av, matches("name[1-9]"), sep = "+", na.rm = TRUE) %>%
select(!where(function(x) all(is.na(x)))) %>%
mutate_at(vars(matches("name[1-9]")), replace_with_av_code) %>%
unite(av, matches("name[1-9]"), sep = "+", na.rm = TRUE) %>%
mutate(name = gsub("(, | and )", "/", name))
substr(names_codes$name, 1, 1) <- toupper(substr(names_codes$name, 1, 1))
@ -143,8 +144,9 @@ antivirals <- antivirals %>% AMR:::dataset_UTF8_to_ASCII()
# add loinc, see 'data-raw/loinc.R'
loinc_df <- read.csv("data-raw/Loinc.csv",
row.names = NULL,
stringsAsFactors = FALSE)
row.names = NULL,
stringsAsFactors = FALSE
)
loinc_df <- loinc_df %>% filter(CLASS == "DRUG/TOX")
av_names <- antivirals %>%

View File

@ -173,7 +173,7 @@ dosage_new <- bind_rows(
as.data.frame(stringsAsFactors = FALSE)
rownames(dosage_new) <- NULL
dosage <- bind_rows(dosage_new, AMR::dosage) %>%
dosage <- bind_rows(dosage_new, AMR::dosage) %>%
dataset_UTF8_to_ASCII()
usethis::use_data(dosage, internal = FALSE, overwrite = TRUE, version = 2)

View File

@ -37,10 +37,10 @@
# CSV file (~12,5 MB) as "taxonomy.csv". Their API unfortunately does
# not include the full taxonomy and is currently (2022) pretty worthless.
# 3. For data about human pathogens, we use Bartlett et al. (2022),
# https://doi.org/10.1099/mic.0.001269. Their latest supplementary material
# https://doi.org/10.1099/mic.0.001269. Their latest supplementary material
# can be found here: https://github.com/padpadpadpad/bartlett_et_al_2022_human_pathogens.
#. Download their latest xlsx file in the `data` folder and save it to our
#. `data-raw` folder.
# . Download their latest xlsx file in the `data` folder and save it to our
# . `data-raw` folder.
# 4. Set this folder_location to the path where these two files are:
folder_location <- "~/Downloads/backbone/"
file_gbif <- paste0(folder_location, "Taxon.tsv")
@ -65,7 +65,7 @@ devtools::load_all(".") # load AMR package
get_author_year <- function(ref) {
# Only keep first author, e.g. transform 'Smith, Jones, 2011' to 'Smith et al., 2011'
authors2 <- iconv(ref, from = "UTF-8", to = "ASCII//TRANSLIT")
authors2 <- gsub(" ?\\(Approved Lists [0-9]+\\) ?", " () ", authors2)
authors2 <- gsub(" [)(]+ $", "", authors2)
@ -73,21 +73,21 @@ get_author_year <- function(ref) {
authors2 <- trimws(gsub("^[(](.*)[)]$", "\\1", authors2))
# only take part after brackets if there's a name
authors2 <- ifelse(grepl(".*[)] [a-zA-Z]+.*", authors2),
gsub(".*[)] (.*)", "\\1", authors2),
authors2
gsub(".*[)] (.*)", "\\1", authors2),
authors2
)
# replace parentheses with emend. to get the latest authors
authors2 <- gsub("(", " emend. ", authors2, fixed = TRUE)
authors2 <- gsub(")", "", authors2, fixed = TRUE)
authors2 <- gsub(" +", " ", authors2)
authors2 <- trimws(authors2)
# get year from last 4 digits
lastyear <- as.integer(gsub(".*([0-9]{4})$", "\\1", authors2))
# can never be later than now
lastyear <- ifelse(lastyear > as.integer(format(Sys.Date(), "%Y")),
NA,
lastyear
NA,
lastyear
)
# get authors without last year
authors <- gsub("(.*)[0-9]{4}$", "\\1", authors2)
@ -119,8 +119,8 @@ get_author_year <- function(ref) {
authors[nchar(authors) <= 3] <- ""
# combine author and year if year is available
ref <- ifelse(!is.na(lastyear),
paste0(authors, ", ", lastyear),
authors
paste0(authors, ", ", lastyear),
authors
)
# fix beginning and ending
ref <- gsub(", $", "", ref)
@ -128,7 +128,7 @@ get_author_year <- function(ref) {
ref <- gsub("^(emend|et al.,?)", "", ref)
ref <- trimws(ref)
ref <- gsub("'", "", ref)
# a lot start with a lowercase character - fix that
ref[!grepl("^d[A-Z]", ref)] <- gsub("^([a-z])", "\\U\\1", ref[!grepl("^d[A-Z]", ref)], perl = TRUE)
# specific one for the French that are named dOrbigny
@ -222,9 +222,9 @@ include_fungal_orders <- c(
# get latest taxonomic names of these fungal orders
include_fungal_orders_ids <- taxonomy_gbif.bak %>%
filter(order %in% include_fungal_orders)
include_fungal_orders <- taxonomy_gbif.bak %>%
filter(taxonID %in% c(include_fungal_orders_ids$taxonID, include_fungal_orders_ids$acceptedNameUsageID)) %>%
distinct(order) %>%
include_fungal_orders <- taxonomy_gbif.bak %>%
filter(taxonID %in% c(include_fungal_orders_ids$taxonID, include_fungal_orders_ids$acceptedNameUsageID)) %>%
distinct(order) %>%
pull(order)
# check some columns to validate below filters
@ -361,7 +361,7 @@ for (page in LETTERS) {
names <- names[ranks != "species"]
ranks <- ranks[ranks != "species"]
ranks[ranks == "domain"] <- "kingdom"
df <- names %>%
tibble() %>%
t() %>%
@ -369,7 +369,7 @@ for (page in LETTERS) {
setNames(ranks) %>%
# no candidates please
filter(genus %unlike% "^(Candidatus|\\[)")
taxonomy_lpsn_missing <- taxonomy_lpsn_missing %>%
bind_rows(df)
}
@ -491,14 +491,14 @@ saveRDS(taxonomy_lpsn, "data-raw/taxonomy_lpsn.rds", version = 2)
taxonomy_gbif <- taxonomy_gbif %>%
# clean NAs and add fullname
mutate(across(kingdom:subspecies, function(x) ifelse(is.na(x), "", x)),
fullname = trimws(case_when(
rank == "family" ~ family,
rank == "order" ~ order,
rank == "class" ~ class,
rank == "phylum" ~ phylum,
rank == "kingdom" ~ kingdom,
TRUE ~ paste(genus, species, subspecies)
)), .before = 1
fullname = trimws(case_when(
rank == "family" ~ family,
rank == "order" ~ order,
rank == "class" ~ class,
rank == "phylum" ~ phylum,
rank == "kingdom" ~ kingdom,
TRUE ~ paste(genus, species, subspecies)
)), .before = 1
) %>%
# keep only one GBIF taxon ID per full name
arrange(fullname, gbif) %>%
@ -507,14 +507,14 @@ taxonomy_gbif <- taxonomy_gbif %>%
taxonomy_lpsn <- taxonomy_lpsn %>%
# clean NAs and add fullname
mutate(across(kingdom:subspecies, function(x) ifelse(is.na(x), "", x)),
fullname = trimws(case_when(
rank == "family" ~ family,
rank == "order" ~ order,
rank == "class" ~ class,
rank == "phylum" ~ phylum,
rank == "kingdom" ~ kingdom,
TRUE ~ paste(genus, species, subspecies)
)), .before = 1
fullname = trimws(case_when(
rank == "family" ~ family,
rank == "order" ~ order,
rank == "class" ~ class,
rank == "phylum" ~ phylum,
rank == "kingdom" ~ kingdom,
TRUE ~ paste(genus, species, subspecies)
)), .before = 1
) %>%
# keep only one LPSN record ID per full name
arrange(fullname, lpsn) %>%
@ -536,23 +536,25 @@ taxonomy_lpsn$lpsn_parent[taxonomy_lpsn$rank == "subspecies"] <- taxonomy_lpsn$l
taxonomy <- taxonomy_lpsn %>%
# join GBIF identifiers to them
left_join(taxonomy_gbif %>% select(kingdom, fullname, starts_with("gbif")),
by = c("kingdom", "fullname")
by = c("kingdom", "fullname")
)
# for everything else, add the GBIF data
taxonomy <- taxonomy %>%
bind_rows(taxonomy_gbif %>%
filter(!paste(kingdom, fullname) %in% paste(taxonomy$kingdom, taxonomy$fullname))) %>%
filter(!paste(kingdom, fullname) %in% paste(taxonomy$kingdom, taxonomy$fullname))) %>%
arrange(fullname) %>%
filter(fullname != "")
# get missing entries from existing microorganisms data set
taxonomy <- taxonomy %>%
bind_rows(AMR::microorganisms %>%
select(all_of(colnames(taxonomy))) %>%
filter(!paste(kingdom, fullname) %in% paste(taxonomy$kingdom, taxonomy$fullname),
# these will be added later:
source != "manually added")) %>%
select(all_of(colnames(taxonomy))) %>%
filter(
!paste(kingdom, fullname) %in% paste(taxonomy$kingdom, taxonomy$fullname),
# these will be added later:
source != "manually added"
)) %>%
arrange(fullname) %>%
filter(fullname != "")
@ -602,9 +604,10 @@ taxonomy <- taxonomy %>%
source = "manually added"
) %>%
filter(!paste(kingdom, rank) %in% paste(taxonomy$kingdom, taxonomy$rank)) %>%
left_join(current_gbif %>%
select(kingdom, rank = taxonRank, ref = scientificNameAuthorship, gbif = taxonID, gbif_parent = parentNameUsageID),
by = c("kingdom", "rank")
left_join(
current_gbif %>%
select(kingdom, rank = taxonRank, ref = scientificNameAuthorship, gbif = taxonID, gbif_parent = parentNameUsageID),
by = c("kingdom", "rank")
) %>%
mutate(source = ifelse(!is.na(gbif), "GBIF", source))
)
@ -625,17 +628,18 @@ for (i in 2:6) {
source = "manually added"
) %>%
filter(!paste(kingdom, .[[ncol(.) - 4]], rank) %in% paste(taxonomy$kingdom, taxonomy[[i + 1]], taxonomy$rank)) %>%
# get GBIF identifier where available
left_join(current_gbif %>%
select(kingdom, all_of(i_name), rank = taxonRank, ref = scientificNameAuthorship, gbif = taxonID, gbif_parent = parentNameUsageID),
by = c("kingdom", "rank", i_name)
) %>%
mutate(source = ifelse(!is.na(gbif), "GBIF", source))
# get GBIF identifier where available
left_join(
current_gbif %>%
select(kingdom, all_of(i_name), rank = taxonRank, ref = scientificNameAuthorship, gbif = taxonID, gbif_parent = parentNameUsageID),
by = c("kingdom", "rank", i_name)
) %>%
mutate(source = ifelse(!is.na(gbif), "GBIF", source))
message("n = ", nrow(to_add))
if (is.null(taxonomy_all_missing)) {
taxonomy_all_missing <- to_add
} else {
taxonomy_all_missing <- taxonomy_all_missing %>%
taxonomy_all_missing <- taxonomy_all_missing %>%
bind_rows(to_add)
}
}
@ -645,20 +649,24 @@ taxonomy <- taxonomy %>%
bind_rows(taxonomy_all_missing)
# fix for duplicate fullnames within a kingdom (such as Nitrospira which is the name of the genus AND its class)
taxonomy <- taxonomy %>%
mutate(rank_index = case_when(rank == "subspecies" ~ 1,
rank == "species" ~ 2,
rank == "genus" ~ 3,
rank == "family" ~ 4,
rank == "order" ~ 5,
rank == "class" ~ 6,
TRUE ~ 7),
fullname_rank = paste0(fullname, " {", rank, "}")) %>%
arrange(kingdom, fullname, rank_index) %>%
group_by(kingdom, fullname) %>%
mutate(fullname = if_else(row_number() > 1, fullname_rank, fullname)) %>%
ungroup() %>%
select(-fullname_rank, -rank_index) %>%
taxonomy <- taxonomy %>%
mutate(
rank_index = case_when(
rank == "subspecies" ~ 1,
rank == "species" ~ 2,
rank == "genus" ~ 3,
rank == "family" ~ 4,
rank == "order" ~ 5,
rank == "class" ~ 6,
TRUE ~ 7
),
fullname_rank = paste0(fullname, " {", rank, "}")
) %>%
arrange(kingdom, fullname, rank_index) %>%
group_by(kingdom, fullname) %>%
mutate(fullname = if_else(row_number() > 1, fullname_rank, fullname)) %>%
ungroup() %>%
select(-fullname_rank, -rank_index) %>%
arrange(fullname)
# now also add missing species (requires combination with genus)
@ -676,12 +684,13 @@ taxonomy <- taxonomy %>%
) %>%
filter(!paste(kingdom, genus, species, rank) %in% paste(taxonomy$kingdom, taxonomy$genus, taxonomy$species, taxonomy$rank)) %>%
# get GBIF identifier where available
left_join(current_gbif %>%
select(kingdom, genus, species = specificEpithet, rank = taxonRank, ref = scientificNameAuthorship, gbif = taxonID, gbif_parent = parentNameUsageID),
by = c("kingdom", "rank", "genus", "species")
left_join(
current_gbif %>%
select(kingdom, genus, species = specificEpithet, rank = taxonRank, ref = scientificNameAuthorship, gbif = taxonID, gbif_parent = parentNameUsageID),
by = c("kingdom", "rank", "genus", "species")
) %>%
mutate(source = ifelse(!is.na(gbif), "GBIF", source))
)
)
# remove NAs from taxonomy again, and keep unique full names
@ -702,7 +711,7 @@ manually_added <- AMR::microorganisms %>%
filter(source == "manually added", !paste(kingdom, fullname) %in% paste(taxonomy$kingdom, taxonomy$fullname)) %>%
select(fullname:subspecies, ref, source, rank)
manually_added <- manually_added %>%
manually_added <- manually_added %>%
bind_rows(salmonellae)
# get latest taxonomy for those entries
@ -805,76 +814,83 @@ taxonomy <- taxonomy %>%
pathogens <- read_excel(file_bartlett, sheet = "Tab 6 Full List")
# get all established, both old and current taxonomic names
established <- pathogens %>%
filter(status == "established") %>%
established <- pathogens %>%
filter(status == "established") %>%
mutate(fullname = paste(genus, species)) %>%
pull(fullname) %>%
c(unlist(mo_current(.)),
unlist(mo_synonyms(., keep_synonyms = FALSE))) %>%
strsplit(" ", fixed = TRUE) %>%
sapply(function(x) ifelse(length(x) == 1, x, paste(x[1], x[2]))) %>%
sort() %>%
pull(fullname) %>%
c(
unlist(mo_current(.)),
unlist(mo_synonyms(., keep_synonyms = FALSE))
) %>%
strsplit(" ", fixed = TRUE) %>%
sapply(function(x) ifelse(length(x) == 1, x, paste(x[1], x[2]))) %>%
sort() %>%
unique()
# get all putative, both old and current taxonomic names
putative <- pathogens %>%
filter(status == "putative") %>%
putative <- pathogens %>%
filter(status == "putative") %>%
mutate(fullname = paste(genus, species)) %>%
pull(fullname) %>%
c(unlist(mo_current(.)),
unlist(mo_synonyms(., keep_synonyms = FALSE))) %>%
strsplit(" ", fixed = TRUE) %>%
sapply(function(x) ifelse(length(x) == 1, x, paste(x[1], x[2]))) %>%
sort() %>%
pull(fullname) %>%
c(
unlist(mo_current(.)),
unlist(mo_synonyms(., keep_synonyms = FALSE))
) %>%
strsplit(" ", fixed = TRUE) %>%
sapply(function(x) ifelse(length(x) == 1, x, paste(x[1], x[2]))) %>%
sort() %>%
unique()
established <- established[established %unlike% "unknown"]
putative <- putative[putative %unlike% "unknown"]
established_genera <- established %>%
strsplit(" ", fixed = TRUE) %>%
sapply(function(x) x[1]) %>%
sort() %>%
established_genera <- established %>%
strsplit(" ", fixed = TRUE) %>%
sapply(function(x) x[1]) %>%
sort() %>%
unique()
putative_genera <- putative %>%
strsplit(" ", fixed = TRUE) %>%
sapply(function(x) x[1]) %>%
sort() %>%
putative_genera <- putative %>%
strsplit(" ", fixed = TRUE) %>%
sapply(function(x) x[1]) %>%
sort() %>%
unique()
nonbacterial_genera <- AMR:::MO_PREVALENT_GENERA %>%
c(unlist(mo_current(.)),
unlist(mo_synonyms(., keep_synonyms = FALSE))) %>%
strsplit(" ", fixed = TRUE) %>%
sapply(function(x) x[1]) %>%
sort() %>%
nonbacterial_genera <- AMR:::MO_PREVALENT_GENERA %>%
c(
unlist(mo_current(.)),
unlist(mo_synonyms(., keep_synonyms = FALSE))
) %>%
strsplit(" ", fixed = TRUE) %>%
sapply(function(x) x[1]) %>%
sort() %>%
unique()
nonbacterial_genera <- nonbacterial_genera[nonbacterial_genera %unlike% "unknown"]
# update prevalence based on taxonomy (following the recent and thorough work of Bartlett et al., 2022)
# see https://doi.org/10.1099/mic.0.001269
taxonomy <- taxonomy %>%
taxonomy <- taxonomy %>%
mutate(prevalence = case_when(
# 'established' means 'have infected at least three persons in three or more references'
paste(genus, species) %in% established & rank %in% c("species", "subspecies") ~ 1.0,
# other genera in the 'established' group
genus %in% established_genera & rank == "genus" ~ 1.0,
# 'putative' means 'fewer than three known cases'
paste(genus, species) %in% putative & rank %in% c("species", "subspecies") ~ 1.25,
# other genera in the 'putative' group
genus %in% putative_genera & rank == "genus" ~ 1.25,
# species and subspecies in 'established' and 'putative' groups
genus %in% c(established_genera, putative_genera) & rank %in% c("species", "subspecies") ~ 1.5,
# other species from a genus in either group
genus %in% nonbacterial_genera & rank %in% c("genus", "species", "subspecies") ~ 1.5,
# we keep track of prevalent genera too of non-bacterial species
genus %in% AMR:::MO_PREVALENT_GENERA & kingdom != "Bacteria" & rank %in% c("genus", "species", "subspecies") ~ 1.5,
# all others
TRUE ~ 2.0))
TRUE ~ 2.0
))
table(taxonomy$prevalence, useNA = "always")
# (a lot will be removed further below)
@ -909,13 +925,14 @@ mo_kingdom <- taxonomy %>%
mo_phylum <- taxonomy %>%
filter(rank == "phylum") %>%
distinct(kingdom, phylum) %>%
left_join(AMR::microorganisms %>%
filter(rank == "phylum") %>%
transmute(kingdom,
phylum = fullname,
mo_old = gsub("[A-Z]{1,2}_", "", as.character(mo))
),
by = c("kingdom", "phylum")
left_join(
AMR::microorganisms %>%
filter(rank == "phylum") %>%
transmute(kingdom,
phylum = fullname,
mo_old = gsub("[A-Z]{1,2}_", "", as.character(mo))
),
by = c("kingdom", "phylum")
) %>%
group_by(kingdom) %>%
mutate(
@ -935,13 +952,14 @@ mo_phylum <- mo_phylum %>%
mo_class <- taxonomy %>%
filter(rank == "class") %>%
distinct(kingdom, class) %>%
left_join(AMR::microorganisms %>%
filter(rank == "class") %>%
transmute(kingdom,
class = fullname,
mo_old = gsub("[A-Z]{1,2}_", "", as.character(mo))
),
by = c("kingdom", "class")
left_join(
AMR::microorganisms %>%
filter(rank == "class") %>%
transmute(kingdom,
class = fullname,
mo_old = gsub("[A-Z]{1,2}_", "", as.character(mo))
),
by = c("kingdom", "class")
) %>%
group_by(kingdom) %>%
mutate(
@ -961,13 +979,14 @@ mo_class <- mo_class %>%
mo_order <- taxonomy %>%
filter(rank == "order") %>%
distinct(kingdom, order) %>%
left_join(AMR::microorganisms %>%
filter(rank == "order") %>%
transmute(kingdom,
order = fullname,
mo_old = gsub("[A-Z]{1,2}_", "", as.character(mo))
),
by = c("kingdom", "order")
left_join(
AMR::microorganisms %>%
filter(rank == "order") %>%
transmute(kingdom,
order = fullname,
mo_old = gsub("[A-Z]{1,2}_", "", as.character(mo))
),
by = c("kingdom", "order")
) %>%
group_by(kingdom) %>%
mutate(
@ -987,13 +1006,14 @@ mo_order <- mo_order %>%
mo_family <- taxonomy %>%
filter(rank == "family") %>%
distinct(kingdom, family) %>%
left_join(AMR::microorganisms %>%
filter(rank == "family") %>%
transmute(kingdom,
family = fullname,
mo_old = gsub("[A-Z]{1,2}_", "", as.character(mo))
),
by = c("kingdom", "family")
left_join(
AMR::microorganisms %>%
filter(rank == "family") %>%
transmute(kingdom,
family = fullname,
mo_old = gsub("[A-Z]{1,2}_", "", as.character(mo))
),
by = c("kingdom", "family")
) %>%
group_by(kingdom) %>%
mutate(
@ -1014,11 +1034,12 @@ mo_genus <- taxonomy %>%
filter(rank == "genus") %>%
distinct(kingdom, genus) %>%
# get available old MO codes
left_join(AMR::microorganisms %>%
filter(rank == "genus") %>%
transmute(mo_genus_old = gsub("^[A-Z]+_", "", as.character(mo)), kingdom, genus) %>%
distinct(kingdom, genus, .keep_all = TRUE),
by = c("kingdom", "genus")
left_join(
AMR::microorganisms %>%
filter(rank == "genus") %>%
transmute(mo_genus_old = gsub("^[A-Z]+_", "", as.character(mo)), kingdom, genus) %>%
distinct(kingdom, genus, .keep_all = TRUE),
by = c("kingdom", "genus")
) %>%
distinct(kingdom, genus, .keep_all = TRUE) %>%
# since kingdom is part of the code, genus abbreviations may be duplicated between kingdoms
@ -1060,12 +1081,13 @@ mo_genus <- mo_genus %>%
mo_species <- taxonomy %>%
filter(rank == "species") %>%
distinct(kingdom, genus, species) %>%
left_join(AMR::microorganisms %>%
filter(rank == "species") %>%
transmute(mo_species_old = gsub("^[A-Z]+_[A-Z]+_", "", as.character(mo)), kingdom, genus, species) %>%
filter(mo_species_old %unlike% "-") %>%
distinct(kingdom, genus, species, .keep_all = TRUE),
by = c("kingdom", "genus", "species")
left_join(
AMR::microorganisms %>%
filter(rank == "species") %>%
transmute(mo_species_old = gsub("^[A-Z]+_[A-Z]+_", "", as.character(mo)), kingdom, genus, species) %>%
filter(mo_species_old %unlike% "-") %>%
distinct(kingdom, genus, species, .keep_all = TRUE),
by = c("kingdom", "genus", "species")
) %>%
distinct(kingdom, genus, species, .keep_all = TRUE) %>%
group_by(kingdom, genus) %>%
@ -1108,12 +1130,13 @@ mo_species <- mo_species %>%
mo_subspecies <- taxonomy %>%
filter(rank == "subspecies") %>%
distinct(kingdom, genus, species, subspecies) %>%
left_join(AMR::microorganisms %>%
filter(rank %in% c("subspecies", "subsp.", "infraspecies")) %>%
transmute(mo_subspecies_old = gsub("^[A-Z]+_[A-Z]+_[A-Z]+_", "", as.character(mo)), kingdom, genus, species, subspecies) %>%
filter(mo_subspecies_old %unlike% "-") %>%
distinct(kingdom, genus, species, subspecies, .keep_all = TRUE),
by = c("kingdom", "genus", "species", "subspecies")
left_join(
AMR::microorganisms %>%
filter(rank %in% c("subspecies", "subsp.", "infraspecies")) %>%
transmute(mo_subspecies_old = gsub("^[A-Z]+_[A-Z]+_[A-Z]+_", "", as.character(mo)), kingdom, genus, species, subspecies) %>%
filter(mo_subspecies_old %unlike% "-") %>%
distinct(kingdom, genus, species, subspecies, .keep_all = TRUE),
by = c("kingdom", "genus", "species", "subspecies")
) %>%
distinct(kingdom, genus, species, subspecies, .keep_all = TRUE) %>%
group_by(kingdom, genus, species) %>%
@ -1187,20 +1210,26 @@ taxonomy <- taxonomy %>%
arrange(fullname)
# now check these - e.g. Nitrospira is the name of a genus AND its class
taxonomy %>% filter(fullname %in% .[duplicated(fullname), "fullname", drop = TRUE]) %>% View()
taxonomy %>%
filter(fullname %in% .[duplicated(fullname), "fullname", drop = TRUE]) %>%
View()
taxonomy <- taxonomy %>%
mutate(rank_index = case_when(kingdom == "Bacteria" ~ 1,
kingdom == "Fungi" ~ 2,
kingdom == "Protozoa" ~ 3,
kingdom == "Archaea" ~ 4,
TRUE ~ 5)) %>%
arrange(fullname, rank_index) %>%
distinct(fullname, .keep_all = TRUE) %>%
select(-rank_index) %>%
mutate(rank_index = case_when(
kingdom == "Bacteria" ~ 1,
kingdom == "Fungi" ~ 2,
kingdom == "Protozoa" ~ 3,
kingdom == "Archaea" ~ 4,
TRUE ~ 5
)) %>%
arrange(fullname, rank_index) %>%
distinct(fullname, .keep_all = TRUE) %>%
select(-rank_index) %>%
filter(mo != "")
# this must not exist:
taxonomy %>% filter(mo %like% "__") %>% View()
taxonomy %>%
filter(mo %like% "__") %>%
View()
taxonomy <- taxonomy %>% filter(mo %unlike% "__")
@ -1214,14 +1243,20 @@ taxonomy <- taxonomy %>% distinct(mo, .keep_all = TRUE)
taxonomy %>% filter(fullname %in% .[duplicated(fullname), "fullname", drop = TRUE])
# are all GBIFs available?
taxonomy %>% filter(!gbif_parent %in% gbif) %>% count(rank)
taxonomy %>%
filter(!gbif_parent %in% gbif) %>%
count(rank)
# try to find the right gbif IDs
taxonomy$gbif_parent[which(!taxonomy$gbif_parent %in% taxonomy$gbif & taxonomy$rank == "species")] <- taxonomy$gbif[match(taxonomy$genus[which(!taxonomy$gbif_parent %in% taxonomy$gbif & taxonomy$rank == "species")], taxonomy$genus)]
taxonomy$gbif_parent[which(!taxonomy$gbif_parent %in% taxonomy$gbif & taxonomy$rank == "class")] <- taxonomy$gbif[match(taxonomy$phylum[which(!taxonomy$gbif_parent %in% taxonomy$gbif & taxonomy$rank == "class")], taxonomy$phylum)]
taxonomy %>% filter(!gbif_parent %in% gbif) %>% count(rank)
taxonomy %>%
filter(!gbif_parent %in% gbif) %>%
count(rank)
# are all LPSNs available?
taxonomy %>% filter(!lpsn_parent %in% lpsn) %>% count(rank)
taxonomy %>%
filter(!lpsn_parent %in% lpsn) %>%
count(rank)
# make GBIF refer to newest renaming according to LPSN
taxonomy$gbif_renamed_to[which(!is.na(taxonomy$gbif_renamed_to) & !is.na(taxonomy$lpsn_renamed_to))] <- taxonomy$gbif[match(taxonomy$lpsn_renamed_to[which(!is.na(taxonomy$gbif_renamed_to) & !is.na(taxonomy$lpsn_renamed_to))], taxonomy$lpsn)]
@ -1251,21 +1286,33 @@ taxonomy <- taxonomy %>%
# no ghost families, orders classes, phyla
taxonomy <- taxonomy %>%
group_by(kingdom, family) %>% filter(n() > 1 | fullname %like% "unknown" | rank == "kingdom") %>%
group_by(kingdom, order) %>% filter(n() > 1 | fullname %like% "unknown" | rank == "kingdom") %>%
group_by(kingdom, class) %>% filter(n() > 1 | fullname %like% "unknown" | rank == "kingdom") %>%
group_by(kingdom, phylum) %>% filter(n() > 1 | fullname %like% "unknown" | rank == "kingdom") %>%
group_by(kingdom, family) %>%
filter(n() > 1 | fullname %like% "unknown" | rank == "kingdom") %>%
group_by(kingdom, order) %>%
filter(n() > 1 | fullname %like% "unknown" | rank == "kingdom") %>%
group_by(kingdom, class) %>%
filter(n() > 1 | fullname %like% "unknown" | rank == "kingdom") %>%
group_by(kingdom, phylum) %>%
filter(n() > 1 | fullname %like% "unknown" | rank == "kingdom") %>%
ungroup()
message("\nCongratulations! The new taxonomic table will contain ", format(nrow(taxonomy), big.mark = ","), " rows.\n",
"This was ", format(nrow(AMR::microorganisms), big.mark = ","), " rows.\n")
message(
"\nCongratulations! The new taxonomic table will contain ", format(nrow(taxonomy), big.mark = ","), " rows.\n",
"This was ", format(nrow(AMR::microorganisms), big.mark = ","), " rows.\n"
)
# these are the new ones:
taxonomy %>% filter(!paste(kingdom, fullname) %in% paste(AMR::microorganisms$kingdom, AMR::microorganisms$fullname)) %>% View()
taxonomy %>%
filter(!paste(kingdom, fullname) %in% paste(AMR::microorganisms$kingdom, AMR::microorganisms$fullname)) %>%
View()
# these were removed:
AMR::microorganisms %>% filter(!paste(kingdom, fullname) %in% paste(taxonomy$kingdom, taxonomy$fullname)) %>% View()
AMR::microorganisms %>% filter(!fullname %in% taxonomy$fullname) %>% View()
AMR::microorganisms %>%
filter(!paste(kingdom, fullname) %in% paste(taxonomy$kingdom, taxonomy$fullname)) %>%
View()
AMR::microorganisms %>%
filter(!fullname %in% taxonomy$fullname) %>%
View()
# Add SNOMED CT -----------------------------------------------------------

File diff suppressed because it is too large Load Diff