1
0
mirror of https://github.com/msberends/AMR.git synced 2025-07-13 06:42:04 +02:00

microorganisms update, added Salmonella groups

This commit is contained in:
2022-12-16 16:10:43 +01:00
parent 8da2467209
commit 5f3a7694aa
43 changed files with 38933 additions and 105358 deletions

View File

@ -50,8 +50,7 @@ library(dplyr)
library(vroom) # to import files
library(rvest) # to scape LPSN website
library(progress) # to show progress bars
library(AMR)
# also requires 'rvest' and 'progress' packages
devtools::load_all(".") # load AMR package
# Helper functions --------------------------------------------------------
@ -538,6 +537,16 @@ taxonomy <- taxonomy %>%
arrange(fullname) %>%
filter(fullname != "")
# get missing entries from existing microorganisms data set
taxonomy <- taxonomy %>%
bind_rows(AMR::microorganisms %>%
select(all_of(colnames(taxonomy))) %>%
filter(!paste(kingdom, fullname) %in% paste(taxonomy$kingdom, taxonomy$fullname),
# these will be added later:
source != "manually added")) %>%
arrange(fullname) %>%
filter(fullname != "")
# fix rank
table(taxonomy$rank, useNA = "always")
taxonomy <- taxonomy %>%
@ -554,26 +563,13 @@ taxonomy <- taxonomy %>%
))
table(taxonomy$rank, useNA = "always")
# get the latest upper taxonomy from LPSN to update the GBIF data
# (e.g., phylum above class "Bacilli" was still "Firmicutes", should be "Bacillota" in 2022)
for (k in unique(taxonomy$kingdom[taxonomy$kingdom != ""])) {
message("Fixing GBIF taxonomy for kingdom ", k, "...")
for (g in unique(taxonomy$genus[taxonomy$genus != "" & taxonomy$kingdom == k & taxonomy$source == "LPSN"])) {
taxonomy$family[which(taxonomy$genus == g & taxonomy$kingdom == k)] <- taxonomy$family[which(taxonomy$genus == g & taxonomy$kingdom == k & taxonomy$source == "LPSN")][1]
}
for (f in unique(taxonomy$family[taxonomy$family != "" & taxonomy$kingdom == k & taxonomy$source == "LPSN"])) {
taxonomy$order[which(taxonomy$family == f & taxonomy$kingdom == k)] <- taxonomy$order[which(taxonomy$family == f & taxonomy$kingdom == k & taxonomy$source == "LPSN")][1]
}
for (o in unique(taxonomy$order[taxonomy$order != "" & taxonomy$kingdom == k & taxonomy$source == "LPSN"])) {
taxonomy$class[which(taxonomy$order == o & taxonomy$kingdom == k)] <- taxonomy$class[which(taxonomy$order == o & taxonomy$kingdom == k & taxonomy$source == "LPSN")][1]
}
for (cc in unique(taxonomy$class[taxonomy$class != "" & taxonomy$kingdom == k & taxonomy$source == "LPSN"])) {
taxonomy$phylum[which(taxonomy$class == cc & taxonomy$kingdom == k)] <- taxonomy$phylum[which(taxonomy$class == cc & taxonomy$kingdom == k & taxonomy$source == "LPSN")][1]
}
}
# Save intermediate results (0) -------------------------------------------
saveRDS(taxonomy, "data-raw/taxonomy0.rds")
# Add missing taxonomic entries -------------------------------------------
# Add missing and fix old taxonomic entries -------------------------------
# this part will make sure that the whole taxonomy of every included species exists, so no missing genera, classes, etc.
@ -639,19 +635,6 @@ taxonomy_all_missing %>% View()
taxonomy <- taxonomy %>%
bind_rows(taxonomy_all_missing)
# we need to fix parent GBIF identifiers
taxonomy$gbif_parent[taxonomy$rank == "phylum" & !is.na(taxonomy$gbif)] <- taxonomy$gbif[match(taxonomy$kingdom[taxonomy$rank == "phylum" & !is.na(taxonomy$gbif)], taxonomy$fullname)]
taxonomy$gbif_parent[taxonomy$rank == "class" & !is.na(taxonomy$gbif)] <- taxonomy$gbif[match(taxonomy$phylum[taxonomy$rank == "class" & !is.na(taxonomy$gbif)], taxonomy$fullname)]
taxonomy$gbif_parent[taxonomy$rank == "order" & !is.na(taxonomy$gbif)] <- taxonomy$gbif[match(taxonomy$class[taxonomy$rank == "order" & !is.na(taxonomy$gbif)], taxonomy$fullname)]
taxonomy$gbif_parent[taxonomy$rank == "family" & !is.na(taxonomy$gbif)] <- taxonomy$gbif[match(taxonomy$order[taxonomy$rank == "family" & !is.na(taxonomy$gbif)], taxonomy$fullname)]
taxonomy$gbif_parent[taxonomy$rank == "genus" & !is.na(taxonomy$gbif)] <- taxonomy$gbif[match(taxonomy$family[taxonomy$rank == "genus" & !is.na(taxonomy$gbif)], taxonomy$fullname)]
taxonomy$gbif_parent[taxonomy$rank == "species" & !is.na(taxonomy$gbif)] <- taxonomy$gbif[match(taxonomy$genus[taxonomy$rank == "species" & !is.na(taxonomy$gbif)], taxonomy$fullname)]
taxonomy$gbif_parent[taxonomy$rank == "subspecies" & !is.na(taxonomy$gbif)] <- taxonomy$gbif[match(paste(taxonomy$genus[taxonomy$rank == "subspecies" & !is.na(taxonomy$gbif)], taxonomy$species[taxonomy$rank == "subspecies" & !is.na(taxonomy$gbif)]), taxonomy$fullname)]
# these still have no record in our data set:
all(taxonomy$lpsn_parent %in% taxonomy$lpsn)
all(taxonomy$gbif_parent %in% taxonomy$gbif)
# fix for duplicate fullnames within a kingdom (such as Nitrospira which is the name of the genus AND its class)
taxonomy <- taxonomy %>%
mutate(rank_index = case_when(rank == "subspecies" ~ 1,
@ -666,7 +649,7 @@ taxonomy <- taxonomy %>%
group_by(kingdom, fullname) %>%
mutate(fullname = if_else(row_number() > 1, fullname_rank, fullname)) %>%
ungroup() %>%
select(-fullname_rank) %>%
select(-fullname_rank, -rank_index) %>%
arrange(fullname)
# now also add missing species (requires combination with genus)
@ -699,7 +682,7 @@ taxonomy <- taxonomy %>%
filter(kingdom != "")
# Save intermediate results -----------------------------------------------
# Save intermediate results (1) -------------------------------------------
saveRDS(taxonomy, "data-raw/taxonomy1.rds")
@ -710,6 +693,9 @@ manually_added <- AMR::microorganisms %>%
filter(source == "manually added", !paste(kingdom, fullname) %in% paste(taxonomy$kingdom, taxonomy$fullname)) %>%
select(fullname:subspecies, ref, source, rank)
manually_added <- manually_added %>%
bind_rows(salmonellae)
# get latest taxonomy for those entries
for (g in unique(manually_added$genus[manually_added$genus != "" & manually_added$genus %in% taxonomy$genus])) {
manually_added$family[which(manually_added$genus == g)] <- taxonomy$family[which(taxonomy$genus == g & is.na(taxonomy$lpsn))][1]
@ -723,14 +709,19 @@ for (o in unique(manually_added$order[manually_added$order != "" & manually_adde
for (cc in unique(manually_added$class[manually_added$class != "" & manually_added$class %in% taxonomy$class])) {
manually_added$phylum[which(manually_added$class == cc)] <- taxonomy$phylum[which(taxonomy$class == cc & is.na(taxonomy$lpsn))][1]
}
for (p in unique(manually_added$phylum[manually_added$phylum != "" & manually_added$phylum %in% taxonomy$phylum])) {
manually_added$kingdom[which(manually_added$phylum == p)] <- taxonomy$kingdom[which(taxonomy$phylum == p & is.na(taxonomy$lpsn))][1]
}
manually_added <- manually_added %>%
mutate(
status = "accepted",
rank = ifelse(fullname %like% "unknown", "(unknown rank)", rank)
)
manually_added
taxonomy <- taxonomy %>%
# here also the 'unknowns' are added, such as "(unknown fungus)"
bind_rows(manually_added) %>%
arrange(fullname)
@ -743,6 +734,49 @@ taxonomy <- taxonomy %>%
mutate(ref = get_author_year(ref))
# Get the latest upper taxonomy from LPSN for non-LPSN data ---------------
# (e.g., phylum above class "Bacilli" was still "Firmicutes", should be "Bacillota" in 2022)
for (k in unique(taxonomy$kingdom[taxonomy$kingdom != ""])) {
message("Fixing GBIF taxonomy for kingdom ", k, ".", appendLF = FALSE)
i <- 0
for (g in unique(taxonomy$genus[taxonomy$genus != "" & taxonomy$kingdom == k & taxonomy$source == "LPSN"])) {
i <- i + 1
if (i %% 50 == 0) message(".", appendLF = FALSE)
taxonomy$family[which(taxonomy$genus == g & taxonomy$kingdom == k)] <- taxonomy$family[which(taxonomy$genus == g & taxonomy$kingdom == k & taxonomy$source == "LPSN")][1]
}
for (f in unique(taxonomy$family[taxonomy$family != "" & taxonomy$kingdom == k & taxonomy$source == "LPSN"])) {
i <- i + 1
if (i %% 50 == 0) message(".", appendLF = FALSE)
taxonomy$order[which(taxonomy$family == f & taxonomy$kingdom == k)] <- taxonomy$order[which(taxonomy$family == f & taxonomy$kingdom == k & taxonomy$source == "LPSN")][1]
}
for (o in unique(taxonomy$order[taxonomy$order != "" & taxonomy$kingdom == k & taxonomy$source == "LPSN"])) {
i <- i + 1
if (i %% 50 == 0) message(".", appendLF = FALSE)
taxonomy$class[which(taxonomy$order == o & taxonomy$kingdom == k)] <- taxonomy$class[which(taxonomy$order == o & taxonomy$kingdom == k & taxonomy$source == "LPSN")][1]
}
for (cc in unique(taxonomy$class[taxonomy$class != "" & taxonomy$kingdom == k & taxonomy$source == "LPSN"])) {
i <- i + 1
if (i %% 50 == 0) message(".", appendLF = FALSE)
taxonomy$phylum[which(taxonomy$class == cc & taxonomy$kingdom == k)] <- taxonomy$phylum[which(taxonomy$class == cc & taxonomy$kingdom == k & taxonomy$source == "LPSN")][1]
}
message("OK.")
}
# we need to fix parent GBIF identifiers
taxonomy$gbif_parent[taxonomy$rank == "phylum" & !is.na(taxonomy$gbif)] <- taxonomy$gbif[match(taxonomy$kingdom[taxonomy$rank == "phylum" & !is.na(taxonomy$gbif)], taxonomy$fullname)]
taxonomy$gbif_parent[taxonomy$rank == "class" & !is.na(taxonomy$gbif)] <- taxonomy$gbif[match(taxonomy$phylum[taxonomy$rank == "class" & !is.na(taxonomy$gbif)], taxonomy$fullname)]
taxonomy$gbif_parent[taxonomy$rank == "order" & !is.na(taxonomy$gbif)] <- taxonomy$gbif[match(taxonomy$class[taxonomy$rank == "order" & !is.na(taxonomy$gbif)], taxonomy$fullname)]
taxonomy$gbif_parent[taxonomy$rank == "family" & !is.na(taxonomy$gbif)] <- taxonomy$gbif[match(taxonomy$order[taxonomy$rank == "family" & !is.na(taxonomy$gbif)], taxonomy$fullname)]
taxonomy$gbif_parent[taxonomy$rank == "genus" & !is.na(taxonomy$gbif)] <- taxonomy$gbif[match(taxonomy$family[taxonomy$rank == "genus" & !is.na(taxonomy$gbif)], taxonomy$fullname)]
taxonomy$gbif_parent[taxonomy$rank == "species" & !is.na(taxonomy$gbif)] <- taxonomy$gbif[match(taxonomy$genus[taxonomy$rank == "species" & !is.na(taxonomy$gbif)], taxonomy$fullname)]
taxonomy$gbif_parent[taxonomy$rank == "subspecies" & !is.na(taxonomy$gbif)] <- taxonomy$gbif[match(paste(taxonomy$genus[taxonomy$rank == "subspecies" & !is.na(taxonomy$gbif)], taxonomy$species[taxonomy$rank == "subspecies" & !is.na(taxonomy$gbif)]), taxonomy$fullname)]
# these still have no record in our data set:
all(taxonomy$lpsn_parent %in% taxonomy$lpsn)
all(taxonomy$gbif_parent %in% taxonomy$gbif)
# Add prevalence ----------------------------------------------------------
# update prevalence based on taxonomy (our own JSS paper: Berends MS et al. (2022), DOI 10.18637/jss.v104.i03)
@ -768,43 +802,6 @@ taxonomy <- taxonomy %>%
table(taxonomy$prevalence, useNA = "always")
# (a lot will be removed further below)
# Add old entries that must be kept ---------------------------------------
# these are bacteria now removed, but have an renamed-to identifier to a current record, so add them
old_to_keep <- microorganisms %>%
filter(!paste(kingdom, fullname) %in% paste(taxonomy$kingdom, taxonomy$fullname),
kingdom == "Bacteria",
(gbif_renamed_to %in% taxonomy$gbif & !is.na(gbif_renamed_to)) | (lpsn_renamed_to %in% taxonomy$lpsn & !is.na(lpsn_renamed_to)),
rank %in% c("genus", "species", "subspecies")) %>%
select(-mo, -snomed)
taxonomy <- taxonomy %>%
bind_rows(old_to_keep) %>%
arrange(fullname)
# and these had prevalence = 1, why do they miss now?
old_to_keep2 <- microorganisms %>%
filter(!paste(kingdom, fullname) %in% paste(taxonomy$kingdom, taxonomy$fullname),
prevalence == 1,
!is.na(gbif_parent) & gbif_parent %in% taxonomy$gbif,
rank %in% c("genus", "species", "subspecies")) %>%
select(-mo, -snomed)
taxonomy <- taxonomy %>%
bind_rows(old_to_keep2) %>%
arrange(fullname)
# strangly, Trichomonas is no longer in GBIF?
old_to_keep3 <- microorganisms %>%
filter(fullname %like% "^trichomona") %>%
select(-mo, -snomed)
taxonomy <- taxonomy %>%
filter(!fullname %in% old_to_keep3$fullname) %>%
bind_rows(old_to_keep3) %>%
arrange(fullname)
# fix rank
taxonomy <- taxonomy %>%
mutate(rank = case_when(
@ -820,6 +817,11 @@ taxonomy <- taxonomy %>%
))
# Save intermediate results (2) -------------------------------------------
saveRDS(taxonomy, "data-raw/taxonomy2.rds")
# Add microbial IDs -------------------------------------------------------
# MO codes in the AMR package have the form KINGDOM_GENUS_SPECIES_SUBSPECIES where all are abbreviated.
@ -995,7 +997,7 @@ mo_genus <- mo_genus %>%
mo_species <- taxonomy %>%
filter(rank == "species") %>%
distinct(kingdom, genus, species) %>%
left_join(microorganisms %>%
left_join(AMR::microorganisms %>%
filter(rank == "species") %>%
transmute(mo_species_old = gsub("^[A-Z]+_[A-Z]+_", "", as.character(mo)), kingdom, genus, species) %>%
filter(mo_species_old %unlike% "-") %>%
@ -1043,7 +1045,7 @@ mo_species <- mo_species %>%
mo_subspecies <- taxonomy %>%
filter(rank == "subspecies") %>%
distinct(kingdom, genus, species, subspecies) %>%
left_join(microorganisms %>%
left_join(AMR::microorganisms %>%
filter(rank %in% c("subspecies", "subsp.", "infraspecies")) %>%
transmute(mo_subspecies_old = gsub("^[A-Z]+_[A-Z]+_[A-Z]+_", "", as.character(mo)), kingdom, genus, species, subspecies) %>%
filter(mo_subspecies_old %unlike% "-") %>%
@ -1138,13 +1140,31 @@ taxonomy <- taxonomy %>%
taxonomy %>% filter(mo %like% "__") %>% View()
taxonomy <- taxonomy %>% filter(mo %unlike% "__")
# this must be empty of course
taxonomy %>% filter(mo %in% .[duplicated(mo), "mo", drop = TRUE]) %>% View()
# Some integrity checks ---------------------------------------------------
# are mo codes unique?
taxonomy %>% filter(mo %in% .[duplicated(mo), "mo", drop = TRUE])
taxonomy <- taxonomy %>% distinct(mo, .keep_all = TRUE)
# Save intermediate results -----------------------------------------------
# are fullnames unique?
taxonomy %>% filter(fullname %in% .[duplicated(fullname), "fullname", drop = TRUE])
saveRDS(taxonomy, "data-raw/taxonomy2.rds")
# are all GBIFs available?
taxonomy %>% filter(!gbif_parent %in% gbif) %>% count(rank)
# try to find the right gbif IDs
taxonomy$gbif_parent[which(!taxonomy$gbif_parent %in% taxonomy$gbif & taxonomy$rank == "species")] <- taxonomy$gbif[match(taxonomy$genus[which(!taxonomy$gbif_parent %in% taxonomy$gbif & taxonomy$rank == "species")], taxonomy$genus)]
taxonomy$gbif_parent[which(!taxonomy$gbif_parent %in% taxonomy$gbif & taxonomy$rank == "class")] <- taxonomy$gbif[match(taxonomy$phylum[which(!taxonomy$gbif_parent %in% taxonomy$gbif & taxonomy$rank == "class")], taxonomy$phylum)]
taxonomy %>% filter(!gbif_parent %in% gbif) %>% count(rank)
# are all LPSNs available?
taxonomy %>% filter(!lpsn_parent %in% lpsn) %>% count(rank)
# make GBIF refer to newest renaming according to LPSN
taxonomy$gbif_renamed_to[which(!is.na(taxonomy$gbif_renamed_to) & !is.na(taxonomy$lpsn_renamed_to))] <- taxonomy$gbif[match(taxonomy$lpsn_renamed_to[which(!is.na(taxonomy$gbif_renamed_to) & !is.na(taxonomy$lpsn_renamed_to))], taxonomy$lpsn)]
# Save intermediate results (3) -------------------------------------------
saveRDS(taxonomy, "data-raw/taxonomy3.rds")
# Remove unwanted taxonomic entries from Protoza/Fungi --------------------
@ -1176,13 +1196,13 @@ taxonomy <- taxonomy %>%
message("\nCongratulations! The new taxonomic table will contain ", format(nrow(taxonomy), big.mark = ","), " rows.\n",
"This was ", format(nrow(microorganisms), big.mark = ","), " rows.\n")
"This was ", format(nrow(AMR::microorganisms), big.mark = ","), " rows.\n")
# these are the new ones:
taxonomy %>% filter(!paste(kingdom, fullname) %in% paste(microorganisms$kingdom, microorganisms$fullname)) %>% View()
taxonomy %>% filter(!paste(kingdom, fullname) %in% paste(AMR::microorganisms$kingdom, AMR::microorganisms$fullname)) %>% View()
# these were removed:
microorganisms %>% filter(!paste(kingdom, fullname) %in% paste(taxonomy$kingdom, taxonomy$fullname)) %>% View()
microorganisms %>% filter(!fullname %in% taxonomy$fullname) %>% View()
AMR::microorganisms %>% filter(!paste(kingdom, fullname) %in% paste(taxonomy$kingdom, taxonomy$fullname)) %>% View()
AMR::microorganisms %>% filter(!fullname %in% taxonomy$fullname) %>% View()
# Add SNOMED CT -----------------------------------------------------------