1
0
mirror of https://github.com/msberends/AMR.git synced 2025-08-24 11:52:11 +02:00

microorganisms update, added Salmonella groups

This commit is contained in:
2022-12-16 16:10:43 +01:00
parent 8da2467209
commit 5f3a7694aa
43 changed files with 38933 additions and 105358 deletions

File diff suppressed because one or more lines are too long

View File

@@ -161,8 +161,8 @@ MO_CONS <- create_species_cons_cops("CoNS")
MO_COPS <- create_species_cons_cops("CoPS")
MO_STREP_ABCG <- AMR_env$MO_lookup$mo[which(AMR_env$MO_lookup$genus == "Streptococcus" &
AMR_env$MO_lookup$species %in% c(
"pyogenes", "agalactiae", "dysgalactiae", "equi", "anginosus", "sanguinis", "salivarius",
"group A", "group B", "group C", "group D", "group F", "group G", "group H", "group K", "group L"
"pyogenes", "agalactiae", "dysgalactiae", "equi", "canis",
"group A", "group B", "group C", "group G"
))]
MO_FULLNAME_LOWER <- create_MO_fullname_lower()
MO_PREVALENT_GENERA <- c(

View File

@@ -1 +1 @@
33ff38f7d52ae39f951f60b73db29630
ddbee37dd1dfa56d90d9b70ad2ed6308

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -1 +1 @@
9d0367aa37e7f7d6923caae506ea434d
88f3ce0e84bf8336f43cd6565215db1d

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

File diff suppressed because one or more lines are too long

Binary file not shown.

View File

@@ -50,8 +50,7 @@ library(dplyr)
library(vroom) # to import files
library(rvest) # to scape LPSN website
library(progress) # to show progress bars
library(AMR)
# also requires 'rvest' and 'progress' packages
devtools::load_all(".") # load AMR package
# Helper functions --------------------------------------------------------
@@ -538,6 +537,16 @@ taxonomy <- taxonomy %>%
arrange(fullname) %>%
filter(fullname != "")
# get missing entries from existing microorganisms data set
taxonomy <- taxonomy %>%
bind_rows(AMR::microorganisms %>%
select(all_of(colnames(taxonomy))) %>%
filter(!paste(kingdom, fullname) %in% paste(taxonomy$kingdom, taxonomy$fullname),
# these will be added later:
source != "manually added")) %>%
arrange(fullname) %>%
filter(fullname != "")
# fix rank
table(taxonomy$rank, useNA = "always")
taxonomy <- taxonomy %>%
@@ -554,26 +563,13 @@ taxonomy <- taxonomy %>%
))
table(taxonomy$rank, useNA = "always")
# get the latest upper taxonomy from LPSN to update the GBIF data
# (e.g., phylum above class "Bacilli" was still "Firmicutes", should be "Bacillota" in 2022)
for (k in unique(taxonomy$kingdom[taxonomy$kingdom != ""])) {
message("Fixing GBIF taxonomy for kingdom ", k, "...")
for (g in unique(taxonomy$genus[taxonomy$genus != "" & taxonomy$kingdom == k & taxonomy$source == "LPSN"])) {
taxonomy$family[which(taxonomy$genus == g & taxonomy$kingdom == k)] <- taxonomy$family[which(taxonomy$genus == g & taxonomy$kingdom == k & taxonomy$source == "LPSN")][1]
}
for (f in unique(taxonomy$family[taxonomy$family != "" & taxonomy$kingdom == k & taxonomy$source == "LPSN"])) {
taxonomy$order[which(taxonomy$family == f & taxonomy$kingdom == k)] <- taxonomy$order[which(taxonomy$family == f & taxonomy$kingdom == k & taxonomy$source == "LPSN")][1]
}
for (o in unique(taxonomy$order[taxonomy$order != "" & taxonomy$kingdom == k & taxonomy$source == "LPSN"])) {
taxonomy$class[which(taxonomy$order == o & taxonomy$kingdom == k)] <- taxonomy$class[which(taxonomy$order == o & taxonomy$kingdom == k & taxonomy$source == "LPSN")][1]
}
for (cc in unique(taxonomy$class[taxonomy$class != "" & taxonomy$kingdom == k & taxonomy$source == "LPSN"])) {
taxonomy$phylum[which(taxonomy$class == cc & taxonomy$kingdom == k)] <- taxonomy$phylum[which(taxonomy$class == cc & taxonomy$kingdom == k & taxonomy$source == "LPSN")][1]
}
}
# Save intermediate results (0) -------------------------------------------
saveRDS(taxonomy, "data-raw/taxonomy0.rds")
# Add missing taxonomic entries -------------------------------------------
# Add missing and fix old taxonomic entries -------------------------------
# this part will make sure that the whole taxonomy of every included species exists, so no missing genera, classes, etc.
@@ -639,19 +635,6 @@ taxonomy_all_missing %>% View()
taxonomy <- taxonomy %>%
bind_rows(taxonomy_all_missing)
# we need to fix parent GBIF identifiers
taxonomy$gbif_parent[taxonomy$rank == "phylum" & !is.na(taxonomy$gbif)] <- taxonomy$gbif[match(taxonomy$kingdom[taxonomy$rank == "phylum" & !is.na(taxonomy$gbif)], taxonomy$fullname)]
taxonomy$gbif_parent[taxonomy$rank == "class" & !is.na(taxonomy$gbif)] <- taxonomy$gbif[match(taxonomy$phylum[taxonomy$rank == "class" & !is.na(taxonomy$gbif)], taxonomy$fullname)]
taxonomy$gbif_parent[taxonomy$rank == "order" & !is.na(taxonomy$gbif)] <- taxonomy$gbif[match(taxonomy$class[taxonomy$rank == "order" & !is.na(taxonomy$gbif)], taxonomy$fullname)]
taxonomy$gbif_parent[taxonomy$rank == "family" & !is.na(taxonomy$gbif)] <- taxonomy$gbif[match(taxonomy$order[taxonomy$rank == "family" & !is.na(taxonomy$gbif)], taxonomy$fullname)]
taxonomy$gbif_parent[taxonomy$rank == "genus" & !is.na(taxonomy$gbif)] <- taxonomy$gbif[match(taxonomy$family[taxonomy$rank == "genus" & !is.na(taxonomy$gbif)], taxonomy$fullname)]
taxonomy$gbif_parent[taxonomy$rank == "species" & !is.na(taxonomy$gbif)] <- taxonomy$gbif[match(taxonomy$genus[taxonomy$rank == "species" & !is.na(taxonomy$gbif)], taxonomy$fullname)]
taxonomy$gbif_parent[taxonomy$rank == "subspecies" & !is.na(taxonomy$gbif)] <- taxonomy$gbif[match(paste(taxonomy$genus[taxonomy$rank == "subspecies" & !is.na(taxonomy$gbif)], taxonomy$species[taxonomy$rank == "subspecies" & !is.na(taxonomy$gbif)]), taxonomy$fullname)]
# these still have no record in our data set:
all(taxonomy$lpsn_parent %in% taxonomy$lpsn)
all(taxonomy$gbif_parent %in% taxonomy$gbif)
# fix for duplicate fullnames within a kingdom (such as Nitrospira which is the name of the genus AND its class)
taxonomy <- taxonomy %>%
mutate(rank_index = case_when(rank == "subspecies" ~ 1,
@@ -666,7 +649,7 @@ taxonomy <- taxonomy %>%
group_by(kingdom, fullname) %>%
mutate(fullname = if_else(row_number() > 1, fullname_rank, fullname)) %>%
ungroup() %>%
select(-fullname_rank) %>%
select(-fullname_rank, -rank_index) %>%
arrange(fullname)
# now also add missing species (requires combination with genus)
@@ -699,7 +682,7 @@ taxonomy <- taxonomy %>%
filter(kingdom != "")
# Save intermediate results -----------------------------------------------
# Save intermediate results (1) -------------------------------------------
saveRDS(taxonomy, "data-raw/taxonomy1.rds")
@@ -710,6 +693,9 @@ manually_added <- AMR::microorganisms %>%
filter(source == "manually added", !paste(kingdom, fullname) %in% paste(taxonomy$kingdom, taxonomy$fullname)) %>%
select(fullname:subspecies, ref, source, rank)
manually_added <- manually_added %>%
bind_rows(salmonellae)
# get latest taxonomy for those entries
for (g in unique(manually_added$genus[manually_added$genus != "" & manually_added$genus %in% taxonomy$genus])) {
manually_added$family[which(manually_added$genus == g)] <- taxonomy$family[which(taxonomy$genus == g & is.na(taxonomy$lpsn))][1]
@@ -723,14 +709,19 @@ for (o in unique(manually_added$order[manually_added$order != "" & manually_adde
for (cc in unique(manually_added$class[manually_added$class != "" & manually_added$class %in% taxonomy$class])) {
manually_added$phylum[which(manually_added$class == cc)] <- taxonomy$phylum[which(taxonomy$class == cc & is.na(taxonomy$lpsn))][1]
}
for (p in unique(manually_added$phylum[manually_added$phylum != "" & manually_added$phylum %in% taxonomy$phylum])) {
manually_added$kingdom[which(manually_added$phylum == p)] <- taxonomy$kingdom[which(taxonomy$phylum == p & is.na(taxonomy$lpsn))][1]
}
manually_added <- manually_added %>%
mutate(
status = "accepted",
rank = ifelse(fullname %like% "unknown", "(unknown rank)", rank)
)
manually_added
taxonomy <- taxonomy %>%
# here also the 'unknowns' are added, such as "(unknown fungus)"
bind_rows(manually_added) %>%
arrange(fullname)
@@ -743,6 +734,49 @@ taxonomy <- taxonomy %>%
mutate(ref = get_author_year(ref))
# Get the latest upper taxonomy from LPSN for non-LPSN data ---------------
# (e.g., phylum above class "Bacilli" was still "Firmicutes", should be "Bacillota" in 2022)
for (k in unique(taxonomy$kingdom[taxonomy$kingdom != ""])) {
message("Fixing GBIF taxonomy for kingdom ", k, ".", appendLF = FALSE)
i <- 0
for (g in unique(taxonomy$genus[taxonomy$genus != "" & taxonomy$kingdom == k & taxonomy$source == "LPSN"])) {
i <- i + 1
if (i %% 50 == 0) message(".", appendLF = FALSE)
taxonomy$family[which(taxonomy$genus == g & taxonomy$kingdom == k)] <- taxonomy$family[which(taxonomy$genus == g & taxonomy$kingdom == k & taxonomy$source == "LPSN")][1]
}
for (f in unique(taxonomy$family[taxonomy$family != "" & taxonomy$kingdom == k & taxonomy$source == "LPSN"])) {
i <- i + 1
if (i %% 50 == 0) message(".", appendLF = FALSE)
taxonomy$order[which(taxonomy$family == f & taxonomy$kingdom == k)] <- taxonomy$order[which(taxonomy$family == f & taxonomy$kingdom == k & taxonomy$source == "LPSN")][1]
}
for (o in unique(taxonomy$order[taxonomy$order != "" & taxonomy$kingdom == k & taxonomy$source == "LPSN"])) {
i <- i + 1
if (i %% 50 == 0) message(".", appendLF = FALSE)
taxonomy$class[which(taxonomy$order == o & taxonomy$kingdom == k)] <- taxonomy$class[which(taxonomy$order == o & taxonomy$kingdom == k & taxonomy$source == "LPSN")][1]
}
for (cc in unique(taxonomy$class[taxonomy$class != "" & taxonomy$kingdom == k & taxonomy$source == "LPSN"])) {
i <- i + 1
if (i %% 50 == 0) message(".", appendLF = FALSE)
taxonomy$phylum[which(taxonomy$class == cc & taxonomy$kingdom == k)] <- taxonomy$phylum[which(taxonomy$class == cc & taxonomy$kingdom == k & taxonomy$source == "LPSN")][1]
}
message("OK.")
}
# we need to fix parent GBIF identifiers
taxonomy$gbif_parent[taxonomy$rank == "phylum" & !is.na(taxonomy$gbif)] <- taxonomy$gbif[match(taxonomy$kingdom[taxonomy$rank == "phylum" & !is.na(taxonomy$gbif)], taxonomy$fullname)]
taxonomy$gbif_parent[taxonomy$rank == "class" & !is.na(taxonomy$gbif)] <- taxonomy$gbif[match(taxonomy$phylum[taxonomy$rank == "class" & !is.na(taxonomy$gbif)], taxonomy$fullname)]
taxonomy$gbif_parent[taxonomy$rank == "order" & !is.na(taxonomy$gbif)] <- taxonomy$gbif[match(taxonomy$class[taxonomy$rank == "order" & !is.na(taxonomy$gbif)], taxonomy$fullname)]
taxonomy$gbif_parent[taxonomy$rank == "family" & !is.na(taxonomy$gbif)] <- taxonomy$gbif[match(taxonomy$order[taxonomy$rank == "family" & !is.na(taxonomy$gbif)], taxonomy$fullname)]
taxonomy$gbif_parent[taxonomy$rank == "genus" & !is.na(taxonomy$gbif)] <- taxonomy$gbif[match(taxonomy$family[taxonomy$rank == "genus" & !is.na(taxonomy$gbif)], taxonomy$fullname)]
taxonomy$gbif_parent[taxonomy$rank == "species" & !is.na(taxonomy$gbif)] <- taxonomy$gbif[match(taxonomy$genus[taxonomy$rank == "species" & !is.na(taxonomy$gbif)], taxonomy$fullname)]
taxonomy$gbif_parent[taxonomy$rank == "subspecies" & !is.na(taxonomy$gbif)] <- taxonomy$gbif[match(paste(taxonomy$genus[taxonomy$rank == "subspecies" & !is.na(taxonomy$gbif)], taxonomy$species[taxonomy$rank == "subspecies" & !is.na(taxonomy$gbif)]), taxonomy$fullname)]
# these still have no record in our data set:
all(taxonomy$lpsn_parent %in% taxonomy$lpsn)
all(taxonomy$gbif_parent %in% taxonomy$gbif)
# Add prevalence ----------------------------------------------------------
# update prevalence based on taxonomy (our own JSS paper: Berends MS et al. (2022), DOI 10.18637/jss.v104.i03)
@@ -768,43 +802,6 @@ taxonomy <- taxonomy %>%
table(taxonomy$prevalence, useNA = "always")
# (a lot will be removed further below)
# Add old entries that must be kept ---------------------------------------
# these are bacteria now removed, but have an renamed-to identifier to a current record, so add them
old_to_keep <- microorganisms %>%
filter(!paste(kingdom, fullname) %in% paste(taxonomy$kingdom, taxonomy$fullname),
kingdom == "Bacteria",
(gbif_renamed_to %in% taxonomy$gbif & !is.na(gbif_renamed_to)) | (lpsn_renamed_to %in% taxonomy$lpsn & !is.na(lpsn_renamed_to)),
rank %in% c("genus", "species", "subspecies")) %>%
select(-mo, -snomed)
taxonomy <- taxonomy %>%
bind_rows(old_to_keep) %>%
arrange(fullname)
# and these had prevalence = 1, why do they miss now?
old_to_keep2 <- microorganisms %>%
filter(!paste(kingdom, fullname) %in% paste(taxonomy$kingdom, taxonomy$fullname),
prevalence == 1,
!is.na(gbif_parent) & gbif_parent %in% taxonomy$gbif,
rank %in% c("genus", "species", "subspecies")) %>%
select(-mo, -snomed)
taxonomy <- taxonomy %>%
bind_rows(old_to_keep2) %>%
arrange(fullname)
# strangly, Trichomonas is no longer in GBIF?
old_to_keep3 <- microorganisms %>%
filter(fullname %like% "^trichomona") %>%
select(-mo, -snomed)
taxonomy <- taxonomy %>%
filter(!fullname %in% old_to_keep3$fullname) %>%
bind_rows(old_to_keep3) %>%
arrange(fullname)
# fix rank
taxonomy <- taxonomy %>%
mutate(rank = case_when(
@@ -820,6 +817,11 @@ taxonomy <- taxonomy %>%
))
# Save intermediate results (2) -------------------------------------------
saveRDS(taxonomy, "data-raw/taxonomy2.rds")
# Add microbial IDs -------------------------------------------------------
# MO codes in the AMR package have the form KINGDOM_GENUS_SPECIES_SUBSPECIES where all are abbreviated.
@@ -995,7 +997,7 @@ mo_genus <- mo_genus %>%
mo_species <- taxonomy %>%
filter(rank == "species") %>%
distinct(kingdom, genus, species) %>%
left_join(microorganisms %>%
left_join(AMR::microorganisms %>%
filter(rank == "species") %>%
transmute(mo_species_old = gsub("^[A-Z]+_[A-Z]+_", "", as.character(mo)), kingdom, genus, species) %>%
filter(mo_species_old %unlike% "-") %>%
@@ -1043,7 +1045,7 @@ mo_species <- mo_species %>%
mo_subspecies <- taxonomy %>%
filter(rank == "subspecies") %>%
distinct(kingdom, genus, species, subspecies) %>%
left_join(microorganisms %>%
left_join(AMR::microorganisms %>%
filter(rank %in% c("subspecies", "subsp.", "infraspecies")) %>%
transmute(mo_subspecies_old = gsub("^[A-Z]+_[A-Z]+_[A-Z]+_", "", as.character(mo)), kingdom, genus, species, subspecies) %>%
filter(mo_subspecies_old %unlike% "-") %>%
@@ -1138,13 +1140,31 @@ taxonomy <- taxonomy %>%
taxonomy %>% filter(mo %like% "__") %>% View()
taxonomy <- taxonomy %>% filter(mo %unlike% "__")
# this must be empty of course
taxonomy %>% filter(mo %in% .[duplicated(mo), "mo", drop = TRUE]) %>% View()
# Some integrity checks ---------------------------------------------------
# are mo codes unique?
taxonomy %>% filter(mo %in% .[duplicated(mo), "mo", drop = TRUE])
taxonomy <- taxonomy %>% distinct(mo, .keep_all = TRUE)
# Save intermediate results -----------------------------------------------
# are fullnames unique?
taxonomy %>% filter(fullname %in% .[duplicated(fullname), "fullname", drop = TRUE])
saveRDS(taxonomy, "data-raw/taxonomy2.rds")
# are all GBIFs available?
taxonomy %>% filter(!gbif_parent %in% gbif) %>% count(rank)
# try to find the right gbif IDs
taxonomy$gbif_parent[which(!taxonomy$gbif_parent %in% taxonomy$gbif & taxonomy$rank == "species")] <- taxonomy$gbif[match(taxonomy$genus[which(!taxonomy$gbif_parent %in% taxonomy$gbif & taxonomy$rank == "species")], taxonomy$genus)]
taxonomy$gbif_parent[which(!taxonomy$gbif_parent %in% taxonomy$gbif & taxonomy$rank == "class")] <- taxonomy$gbif[match(taxonomy$phylum[which(!taxonomy$gbif_parent %in% taxonomy$gbif & taxonomy$rank == "class")], taxonomy$phylum)]
taxonomy %>% filter(!gbif_parent %in% gbif) %>% count(rank)
# are all LPSNs available?
taxonomy %>% filter(!lpsn_parent %in% lpsn) %>% count(rank)
# make GBIF refer to newest renaming according to LPSN
taxonomy$gbif_renamed_to[which(!is.na(taxonomy$gbif_renamed_to) & !is.na(taxonomy$lpsn_renamed_to))] <- taxonomy$gbif[match(taxonomy$lpsn_renamed_to[which(!is.na(taxonomy$gbif_renamed_to) & !is.na(taxonomy$lpsn_renamed_to))], taxonomy$lpsn)]
# Save intermediate results (3) -------------------------------------------
saveRDS(taxonomy, "data-raw/taxonomy3.rds")
# Remove unwanted taxonomic entries from Protoza/Fungi --------------------
@@ -1176,13 +1196,13 @@ taxonomy <- taxonomy %>%
message("\nCongratulations! The new taxonomic table will contain ", format(nrow(taxonomy), big.mark = ","), " rows.\n",
"This was ", format(nrow(microorganisms), big.mark = ","), " rows.\n")
"This was ", format(nrow(AMR::microorganisms), big.mark = ","), " rows.\n")
# these are the new ones:
taxonomy %>% filter(!paste(kingdom, fullname) %in% paste(microorganisms$kingdom, microorganisms$fullname)) %>% View()
taxonomy %>% filter(!paste(kingdom, fullname) %in% paste(AMR::microorganisms$kingdom, AMR::microorganisms$fullname)) %>% View()
# these were removed:
microorganisms %>% filter(!paste(kingdom, fullname) %in% paste(taxonomy$kingdom, taxonomy$fullname)) %>% View()
microorganisms %>% filter(!fullname %in% taxonomy$fullname) %>% View()
AMR::microorganisms %>% filter(!paste(kingdom, fullname) %in% paste(taxonomy$kingdom, taxonomy$fullname)) %>% View()
AMR::microorganisms %>% filter(!fullname %in% taxonomy$fullname) %>% View()
# Add SNOMED CT -----------------------------------------------------------

1584
data-raw/salmonellae.R Normal file

File diff suppressed because it is too large Load Diff

BIN
data-raw/taxonomy0.rds Normal file

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
data-raw/taxonomy3.rds Normal file

Binary file not shown.