# ==================================================================== # # TITLE: # # AMR: An R Package for Working with Antimicrobial Resistance Data # # # # SOURCE CODE: # # https://github.com/msberends/AMR # # # # PLEASE CITE THIS SOFTWARE AS: # # Berends MS, Luz CF, Friedrich AW, et al. (2022). # # AMR: An R Package for Working with Antimicrobial Resistance Data. # # Journal of Statistical Software, 104(3), 1-31. # # https://doi.org/10.18637/jss.v104.i03 # # # # Developed at the University of Groningen and the University Medical # # Center Groningen in The Netherlands, in collaboration with many # # colleagues from around the world, see our website. # # # # This R package is free software; you can freely use and distribute # # it for both personal and commercial purposes under the terms of the # # GNU General Public License version 2.0 (GNU GPL-2), as published by # # the Free Software Foundation. # # We created this package for both routine data analysis and academic # # research and it was publicly released in the hope that it will be # # useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. # # # # Visit our website for the full manual and a complete tutorial about # # how to conduct AMR data analysis: https://msberends.github.io/AMR/ # # ==================================================================== # # ! THIS SCRIPT REQUIRES AT LEAST 16 GB RAM ! # (at least 12 GB will be used by the R session for the size of the files) # GBIF: # 1. Go to https://doi.org/10.15468/39omei and find the download link for the # latest GBIF backbone taxonony under "Endpoints" and unpack "Taxon.tsv" from it (~1.0 GB) # ALSO BE SURE to get the date of release and update R/aa_globals.R later! # LPSN: # 2. Go to https://lpsn.dsmz.de/downloads (register first) and download the latest # CSV file (~12,5 MB) and rename to "taxonomy.csv" # ALSO BE SURE to get the date of release and update R/aa_globals.R later! # MycoBank: # 3. Go to https://www.mycobank.org/ and find the download link of all entries # (last time https://www.mycobank.org/images/MBList.zip) and unpack # "MBList.xlsx" from it (~120 MB) # ALSO BE SURE to get the date of release and update R/aa_globals.R later! # Bartlett: # 4. For data about human pathogens, we use Bartlett et al. (2022), # https://doi.org/10.1099/mic.0.001269. Their latest supplementary material # can be found here: https://github.com/padpadpadpad/bartlett_et_al_2022_human_pathogens. # Download their latest xlsx file in the `data` folder and save it to our # `data-raw` folder. # 5. Set these locations to the paths where the files are: folder_location <- "~/Downloads/" file_gbif <- paste0(folder_location, "/backbone/Taxon.tsv") file_lpsn <- paste0(folder_location, "taxonomy.csv") file_mycobank <- paste0(folder_location, "MBList.xlsx") file_bartlett <- "data-raw/bartlett_et_al_2022_human_pathogens.xlsx" # 4. Run the rest of this script line by line and check everything :) if (!file.exists(file_gbif)) stop("GBIF file not found") if (!file.exists(file_lpsn)) stop("LPSN file not found") if (!file.exists(file_mycobank)) stop("MB file not found") if (!file.exists(file_bartlett)) stop("Bartlett et al. Excel file not found") library(dplyr) library(tidyr) library(vroom) # to import files library(rvest) # to scape LPSN website library(progress) # to show progress bars library(readxl) # for reading the Bartlett Excel file devtools::load_all(".") # load AMR package # Helper functions -------------------------------------------------------- get_author_year <- function(ref) { # Only keep first author, e.g. transform 'Smith, Jones, 2011' to 'Smith et al., 2011' authors2 <- iconv(ref, from = "UTF-8", to = "ASCII//TRANSLIT") authors2 <- gsub(" ?\\(Approved Lists [0-9]+\\) ?", " () ", authors2) authors2 <- gsub(" [)(]+ $", "", authors2) # remove leading and trailing brackets authors2 <- trimws(gsub("^[(](.*)[)]$", "\\1", authors2)) # only take part after brackets if there's a name authors2 <- ifelse(grepl(".*[)] [a-zA-Z]+.*", authors2), gsub(".*[)] (.*)", "\\1", authors2), authors2 ) # replace parentheses with emend. to get the latest authors authors2 <- gsub("(", " emend. ", authors2, fixed = TRUE) authors2 <- gsub(")", "", authors2, fixed = TRUE) authors2 <- gsub(" +", " ", authors2) authors2 <- trimws(authors2) # get year from last 4 digits lastyear <- as.integer(gsub(".*([0-9]{4})$", "\\1", authors2)) # can never be later than now lastyear <- ifelse(lastyear > as.integer(format(Sys.Date(), "%Y")), NA, lastyear ) # get authors without last year authors <- gsub("(.*)[0-9]{4}$", "\\1", authors2) # not sure what this is authors <- gsub("(Saito)", "", authors, fixed = TRUE) authors <- gsub("(Oudem.)", "", authors, fixed = TRUE) # remove nonsense characters from names authors <- gsub("[^a-zA-Z,'&. -]", "", authors) # no initials, only surname authors <- gsub("[A-Z-][.]", "", authors, ignore.case = FALSE) # remove trailing and leading spaces authors <- trimws(authors) # keep only the part after last 'emend.' to get the latest authors authors <- gsub(".*emend[.] ?", "", authors) # only keep first author and replace all others by 'et al' authors <- gsub("(,| and| et| &| ex| emend\\.?) .*", " et al.", authors) # et al. always with ending dot authors <- gsub(" et al\\.?", " et al.", authors) authors <- gsub(" ?,$", "", authors) # don't start with 'sensu' or 'ehrenb' authors <- gsub("^(sensu|Ehrenb.?|corrig.?) ", "", authors, ignore.case = TRUE) # no initials, only surname authors <- trimws(authors) authors <- gsub("^([A-Z-][.])+( & ?)?", "", authors, ignore.case = FALSE) authors <- gsub("^([A-Z-]+ )+", "", authors, ignore.case = FALSE) # remove dots authors <- gsub(".", "", authors, fixed = TRUE) authors <- gsub("et al", "et al.", authors, fixed = TRUE) authors[nchar(authors) <= 3] <- "" # combine author and year if year is available ref <- ifelse(!is.na(lastyear), paste0(authors, ", ", lastyear), authors ) # fix beginning and ending ref <- gsub(", $", "", ref) ref <- gsub("^, ", "", ref) ref <- gsub("^(emend|et al.,?)", "", ref) ref <- trimws(ref) ref <- gsub("'", "", ref) # a lot start with a lowercase character - fix that ref[!grepl("^d[A-Z]", ref)] <- gsub("^([a-z])", "\\U\\1", ref[!grepl("^d[A-Z]", ref)], perl = TRUE) # specific one for the French that are named dOrbigny ref[grepl("^d[A-Z]", ref)] <- gsub("^d", "d'", ref[grepl("^d[A-Z]", ref)]) ref <- gsub(" +", " ", ref) ref[ref == ""] <- NA_character_ ref } df_remove_nonASCII <- function(df) { # Remove non-ASCII characters (these are not allowed by CRAN) df %>% mutate_if(is.character, iconv, from = "UTF-8", to = "ASCII//TRANSLIT") %>% # also remove invalid characters mutate_if(is.character, ~ gsub("[\"'`]+", "", .)) %>% AMR:::dataset_UTF8_to_ASCII() } # to retrieve LPSN and authors from LPSN website # e.g., get_lpsn_and_author("genus", "Klebsiella") get_lpsn_and_author <- function(rank, name) { name <- gsub("^Candidatus ", "", name) url <- paste0("https://lpsn.dsmz.de/", tolower(rank), "/", tolower(name)) page_txt <- tryCatch(read_html(url), error = function(e) NULL) if (is.null(page_txt)) { warning("No LPSN found for ", tolower(rank), " '", name, "'") lpsn <- NA_character_ ref <- NA_character_ status <- "unknown" } else { page_txt <- page_txt %>% html_element("#detail-page") %>% html_text() lpsn <- gsub(".*Record number:[\r\n\t ]*([0-9]+).*", "\\1", page_txt, perl = FALSE) ref <- page_txt %>% gsub(".*?Name: (.*[0-9]{4}?).*", "\\1", ., perl = FALSE) %>% gsub(name, "", ., fixed = TRUE) %>% gsub("^\"?Candidatus ?\"?", "", .) %>% trimws() status <- trimws(gsub(".*Nomenclatural status:[\r\n\t ]*([a-zA-Z, ]+)[\r\n\t].*", "\\1", page_txt, perl = FALSE)) if ((status %like% "validly published" & status %unlike% "not valid") | status %like% "[\r\n\t]") { # we used to take "accepted" for every LPSN record, also candidates. Now only for missing values and explicit accepted ones. status <- "accepted" } else { status <- "not validly published" } } c("lpsn" = lpsn, "ref" = ref, "status" = status) } # this will e.g. take the family from the root genus record, and gives all species of that family get_top_lvl <- function(current, rank, source, rank_target, target) { current.bak <- current current <- current[target != ""] rank <- rank[target != ""] source <- source[target != ""] target <- target[target != ""] if (length(current) == 0) { current.bak } else if (!rank_target %in% rank) { current.bak[1] } else { if (n_distinct(source) > 1 && "GBIF" %in% source) { # prefer LPSN and MycoBank over GBIF, which is often not up-to-date at all current <- current[source != "GBIF"] rank <- rank[source != "GBIF"] source <- source[source != "GBIF"] } out <- current[rank == rank_target][1] if (out %in% c("", NA)) { out <- names(sort(table(current[which(!current %in% c("", NA))]), decreasing = TRUE)[1]) if (is.null(out)) { out <- "" } } out } } # MB/ June 2024: after years still useless, does not contain full taxonomy, e.g. LPSN::request(cred, category = "family") is empty. # get_from_lpsn <- function (user, pw) { # if (!"LPSN" %in% rownames(utils::installed.packages())) { # stop("Install the official LPSN package for R using: install.packages('LPSN', repos = 'https://r-forge.r-project.org')") # } # cred <- LPSN::open_lpsn(user, pw) # # lpsn_genus <- LPSN::request(cred, category = "genus") # message("Downloading genus data (n = ", lpsn_genus$count, ") from LPSN API...") # lpsn_genus <- as.data.frame(LPSN::retrieve(cred, category = "genus")) # # lpsn_species <- LPSN::request(cred, category = "species") # message("Downloading species data (n = ", lpsn_species$count, ") from LPSN API...") # lpsn_species <- as.data.frame(LPSN::retrieve(cred, category = "species")) # # lpsn_subspecies <- LPSN::request(cred, category = "subspecies") # message("Downloading subspecies data (n = ", lpsn_subspecies$count, ") from LPSN API...") # lpsn_subspecies <- as.data.frame(LPSN::retrieve(cred, category = "subspecies")) # # message("Binding rows...") # lpsn_total <- bind_rows(lpsn_genus, lpsn_species, lpsn_subspecies) # message("Done.") # lpsn_total # } # Read LPSN data ---------------------------------------------------------- taxonomy_lpsn.bak <- vroom(file_lpsn, guess_max = 1e5) taxonomy_lpsn <- taxonomy_lpsn.bak %>% transmute( genus = genus_name, species = sp_epithet, subspecies = subsp_epithet, rank = case_when( !is.na(subsp_epithet) ~ "subspecies", !is.na(sp_epithet) ~ "species", TRUE ~ "genus" ), status = ifelse(is.na(record_lnk), "accepted", "synonym"), ref = authors, lpsn = as.character(record_no), lpsn_parent = NA_character_, lpsn_renamed_to = as.character(record_lnk) ) %>% mutate(source = "LPSN") # integrity tests sort(table(taxonomy_lpsn$rank)) sort(table(taxonomy_lpsn$status)) taxonomy_lpsn # download additional taxonomy to the domain/kingdom level (their API is not sufficient...) taxonomy_lpsn_missing <- tibble( kingdom = character(0), phylum = character(0), class = character(0), order = character(0), family = character(0), genus = character(0) ) for (page in LETTERS) { # this will not alter `taxonomy_lpsn` yet message("Downloading page ", page, "...", appendLF = TRUE) url <- paste0("https://lpsn.dsmz.de/genus?page=", page) x <- tryCatch(read_html(url), error = function(e) { message("Waiting 10 seconds because of error: ", e$message) Sys.sleep(10) read_html(url) }) x <- x %>% # class "main-list" is the main table html_element(".main-list") %>% # get every list element with a set attribute html_elements("li[id]") pb <- progress_bar$new(total = length(x), format = "[:bar] :current/:total :eta") for (i in seq_len(length(x))) { pb$tick() elements <- x[[i]] %>% html_elements("a") hrefs <- elements %>% html_attr("href") ranks <- hrefs %>% gsub(".*/(.*?)/.*", "\\1", .) names <- elements %>% html_text() %>% gsub('"', "", ., fixed = TRUE) # no species, this must be until genus level hrefs <- hrefs[ranks != "species"] names <- names[ranks != "species"] ranks <- ranks[ranks != "species"] ranks[ranks == "domain"] <- "kingdom" suppressMessages( df <- names %>% tibble() %>% t() %>% as_tibble(.name_repair = "unique") %>% setNames(ranks) %>% # no candidates please filter(genus %unlike% "^(Candidatus|\\[)") ) taxonomy_lpsn_missing <- taxonomy_lpsn_missing %>% bind_rows(df) } message(" => ", length(x), " entries incl. candidates (cleaned total: ", nrow(taxonomy_lpsn_missing), ")") } taxonomy_lpsn_missing <- taxonomy_lpsn_missing %>% distinct() saveRDS(taxonomy_lpsn_missing, "data-raw/taxonomy_lpsn_missing.rds") # had to pick the right genus/family combination here: taxonomy_lpsn_missing <- taxonomy_lpsn_missing %>% filter(!(genus == "Pusillimonas" & family == "Oscillospiraceae")) taxonomy_lpsn.bak2 <- taxonomy_lpsn.bak taxonomy_lpsn <- taxonomy_lpsn %>% left_join(taxonomy_lpsn_missing, by = "genus") %>% select(kingdom:family, everything()) %>% # remove entries like "[Bacteria, no family]" and "[Bacteria, no class]" mutate_all(function(x) ifelse(x %like_case% " no ", NA_character_, x)) taxonomy_lpsn.bak2 <- taxonomy_lpsn # download family directly from LPSN website using scraping, by using get_lpsn_and_author() # try it first: # get_lpsn_and_author("genus", "Escherichia") # get_lpsn_and_author("family", "Enterobacteriaceae") pb <- progress_bar$new(total = length(unique(taxonomy_lpsn$family)), format = "[:bar] :current/:total :eta") for (f in unique(taxonomy_lpsn$family)) { pb$tick() if (is.na(f)) next tax_info <- get_lpsn_and_author("Family", f) taxonomy_lpsn <- taxonomy_lpsn %>% bind_rows(tibble( kingdom = taxonomy_lpsn$kingdom[which(taxonomy_lpsn$family == f)[1]], phylum = taxonomy_lpsn$phylum[which(taxonomy_lpsn$family == f)[1]], class = taxonomy_lpsn$class[which(taxonomy_lpsn$family == f)[1]], order = taxonomy_lpsn$order[which(taxonomy_lpsn$family == f)[1]], family = f, rank = "family", status = unname(tax_info["status"]), source = "LPSN", lpsn = unname(tax_info["lpsn"]), ref = unname(tax_info["ref"]) )) } # download order directly from LPSN website using scraping # try it first: # get_lpsn_and_author("order", "Enterobacterales") pb <- progress_bar$new(total = length(unique(taxonomy_lpsn$order)), format = "[:bar] :current/:total :eta") for (o in unique(taxonomy_lpsn$order)) { pb$tick() if (is.na(o)) next tax_info <- get_lpsn_and_author("Order", o) taxonomy_lpsn <- taxonomy_lpsn %>% bind_rows(tibble( kingdom = taxonomy_lpsn$kingdom[which(taxonomy_lpsn$order == o)[1]], phylum = taxonomy_lpsn$phylum[which(taxonomy_lpsn$order == o)[1]], class = taxonomy_lpsn$class[which(taxonomy_lpsn$order == o)[1]], order = o, rank = "order", status = unname(tax_info["status"]), source = "LPSN", lpsn = unname(tax_info["lpsn"]), ref = unname(tax_info["ref"]) )) } # download class directly from LPSN website using scraping # try it first: # get_lpsn_and_author("class", "Gammaproteobacteria") pb <- progress_bar$new(total = length(unique(taxonomy_lpsn$class)), format = "[:bar] :current/:total :eta") for (cc in unique(taxonomy_lpsn$class)) { pb$tick() if (is.na(cc)) next tax_info <- get_lpsn_and_author("Class", cc) taxonomy_lpsn <- taxonomy_lpsn %>% bind_rows(tibble( kingdom = taxonomy_lpsn$kingdom[which(taxonomy_lpsn$class == cc)[1]], phylum = taxonomy_lpsn$phylum[which(taxonomy_lpsn$class == cc)[1]], class = cc, rank = "class", status = unname(tax_info["status"]), source = "LPSN", lpsn = unname(tax_info["lpsn"]), ref = unname(tax_info["ref"]) )) } # download phylum directly from LPSN website using scraping # try it first: # get_lpsn_and_author("phylum", "Pseudomonadota") pb <- progress_bar$new(total = length(unique(taxonomy_lpsn$phylum)), format = "[:bar] :current/:total :eta") for (p in unique(taxonomy_lpsn$phylum)) { pb$tick() if (is.na(p)) next tax_info <- get_lpsn_and_author("Phylum", p) taxonomy_lpsn <- taxonomy_lpsn %>% bind_rows(tibble( kingdom = taxonomy_lpsn$kingdom[which(taxonomy_lpsn$phylum == p)[1]], phylum = p, rank = "phylum", status = unname(tax_info["status"]), source = "LPSN", lpsn = unname(tax_info["lpsn"]), ref = unname(tax_info["ref"]) )) } # download kingdom directly from LPSN website using scraping # try it first: # get_lpsn_and_author("kingdom", "Bacteria") pb <- progress_bar$new(total = length(unique(taxonomy_lpsn$kingdom)), format = "[:bar] :current/:total :eta") for (k in unique(taxonomy_lpsn$kingdom)) { pb$tick() if (is.na(k)) next tax_info <- get_lpsn_and_author("Domain", k) taxonomy_lpsn <- taxonomy_lpsn %>% bind_rows(tibble( kingdom = k, rank = "kingdom", status = unname(tax_info["status"]), source = "LPSN", lpsn = unname(tax_info["lpsn"]), ref = unname(tax_info["ref"]) )) } taxonomy_lpsn <- taxonomy_lpsn |> filter(status != "not validly published") # integrity tests sort(table(taxonomy_lpsn$rank)) # should only be 'accepted' and 'synonym': sort(table(taxonomy_lpsn$status)) # Read MycoBank data ------------------------------------------------------ taxonomy_mycobank <- readxl::read_excel(file_mycobank, guess_max = 1e5) taxonomy_mycobank.bak <- taxonomy_mycobank taxonomy_mycobank <- taxonomy_mycobank %>% transmute(mycobank = `MycoBank #`, fullname = gsub(" +", " ", `Taxon name`), current = `Current name.Taxon name`, ref = paste0(Authors, ", ", `Year of effective publication`), rank = `Rank.Rank name`, status = `Name status`, mycobank_renamed_to = taxonomy_mycobank.bak$`MycoBank #`[match(`Current name.Taxon name`, taxonomy_mycobank.bak$`Taxon name`)], Classification ) %>% separate(Classification, sep = ", ", into = paste0("tax_", letters[1:20]), remove = TRUE) %>% mutate(rank = case_when( fullname %like_case% "^[A-Z][a-z]+ .+ .+" ~ "subsp.", rank == "-" & fullname %like_case% "^[A-Z][a-z]+ [a-z-]+$" ~ "sp.", rank == "-" & paste0(fullname, " ") %in% gsub("(^[A-Z][a-z]+ ).*", "\\1", trimws(taxonomy_mycobank.bak$`Taxon name`), perl = TRUE) ~ "gen.", # we take a leap here for the family and order rank == "-" & fullname %like% "ceae$" ~ "fam.", rank == "-" & fullname %like% "ales$" ~ "ordo", # tax_d and tax_e are the subdivision/class columns, kind of; prefer e over d rank == "-" & fullname %in% tax_e ~ "cl.", rank == "-" & fullname %in% tax_d ~ "cl.", TRUE ~ rank )) %>% filter(rank %unlike% "sub" & !tolower(rank) %in% c("var.", "sect.", "ser.", "tr.", "f.", "race", "stirps", "*", "-"), # we also remove orthographic variants here, because our algorithms will give valid results based on misspelling !tolower(status) %in% c("invalid", "deleted", "uncertain", "illegitimate", "orthographic variant", "unavailable")) %>% mutate(mycobank_renamed_to = if_else(mycobank_renamed_to == mycobank, NA_character_, mycobank_renamed_to)) %>% # only keep 1 entry for the kingdom (regnum) filter(fullname == "Fungi" | rank != "regn.") %>% # no other kingdoms than fungi filter(fullname == "Fungi" | tax_a == "Fungi") %>% select_if(function(x) !all(is.na(x))) taxonomy_mycobank %>% count(rank, sort = TRUE) unique(taxonomy_mycobank$status) taxonomy_mycobank <- taxonomy_mycobank %>% mutate(status = if_else(!is.na(mycobank_renamed_to), "synonym", "accepted")) taxonomy_mycobank2 <- taxonomy_mycobank taxonomy_mycobank <- taxonomy_mycobank %>% mutate(rank = case_match(rank, "sp." ~ "species", "gen." ~ "genus", "fam." ~ "family", "ordo" ~ "order", "cl." ~ "class", "div." ~ "phylum", "regn." ~ "kingdom", .default = paste0("#", rank))) taxonomy_mycobank %>% filter(rank %like% "#") taxonomy_mycobank %>% count(rank, sort = TRUE) taxonomy_mycobank3 <- taxonomy_mycobank # MycoBank just pasted their taxonomy together into 1 field, and now some classes are in the division (tax_b) column, it's horrible # so we decide based on the fullname and rank column per record # use this to determine how far to go: any(taxonomy_mycobank$fullname[taxonomy_mycobank$rank == "class"] %in% taxonomy_mycobank$tax_e) # replace tax_e with tax_f, etc taxonomy_mycobank <- taxonomy_mycobank %>% mutate(kingdom = "Fungi", # we already filtered everything else, and MycoBank 90157 has '-' as current name... phylum = case_when(rank == "phylum" ~ fullname, tax_b %in% taxonomy_mycobank$fullname[taxonomy_mycobank$rank == "phylum"] ~ tax_b, tax_c %in% taxonomy_mycobank$fullname[taxonomy_mycobank$rank == "phylum"] ~ tax_c, TRUE ~ ""), class = case_when(rank == "class" ~ fullname, tax_c %in% taxonomy_mycobank$fullname[taxonomy_mycobank$rank == "class"] ~ tax_c, tax_d %in% taxonomy_mycobank$fullname[taxonomy_mycobank$rank == "class"] ~ tax_d, tax_e %in% taxonomy_mycobank$fullname[taxonomy_mycobank$rank == "class"] ~ tax_e, TRUE ~ ""), order = case_when(rank == "order" ~ fullname, tax_c %in% taxonomy_mycobank$fullname[taxonomy_mycobank$rank == "order"] ~ tax_c, tax_d %in% taxonomy_mycobank$fullname[taxonomy_mycobank$rank == "order"] ~ tax_d, tax_e %in% taxonomy_mycobank$fullname[taxonomy_mycobank$rank == "order"] ~ tax_e, tax_f %in% taxonomy_mycobank$fullname[taxonomy_mycobank$rank == "order"] ~ tax_f, tax_g %in% taxonomy_mycobank$fullname[taxonomy_mycobank$rank == "order"] ~ tax_g, TRUE ~ ""), family = case_when(rank == "family" ~ fullname, tax_c %in% taxonomy_mycobank$fullname[taxonomy_mycobank$rank == "family"] ~ tax_c, tax_d %in% taxonomy_mycobank$fullname[taxonomy_mycobank$rank == "family"] ~ tax_d, tax_e %in% taxonomy_mycobank$fullname[taxonomy_mycobank$rank == "family"] ~ tax_e, tax_f %in% taxonomy_mycobank$fullname[taxonomy_mycobank$rank == "family"] ~ tax_f, tax_g %in% taxonomy_mycobank$fullname[taxonomy_mycobank$rank == "family"] ~ tax_g, tax_h %in% taxonomy_mycobank$fullname[taxonomy_mycobank$rank == "family"] ~ tax_h, tax_i %in% taxonomy_mycobank$fullname[taxonomy_mycobank$rank == "family"] ~ tax_i, TRUE ~ ""), genus = case_when(rank == "genus" ~ fullname, tax_b %in% taxonomy_mycobank$fullname[taxonomy_mycobank$rank == "genus"] ~ tax_b, tax_c %in% taxonomy_mycobank$fullname[taxonomy_mycobank$rank == "genus"] ~ tax_c, tax_d %in% taxonomy_mycobank$fullname[taxonomy_mycobank$rank == "genus"] ~ tax_d, tax_e %in% taxonomy_mycobank$fullname[taxonomy_mycobank$rank == "genus"] ~ tax_e, tax_f %in% taxonomy_mycobank$fullname[taxonomy_mycobank$rank == "genus"] ~ tax_f, tax_g %in% taxonomy_mycobank$fullname[taxonomy_mycobank$rank == "genus"] ~ tax_g, tax_h %in% taxonomy_mycobank$fullname[taxonomy_mycobank$rank == "genus"] ~ tax_h, tax_i %in% taxonomy_mycobank$fullname[taxonomy_mycobank$rank == "genus"] ~ tax_i, tax_k %in% taxonomy_mycobank$fullname[taxonomy_mycobank$rank == "genus"] ~ tax_j, tax_k %in% taxonomy_mycobank$fullname[taxonomy_mycobank$rank == "genus"] ~ tax_k, TRUE ~ ""), species = case_when(rank == "species" & fullname %like% " " ~ gsub(".* (.*)", "\\1", fullname, perl = TRUE), TRUE ~ "") ) # FOR 2025: use this to get all the genera with updated names from MO_PREVALENT_GENERA: # AMR::microorganisms %>% filter(genus %in% MO_PREVALENT_GENERA) %>% pull(fullname) %>% mo_current() %>% mo_genus() %>% unique() %>% sort() # keep only the relevant ones taxonomy_mycobank <- taxonomy_mycobank %>% filter(order %in% include_fungal_orders | genus %in% AMR:::MO_PREVALENT_GENERA | !rank %in% c("genus", "species")) %>% filter(!(genus == "" & rank %in% c("genus", "species"))) # clean up authors and add last columns taxonomy_mycobank <- taxonomy_mycobank %>% mutate(subspecies = "", source = "MycoBank", mycobank_parent = NA_character_) # select final set taxonomy_mycobank <- taxonomy_mycobank %>% select(fullname, kingdom, phylum, class, order, family, genus, species, subspecies, rank, status, ref, mycobank, mycobank_parent, mycobank_renamed_to, source) # not all 'renamed to' records are available, some were even just orthographic variants (that have an invalid status) taxonomy_mycobank$status[!is.na(taxonomy_mycobank$mycobank_renamed_to) & !taxonomy_mycobank$mycobank_renamed_to %in% taxonomy_mycobank$mycobank] <- "accepted" taxonomy_mycobank$mycobank_renamed_to[taxonomy_mycobank$status == "accepted"] <- NA taxonomy_mycobank %>% count(status) # Read GBIF data ---------------------------------------------------------- taxonomy_gbif.bak <- vroom(file_gbif, guess_max = 1e5) include_fungal_orders <- unique(taxonomy_mycobank$order[!taxonomy_mycobank$order %in% c("", NA)]) # get latest taxonomic names of these fungal orders include_fungal_orders_ids <- taxonomy_gbif.bak %>% filter(order %in% include_fungal_orders) include_fungal_orders <- taxonomy_gbif.bak %>% filter(taxonID %in% c(include_fungal_orders_ids$taxonID, include_fungal_orders_ids$acceptedNameUsageID)) %>% distinct(order) %>% pull(order) %>% sort() # check some columns to validate below filters taxonomy_gbif.bak %>% count(taxonomicStatus, sort = TRUE) taxonomy_gbif.bak %>% count(taxonRank, sort = TRUE) taxonomy_gbif <- taxonomy_gbif.bak %>% # immediately filter rows we really never want filter( # never doubtful status, only accepted and all synonyms, and only ranked items taxonomicStatus != "doubtful", taxonRank != "unranked", # include these kingdoms (no Chromista) kingdom %in% c("Archaea", "Bacteria", "Protozoa") | # include all of these fungal orders order %in% include_fungal_orders | # and all of these important genera (see "data-raw/_pre_commit_checks.R") # (they also contain bacteria and protozoa, but these will get prevalence = 2 later on) genus %in% AMR:::MO_PREVALENT_GENERA ) %>% select( kingdom, phylum, class, order, family, genus, species = specificEpithet, subspecies = infraspecificEpithet, rank = taxonRank, status = taxonomicStatus, ref = scientificNameAuthorship, gbif = taxonID, gbif_parent = parentNameUsageID, gbif_renamed_to = acceptedNameUsageID ) %>% mutate( # do this mutate after the original selection/filtering, as it decreases computing time tremendously status = ifelse(status == "accepted", "accepted", "synonym"), # checked taxonRank - the "form" and "variety" always have a subspecies, so: rank = ifelse(rank %in% c("form", "variety"), "subspecies", rank), source = "GBIF" ) %>% filter( # their data is messy - keep only these: rank == "kingdom" & !is.na(kingdom) | rank == "phylum" & !is.na(phylum) | rank == "class" & !is.na(class) | rank == "order" & !is.na(order) | rank == "family" & !is.na(family) | rank == "genus" & !is.na(genus) | rank == "species" & !is.na(species) | rank == "subspecies" & !is.na(subspecies) ) %>% # some items end with _A or _B... why?? mutate_all(~ gsub("_[A-Z]$", "", .x, perl = TRUE)) %>% # now we have duplicates, remove these, but prioritise "accepted" status and highest taxon ID arrange(status, gbif) %>% distinct(kingdom, phylum, class, order, family, genus, species, subspecies, .keep_all = TRUE) %>% filter( kingdom %unlike% "[0-9]", phylum %unlike% "[0-9]", class %unlike% "[0-9]", order %unlike% "[0-9]", family %unlike% "[0-9]", genus %unlike% "[0-9]" ) # integrity tests sort(table(taxonomy_gbif$rank)) sort(table(taxonomy_gbif$status)) taxonomy_gbif # Save intermediate results ----------------------------------------------- saveRDS(taxonomy_gbif, "data-raw/taxonomy_gbif.rds", version = 2) saveRDS(taxonomy_lpsn, "data-raw/taxonomy_lpsn.rds", version = 2) saveRDS(taxonomy_mycobank, "data-raw/taxonomy_mycobank.rds", version = 2) # this allows to always get back to this point by simply loading the files from data-raw/. # Add full names ---------------------------------------------------------- taxonomy_gbif <- taxonomy_gbif %>% # clean NAs and add fullname mutate(across(kingdom:subspecies, function(x) ifelse(is.na(x), "", x)), fullname = trimws(case_when( rank == "family" ~ family, rank == "order" ~ order, rank == "class" ~ class, rank == "phylum" ~ phylum, rank == "kingdom" ~ kingdom, TRUE ~ paste(genus, species, subspecies) # already trimmed 6 lines up )), .before = 1 ) %>% # keep only one GBIF taxon ID per full name arrange(fullname, gbif) %>% distinct(kingdom, rank, fullname, .keep_all = TRUE) taxonomy_lpsn <- taxonomy_lpsn %>% # clean NAs and add fullname mutate(across(kingdom:subspecies, function(x) ifelse(is.na(x), "", x)), fullname = trimws(case_when( rank == "family" ~ family, rank == "order" ~ order, rank == "class" ~ class, rank == "phylum" ~ phylum, rank == "kingdom" ~ kingdom, TRUE ~ paste(genus, species, subspecies) # already trimmed 6 lines up )), .before = 1 ) %>% # keep only one LPSN record ID per full name arrange(fullname, lpsn) %>% distinct(kingdom, rank, fullname, .keep_all = TRUE) taxonomy_mycobank <- taxonomy_mycobank %>% # clean NAs and add fullname mutate(across(kingdom:subspecies, function(x) ifelse(is.na(x), "", x)), fullname = trimws(case_when( rank == "family" ~ family, rank == "order" ~ order, rank == "class" ~ class, rank == "phylum" ~ phylum, rank == "kingdom" ~ kingdom, TRUE ~ paste(genus, species, subspecies) # already trimmed 6 lines up )), .before = 1 ) %>% # keep only one MycoBank record ID per full name arrange(fullname, mycobank) %>% distinct(kingdom, rank, fullname, .keep_all = TRUE) # Combine the datasets ---------------------------------------------------- # TODO !! check why e.g. Clavispora lusitaniae is gotten from GBIF, not MycoBank !! taxonomy <- taxonomy_lpsn %>% # add fungi bind_rows(taxonomy_mycobank) %>% # start by adding GBIF to the bottom bind_rows(taxonomy_gbif) %>% # group on unique species group_by(kingdom, fullname) %>% # fill the NAs in LPSN/GBIF fields and ref with the other source (so LPSN: 123 and GBIF: NA will become LPSN: 123 and GBIF: 123) mutate(across(matches("^(lpsn|mycobank|gbif|ref)"), function(x) rep(x[!is.na(x)][1], length(x)))) %>% # ungroup again ungroup() %>% # only keep unique species per kingdom distinct(kingdom, fullname, .keep_all = TRUE) %>% arrange(fullname) # get missing entries from existing microorganisms data set taxonomy.old <- AMR::microorganisms %>% select(any_of(colnames(taxonomy))) %>% filter( !paste(kingdom, fullname) %in% paste(taxonomy$kingdom, taxonomy$fullname), # these will be added later: source != "manually added") taxonomy <- taxonomy %>% bind_rows(taxonomy.old) %>% arrange(fullname) %>% filter(fullname != "") # fix rank taxonomy %>% count(rank, sort = TRUE) taxonomy <- taxonomy %>% mutate(rank = case_when( subspecies != "" ~ "subspecies", species != "" ~ "species", genus != "" ~ "genus", family != "" ~ "family", order != "" ~ "order", class != "" ~ "class", phylum != "" ~ "phylum", kingdom != "" ~ "kingdom", TRUE ~ NA_character_ )) taxonomy %>% count(rank, sort = TRUE) # at this point, it happens that some genera within kingdoms have multiple families / orders, etc., see here: taxonomy %>% filter(genus != "") %>% group_by(kingdom, genus) %>% filter(n_distinct(family) > 1) %>% View() # so make this universal taxonomy0 <- taxonomy taxonomy <- taxonomy0 taxonomy <- taxonomy %>% group_by(kingdom, genus) %>% mutate(family = get_top_lvl(family, rank, source, "genus", genus)) %>% group_by(kingdom, family) %>% mutate(order = get_top_lvl(order, rank, source, "family", family)) %>% group_by(kingdom, order) %>% mutate(class = get_top_lvl(class, rank, source, "order", order)) %>% group_by(kingdom, class) %>% mutate(phylum = get_top_lvl(phylum, rank, source, "class", class), .after = phylum) %>% ungroup() # and remove the taxonomy where it must remain empty taxonomy <- taxonomy %>% mutate(phylum = ifelse(rank %in% c("kingdom"), "", phylum), class = ifelse(rank %in% c("kingdom", "phylum"), "", class), order = ifelse(rank %in% c("kingdom", "phylum", "class"), "", order), family = ifelse(rank %in% c("kingdom", "phylum", "class", "order"), "", family), genus = ifelse(rank %in% c("kingdom", "phylum", "class", "order", "family"), "", genus), species = ifelse(rank %in% c("kingdom", "phylum", "class", "order", "family", "genus"), "", species), subspecies = ifelse(rank %in% c("kingdom", "phylum", "class", "order", "family", "genus", "species"), "", subspecies)) # Save intermediate results (0) ------------------------------------------- saveRDS(taxonomy, "data-raw/taxonomy0.rds") # Add missing and fix old taxonomic entries ------------------------------- # this part will make sure that the whole taxonomy of every included species exists, so no missing genera, classes, etc. current_gbif <- taxonomy_gbif.bak %>% filter(is.na(acceptedNameUsageID)) %>% mutate( taxonID = as.character(taxonID), parentNameUsageID = as.character(parentNameUsageID) ) # add missing kingdoms taxonomy_all_missing <- taxonomy %>% filter(kingdom != "") %>% distinct(kingdom) %>% mutate( fullname = kingdom, rank = "kingdom" ) %>% filter(!paste(kingdom, rank) %in% paste(taxonomy$kingdom, taxonomy$rank)) %>% left_join( current_gbif %>% select(kingdom, rank = taxonRank, ref = scientificNameAuthorship, gbif = taxonID, gbif_parent = parentNameUsageID), by = c("kingdom", "rank") ) %>% mutate(source = ifelse(!is.na(gbif), "GBIF", "manually added"), status = ifelse(!is.na(gbif), "accepted", "unknown")) # 2 = phylum ... 6 = genus for (i in 2:6) { i_name <- colnames(taxonomy)[i + 1] message("Adding missing: ", i_name, "... ", appendLF = FALSE) to_add <- taxonomy %>% filter(.[[i + 1]] != "") %>% distinct(kingdom, .[[i + 1]], .keep_all = TRUE) %>% select(kingdom:(i + 1)) %>% mutate( fullname = .[[ncol(.)]], rank = i_name ) %>% filter(!paste(kingdom, .[[ncol(.) - 2]], rank) %in% paste(taxonomy$kingdom, taxonomy[[i + 1]], taxonomy$rank)) %>% # get GBIF identifier where available left_join( current_gbif %>% select(kingdom, all_of(i_name), rank = taxonRank, ref = scientificNameAuthorship, gbif = taxonID, gbif_parent = parentNameUsageID), by = c("kingdom", "rank", i_name) ) %>% mutate(source = ifelse(!is.na(gbif), "GBIF", "manually added"), status = ifelse(!is.na(gbif), "accepted", "unknown")) message("n = ", nrow(to_add)) taxonomy_all_missing <- taxonomy_all_missing %>% bind_rows(to_add) } taxonomy_all_missing %>% View() taxonomy <- taxonomy %>% bind_rows(taxonomy_all_missing) # fix for duplicate fullnames within a kingdom (such as Nitrospira which is the name of the genus AND its class) taxonomy <- taxonomy %>% mutate( rank_index = case_when( rank == "subspecies" ~ 1, rank == "species" ~ 2, rank == "genus" ~ 3, rank == "family" ~ 4, rank == "order" ~ 5, rank == "class" ~ 6, TRUE ~ 7 ), fullname_rank = paste0(fullname, " {", rank, "}") ) %>% arrange(kingdom, fullname, rank_index) %>% group_by(kingdom, fullname) %>% mutate(fullname = if_else(row_number() > 1, fullname_rank, fullname)) %>% ungroup() %>% select(-fullname_rank, -rank_index) %>% arrange(fullname) # now also add missing species that have subspecies (requires combination with genus) missing_species <- taxonomy %>% filter(species != "") %>% distinct(kingdom, genus, species, .keep_all = TRUE) %>% select(kingdom:species) %>% mutate( fullname = paste(genus, species), rank = "species" ) %>% filter(!paste(kingdom, genus, species, rank) %in% paste(taxonomy$kingdom, taxonomy$genus, taxonomy$species, taxonomy$rank)) %>% # get GBIF identifier where available left_join( current_gbif %>% select(kingdom, genus, species = specificEpithet, rank = taxonRank, ref = scientificNameAuthorship, gbif = taxonID, gbif_parent = parentNameUsageID), by = c("kingdom", "rank", "genus", "species") ) %>% mutate(source = ifelse(!is.na(gbif), "GBIF", "manually added"), status = ifelse(!is.na(gbif), "accepted", "unknown")) taxonomy <- taxonomy %>% bind_rows(missing_species) # remove NAs from taxonomy again, and keep unique full names taxonomy <- taxonomy %>% mutate(across(kingdom:subspecies, function(x) ifelse(is.na(x), "", x))) %>% arrange(kingdom, fullname, ref) %>% distinct(kingdom, fullname, .keep_all = TRUE) %>% filter(kingdom != "") # Save intermediate results (1) ------------------------------------------- saveRDS(taxonomy, "data-raw/taxonomy1.rds") # Get previously manually added entries ----------------------------------- manually_added <- AMR::microorganisms %>% filter(source == "manually added", !paste(kingdom, fullname) %in% paste(taxonomy$kingdom, taxonomy$fullname), !rank %in% c("kingdom", "phylum", "class", "order", "family")) %>% select(fullname:subspecies, ref, source, rank) # get latest taxonomy for those entries for (g in unique(manually_added$genus[manually_added$genus != "" & manually_added$genus %in% taxonomy$genus])) { manually_added$family[which(manually_added$genus == g)] <- taxonomy$family[which(taxonomy$genus == g & is.na(taxonomy$lpsn))][1] } for (f in unique(manually_added$family[manually_added$family != "" & manually_added$family %in% taxonomy$family])) { manually_added$order[which(manually_added$family == f)] <- taxonomy$order[which(taxonomy$family == f & is.na(taxonomy$lpsn))][1] } for (o in unique(manually_added$order[manually_added$order != "" & manually_added$order %in% taxonomy$order])) { manually_added$class[which(manually_added$order == o)] <- taxonomy$class[which(taxonomy$order == o & is.na(taxonomy$lpsn))][1] } for (cc in unique(manually_added$class[manually_added$class != "" & manually_added$class %in% taxonomy$class])) { manually_added$phylum[which(manually_added$class == cc)] <- taxonomy$phylum[which(taxonomy$class == cc & is.na(taxonomy$lpsn))][1] } for (p in unique(manually_added$phylum[manually_added$phylum != "" & manually_added$phylum %in% taxonomy$phylum])) { manually_added$kingdom[which(manually_added$phylum == p)] <- taxonomy$kingdom[which(taxonomy$phylum == p & is.na(taxonomy$lpsn))][1] } manually_added <- manually_added %>% mutate( status = "unknown", rank = ifelse(fullname %like% "unknown", "(unknown rank)", rank) ) manually_added %>% View() # these are now included in the new taxonomy, check them manually_added %>% filter(fullname %in% taxonomy$fullname) taxonomy <- taxonomy %>% # here also the 'unknowns' are added, such as "(unknown fungus)" bind_rows(manually_added) %>% arrange(fullname) table(taxonomy$rank, useNA = "always") # Get LPSN data for records missing from `taxonomy_lpsn` ------------------ # Weirdly enough, some LPSN records were lacking from `taxonomy_lpsn`, # such as family Thiotrichaceae and its order Thiotrichales. When running # get_lpsn_and_author("family", "Thiotrichaceae") you do get a result, vs. taxonomy_lpsn %>% filter(family == "Thiotrichaceae"). # So check every non-LPSN records from the kingdom of Bacteria and add it gbif_bacteria <- which(taxonomy$kingdom == "Bacteria" & taxonomy$source == "GBIF" & taxonomy$rank %in% c("phylum", "class", "order", "family")) added <- 0 pb <- progress_bar$new(total = length(gbif_bacteria), format = "[:bar] :current/:total :eta") for (record in gbif_bacteria) { pb$tick() lpsn <- get_lpsn_and_author(rank = taxonomy$rank[record], name = taxonomy$fullname[record]) if (is.na(lpsn["lpsn"])) { next } else { added <- added + 1 taxonomy$source[record] <- "LPSN" taxonomy$lpsn[record] <- unname(lpsn["lpsn"]) taxonomy$ref[record] <- unname(lpsn["ref"]) taxonomy$status[record] <- unname(lpsn["status"]) } } warnings() message(added, " GBIF records altered to latest LPSN") taxbak <- taxonomy saveRDS(taxonomy, "data-raw/taxonomy0b.rds") # Clean scientific reference ---------------------------------------------- taxonomy <- taxonomy %>% mutate(ref = get_author_year(ref)) # Get the latest upper taxonomy from LPSN/MycoBank for GBIF data --------------- # (e.g., phylum above class "Bacilli" was still "Firmicutes" in 2023, should be "Bacillota") for (k in unique(taxonomy$kingdom[taxonomy$kingdom != ""])) { if (k == "Fungi") { src <- "MycoBank" } else { src <- "LPSN" } message("Fixing GBIF taxonomy for kingdom ", k, " based on ", src, ".", appendLF = FALSE) i <- 0 for (g in unique(taxonomy$genus[taxonomy$genus != "" & taxonomy$kingdom == k & taxonomy$source == src])) { i <- i + 1 if (i %% 50 == 0) message(".", appendLF = FALSE) taxonomy$family[which(taxonomy$genus == g & taxonomy$kingdom == k)] <- taxonomy$family[which(taxonomy$genus == g & taxonomy$kingdom == k & taxonomy$source == src)][1] } for (f in unique(taxonomy$family[taxonomy$family != "" & taxonomy$kingdom == k & taxonomy$source == src])) { i <- i + 1 if (i %% 50 == 0) message(".", appendLF = FALSE) taxonomy$order[which(taxonomy$family == f & taxonomy$kingdom == k)] <- taxonomy$order[which(taxonomy$family == f & taxonomy$kingdom == k & taxonomy$source == src)][1] } for (o in unique(taxonomy$order[taxonomy$order != "" & taxonomy$kingdom == k & taxonomy$source == src])) { i <- i + 1 if (i %% 50 == 0) message(".", appendLF = FALSE) taxonomy$class[which(taxonomy$order == o & taxonomy$kingdom == k)] <- taxonomy$class[which(taxonomy$order == o & taxonomy$kingdom == k & taxonomy$source == src)][1] } for (cc in unique(taxonomy$class[taxonomy$class != "" & taxonomy$kingdom == k & taxonomy$source == src])) { i <- i + 1 if (i %% 50 == 0) message(".", appendLF = FALSE) taxonomy$phylum[which(taxonomy$class == cc & taxonomy$kingdom == k)] <- taxonomy$phylum[which(taxonomy$class == cc & taxonomy$kingdom == k & taxonomy$source == src)][1] } message("OK.") } # fix rank taxonomy <- taxonomy %>% mutate(rank = case_when( subspecies != "" ~ "subspecies", species != "" ~ "species", genus != "" ~ "genus", family != "" ~ "family", order != "" ~ "order", class != "" ~ "class", phylum != "" ~ "phylum", kingdom != "" ~ "kingdom", TRUE ~ NA_character_ )) # Add parent identifiers -------------------------------------------------- # requires full name and full taxonomy taxonomybak <- taxonomy taxonomy <- taxonomy %>% mutate( lpsn_parent = case_when( rank == "phylum" ~ lpsn[match(kingdom, fullname)], # in class, take parent from phylum if available, otherwise kingdom rank == "class" & phylum != "" ~ lpsn[match(phylum, fullname)], rank == "class" ~ lpsn[match(kingdom, fullname)], # in order, take parent from class if available, otherwise phylum, otherwise kingdom rank == "order" & class != "" ~ lpsn[match(class, fullname)], rank == "order" & phylum != "" ~ lpsn[match(phylum, fullname)], rank == "order" ~ lpsn[match(kingdom, fullname)], # family rank == "family" & order != "" ~ lpsn[match(order, fullname)], rank == "family" & class != "" ~ lpsn[match(class, fullname)], rank == "family" & phylum != "" ~ lpsn[match(phylum, fullname)], rank == "family" ~ lpsn[match(kingdom, fullname)], # genus rank == "genus" & family != "" ~ lpsn[match(family, fullname)], rank == "genus" & order != "" ~ lpsn[match(order, fullname)], rank == "genus" & class != "" ~ lpsn[match(class, fullname)], rank == "genus" & phylum != "" ~ lpsn[match(phylum, fullname)], rank == "genus" ~ lpsn[match(kingdom, fullname)], # species, always has a genus rank == "species" ~ lpsn[match(genus, fullname)], # subspecies, always has a genus + species rank == "subspecies" ~ lpsn[match(paste(genus, species), fullname)], TRUE ~ NA_character_), mycobank_parent = case_when( rank == "phylum" ~ mycobank[match(kingdom, fullname)], # class rank == "class" & phylum != "" ~ mycobank[match(phylum, fullname)], rank == "class" ~ mycobank[match(kingdom, fullname)], # order rank == "order" & class != "" ~ mycobank[match(class, fullname)], rank == "order" & phylum != "" ~ mycobank[match(phylum, fullname)], rank == "order" ~ mycobank[match(kingdom, fullname)], # family rank == "family" & order != "" ~ mycobank[match(order, fullname)], rank == "family" & class != "" ~ mycobank[match(class, fullname)], rank == "family" & phylum != "" ~ mycobank[match(phylum, fullname)], rank == "family" ~ mycobank[match(kingdom, fullname)], # genus rank == "genus" & family != "" ~ mycobank[match(family, fullname)], rank == "genus" & order != "" ~ mycobank[match(order, fullname)], rank == "genus" & class != "" ~ mycobank[match(class, fullname)], rank == "genus" & phylum != "" ~ mycobank[match(phylum, fullname)], rank == "genus" ~ mycobank[match(kingdom, fullname)], # species rank == "species" ~ mycobank[match(genus, fullname)], # subspecies rank == "subspecies" ~ mycobank[match(paste(genus, species), fullname)], TRUE ~ NA_character_), gbif_parent = case_when( rank == "phylum" ~ gbif[match(kingdom, fullname)], # class rank == "class" & phylum != "" ~ gbif[match(phylum, fullname)], rank == "class" ~ gbif[match(kingdom, fullname)], # order rank == "order" & class != "" ~ gbif[match(class, fullname)], rank == "order" & phylum != "" ~ gbif[match(phylum, fullname)], rank == "order" ~ gbif[match(kingdom, fullname)], # family rank == "family" & order != "" ~ gbif[match(order, fullname)], rank == "family" & class != "" ~ gbif[match(class, fullname)], rank == "family" & phylum != "" ~ gbif[match(phylum, fullname)], rank == "family" ~ gbif[match(kingdom, fullname)], # genus rank == "genus" & family != "" ~ gbif[match(family, fullname)], rank == "genus" & order != "" ~ gbif[match(order, fullname)], rank == "genus" & class != "" ~ gbif[match(class, fullname)], rank == "genus" & phylum != "" ~ gbif[match(phylum, fullname)], rank == "genus" ~ gbif[match(kingdom, fullname)], # species rank == "species" ~ gbif[match(genus, fullname)], # subspecies rank == "subspecies" ~ gbif[match(paste(genus, species), fullname)], TRUE ~ NA_character_)) # these still have no record in our data set: which(!taxonomy$lpsn_parent %in% taxonomy$lpsn) which(!taxonomy$mycobank_parent %in% taxonomy$mycobank) which(!taxonomy$gbif_parent %in% taxonomy$gbif) # Add prevalence ---------------------------------------------------------- taxonomy_lpsn.bak3 <- taxonomy pathogens <- read_excel(file_bartlett, sheet = "Tab 6 Full List") # get all established, both old and current taxonomic names established <- pathogens %>% filter(status == "established") %>% mutate(fullname = paste(genus, species)) %>% pull(fullname) %>% c( unlist(mo_current(.)), unlist(mo_synonyms(., keep_synonyms = FALSE)) ) %>% strsplit(" ", fixed = TRUE) %>% sapply(function(x) ifelse(length(x) == 1, x, paste(x[1], x[2]))) %>% sort() %>% unique() # get all putative, both old and current taxonomic names putative <- pathogens %>% filter(status == "putative") %>% mutate(fullname = paste(genus, species)) %>% pull(fullname) %>% c( unlist(mo_current(.)), unlist(mo_synonyms(., keep_synonyms = FALSE)) ) %>% strsplit(" ", fixed = TRUE) %>% sapply(function(x) ifelse(length(x) == 1, x, paste(x[1], x[2]))) %>% sort() %>% unique() established <- established[established %unlike% "unknown"] putative <- putative[putative %unlike% "unknown"] established_genera <- established %>% strsplit(" ", fixed = TRUE) %>% sapply(function(x) x[1]) %>% sort() %>% unique() putative_genera <- putative %>% strsplit(" ", fixed = TRUE) %>% sapply(function(x) x[1]) %>% sort() %>% unique() nonbacterial_genera <- AMR:::MO_PREVALENT_GENERA %>% c( unlist(mo_current(.)), unlist(mo_synonyms(., keep_synonyms = FALSE)) ) %>% strsplit(" ", fixed = TRUE) %>% sapply(function(x) x[1]) %>% sort() %>% unique() nonbacterial_genera <- nonbacterial_genera[nonbacterial_genera %unlike% "unknown"] # update prevalence based on taxonomy (following the recent and thorough work of Bartlett et al., 2022) # see https://doi.org/10.1099/mic.0.001269 taxonomy <- taxonomy %>% mutate(prevalence = case_when( # 'established' means 'have infected at least three persons in three or more references' paste(genus, species) %in% established & rank %in% c("species", "subspecies") ~ 1.0, # other genera in the 'established' group genus %in% established_genera & rank == "genus" ~ 1.0, # 'putative' means 'fewer than three known cases' paste(genus, species) %in% putative & rank %in% c("species", "subspecies") ~ 1.25, # other genera in the 'putative' group genus %in% putative_genera & rank == "genus" ~ 1.25, # we keep track of prevalent genera too of non-bacterial species genus %in% AMR:::MO_PREVALENT_GENERA & kingdom != "Bacteria" & rank %in% c("genus", "species", "subspecies") ~ 1.25, # species and subspecies in 'established' and 'putative' groups genus %in% c(established_genera, putative_genera) & rank %in% c("species", "subspecies") ~ 1.5, # other species from a genus in either group genus %in% nonbacterial_genera & rank %in% c("genus", "species", "subspecies") ~ 1.5, # all others TRUE ~ 2.0 )) table(taxonomy$prevalence, useNA = "always") # (a lot will be removed further below) # Save intermediate results (2) ------------------------------------------- saveRDS(taxonomy, "data-raw/taxonomy2.rds") # taxonomy <- readRDS("data-raw/taxonomy2.rds") # Remove unwanted taxonomic entries --------------------------------------- part1 <- taxonomy %>% filter( # keep all we added ourselves: source == "manually added" | # keep all bacteria anyway, main focus of our package: kingdom == "Bacteria" | # and these kingdoms are very small, and also mainly microorganisms: kingdom == "Protozoa" | kingdom == "Archaea" | kingdom == "Chromista" | # keep everything from family up, 'ghost' entries will be removed later on rank %in% c("kingdom", "phylum", "class", "order", "family") | # other relevant genera to keep: genus %in% AMR:::MO_PREVALENT_GENERA | # relevant for biotechnology: genus %in% c("Archaeoglobus", "Desulfurococcus", "Ferroglobus", "Ferroplasma", "Halobacterium", "Halococcus", "Haloferax", "Naegleria", "Nanoarchaeum", "Nitrosopumilus", "Nosema", "Pleistophora", "Pyrobaculum", "Pyrococcus", "Spraguea", "Thelohania", "Thermococcus", "Thermoplasma", "Thermoproteus", "Tritrichomonas") | genus %like_case% "Methano" | # kingdom of Protozoa: (phylum %in% c("Choanozoa", "Mycetozoa") & prevalence < 2) | # Fungi: (kingdom == "Fungi" & (!rank %in% c("genus", "species", "subspecies") | prevalence < 2 | class == "Pichiomycetes")) | # Animalia: genus %in% c("Lucilia", "Lumbricus") | (class == "Insecta" & !rank %in% c("species", "subspecies")) | # keep only genus of insects, not all of their (sub)species (genus == "Amoeba" & kingdom != "Animalia") # keep only in the protozoa, not the animalia ) %>% # this kingdom only contained Curvularia and Hymenolepis, which have coincidental twin names with Fungi filter(kingdom != "Plantae", !(genus %in% c("Aedes", "Anopheles") & rank %in% c("species", "subspecies"))) # now get the parents and old names part2 <- taxonomy %>% filter(gbif %in% c(part1$gbif_parent[!is.na(part1$gbif_parent)], part1$gbif_renamed_to[!is.na(part1$gbif_renamed_to)]) | mycobank %in% c(part1$mycobank_parent[!is.na(part1$mycobank_parent)], part1$mycobank_renamed_to[!is.na(part1$mycobank_renamed_to)]) | lpsn %in% c(part1$lpsn_parent[!is.na(part1$lpsn_parent)], part1$lpsn_renamed_to[!is.na(part1$lpsn_renamed_to)])) parts <- bind_rows(part1, part2) part3 <- taxonomy %>% filter(gbif %in% c(parts$gbif_parent[!is.na(parts$gbif_parent)], parts$gbif_renamed_to[!is.na(parts$gbif_renamed_to)]) | mycobank %in% c(parts$mycobank_parent[!is.na(parts$mycobank_parent)], parts$mycobank_renamed_to[!is.na(parts$mycobank_renamed_to)]) | lpsn %in% c(parts$lpsn_parent[!is.na(parts$lpsn_parent)], parts$lpsn_renamed_to[!is.na(parts$lpsn_renamed_to)])) parts <- bind_rows(part1, part2, part3) part4 <- taxonomy %>% filter(gbif %in% c(parts$gbif_parent[!is.na(parts$gbif_parent)], parts$gbif_renamed_to[!is.na(parts$gbif_renamed_to)]) | mycobank %in% c(parts$mycobank_parent[!is.na(parts$mycobank_parent)], parts$mycobank_renamed_to[!is.na(parts$mycobank_renamed_to)]) | lpsn %in% c(parts$lpsn_parent[!is.na(parts$lpsn_parent)], parts$lpsn_renamed_to[!is.na(parts$lpsn_renamed_to)])) parts <- bind_rows(part1, part2, part3, part4) taxonomy <- bind_rows(part1, part2, part3, part4) %>% arrange(fullname) %>% distinct(fullname, .keep_all = TRUE) # no ghost families, orders classes, phyla taxonomy <- taxonomy %>% group_by(kingdom, family) %>% # (but keep the ghost families of bacteria) filter(n() > 1 | fullname %like% "unknown" | rank == "kingdom" | kingdom == "Bacteria") %>% group_by(kingdom, order) %>% filter(n() > 1 | fullname %like% "unknown" | rank == "kingdom") %>% group_by(kingdom, class) %>% filter(n() > 1 | fullname %like% "unknown" | rank == "kingdom") %>% group_by(kingdom, phylum) %>% filter(n() > 1 | fullname %like% "unknown" | rank == "kingdom") %>% ungroup() # Add microbial IDs ------------------------------------------------------- # MO codes in the AMR package have the form: KINGDOM_GENUS_SPECIES_SUBSPECIES where all are abbreviated. # Kingdom is abbreviated with 1 character, with exceptions for Animalia and Plantae mo_kingdom <- taxonomy %>% filter(rank == "kingdom") %>% select(kingdom) %>% mutate(mo_kingdom = case_when( kingdom == "Animalia" ~ "AN", kingdom == "Archaea" ~ "A", kingdom == "Bacteria" ~ "B", kingdom == "Chromista" ~ "C", kingdom == "Fungi" ~ "F", kingdom == "Plantae" ~ "PL", kingdom == "Protozoa" ~ "P", TRUE ~ "" )) # phylum until family are abbreviated with 8 characters and prefixed with their rank # Phylum - keep old and fill up for new ones mo_phylum <- taxonomy %>% filter(rank == "phylum") %>% distinct(kingdom, phylum) %>% left_join( AMR::microorganisms %>% filter(rank == "phylum") %>% transmute(kingdom, phylum = fullname, mo_old = gsub("[A-Z]{1,2}_", "", as.character(mo)) ), by = c("kingdom", "phylum") ) %>% group_by(kingdom) %>% mutate( mo_phylum8 = AMR:::abbreviate_mo(phylum, minlength = 8, prefix = "[PHL]_"), mo_phylum9 = AMR:::abbreviate_mo(phylum, minlength = 9, prefix = "[PHL]_"), mo_phylum = ifelse(!is.na(mo_old), mo_old, mo_phylum8), mo_duplicated = duplicated(mo_phylum), mo_phylum = ifelse(mo_duplicated, mo_phylum9, mo_phylum), mo_duplicated = duplicated(mo_phylum) ) %>% ungroup() if (any(mo_phylum$mo_duplicated, na.rm = TRUE)) stop("Duplicate MO codes for phylum!") mo_phylum <- mo_phylum %>% select(kingdom, phylum, mo_phylum) # Class - keep old and fill up for new ones mo_class <- taxonomy %>% filter(rank == "class") %>% distinct(kingdom, class) %>% left_join( AMR::microorganisms %>% filter(rank == "class") %>% transmute(kingdom, class = fullname, mo_old = gsub("[A-Z]{1,2}_", "", as.character(mo)) ), by = c("kingdom", "class") ) %>% group_by(kingdom) %>% mutate( mo_class8 = AMR:::abbreviate_mo(class, minlength = 8, prefix = "[CLS]_"), mo_class9 = AMR:::abbreviate_mo(class, minlength = 9, prefix = "[CLS]_"), mo_class = ifelse(!is.na(mo_old), mo_old, mo_class8), mo_duplicated = duplicated(mo_class), mo_class = ifelse(mo_duplicated, mo_class9, mo_class), mo_duplicated = duplicated(mo_class) ) %>% ungroup() if (any(mo_class$mo_duplicated, na.rm = TRUE)) stop("Duplicate MO codes for class!") mo_class <- mo_class %>% select(kingdom, class, mo_class) # Order - keep old and fill up for new ones mo_order <- taxonomy %>% filter(rank == "order") %>% distinct(kingdom, order) %>% left_join( AMR::microorganisms %>% filter(rank == "order") %>% transmute(kingdom, order = fullname, mo_old = gsub("[A-Z]{1,2}_", "", as.character(mo)) ), by = c("kingdom", "order") ) %>% group_by(kingdom) %>% mutate( mo_order8 = AMR:::abbreviate_mo(order, minlength = 8, prefix = "[ORD]_"), mo_order9 = AMR:::abbreviate_mo(order, minlength = 9, prefix = "[ORD]_"), mo_order = ifelse(!is.na(mo_old), mo_old, mo_order8), mo_duplicated = duplicated(mo_order), mo_order = ifelse(mo_duplicated, mo_order9, mo_order), mo_duplicated = duplicated(mo_order) ) %>% ungroup() if (any(mo_order$mo_duplicated, na.rm = TRUE)) stop("Duplicate MO codes for order!") mo_order <- mo_order %>% select(kingdom, order, mo_order) # Family - keep old and fill up for new ones mo_family <- taxonomy %>% filter(rank == "family") %>% distinct(kingdom, family) %>% left_join( AMR::microorganisms %>% filter(rank == "family") %>% transmute(kingdom, family = fullname, mo_old = gsub("[A-Z]{1,2}_", "", as.character(mo)) ), by = c("kingdom", "family") ) %>% group_by(kingdom) %>% mutate( mo_family8 = AMR:::abbreviate_mo(family, minlength = 8, prefix = "[FAM]_"), mo_family9 = AMR:::abbreviate_mo(family, minlength = 9, prefix = "[FAM]_"), mo_family = ifelse(!is.na(mo_old), mo_old, mo_family8), mo_duplicated = duplicated(mo_family), mo_family = ifelse(mo_duplicated, mo_family9, mo_family), mo_duplicated = duplicated(mo_family) ) %>% ungroup() if (any(mo_family$mo_duplicated, na.rm = TRUE)) stop("Duplicate MO codes for family!") mo_family <- mo_family %>% select(kingdom, family, mo_family) # construct code part for genus - keep old code where available and generate new ones where needed mo_genus <- taxonomy %>% filter(rank == "genus") %>% distinct(kingdom, genus) %>% # get available old MO codes left_join( AMR::microorganisms %>% filter(rank == "genus") %>% transmute(mo_genus_old = gsub("^[A-Z]+_", "", as.character(mo)), kingdom, genus) %>% distinct(kingdom, genus, .keep_all = TRUE), by = c("kingdom", "genus") ) %>% distinct(kingdom, genus, .keep_all = TRUE) %>% # since kingdom is part of the code, genus abbreviations may be duplicated between kingdoms group_by(kingdom) %>% # generate new MO codes for genus and set the right one mutate( mo_genus_new5 = AMR:::abbreviate_mo(genus, 5), mo_genus_new5b = paste0(AMR:::abbreviate_mo(genus, 5), 1), mo_genus_new5c = paste0(AMR:::abbreviate_mo(genus, 5), 2), mo_genus_new6 = AMR:::abbreviate_mo(genus, 6), mo_genus_new7 = AMR:::abbreviate_mo(genus, 7), mo_genus_new8 = AMR:::abbreviate_mo(genus, 8), mo_genus_new = case_when( !is.na(mo_genus_old) ~ mo_genus_old, !mo_genus_new5 %in% mo_genus_old ~ mo_genus_new5, !mo_genus_new6 %in% mo_genus_old ~ mo_genus_new6, !mo_genus_new7 %in% mo_genus_old ~ mo_genus_new7, !mo_genus_new8 %in% mo_genus_old ~ mo_genus_new8, !mo_genus_new5b %in% mo_genus_old ~ mo_genus_new5b, TRUE ~ mo_genus_old ), mo_duplicated = duplicated(mo_genus_new), mo_genus_new = case_when( !mo_duplicated ~ mo_genus_new, mo_duplicated & mo_genus_new == mo_genus_old ~ mo_genus_new6, mo_duplicated & mo_genus_new == mo_genus_new5 ~ mo_genus_new6, mo_duplicated & mo_genus_new == mo_genus_new6 ~ mo_genus_new7, mo_duplicated & mo_genus_new == mo_genus_new7 ~ mo_genus_new8, mo_duplicated & mo_genus_new == mo_genus_new8 ~ mo_genus_new5b, TRUE ~ NA_character_ ), mo_duplicated = duplicated(mo_genus_new), mo_genus_new = case_when( !mo_duplicated ~ mo_genus_new, mo_duplicated & mo_genus_new == mo_genus_new6 ~ mo_genus_new7, mo_duplicated & mo_genus_new == mo_genus_new7 ~ mo_genus_new8, mo_duplicated & mo_genus_new == mo_genus_new8 ~ mo_genus_new5b, mo_duplicated ~ mo_genus_new5c, TRUE ~ NA_character_ ), mo_duplicated = duplicated(mo_genus_new) ) %>% ungroup() if (any(mo_genus$mo_duplicated, na.rm = TRUE) | anyNA(mo_genus$mo_genus_new)) stop("Duplicate MO codes for genus!") # mo_genus %>% filter(mo_duplicated) # no duplicates *within kingdoms*, so keep the right columns for left joining later mo_genus <- mo_genus %>% select(kingdom, genus, mo_genus = mo_genus_new) # same for species - keep old where available and create new per kingdom-genus where needed: mo_species <- taxonomy %>% filter(rank == "species") %>% distinct(kingdom, genus, species) %>% left_join( AMR::microorganisms %>% filter(rank == "species") %>% transmute(mo_species_old = gsub("^[A-Z]+_[A-Z]+_", "", as.character(mo)), kingdom, genus, species) %>% filter(mo_species_old %unlike% "-") %>% distinct(kingdom, genus, species, .keep_all = TRUE), by = c("kingdom", "genus", "species") ) %>% distinct(kingdom, genus, species, .keep_all = TRUE) %>% group_by(kingdom, genus) %>% mutate( mo_species_new4 = AMR:::abbreviate_mo(species, 4, hyphen_as_space = TRUE), mo_species_new5 = AMR:::abbreviate_mo(species, 5, hyphen_as_space = TRUE), mo_species_new5b = paste0(AMR:::abbreviate_mo(species, 5, hyphen_as_space = TRUE), 1), mo_species_new6 = AMR:::abbreviate_mo(species, 6, hyphen_as_space = TRUE), mo_species_new7 = AMR:::abbreviate_mo(species, 7, hyphen_as_space = TRUE), mo_species_new8 = AMR:::abbreviate_mo(species, 8, hyphen_as_space = TRUE), mo_species_new = case_when( !is.na(mo_species_old) ~ mo_species_old, !mo_species_new4 %in% mo_species_old ~ mo_species_new4, !mo_species_new5 %in% mo_species_old ~ mo_species_new5, !mo_species_new6 %in% mo_species_old ~ mo_species_new6, !mo_species_new7 %in% mo_species_old ~ mo_species_new7, !mo_species_new8 %in% mo_species_old ~ mo_species_new8, !mo_species_new5b %in% mo_species_old ~ mo_species_new5b, TRUE ~ mo_species_old ), mo_duplicated = duplicated(mo_species_new), mo_species_new = case_when( !mo_duplicated ~ mo_species_new, mo_duplicated & mo_species_new == mo_species_new4 ~ mo_species_new5, mo_duplicated & mo_species_new == mo_species_new5 ~ mo_species_new6, mo_duplicated & mo_species_new == mo_species_new6 ~ mo_species_new7, mo_duplicated & mo_species_new == mo_species_new7 ~ mo_species_new8, mo_duplicated & mo_species_new == mo_species_new8 ~ mo_species_new5b, TRUE ~ NA_character_ ), mo_duplicated = duplicated(mo_species_new) ) %>% ungroup() if (any(mo_species$mo_duplicated, na.rm = TRUE) | anyNA(mo_species$mo_species_new)) stop("Duplicate MO codes for species!") # no duplicates *within kingdoms*, so keep the right columns for left joining later mo_species <- mo_species %>% select(kingdom, genus, species, mo_species = mo_species_new) # same for subspecies - keep old where available and create new per kingdom-genus-species where needed: mo_subspecies <- taxonomy %>% filter(rank == "subspecies") %>% distinct(kingdom, genus, species, subspecies) %>% left_join( AMR::microorganisms %>% filter(rank %in% c("subspecies", "subsp.", "infraspecies")) %>% transmute(mo_subspecies_old = gsub("^[A-Z]+_[A-Z]+_[A-Z]+_", "", as.character(mo)), kingdom, genus, species, subspecies) %>% filter(mo_subspecies_old %unlike% "-") %>% distinct(kingdom, genus, species, subspecies, .keep_all = TRUE), by = c("kingdom", "genus", "species", "subspecies") ) %>% distinct(kingdom, genus, species, subspecies, .keep_all = TRUE) %>% group_by(kingdom, genus, species) %>% mutate( mo_subspecies_new4 = AMR:::abbreviate_mo(subspecies, 4, hyphen_as_space = TRUE), mo_subspecies_new5 = AMR:::abbreviate_mo(subspecies, 5, hyphen_as_space = TRUE), mo_subspecies_new5b = paste0(AMR:::abbreviate_mo(subspecies, 5, hyphen_as_space = TRUE), 1), mo_subspecies_new6 = AMR:::abbreviate_mo(subspecies, 6, hyphen_as_space = TRUE), mo_subspecies_new7 = AMR:::abbreviate_mo(subspecies, 7, hyphen_as_space = TRUE), mo_subspecies_new8 = AMR:::abbreviate_mo(subspecies, 8, hyphen_as_space = TRUE), mo_subspecies_new = case_when( !is.na(mo_subspecies_old) ~ mo_subspecies_old, !mo_subspecies_new4 %in% mo_subspecies_old ~ mo_subspecies_new4, !mo_subspecies_new5 %in% mo_subspecies_old ~ mo_subspecies_new5, !mo_subspecies_new6 %in% mo_subspecies_old ~ mo_subspecies_new6, !mo_subspecies_new7 %in% mo_subspecies_old ~ mo_subspecies_new7, !mo_subspecies_new8 %in% mo_subspecies_old ~ mo_subspecies_new8, !mo_subspecies_new5b %in% mo_subspecies_old ~ mo_subspecies_new5b, TRUE ~ mo_subspecies_old ), mo_duplicated = duplicated(mo_subspecies_new), mo_subspecies_new = case_when( !mo_duplicated ~ mo_subspecies_new, mo_duplicated & mo_subspecies_new == mo_subspecies_new4 ~ mo_subspecies_new5, mo_duplicated & mo_subspecies_new == mo_subspecies_new5 ~ mo_subspecies_new6, mo_duplicated & mo_subspecies_new == mo_subspecies_new6 ~ mo_subspecies_new7, mo_duplicated & mo_subspecies_new == mo_subspecies_new7 ~ mo_subspecies_new8, mo_duplicated & mo_subspecies_new == mo_subspecies_new8 ~ mo_subspecies_new5b, TRUE ~ NA_character_ ), mo_duplicated = duplicated(mo_subspecies_new) ) %>% ungroup() if (any(mo_subspecies$mo_duplicated, na.rm = TRUE) | anyNA(mo_subspecies$mo_subspecies_new)) stop("Duplicate MO codes for subspecies!") # no duplicates *within kingdoms*, so keep the right columns for left joining later mo_subspecies <- mo_subspecies %>% select(kingdom, genus, species, subspecies, mo_subspecies = mo_subspecies_new) # unknowns - manually added mo_unknown <- AMR::microorganisms %>% filter(fullname %like% "unknown") %>% transmute(fullname, mo_unknown = as.character(mo)) # apply the new codes! taxonomybak2 <- taxonomy taxonomy <- taxonomy %>% left_join(mo_kingdom, by = "kingdom") %>% left_join(mo_phylum, by = c("kingdom", "phylum")) %>% left_join(mo_class, by = c("kingdom", "class")) %>% left_join(mo_order, by = c("kingdom", "order")) %>% left_join(mo_family, by = c("kingdom", "family")) %>% left_join(mo_genus, by = c("kingdom", "genus")) %>% left_join(mo_species, by = c("kingdom", "genus", "species")) %>% left_join(mo_subspecies, by = c("kingdom", "genus", "species", "subspecies")) %>% left_join(mo_unknown, by = "fullname") %>% mutate(across(starts_with("mo_"), function(x) ifelse(is.na(x), "", x))) %>% mutate( mo = case_when( fullname %like% "unknown" ~ mo_unknown, # add special cases for taxons higher than genus rank == "kingdom" ~ paste(mo_kingdom, "[KNG]", toupper(kingdom), sep = "_"), rank == "phylum" ~ paste(mo_kingdom, mo_phylum, sep = "_"), rank == "class" ~ paste(mo_kingdom, mo_class, sep = "_"), rank == "order" ~ paste(mo_kingdom, mo_order, sep = "_"), rank == "family" ~ paste(mo_kingdom, mo_family, sep = "_"), TRUE ~ paste(mo_kingdom, mo_genus, mo_species, mo_subspecies, sep = "_") ), mo = trimws(gsub("_+$", "", mo)), .before = 1 ) %>% select(!starts_with("mo_")) %>% arrange(fullname) # now check these duplicate fullnames taxonomy %>% filter(fullname %in% .[duplicated(fullname), "fullname", drop = TRUE]) %>% View() # we will prefer kingdoms as Bacteria > Fungi > Protozoa > Archaea > Animalia, so check the ones within 1 kingdom taxonomy %>% filter(fullname %in% .[duplicated(fullname), "fullname", drop = TRUE]) %>% group_by(fullname) %>% filter(n_distinct(kingdom) == 1) # fullnames must be unique, we'll keep the most relevant ones only taxonomy <- taxonomy %>% mutate(rank_index = case_when( kingdom == "Bacteria" ~ 1, kingdom == "Fungi" ~ 2, kingdom == "Protozoa" ~ 3, kingdom == "Archaea" ~ 4, kingdom == "Animalia" ~ 5, TRUE ~ 6 )) %>% arrange(fullname, rank_index) %>% distinct(fullname, .keep_all = TRUE) %>% select(-rank_index) %>% filter(mo != "") # keep the codes from manually added ones manual_mos <- as.character(AMR::microorganisms$mo)[match(taxonomy$fullname[taxonomy$source == "manually added"], AMR::microorganisms$fullname)] taxonomy$mo[taxonomy$source == "manually added"][!is.na(manual_mos)] <- manual_mos[!is.na(manual_mos)] # remove the manually added ones that are now ghosts taxonomy <- taxonomy %>% filter(!(mo %like% "__" & source == "manually added")) # this must not exist (2024: GBIF mess with missing genera - remove them): taxonomy %>% filter(mo %like% "__") %>% View() taxonomy <- taxonomy %>% filter(mo %unlike% "__") taxonomy_lpsn.bak4 <- taxonomy # Some integrity checks --------------------------------------------------- # are mo codes unique? taxonomy %>% filter(mo %in% .[duplicated(mo), "mo", drop = TRUE]) %>% arrange(mo) %>% View() # no, there is one weird entry from MycoBank, this is recorded as class while it's a phylum fix <- which(taxonomy$fullname == "Chytridiopsidomycota") taxonomy$class[fix] <- "" taxonomy$rank[fix] <- "phylum" taxonomy$mo[fix] <- paste0("F_", AMR:::abbreviate_mo("Chytridiopsidomycota", minlength = 8, prefix = "[PHL]_")) # check again taxonomy %>% filter(mo %in% .[duplicated(mo), "mo", drop = TRUE]) %>% arrange(mo) %>% View() # keep the firsts taxonomy <- taxonomy %>% arrange(mo) %>% distinct(mo, .keep_all = TRUE) # are fullnames unique? taxonomy %>% arrange(fullname) %>% filter(fullname %in% .[duplicated(fullname), "fullname", drop = TRUE]) %>% View() # # we can now solve it like this, but check every year if this is okay # taxonomy <- taxonomy %>% # group_by(fullname) %>% # mutate(fullname = if_else(duplicated(fullname), paste0(fullname, " {", rank, "}"), fullname)) %>% # ungroup() # are all GBIFs available? taxonomy %>% filter((!gbif_parent %in% gbif) | (!lpsn_parent %in% lpsn) | (!mycobank_parent %in% mycobank)) %>% count(source = case_when(!gbif_parent %in% gbif ~ "GBIF", !lpsn_parent %in% lpsn ~ "LPSN", !mycobank_parent %in% mycobank ~ "MycoBank", TRUE ~ "?"), rank) # so fix again all parent identifiers taxonomybak5 <- taxonomy taxonomy <- taxonomy %>% mutate( lpsn_parent = case_when( rank == "phylum" ~ lpsn[match(kingdom, fullname)], # in class, take parent from phylum if available, otherwise kingdom rank == "class" & phylum != "" ~ lpsn[match(phylum, fullname)], rank == "class" ~ lpsn[match(kingdom, fullname)], # in order, take parent from class if available, otherwise phylum, otherwise kingdom rank == "order" & class != "" ~ lpsn[match(class, fullname)], rank == "order" & phylum != "" ~ lpsn[match(phylum, fullname)], rank == "order" ~ lpsn[match(kingdom, fullname)], # family rank == "family" & order != "" ~ lpsn[match(order, fullname)], rank == "family" & class != "" ~ lpsn[match(class, fullname)], rank == "family" & phylum != "" ~ lpsn[match(phylum, fullname)], rank == "family" ~ lpsn[match(kingdom, fullname)], # genus rank == "genus" & family != "" ~ lpsn[match(family, fullname)], rank == "genus" & order != "" ~ lpsn[match(order, fullname)], rank == "genus" & class != "" ~ lpsn[match(class, fullname)], rank == "genus" & phylum != "" ~ lpsn[match(phylum, fullname)], rank == "genus" ~ lpsn[match(kingdom, fullname)], # species, always has a genus rank == "species" ~ lpsn[match(genus, fullname)], # subspecies, always has a genus + species rank == "subspecies" ~ lpsn[match(paste(genus, species), fullname)], TRUE ~ NA_character_), mycobank_parent = case_when( rank == "phylum" ~ mycobank[match(kingdom, fullname)], # class rank == "class" & phylum != "" ~ mycobank[match(phylum, fullname)], rank == "class" ~ mycobank[match(kingdom, fullname)], # order rank == "order" & class != "" ~ mycobank[match(class, fullname)], rank == "order" & phylum != "" ~ mycobank[match(phylum, fullname)], rank == "order" ~ mycobank[match(kingdom, fullname)], # family rank == "family" & order != "" ~ mycobank[match(order, fullname)], rank == "family" & class != "" ~ mycobank[match(class, fullname)], rank == "family" & phylum != "" ~ mycobank[match(phylum, fullname)], rank == "family" ~ mycobank[match(kingdom, fullname)], # genus rank == "genus" & family != "" ~ mycobank[match(family, fullname)], rank == "genus" & order != "" ~ mycobank[match(order, fullname)], rank == "genus" & class != "" ~ mycobank[match(class, fullname)], rank == "genus" & phylum != "" ~ mycobank[match(phylum, fullname)], rank == "genus" ~ mycobank[match(kingdom, fullname)], # species rank == "species" ~ mycobank[match(genus, fullname)], # subspecies rank == "subspecies" ~ mycobank[match(paste(genus, species), fullname)], TRUE ~ NA_character_), gbif_parent = case_when( rank == "phylum" ~ gbif[match(kingdom, fullname)], # class rank == "class" & phylum != "" ~ gbif[match(phylum, fullname)], rank == "class" ~ gbif[match(kingdom, fullname)], # order rank == "order" & class != "" ~ gbif[match(class, fullname)], rank == "order" & phylum != "" ~ gbif[match(phylum, fullname)], rank == "order" ~ gbif[match(kingdom, fullname)], # family rank == "family" & order != "" ~ gbif[match(order, fullname)], rank == "family" & class != "" ~ gbif[match(class, fullname)], rank == "family" & phylum != "" ~ gbif[match(phylum, fullname)], rank == "family" ~ gbif[match(kingdom, fullname)], # genus rank == "genus" & family != "" ~ gbif[match(family, fullname)], rank == "genus" & order != "" ~ gbif[match(order, fullname)], rank == "genus" & class != "" ~ gbif[match(class, fullname)], rank == "genus" & phylum != "" ~ gbif[match(phylum, fullname)], rank == "genus" ~ gbif[match(kingdom, fullname)], # species rank == "species" ~ gbif[match(genus, fullname)], # subspecies rank == "subspecies" ~ gbif[match(paste(genus, species), fullname)], TRUE ~ NA_character_)) # check again taxonomy %>% filter((!gbif_parent %in% gbif) | (!lpsn_parent %in% lpsn) | (!mycobank_parent %in% mycobank)) %>% count(source = case_when(!gbif_parent %in% gbif ~ "GBIF", !lpsn_parent %in% lpsn ~ "LPSN", !mycobank_parent %in% mycobank ~ "MycoBank", TRUE ~ "?"), rank) # Save intermediate results (3) ------------------------------------------- saveRDS(taxonomy, "data-raw/taxonomy3.rds") # Redo LPSN missings and parents ------------------------------------------ gbif_bacteria_second_run <- which(taxonomy$kingdom == "Bacteria" & taxonomy$source == "GBIF" & taxonomy$rank %in% c("phylum", "class", "order", "family")) added <- 0 pb <- progress_bar$new(total = length(gbif_bacteria_second_run), format = "[:bar] :current/:total :eta") for (record in gbif_bacteria_second_run) { pb$tick() suppressWarnings( lpsn <- get_lpsn_and_author(rank = taxonomy$rank[record], name = taxonomy$fullname[record]) ) if (is.na(lpsn["lpsn"])) { next } else { added <- added + 1 taxonomy$source[record] <- "LPSN" taxonomy$lpsn[record] <- unname(lpsn["lpsn"]) taxonomy$ref[record] <- unname(lpsn["ref"]) taxonomy$status[record] <- unname(lpsn["status"]) } } message(added, " GBIF records altered to latest LPSN") taxbak <- taxonomy # update parent LPSN identifier again taxonomy <- taxonomy %>% mutate( lpsn_parent = case_when( rank == "phylum" ~ lpsn[match(kingdom, fullname)], # in class, take parent from phylum if available, otherwise kingdom rank == "class" & phylum != "" ~ lpsn[match(phylum, fullname)], rank == "class" ~ lpsn[match(kingdom, fullname)], # in order, take parent from class if available, otherwise phylum, otherwise kingdom rank == "order" & class != "" ~ lpsn[match(class, fullname)], rank == "order" & phylum != "" ~ lpsn[match(phylum, fullname)], rank == "order" ~ lpsn[match(kingdom, fullname)], # family rank == "family" & order != "" ~ lpsn[match(order, fullname)], rank == "family" & class != "" ~ lpsn[match(class, fullname)], rank == "family" & phylum != "" ~ lpsn[match(phylum, fullname)], rank == "family" ~ lpsn[match(kingdom, fullname)], # genus rank == "genus" & family != "" ~ lpsn[match(family, fullname)], rank == "genus" & order != "" ~ lpsn[match(order, fullname)], rank == "genus" & class != "" ~ lpsn[match(class, fullname)], rank == "genus" & phylum != "" ~ lpsn[match(phylum, fullname)], rank == "genus" ~ lpsn[match(kingdom, fullname)], # species, always has a genus rank == "species" ~ lpsn[match(genus, fullname)], # subspecies, always has a genus + species rank == "subspecies" ~ lpsn[match(paste(genus, species), fullname)], TRUE ~ NA_character_)) message( "\nCongratulations! The new taxonomic table will contain ", format(nrow(taxonomy), big.mark = " "), " rows.\n", "This was ", format(nrow(AMR::microorganisms), big.mark = " "), " rows.\n" ) # these are the new ones: taxonomy %>% # filter(!paste(kingdom, fullname) %in% paste(AMR::microorganisms$kingdom, AMR::microorganisms$fullname)) %>% filter(!fullname %in% AMR::microorganisms$fullname) %>% View() # these were removed: AMR::microorganisms %>% # filter(!paste(kingdom, fullname) %in% paste(taxonomy$kingdom, taxonomy$fullname)) %>% filter(!fullname %in% taxonomy$fullname) %>% View() # Add SNOMED CT ----------------------------------------------------------- # we will use Public Health Information Network Vocabulary Access and Distribution System (PHIN VADS) # as a source, which copies directly from the latest US SNOMED CT version # - go to https://phinvads.cdc.gov/vads/ViewValueSet.action?oid=2.16.840.1.114222.4.11.1009 # - check that current online version is higher than TAXONOMY_VERSION$SNOMED # - if so, click on 'Download Value Set', choose 'TXT' snomed <- vroom("data-raw/SNOMED_PHVS_Microorganism_CDC_V12.txt", skip = 3) %>% select(1:2) %>% setNames(c("snomed", "mo")) %>% mutate(snomed = as.character(snomed)) # try to get name of MO snomed <- snomed %>% mutate(mo = gsub("ss. ", "", mo, fixed = TRUE)) %>% mutate(fullname = case_when( mo %like_case% "[A-Z][a-z]+ [a-z]+ [a-z]{4,} " ~ gsub("(^|.*)([A-Z][a-z]+ [a-z]+ [a-z]{4,}) .*", "\\2", mo), mo %like_case% "[A-Z][a-z]+ [a-z]{4,} " ~ gsub("(^|.*)([A-Z][a-z]+ [a-z]{4,}) .*", "\\2", mo), mo %like_case% "[A-Z][a-z]+" ~ gsub("(^|.*)([A-Z][a-z]+) .*", "\\2", mo), TRUE ~ NA_character_ )) snomed <- snomed %>% filter(fullname %in% taxonomy$fullname) message(nrow(snomed), " SNOMED codes will be added to ", n_distinct(snomed$fullname), " microorganisms") snomed <- snomed %>% group_by(fullname) %>% summarise(snomed = list(snomed)) taxonomy <- taxonomy %>% left_join(snomed, by = "fullname") # Add oxygen tolerance (aerobe/anaerobe) ---------------------------------- # We will use the BacDive data base for this: # - go to https://bacdive.dsmz.de/advsearch a # - filter 'Oxygen tolerance' on "*" and click Submit # - click on the 'Download tabel as CSV' button bacdive <- vroom::vroom("data-raw/bacdive.csv", skip = 2) %>% select(species, oxygen = `Oxygen tolerance`) bacdive <- bacdive %>% # fill in missing species from previous rows mutate(fullname = if_else(is.na(species), lag(species), species)) %>% filter(!is.na(species), !is.na(oxygen), oxygen %unlike% "tolerant", species %unlike% "unclassified") %>% select(-species) bacdive <- bacdive %>% # now determine type per species group_by(fullname) %>% summarise(fullname = first(fullname), oxygen_tolerance = case_when(any(oxygen %like% "facultative") ~ "facultative anaerobe", all(oxygen == "microaerophile") ~ "microaerophile", all(oxygen %in% c("anaerobe", "obligate anaerobe")) ~ "anaerobe", all(oxygen %in% c("anaerobe", "obligate anaerobe", "microaerophile")) ~ "anaerobe/microaerophile", all(oxygen %in% c("aerobe", "obligate aerobe")) ~ "aerobe", all(!oxygen %in% c("anaerobe", "obligate anaerobe")) ~ "aerobe", all(c("aerobe", "anaerobe") %in% oxygen) ~ "facultative anaerobe", TRUE ~ NA_character_)) # now find all synonyms and copy them from their current taxonomic names synonyms <- taxonomy %>% filter(status == "synonym") %>% transmute(mo, fullname_old = fullname, current = synonym_mo_to_accepted_mo(mo, fill_in_accepted = FALSE, dataset = taxonomy)) %>% filter(!is.na(current)) %>% mutate(fullname = taxonomy$fullname[match(current, taxonomy$mo)]) %>% left_join(bacdive, by = "fullname") %>% filter(!is.na(oxygen_tolerance)) %>% select(fullname, oxygen_tolerance) bacdive <- bacdive %>% bind_rows(synonyms) %>% distinct() bacdive_genus <- bacdive %>% mutate(oxygen = oxygen_tolerance, genus = taxonomy$genus[match(fullname, taxonomy$fullname)]) %>% group_by(fullname = genus) %>% summarise(oxygen_tolerance = case_when(any(oxygen == "facultative anaerobe") ~ "facultative anaerobe", any(oxygen == "anaerobe/microaerophile") ~ "anaerobe/microaerophile", all(oxygen == "microaerophile") ~ "microaerophile", all(oxygen == "anaerobe") ~ "anaerobe", all(oxygen == "aerobe") ~ "aerobe", TRUE ~ "facultative anaerobe")) bacdive <- bacdive %>% bind_rows(bacdive_genus) %>% arrange(fullname) bacdive_other <- taxonomy %>% filter(kingdom == "Bacteria", rank == "species", !fullname %in% bacdive$fullname, genus %in% bacdive$fullname) %>% select(fullname, genus) %>% left_join(bacdive, by = c("genus" = "fullname")) %>% mutate(oxygen_tolerance = ifelse(oxygen_tolerance %in% c("aerobe", "anaerobe", "microaerophile", "anaerobe/microaerophile"), oxygen_tolerance, paste("likely", oxygen_tolerance))) %>% select(fullname, oxygen_tolerance) %>% distinct(fullname, .keep_all = TRUE) bacdive <- bacdive %>% bind_rows(bacdive_other) %>% arrange(fullname) %>% distinct(fullname, .keep_all = TRUE) taxonomy <- taxonomy %>% left_join(bacdive, by = "fullname") %>% relocate(oxygen_tolerance, .after = ref) # Restore 'synonym' microorganisms to 'accepted' -------------------------- # If there are some synonyms that need to be corrected to 'accepted', you can do that here. # Before 2024, we encountered this (but currently, this is all good, no action needed: # according to LPSN: Stenotrophomonas maltophilia is the correct name if this species is regarded as a separate species (i.e., if its nomenclatural type is not assigned to another species whose name is validly published, legitimate and not rejected and has priority) within a separate genus Stenotrophomonas. # https://lpsn.dsmz.de/species/stenotrophomonas-maltophilia # # all MO's to keep as 'accepted', not as 'synonym': # to_restore <- c( # "Stenotrophomonas maltophilia", # "Moraxella catarrhalis" # ) # taxonomy %>% filter(fullname %in% to_restore) %>% View() # all(to_restore %in% taxonomy$fullname) # for (nm in to_restore) { # taxonomy$lpsn_renamed_to[which(taxonomy$fullname == nm)] <- NA # taxonomy$gbif_renamed_to[which(taxonomy$fullname == nm)] <- NA # taxonomy$status[which(taxonomy$fullname == nm)] <- "accepted" # } # Add species groups ------------------------------------------------------ # just before the end, make sure we get the species group from the previous data set old_groups <- microorganisms %>% filter(rank == "species group") # use current taxonomy groups <- taxonomy[match(old_groups$genus, taxonomy$genus), ] groups <- groups %>% mutate(mo = as.character(old_groups$mo), fullname = old_groups$fullname, species = old_groups$species, subspecies = old_groups$subspecies, rank = old_groups$rank, ref = old_groups$ref, status = "accepted", source = "manually added", lpsn = NA_character_, lpsn_renamed_to = NA_character_, mycobank = NA_character_, mycobank_renamed_to = NA_character_, gbif = NA_character_, gbif_renamed_to = NA_character_) %>% select(-c(lpsn, lpsn_renamed_to, mycobank, mycobank_renamed_to, gbif, gbif_renamed_to, snomed)) taxonomy <- taxonomy %>% filter(!mo %in% groups$mo) %>% bind_rows(groups) # we added an MO code, so make sure everything is still unique any(duplicated(taxonomy$mo)) any(duplicated(taxonomy$fullname)) # Some final checks ------------------------------------------------------- # and check: these codes should not be missing (will otherwise throw a unit test error): AMR::microorganisms.codes %>% filter(!mo %in% taxonomy$mo) AMR::clinical_breakpoints %>% filter(!mo %in% taxonomy$mo) AMR::example_isolates %>% filter(!mo %in% taxonomy$mo) AMR::intrinsic_resistant %>% filter(!mo %in% taxonomy$mo) # all our previously manually added names should be in it all(microorganisms$fullname[microorganisms$source == "manually added"] %in% taxonomy$fullname) microorganisms$fullname[!microorganisms$fullname[microorganisms$source == "manually added"] %in% taxonomy$fullname] # put this one back taxonomy <- taxonomy %>% bind_rows(microorganisms %>% filter(fullname == "Blastocystis hominis") %>% mutate(mo = as.character(mo))) # we added an MO code, so make sure everything is still unique any(duplicated(taxonomy$mo)) any(duplicated(taxonomy$fullname)) # Save to package --------------------------------------------------------- # format to tibble and remove non-ASCII characters taxonomy.bak6 <- taxonomy taxonomy <- taxonomy %>% arrange(fullname) %>% select(mo, fullname, status, kingdom:subspecies, rank, ref, oxygen_tolerance, source, starts_with("lpsn"), starts_with("mycobank"), starts_with("gbif"), prevalence, snomed) %>% df_remove_nonASCII() microorganisms <- taxonomy # set class class(microorganisms$mo) <- c("mo", "character") microorganisms <- microorganisms %>% arrange(fullname) usethis::use_data(microorganisms, overwrite = TRUE, version = 2, compress = "xz") rm(microorganisms) # DON'T FORGET TO UPDATE R/_globals.R! # load new data set devtools::load_all(".") # Reset previously changed mo codes --------------------------------------- if (!identical(clinical_breakpoints$mo, as.mo(clinical_breakpoints$mo, language = NULL))) { clinical_breakpoints$mo <- as.mo(clinical_breakpoints$mo, language = NULL) usethis::use_data(clinical_breakpoints, overwrite = TRUE, version = 2, compress = "xz") rm(clinical_breakpoints) } if (!identical(microorganisms.codes$mo, as.mo(microorganisms.codes$mo, language = NULL))) { microorganisms.codes <- microorganisms.codes %>% filter(mo %in% microorganisms$mo) microorganisms.codes$mo <- as.mo(microorganisms.codes$mo, language = NULL) usethis::use_data(microorganisms.codes, overwrite = TRUE, version = 2, compress = "xz") rm(microorganisms.codes) } if (!identical(example_isolates$mo, as.mo(example_isolates$mo, language = NULL))) { example_isolates$mo <- as.mo(example_isolates$mo, language = NULL) usethis::use_data(example_isolates, overwrite = TRUE, version = 2) rm(example_isolates) } anyNA(microorganisms$mo) anyNA(microorganisms.codes$mo) anyNA(intrinsic_resistant$mo) anyNA(clinical_breakpoints$mo) # load new data sets again devtools::load_all(".") source("data-raw/_pre_commit_checks.R") devtools::load_all(".") if (!identical(intrinsic_resistant$mo, as.mo(intrinsic_resistant$mo, language = NULL))) { stop("Run data-raw/reproduction_of_intrinsic_resistant.R again") } # run the unit tests Sys.setenv(NOT_CRAN = "true") testthat::test_file("inst/tinytest/test-data.R") testthat::test_file("inst/tinytest/test-mo.R") testthat::test_file("inst/tinytest/test-mo_property.R")