2020-01-05 17:22:09 +01:00
# ==================================================================== #
2023-06-26 13:52:02 +02:00
# TITLE: #
2022-10-05 09:12:22 +02:00
# AMR: An R Package for Working with Antimicrobial Resistance Data #
2020-01-05 17:22:09 +01:00
# #
2023-06-26 13:52:02 +02:00
# SOURCE CODE: #
2020-07-09 20:07:39 +02:00
# https://github.com/msberends/AMR #
2020-01-05 17:22:09 +01:00
# #
2023-06-26 13:52:02 +02:00
# PLEASE CITE THIS SOFTWARE AS: #
2024-07-16 14:51:57 +02:00
# Berends MS, Luz CF, Friedrich AW, et al. (2022). #
# AMR: An R Package for Working with Antimicrobial Resistance Data. #
# Journal of Statistical Software, 104(3), 1-31. #
2023-05-27 10:39:22 +02:00
# https://doi.org/10.18637/jss.v104.i03 #
2022-10-05 09:12:22 +02:00
# #
2022-12-27 15:16:15 +01:00
# Developed at the University of Groningen and the University Medical #
# Center Groningen in The Netherlands, in collaboration with many #
# colleagues from around the world, see our website. #
2020-01-05 17:22:09 +01:00
# #
# This R package is free software; you can freely use and distribute #
# it for both personal and commercial purposes under the terms of the #
# GNU General Public License version 2.0 (GNU GPL-2), as published by #
# the Free Software Foundation. #
# We created this package for both routine data analysis and academic #
# research and it was publicly released in the hope that it will be #
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. #
2020-10-08 11:16:03 +02:00
# #
# Visit our website for the full manual and a complete tutorial about #
2021-02-02 23:57:35 +01:00
# how to conduct AMR data analysis: https://msberends.github.io/AMR/ #
2020-01-05 17:22:09 +01:00
# ==================================================================== #
2024-07-16 14:51:57 +02:00
# ! THIS SCRIPT REQUIRES AT LEAST 16 GB RAM !
# (at least 12 GB will be used by the R session for the size of the files)
2019-03-18 14:29:41 +01:00
2024-07-16 14:51:57 +02:00
# GBIF:
2022-10-05 09:12:22 +02:00
# 1. Go to https://doi.org/10.15468/39omei and find the download link for the
2024-07-16 14:51:57 +02:00
# latest GBIF backbone taxonony under "Endpoints" and unpack "Taxon.tsv" from it (~1.0 GB)
2022-10-05 09:12:22 +02:00
# ALSO BE SURE to get the date of release and update R/aa_globals.R later!
2024-07-16 14:51:57 +02:00
# LPSN:
2022-10-05 09:12:22 +02:00
# 2. Go to https://lpsn.dsmz.de/downloads (register first) and download the latest
2024-07-16 14:51:57 +02:00
# CSV file (~12,5 MB) and rename to "taxonomy.csv"
# ALSO BE SURE to get the date of release and update R/aa_globals.R later!
# MycoBank:
# 3. Go to https://www.mycobank.org/ and find the download link of all entries
# (last time https://www.mycobank.org/images/MBList.zip) and unpack
# "MBList.xlsx" from it (~120 MB)
# ALSO BE SURE to get the date of release and update R/aa_globals.R later!
# Bartlett:
# 4. For data about human pathogens, we use Bartlett et al. (2022),
2023-01-23 15:01:21 +01:00
# https://doi.org/10.1099/mic.0.001269. Their latest supplementary material
2022-12-19 15:32:41 +01:00
# can be found here: https://github.com/padpadpadpad/bartlett_et_al_2022_human_pathogens.
2024-07-16 14:51:57 +02:00
# Download their latest xlsx file in the `data` folder and save it to our
# `data-raw` folder.
# 5. Set these locations to the paths where the files are:
folder_location <- " ~/Downloads/"
file_gbif <- paste0 ( folder_location , " /backbone/Taxon.tsv" )
2022-10-05 09:12:22 +02:00
file_lpsn <- paste0 ( folder_location , " taxonomy.csv" )
2024-07-16 14:51:57 +02:00
file_mycobank <- paste0 ( folder_location , " MBList.xlsx" )
2019-02-20 00:04:48 +01:00
2022-12-19 15:32:41 +01:00
file_bartlett <- " data-raw/bartlett_et_al_2022_human_pathogens.xlsx"
2022-10-05 09:12:22 +02:00
# 4. Run the rest of this script line by line and check everything :)
2019-03-18 14:29:41 +01:00
2022-10-05 09:12:22 +02:00
if ( ! file.exists ( file_gbif ) ) stop ( " GBIF file not found" )
if ( ! file.exists ( file_lpsn ) ) stop ( " LPSN file not found" )
2024-07-16 14:51:57 +02:00
if ( ! file.exists ( file_mycobank ) ) stop ( " MB file not found" )
2022-12-19 15:32:41 +01:00
if ( ! file.exists ( file_bartlett ) ) stop ( " Bartlett et al. Excel file not found" )
2019-03-18 14:29:41 +01:00
2022-10-05 09:12:22 +02:00
library ( dplyr )
2024-07-16 14:51:57 +02:00
library ( tidyr )
2022-12-12 00:14:56 +01:00
library ( vroom ) # to import files
2024-09-29 22:17:56 +02:00
library ( rvest ) # to scrape LPSN website
2022-12-12 00:14:56 +01:00
library ( progress ) # to show progress bars
2024-09-29 22:17:56 +02:00
library ( readxl ) # to read the MycoBank and Bartlett Excel files
devtools :: load_all ( " ." ) # to load the AMR package
2019-03-18 14:29:41 +01:00
2024-09-29 22:17:56 +02:00
# Helper functions --------------------------------------------------------------------------------
2019-04-05 18:47:39 +02:00
2020-05-27 16:37:49 +02:00
get_author_year <- function ( ref ) {
# Only keep first author, e.g. transform 'Smith, Jones, 2011' to 'Smith et al., 2011'
2024-09-29 22:17:56 +02:00
2020-05-27 16:37:49 +02:00
authors2 <- iconv ( ref , from = " UTF-8" , to = " ASCII//TRANSLIT" )
2022-10-05 09:12:22 +02:00
authors2 <- gsub ( " ?\\(Approved Lists [0-9]+\\) ?" , " () " , authors2 )
authors2 <- gsub ( " [)(]+ $" , " " , authors2 )
2020-05-27 16:37:49 +02:00
# remove leading and trailing brackets
2022-10-05 09:12:22 +02:00
authors2 <- trimws ( gsub ( " ^[(](.*)[)]$" , " \\1" , authors2 ) )
2020-05-27 16:37:49 +02:00
# only take part after brackets if there's a name
2024-09-29 22:17:56 +02:00
authors2 <- if_else ( grepl ( " .*[)] [a-zA-Z]+.*" , authors2 ) ,
gsub ( " .*[)] (.*)" , " \\1" , authors2 ) ,
authors2
2022-08-28 10:31:50 +02:00
)
2022-10-05 09:12:22 +02:00
# replace parentheses with emend. to get the latest authors
authors2 <- gsub ( " (" , " emend. " , authors2 , fixed = TRUE )
authors2 <- gsub ( " )" , " " , authors2 , fixed = TRUE )
authors2 <- gsub ( " +" , " " , authors2 )
authors2 <- trimws ( authors2 )
2024-09-29 22:17:56 +02:00
2020-05-27 16:37:49 +02:00
# get year from last 4 digits
2022-08-28 10:31:50 +02:00
lastyear <- as.integer ( gsub ( " .*([0-9]{4})$" , " \\1" , authors2 ) )
2020-05-27 16:37:49 +02:00
# can never be later than now
2024-09-29 22:17:56 +02:00
lastyear <- if_else ( lastyear > as.integer ( format ( Sys.Date ( ) , " %Y" ) ) ,
NA ,
lastyear
2022-08-28 10:31:50 +02:00
)
2020-05-27 16:37:49 +02:00
# get authors without last year
authors <- gsub ( " (.*)[0-9]{4}$" , " \\1" , authors2 )
2022-10-05 09:12:22 +02:00
# not sure what this is
authors <- gsub ( " (Saito)" , " " , authors , fixed = TRUE )
authors <- gsub ( " (Oudem.)" , " " , authors , fixed = TRUE )
2020-05-27 16:37:49 +02:00
# remove nonsense characters from names
2022-10-05 09:12:22 +02:00
authors <- gsub ( " [^a-zA-Z,'&. -]" , " " , authors )
# no initials, only surname
2024-09-29 22:17:56 +02:00
authors <- gsub ( " [A-Z-][a-z-]?[.]" , " " , authors , ignore.case = FALSE )
2020-05-27 16:37:49 +02:00
# remove trailing and leading spaces
authors <- trimws ( authors )
2022-10-05 09:12:22 +02:00
# keep only the part after last 'emend.' to get the latest authors
authors <- gsub ( " .*emend[.] ?" , " " , authors )
2020-05-27 16:37:49 +02:00
# only keep first author and replace all others by 'et al'
authors <- gsub ( " (,| and| et| &| ex| emend\\.?) .*" , " et al." , authors )
# et al. always with ending dot
authors <- gsub ( " et al\\.?" , " et al." , authors )
authors <- gsub ( " ?,$" , " " , authors )
# don't start with 'sensu' or 'ehrenb'
2022-10-05 09:12:22 +02:00
authors <- gsub ( " ^(sensu|Ehrenb.?|corrig.?) " , " " , authors , ignore.case = TRUE )
2020-05-27 16:37:49 +02:00
# no initials, only surname
2022-10-05 09:12:22 +02:00
authors <- trimws ( authors )
2024-07-16 14:51:57 +02:00
authors <- gsub ( " ^([A-Z-][.])+( & ?)?" , " " , authors , ignore.case = FALSE )
authors <- gsub ( " ^([A-Z-]+ )+" , " " , authors , ignore.case = FALSE )
2022-10-05 09:12:22 +02:00
# remove dots
authors <- gsub ( " ." , " " , authors , fixed = TRUE )
authors <- gsub ( " et al" , " et al." , authors , fixed = TRUE )
authors [nchar ( authors ) <= 3 ] <- " "
2020-05-27 16:37:49 +02:00
# combine author and year if year is available
2024-09-29 22:17:56 +02:00
ref <- if_else ( ! is.na ( lastyear ) ,
paste0 ( authors , " , " , lastyear ) ,
authors
2022-08-28 10:31:50 +02:00
)
2020-05-27 16:37:49 +02:00
# fix beginning and ending
ref <- gsub ( " , $" , " " , ref )
ref <- gsub ( " ^, " , " " , ref )
ref <- gsub ( " ^(emend|et al.,?)" , " " , ref )
ref <- trimws ( ref )
2022-10-05 09:12:22 +02:00
ref <- gsub ( " '" , " " , ref )
2024-09-29 22:17:56 +02:00
2020-05-27 16:37:49 +02:00
# a lot start with a lowercase character - fix that
ref [ ! grepl ( " ^d[A-Z]" , ref ) ] <- gsub ( " ^([a-z])" , " \\U\\1" , ref [ ! grepl ( " ^d[A-Z]" , ref ) ] , perl = TRUE )
2022-08-28 10:31:50 +02:00
# specific one for the French that are named dOrbigny
2020-05-27 16:37:49 +02:00
ref [grepl ( " ^d[A-Z]" , ref ) ] <- gsub ( " ^d" , " d'" , ref [grepl ( " ^d[A-Z]" , ref ) ] )
ref <- gsub ( " +" , " " , ref )
2022-10-05 09:12:22 +02:00
ref [ref == " " ] <- NA_character_
2020-05-27 16:37:49 +02:00
ref
}
2022-10-05 09:12:22 +02:00
df_remove_nonASCII <- function ( df ) {
# Remove non-ASCII characters (these are not allowed by CRAN)
df %>%
mutate_if ( is.character , iconv , from = " UTF-8" , to = " ASCII//TRANSLIT" ) %>%
# also remove invalid characters
mutate_if ( is.character , ~ gsub ( " [\"'`]+" , " " , .) ) %>%
AMR ::: dataset_UTF8_to_ASCII ( )
}
2019-02-20 00:04:48 +01:00
2022-10-05 09:12:22 +02:00
# to retrieve LPSN and authors from LPSN website
2024-07-16 14:51:57 +02:00
# e.g., get_lpsn_and_author("genus", "Klebsiella")
2022-10-05 09:12:22 +02:00
get_lpsn_and_author <- function ( rank , name ) {
2024-02-24 15:16:52 +01:00
name <- gsub ( " ^Candidatus " , " " , name )
2022-10-05 09:12:22 +02:00
url <- paste0 ( " https://lpsn.dsmz.de/" , tolower ( rank ) , " /" , tolower ( name ) )
2022-12-12 00:14:56 +01:00
page_txt <- tryCatch ( read_html ( url ) , error = function ( e ) NULL )
2022-10-05 09:12:22 +02:00
if ( is.null ( page_txt ) ) {
warning ( " No LPSN found for " , tolower ( rank ) , " '" , name , " '" )
lpsn <- NA_character_
ref <- NA_character_
2024-02-24 15:16:52 +01:00
status <- " unknown"
2022-10-05 09:12:22 +02:00
} else {
page_txt <- page_txt %>%
2022-12-12 00:14:56 +01:00
html_element ( " #detail-page" ) %>%
html_text ( )
2022-10-05 09:12:22 +02:00
lpsn <- gsub ( " .*Record number:[\r\n\t ]*([0-9]+).*" , " \\1" , page_txt , perl = FALSE )
ref <- page_txt %>%
gsub ( " .*?Name: (.*[0-9]{4}?).*" , " \\1" , ., perl = FALSE ) %>%
gsub ( name , " " , ., fixed = TRUE ) %>%
2024-02-24 15:16:52 +01:00
gsub ( " ^\"?Candidatus ?\"?" , " " , .) %>%
2022-10-05 09:12:22 +02:00
trimws ( )
2024-02-24 15:16:52 +01:00
status <- trimws ( gsub ( " .*Nomenclatural status:[\r\n\t ]*([a-zA-Z, ]+)[\r\n\t].*" , " \\1" , page_txt , perl = FALSE ) )
if ( ( status %like% " validly published" & status %unlike% " not valid" ) | status %like% " [\r\n\t]" ) {
# we used to take "accepted" for every LPSN record, also candidates. Now only for missing values and explicit accepted ones.
status <- " accepted"
} else {
status <- " not validly published"
}
}
c ( " lpsn" = lpsn , " ref" = ref , " status" = status )
}
2024-07-16 14:51:57 +02:00
# this will e.g. take the family from the root genus record, and gives all species of that family
get_top_lvl <- function ( current , rank , source , rank_target , target ) {
current.bak <- current
current <- current [target != " " ]
rank <- rank [target != " " ]
source <- source [target != " " ]
target <- target [target != " " ]
if ( length ( current ) == 0 ) {
current.bak
} else if ( ! rank_target %in% rank ) {
current.bak [1 ]
2024-02-24 15:16:52 +01:00
} else {
2024-07-16 14:51:57 +02:00
if ( n_distinct ( source ) > 1 && " GBIF" %in% source ) {
# prefer LPSN and MycoBank over GBIF, which is often not up-to-date at all
current <- current [source != " GBIF" ]
rank <- rank [source != " GBIF" ]
source <- source [source != " GBIF" ]
}
2024-02-24 15:16:52 +01:00
out <- current [rank == rank_target ] [1 ]
if ( out %in% c ( " " , NA ) ) {
out <- names ( sort ( table ( current [which ( ! current %in% c ( " " , NA ) ) ] ) , decreasing = TRUE ) [1 ] )
if ( is.null ( out ) ) {
out <- " "
}
}
out
2022-10-05 09:12:22 +02:00
}
}
2024-07-16 14:51:57 +02:00
# MB/ June 2024: after years still useless, does not contain full taxonomy, e.g. LPSN::request(cred, category = "family") is empty.
2022-10-05 09:12:22 +02:00
# get_from_lpsn <- function (user, pw) {
# if (!"LPSN" %in% rownames(utils::installed.packages())) {
# stop("Install the official LPSN package for R using: install.packages('LPSN', repos = 'https://r-forge.r-project.org')")
# }
# cred <- LPSN::open_lpsn(user, pw)
#
# lpsn_genus <- LPSN::request(cred, category = "genus")
# message("Downloading genus data (n = ", lpsn_genus$count, ") from LPSN API...")
# lpsn_genus <- as.data.frame(LPSN::retrieve(cred, category = "genus"))
#
# lpsn_species <- LPSN::request(cred, category = "species")
# message("Downloading species data (n = ", lpsn_species$count, ") from LPSN API...")
# lpsn_species <- as.data.frame(LPSN::retrieve(cred, category = "species"))
#
# lpsn_subspecies <- LPSN::request(cred, category = "subspecies")
# message("Downloading subspecies data (n = ", lpsn_subspecies$count, ") from LPSN API...")
# lpsn_subspecies <- as.data.frame(LPSN::retrieve(cred, category = "subspecies"))
#
# message("Binding rows...")
# lpsn_total <- bind_rows(lpsn_genus, lpsn_species, lpsn_subspecies)
# message("Done.")
# lpsn_total
# }
2019-02-20 00:04:48 +01:00
2024-09-29 22:17:56 +02:00
# Read LPSN data ----------------------------------------------------------------------------------
2022-10-05 09:12:22 +02:00
2024-07-16 14:51:57 +02:00
taxonomy_lpsn.bak <- vroom ( file_lpsn , guess_max = 1e5 )
2022-12-12 00:14:56 +01:00
2022-10-05 09:12:22 +02:00
taxonomy_lpsn <- taxonomy_lpsn.bak %>%
transmute (
genus = genus_name ,
species = sp_epithet ,
subspecies = subsp_epithet ,
rank = case_when (
! is.na ( subsp_epithet ) ~ " subspecies" ,
! is.na ( sp_epithet ) ~ " species" ,
TRUE ~ " genus"
) ,
2024-09-29 22:17:56 +02:00
status = if_else ( is.na ( record_lnk ) , " accepted" , " synonym" ) ,
2022-10-05 09:12:22 +02:00
ref = authors ,
lpsn = as.character ( record_no ) ,
lpsn_parent = NA_character_ ,
lpsn_renamed_to = as.character ( record_lnk )
) %>%
mutate ( source = " LPSN" )
2024-02-24 15:16:52 +01:00
# integrity tests
sort ( table ( taxonomy_lpsn $ rank ) )
sort ( table ( taxonomy_lpsn $ status ) )
2022-10-05 09:12:22 +02:00
taxonomy_lpsn
# download additional taxonomy to the domain/kingdom level (their API is not sufficient...)
taxonomy_lpsn_missing <- tibble (
kingdom = character ( 0 ) ,
phylum = character ( 0 ) ,
class = character ( 0 ) ,
order = character ( 0 ) ,
family = character ( 0 ) ,
genus = character ( 0 )
)
for ( page in LETTERS ) {
2022-12-12 00:14:56 +01:00
# this will not alter `taxonomy_lpsn` yet
2024-02-24 15:16:52 +01:00
message ( " Downloading page " , page , " ..." , appendLF = TRUE )
2022-10-05 09:12:22 +02:00
url <- paste0 ( " https://lpsn.dsmz.de/genus?page=" , page )
2024-02-24 15:16:52 +01:00
x <- tryCatch ( read_html ( url ) ,
error = function ( e ) {
message ( " Waiting 10 seconds because of error: " , e $ message )
Sys.sleep ( 10 )
read_html ( url )
} )
x <- x %>%
2022-10-05 09:12:22 +02:00
# class "main-list" is the main table
2022-12-12 00:14:56 +01:00
html_element ( " .main-list" ) %>%
2022-10-05 09:12:22 +02:00
# get every list element with a set <id> attribute
2022-12-12 00:14:56 +01:00
html_elements ( " li[id]" )
2024-02-24 15:16:52 +01:00
pb <- progress_bar $ new ( total = length ( x ) , format = " [:bar] :current/:total :eta" )
2022-10-05 09:12:22 +02:00
for ( i in seq_len ( length ( x ) ) ) {
2024-02-24 15:16:52 +01:00
pb $ tick ( )
2022-12-12 00:14:56 +01:00
elements <- x [ [i ] ] %>% html_elements ( " a" )
hrefs <- elements %>% html_attr ( " href" )
2022-10-05 09:12:22 +02:00
ranks <- hrefs %>% gsub ( " .*/(.*?)/.*" , " \\1" , .)
names <- elements %>%
2022-12-12 00:14:56 +01:00
html_text ( ) %>%
2022-10-05 09:12:22 +02:00
gsub ( ' "' , " " , ., fixed = TRUE )
# no species, this must be until genus level
hrefs <- hrefs [ranks != " species" ]
names <- names [ranks != " species" ]
ranks <- ranks [ranks != " species" ]
ranks [ranks == " domain" ] <- " kingdom"
2024-02-24 15:16:52 +01:00
suppressMessages (
df <- names %>%
tibble ( ) %>%
t ( ) %>%
as_tibble ( .name_repair = " unique" ) %>%
setNames ( ranks ) %>%
# no candidates please
filter ( genus %unlike% " ^(Candidatus|\\[)" )
)
2024-09-29 22:17:56 +02:00
2022-10-05 09:12:22 +02:00
taxonomy_lpsn_missing <- taxonomy_lpsn_missing %>%
bind_rows ( df )
}
2024-02-24 15:16:52 +01:00
message ( " => " , length ( x ) , " entries incl. candidates (cleaned total: " , nrow ( taxonomy_lpsn_missing ) , " )" )
2022-10-05 09:12:22 +02:00
}
2024-07-16 14:51:57 +02:00
taxonomy_lpsn_missing <- taxonomy_lpsn_missing %>% distinct ( )
saveRDS ( taxonomy_lpsn_missing , " data-raw/taxonomy_lpsn_missing.rds" )
2024-09-29 22:17:56 +02:00
# taxonomy_lpsn_missing <- readRDS("data-raw/taxonomy_lpsn_missing.rds")
2024-07-16 14:51:57 +02:00
2024-02-24 15:16:52 +01:00
# had to pick the right genus/family combination here:
2024-07-16 14:51:57 +02:00
taxonomy_lpsn_missing <- taxonomy_lpsn_missing %>% filter ( ! ( genus == " Pusillimonas" & family == " Oscillospiraceae" ) )
2024-02-24 15:16:52 +01:00
taxonomy_lpsn.bak2 <- taxonomy_lpsn.bak
2022-10-05 09:12:22 +02:00
taxonomy_lpsn <- taxonomy_lpsn %>%
left_join ( taxonomy_lpsn_missing , by = " genus" ) %>%
select ( kingdom : family , everything ( ) ) %>%
# remove entries like "[Bacteria, no family]" and "[Bacteria, no class]"
2024-09-29 22:17:56 +02:00
mutate_all ( function ( x ) if_else ( x %like_case% " no " , NA_character_ , x ) )
2022-10-05 09:12:22 +02:00
taxonomy_lpsn.bak2 <- taxonomy_lpsn
2024-02-24 15:16:52 +01:00
# download family directly from LPSN website using scraping, by using get_lpsn_and_author()
# try it first:
# get_lpsn_and_author("genus", "Escherichia")
2024-07-16 14:51:57 +02:00
# get_lpsn_and_author("family", "Enterobacteriaceae")
2024-02-24 15:16:52 +01:00
pb <- progress_bar $ new ( total = length ( unique ( taxonomy_lpsn $ family ) ) , format = " [:bar] :current/:total :eta" )
2022-10-05 09:12:22 +02:00
for ( f in unique ( taxonomy_lpsn $ family ) ) {
pb $ tick ( )
if ( is.na ( f ) ) next
tax_info <- get_lpsn_and_author ( " Family" , f )
taxonomy_lpsn <- taxonomy_lpsn %>%
bind_rows ( tibble (
kingdom = taxonomy_lpsn $ kingdom [which ( taxonomy_lpsn $ family == f ) [1 ] ] ,
phylum = taxonomy_lpsn $ phylum [which ( taxonomy_lpsn $ family == f ) [1 ] ] ,
class = taxonomy_lpsn $ class [which ( taxonomy_lpsn $ family == f ) [1 ] ] ,
order = taxonomy_lpsn $ order [which ( taxonomy_lpsn $ family == f ) [1 ] ] ,
family = f ,
rank = " family" ,
2024-02-24 15:16:52 +01:00
status = unname ( tax_info [ " status" ] ) ,
2022-10-05 09:12:22 +02:00
source = " LPSN" ,
lpsn = unname ( tax_info [ " lpsn" ] ) ,
ref = unname ( tax_info [ " ref" ] )
) )
}
2022-12-12 00:14:56 +01:00
# download order directly from LPSN website using scraping
2024-07-16 14:51:57 +02:00
# try it first:
# get_lpsn_and_author("order", "Enterobacterales")
2024-02-24 15:16:52 +01:00
pb <- progress_bar $ new ( total = length ( unique ( taxonomy_lpsn $ order ) ) , format = " [:bar] :current/:total :eta" )
2022-10-05 09:12:22 +02:00
for ( o in unique ( taxonomy_lpsn $ order ) ) {
pb $ tick ( )
if ( is.na ( o ) ) next
tax_info <- get_lpsn_and_author ( " Order" , o )
taxonomy_lpsn <- taxonomy_lpsn %>%
bind_rows ( tibble (
kingdom = taxonomy_lpsn $ kingdom [which ( taxonomy_lpsn $ order == o ) [1 ] ] ,
phylum = taxonomy_lpsn $ phylum [which ( taxonomy_lpsn $ order == o ) [1 ] ] ,
class = taxonomy_lpsn $ class [which ( taxonomy_lpsn $ order == o ) [1 ] ] ,
order = o ,
rank = " order" ,
2024-02-24 15:16:52 +01:00
status = unname ( tax_info [ " status" ] ) ,
2022-10-05 09:12:22 +02:00
source = " LPSN" ,
lpsn = unname ( tax_info [ " lpsn" ] ) ,
ref = unname ( tax_info [ " ref" ] )
) )
}
2022-12-12 00:14:56 +01:00
# download class directly from LPSN website using scraping
2024-07-16 14:51:57 +02:00
# try it first:
# get_lpsn_and_author("class", "Gammaproteobacteria")
2024-02-24 15:16:52 +01:00
pb <- progress_bar $ new ( total = length ( unique ( taxonomy_lpsn $ class ) ) , format = " [:bar] :current/:total :eta" )
2022-10-05 09:12:22 +02:00
for ( cc in unique ( taxonomy_lpsn $ class ) ) {
pb $ tick ( )
if ( is.na ( cc ) ) next
tax_info <- get_lpsn_and_author ( " Class" , cc )
taxonomy_lpsn <- taxonomy_lpsn %>%
bind_rows ( tibble (
kingdom = taxonomy_lpsn $ kingdom [which ( taxonomy_lpsn $ class == cc ) [1 ] ] ,
phylum = taxonomy_lpsn $ phylum [which ( taxonomy_lpsn $ class == cc ) [1 ] ] ,
class = cc ,
rank = " class" ,
2024-02-24 15:16:52 +01:00
status = unname ( tax_info [ " status" ] ) ,
2022-10-05 09:12:22 +02:00
source = " LPSN" ,
lpsn = unname ( tax_info [ " lpsn" ] ) ,
ref = unname ( tax_info [ " ref" ] )
) )
}
2022-12-12 00:14:56 +01:00
# download phylum directly from LPSN website using scraping
2024-07-16 14:51:57 +02:00
# try it first:
# get_lpsn_and_author("phylum", "Pseudomonadota")
2024-02-24 15:16:52 +01:00
pb <- progress_bar $ new ( total = length ( unique ( taxonomy_lpsn $ phylum ) ) , format = " [:bar] :current/:total :eta" )
2022-10-05 09:12:22 +02:00
for ( p in unique ( taxonomy_lpsn $ phylum ) ) {
pb $ tick ( )
if ( is.na ( p ) ) next
tax_info <- get_lpsn_and_author ( " Phylum" , p )
taxonomy_lpsn <- taxonomy_lpsn %>%
bind_rows ( tibble (
kingdom = taxonomy_lpsn $ kingdom [which ( taxonomy_lpsn $ phylum == p ) [1 ] ] ,
phylum = p ,
rank = " phylum" ,
2024-02-24 15:16:52 +01:00
status = unname ( tax_info [ " status" ] ) ,
2022-10-05 09:12:22 +02:00
source = " LPSN" ,
lpsn = unname ( tax_info [ " lpsn" ] ) ,
ref = unname ( tax_info [ " ref" ] )
) )
}
2022-12-12 00:14:56 +01:00
# download kingdom directly from LPSN website using scraping
2024-07-16 14:51:57 +02:00
# try it first:
# get_lpsn_and_author("kingdom", "Bacteria")
2024-02-24 15:16:52 +01:00
pb <- progress_bar $ new ( total = length ( unique ( taxonomy_lpsn $ kingdom ) ) , format = " [:bar] :current/:total :eta" )
2022-10-05 09:12:22 +02:00
for ( k in unique ( taxonomy_lpsn $ kingdom ) ) {
pb $ tick ( )
if ( is.na ( k ) ) next
tax_info <- get_lpsn_and_author ( " Domain" , k )
taxonomy_lpsn <- taxonomy_lpsn %>%
bind_rows ( tibble (
kingdom = k ,
rank = " kingdom" ,
2024-02-24 15:16:52 +01:00
status = unname ( tax_info [ " status" ] ) ,
2022-10-05 09:12:22 +02:00
source = " LPSN" ,
lpsn = unname ( tax_info [ " lpsn" ] ) ,
ref = unname ( tax_info [ " ref" ] )
) )
}
2024-09-29 22:17:56 +02:00
taxonomy_lpsn <- taxonomy_lpsn %>%
2024-07-16 14:51:57 +02:00
filter ( status != " not validly published" )
2022-10-05 09:12:22 +02:00
# integrity tests
sort ( table ( taxonomy_lpsn $ rank ) )
2024-07-16 14:51:57 +02:00
# should only be 'accepted' and 'synonym':
2022-10-05 09:12:22 +02:00
sort ( table ( taxonomy_lpsn $ status ) )
2024-09-29 22:17:56 +02:00
# Read MycoBank data ------------------------------------------------------------------------------
2024-07-16 14:51:57 +02:00
2024-09-30 18:46:55 +02:00
taxonomy_mycobank <- read_excel ( file_mycobank , guess_max = 1e5 )
2024-07-16 14:51:57 +02:00
taxonomy_mycobank.bak <- taxonomy_mycobank
taxonomy_mycobank <- taxonomy_mycobank %>%
transmute ( mycobank = `MycoBank #` ,
fullname = gsub ( " +" , " " , `Taxon name` ) ,
current = `Current name.Taxon name` ,
ref = paste0 ( Authors , " , " , `Year of effective publication` ) ,
rank = `Rank.Rank name` ,
status = `Name status` ,
mycobank_renamed_to = taxonomy_mycobank.bak $ `MycoBank #` [match ( `Current name.Taxon name` , taxonomy_mycobank.bak $ `Taxon name` ) ] ,
Classification
) %>%
separate ( Classification , sep = " , " , into = paste0 ( " tax_" , letters [1 : 20 ] ) , remove = TRUE ) %>%
mutate ( rank = case_when (
fullname %like_case% " ^[A-Z][a-z]+ .+ .+" ~ " subsp." ,
rank == " -" & fullname %like_case% " ^[A-Z][a-z]+ [a-z-]+$" ~ " sp." ,
rank == " -" & paste0 ( fullname , " " ) %in% gsub ( " (^[A-Z][a-z]+ ).*" , " \\1" , trimws ( taxonomy_mycobank.bak $ `Taxon name` ) , perl = TRUE ) ~ " gen." ,
# we take a leap here for the family and order
rank == " -" & fullname %like% " ceae$" ~ " fam." ,
rank == " -" & fullname %like% " ales$" ~ " ordo" ,
# tax_d and tax_e are the subdivision/class columns, kind of; prefer e over d
rank == " -" & fullname %in% tax_e ~ " cl." ,
rank == " -" & fullname %in% tax_d ~ " cl." ,
TRUE ~ rank
) ) %>%
filter ( rank %unlike% " sub" & ! tolower ( rank ) %in% c ( " var." , " sect." , " ser." , " tr." , " f." , " race" , " stirps" , " *" , " -" ) ,
# we also remove orthographic variants here, because our algorithms will give valid results based on misspelling
! tolower ( status ) %in% c ( " invalid" , " deleted" , " uncertain" , " illegitimate" , " orthographic variant" , " unavailable" ) ) %>%
mutate ( mycobank_renamed_to = if_else ( mycobank_renamed_to == mycobank , NA_character_ , mycobank_renamed_to ) ) %>%
# only keep 1 entry for the kingdom (regnum)
filter ( fullname == " Fungi" | rank != " regn." ) %>%
# no other kingdoms than fungi
filter ( fullname == " Fungi" | tax_a == " Fungi" ) %>%
select_if ( function ( x ) ! all ( is.na ( x ) ) )
taxonomy_mycobank %>% count ( rank , sort = TRUE )
2024-09-29 22:17:56 +02:00
table ( taxonomy_mycobank $ status )
2024-07-16 14:51:57 +02:00
taxonomy_mycobank <- taxonomy_mycobank %>%
mutate ( status = if_else ( ! is.na ( mycobank_renamed_to ) , " synonym" , " accepted" ) )
2024-09-29 22:17:56 +02:00
table ( taxonomy_mycobank $ status )
2024-07-16 14:51:57 +02:00
taxonomy_mycobank2 <- taxonomy_mycobank
taxonomy_mycobank <- taxonomy_mycobank %>%
mutate ( rank = case_match ( rank ,
" sp." ~ " species" ,
" gen." ~ " genus" ,
" fam." ~ " family" ,
" ordo" ~ " order" ,
" cl." ~ " class" ,
" div." ~ " phylum" ,
" regn." ~ " kingdom" ,
.default = paste0 ( " #" , rank ) ) )
taxonomy_mycobank %>% filter ( rank %like% " #" )
taxonomy_mycobank %>% count ( rank , sort = TRUE )
taxonomy_mycobank3 <- taxonomy_mycobank
# MycoBank just pasted their taxonomy together into 1 field, and now some classes are in the division (tax_b) column, it's horrible
# so we decide based on the fullname and rank column per record
# use this to determine how far to go:
any ( taxonomy_mycobank $ fullname [taxonomy_mycobank $ rank == " class" ] %in% taxonomy_mycobank $ tax_e ) # replace tax_e with tax_f, etc
2024-09-29 22:17:56 +02:00
any ( taxonomy_mycobank $ fullname [taxonomy_mycobank $ rank == " class" ] %in% taxonomy_mycobank $ tax_f )
2024-07-16 14:51:57 +02:00
taxonomy_mycobank <- taxonomy_mycobank %>%
mutate ( kingdom = " Fungi" , # we already filtered everything else, and MycoBank 90157 has '-' as current name...
phylum = case_when ( rank == " phylum" ~ fullname ,
tax_b %in% taxonomy_mycobank $ fullname [taxonomy_mycobank $ rank == " phylum" ] ~ tax_b ,
tax_c %in% taxonomy_mycobank $ fullname [taxonomy_mycobank $ rank == " phylum" ] ~ tax_c ,
TRUE ~ " " ) ,
class = case_when ( rank == " class" ~ fullname ,
tax_c %in% taxonomy_mycobank $ fullname [taxonomy_mycobank $ rank == " class" ] ~ tax_c ,
tax_d %in% taxonomy_mycobank $ fullname [taxonomy_mycobank $ rank == " class" ] ~ tax_d ,
tax_e %in% taxonomy_mycobank $ fullname [taxonomy_mycobank $ rank == " class" ] ~ tax_e ,
TRUE ~ " " ) ,
order = case_when ( rank == " order" ~ fullname ,
tax_c %in% taxonomy_mycobank $ fullname [taxonomy_mycobank $ rank == " order" ] ~ tax_c ,
tax_d %in% taxonomy_mycobank $ fullname [taxonomy_mycobank $ rank == " order" ] ~ tax_d ,
tax_e %in% taxonomy_mycobank $ fullname [taxonomy_mycobank $ rank == " order" ] ~ tax_e ,
tax_f %in% taxonomy_mycobank $ fullname [taxonomy_mycobank $ rank == " order" ] ~ tax_f ,
tax_g %in% taxonomy_mycobank $ fullname [taxonomy_mycobank $ rank == " order" ] ~ tax_g ,
TRUE ~ " " ) ,
family = case_when ( rank == " family" ~ fullname ,
tax_c %in% taxonomy_mycobank $ fullname [taxonomy_mycobank $ rank == " family" ] ~ tax_c ,
tax_d %in% taxonomy_mycobank $ fullname [taxonomy_mycobank $ rank == " family" ] ~ tax_d ,
tax_e %in% taxonomy_mycobank $ fullname [taxonomy_mycobank $ rank == " family" ] ~ tax_e ,
tax_f %in% taxonomy_mycobank $ fullname [taxonomy_mycobank $ rank == " family" ] ~ tax_f ,
tax_g %in% taxonomy_mycobank $ fullname [taxonomy_mycobank $ rank == " family" ] ~ tax_g ,
tax_h %in% taxonomy_mycobank $ fullname [taxonomy_mycobank $ rank == " family" ] ~ tax_h ,
tax_i %in% taxonomy_mycobank $ fullname [taxonomy_mycobank $ rank == " family" ] ~ tax_i ,
TRUE ~ " " ) ,
genus = case_when ( rank == " genus" ~ fullname ,
tax_b %in% taxonomy_mycobank $ fullname [taxonomy_mycobank $ rank == " genus" ] ~ tax_b ,
tax_c %in% taxonomy_mycobank $ fullname [taxonomy_mycobank $ rank == " genus" ] ~ tax_c ,
tax_d %in% taxonomy_mycobank $ fullname [taxonomy_mycobank $ rank == " genus" ] ~ tax_d ,
tax_e %in% taxonomy_mycobank $ fullname [taxonomy_mycobank $ rank == " genus" ] ~ tax_e ,
tax_f %in% taxonomy_mycobank $ fullname [taxonomy_mycobank $ rank == " genus" ] ~ tax_f ,
tax_g %in% taxonomy_mycobank $ fullname [taxonomy_mycobank $ rank == " genus" ] ~ tax_g ,
tax_h %in% taxonomy_mycobank $ fullname [taxonomy_mycobank $ rank == " genus" ] ~ tax_h ,
tax_i %in% taxonomy_mycobank $ fullname [taxonomy_mycobank $ rank == " genus" ] ~ tax_i ,
tax_k %in% taxonomy_mycobank $ fullname [taxonomy_mycobank $ rank == " genus" ] ~ tax_j ,
tax_k %in% taxonomy_mycobank $ fullname [taxonomy_mycobank $ rank == " genus" ] ~ tax_k ,
TRUE ~ " " ) ,
species = case_when ( rank == " species" & fullname %like% " " ~ gsub ( " .* (.*)" , " \\1" , fullname , perl = TRUE ) ,
TRUE ~ " " )
)
2024-09-29 22:17:56 +02:00
# FOR 2025: use this to get all the genera with updated names from MO_RELEVANT_GENERA:
# AMR::microorganisms %>% filter(genus %in% MO_RELEVANT_GENERA) %>% pull(fullname) %>% mo_current() %>% mo_genus() %>% unique() %>% sort()
2024-07-17 14:29:55 +02:00
2024-07-16 14:51:57 +02:00
# keep only the relevant ones
taxonomy_mycobank <- taxonomy_mycobank %>%
2024-09-29 22:17:56 +02:00
filter ( genus %in% AMR ::: MO_RELEVANT_GENERA |
2024-07-16 14:51:57 +02:00
! rank %in% c ( " genus" , " species" ) ) %>%
filter ( ! ( genus == " " & rank %in% c ( " genus" , " species" ) ) )
# clean up authors and add last columns
taxonomy_mycobank <- taxonomy_mycobank %>%
mutate ( subspecies = " " ,
source = " MycoBank" ,
mycobank_parent = NA_character_ )
# select final set
taxonomy_mycobank <- taxonomy_mycobank %>%
select ( fullname , kingdom , phylum , class , order , family , genus , species , subspecies ,
rank , status , ref , mycobank , mycobank_parent , mycobank_renamed_to , source )
# not all 'renamed to' records are available, some were even just orthographic variants (that have an invalid status)
taxonomy_mycobank $ status [ ! is.na ( taxonomy_mycobank $ mycobank_renamed_to ) & ! taxonomy_mycobank $ mycobank_renamed_to %in% taxonomy_mycobank $ mycobank ] <- " accepted"
taxonomy_mycobank $ mycobank_renamed_to [taxonomy_mycobank $ status == " accepted" ] <- NA
taxonomy_mycobank %>% count ( status )
2024-09-29 22:17:56 +02:00
# Read GBIF data ----------------------------------------------------------------------------------
2024-07-16 14:51:57 +02:00
taxonomy_gbif.bak <- vroom ( file_gbif , guess_max = 1e5 )
2024-09-29 22:17:56 +02:00
# include all fungal orders from the mycobank db
2024-07-16 14:51:57 +02:00
include_fungal_orders <- unique ( taxonomy_mycobank $ order [ ! taxonomy_mycobank $ order %in% c ( " " , NA ) ] )
# check some columns to validate below filters
taxonomy_gbif.bak %>% count ( taxonomicStatus , sort = TRUE )
taxonomy_gbif.bak %>% count ( taxonRank , sort = TRUE )
taxonomy_gbif <- taxonomy_gbif.bak %>%
# immediately filter rows we really never want
filter (
# never doubtful status, only accepted and all synonyms, and only ranked items
taxonomicStatus != " doubtful" ,
taxonRank != " unranked" ,
# include these kingdoms (no Chromista)
kingdom %in% c ( " Archaea" , " Bacteria" , " Protozoa" ) |
# include all of these fungal orders
order %in% include_fungal_orders |
# and all of these important genera (see "data-raw/_pre_commit_checks.R")
2024-09-29 22:17:56 +02:00
# (they also contain bacteria and protozoa, but these will get higher prevalence scores later on)
genus %in% AMR ::: MO_RELEVANT_GENERA
2024-07-16 14:51:57 +02:00
) %>%
select (
kingdom ,
phylum ,
class ,
order ,
family ,
genus ,
species = specificEpithet ,
subspecies = infraspecificEpithet ,
rank = taxonRank ,
status = taxonomicStatus ,
ref = scientificNameAuthorship ,
gbif = taxonID ,
gbif_parent = parentNameUsageID ,
gbif_renamed_to = acceptedNameUsageID
) %>%
mutate (
# do this mutate after the original selection/filtering, as it decreases computing time tremendously
2024-09-29 22:17:56 +02:00
status = if_else ( status == " accepted" , " accepted" , " synonym" ) ,
2024-07-16 14:51:57 +02:00
# checked taxonRank - the "form" and "variety" always have a subspecies, so:
2024-09-29 22:17:56 +02:00
rank = if_else ( rank %in% c ( " form" , " variety" ) , " subspecies" , rank ) ,
2024-07-16 14:51:57 +02:00
source = " GBIF"
) %>%
filter (
# their data is messy - keep only these:
rank == " kingdom" & ! is.na ( kingdom ) |
rank == " phylum" & ! is.na ( phylum ) |
rank == " class" & ! is.na ( class ) |
rank == " order" & ! is.na ( order ) |
rank == " family" & ! is.na ( family ) |
rank == " genus" & ! is.na ( genus ) |
rank == " species" & ! is.na ( species ) |
rank == " subspecies" & ! is.na ( subspecies )
) %>%
# some items end with _A or _B... why??
mutate_all ( ~ gsub ( " _[A-Z]$" , " " , .x , perl = TRUE ) ) %>%
# now we have duplicates, remove these, but prioritise "accepted" status and highest taxon ID
arrange ( status , gbif ) %>%
distinct ( kingdom , phylum , class , order , family , genus , species , subspecies , .keep_all = TRUE ) %>%
filter (
kingdom %unlike% " [0-9]" ,
phylum %unlike% " [0-9]" ,
class %unlike% " [0-9]" ,
order %unlike% " [0-9]" ,
family %unlike% " [0-9]" ,
genus %unlike% " [0-9]"
)
# integrity tests
sort ( table ( taxonomy_gbif $ rank ) )
sort ( table ( taxonomy_gbif $ status ) )
taxonomy_gbif
2024-09-29 22:17:56 +02:00
# Save intermediate results -----------------------------------------------------------------------
2022-10-05 09:12:22 +02:00
saveRDS ( taxonomy_lpsn , " data-raw/taxonomy_lpsn.rds" , version = 2 )
2024-07-16 14:51:57 +02:00
saveRDS ( taxonomy_mycobank , " data-raw/taxonomy_mycobank.rds" , version = 2 )
2024-09-29 22:17:56 +02:00
saveRDS ( taxonomy_gbif , " data-raw/taxonomy_gbif.rds" , version = 2 )
2022-12-12 00:14:56 +01:00
# this allows to always get back to this point by simply loading the files from data-raw/.
2022-10-05 09:12:22 +02:00
2024-02-24 15:16:52 +01:00
2024-09-29 22:17:56 +02:00
# Add full names ----------------------------------------------------------------------------------
2022-10-05 09:12:22 +02:00
taxonomy_gbif <- taxonomy_gbif %>%
# clean NAs and add fullname
2024-09-29 22:17:56 +02:00
mutate ( across ( kingdom : subspecies , function ( x ) if_else ( is.na ( x ) , " " , x ) ) ,
fullname = trimws ( case_when (
rank == " family" ~ family ,
rank == " order" ~ order ,
rank == " class" ~ class ,
rank == " phylum" ~ phylum ,
rank == " kingdom" ~ kingdom ,
TRUE ~ paste ( genus , species , subspecies ) # already trimmed 6 lines up
) ) , .before = 1
2022-08-28 10:31:50 +02:00
) %>%
2022-10-05 09:12:22 +02:00
# keep only one GBIF taxon ID per full name
arrange ( fullname , gbif ) %>%
2022-12-12 00:14:56 +01:00
distinct ( kingdom , rank , fullname , .keep_all = TRUE )
2020-05-27 16:37:49 +02:00
2022-10-05 09:12:22 +02:00
taxonomy_lpsn <- taxonomy_lpsn %>%
# clean NAs and add fullname
2024-09-29 22:17:56 +02:00
mutate ( across ( kingdom : subspecies , function ( x ) if_else ( is.na ( x ) , " " , x ) ) ,
fullname = trimws ( case_when (
rank == " family" ~ family ,
rank == " order" ~ order ,
rank == " class" ~ class ,
rank == " phylum" ~ phylum ,
rank == " kingdom" ~ kingdom ,
TRUE ~ paste ( genus , species , subspecies ) # already trimmed 6 lines up
) ) , .before = 1
2022-08-28 10:31:50 +02:00
) %>%
2022-10-05 09:12:22 +02:00
# keep only one LPSN record ID per full name
arrange ( fullname , lpsn ) %>%
2022-12-12 00:14:56 +01:00
distinct ( kingdom , rank , fullname , .keep_all = TRUE )
2022-10-05 09:12:22 +02:00
2024-07-16 14:51:57 +02:00
taxonomy_mycobank <- taxonomy_mycobank %>%
# clean NAs and add fullname
2024-09-29 22:17:56 +02:00
mutate ( across ( kingdom : subspecies , function ( x ) if_else ( is.na ( x ) , " " , x ) ) ,
2024-07-16 14:51:57 +02:00
fullname = trimws ( case_when (
rank == " family" ~ family ,
rank == " order" ~ order ,
rank == " class" ~ class ,
rank == " phylum" ~ phylum ,
rank == " kingdom" ~ kingdom ,
TRUE ~ paste ( genus , species , subspecies ) # already trimmed 6 lines up
) ) , .before = 1
) %>%
# keep only one MycoBank record ID per full name
arrange ( fullname , mycobank ) %>%
distinct ( kingdom , rank , fullname , .keep_all = TRUE )
2022-10-05 09:12:22 +02:00
2024-09-29 22:17:56 +02:00
# Combine the datasets ----------------------------------------------------------------------------
2024-09-19 11:44:56 +02:00
2024-07-16 14:51:57 +02:00
taxonomy <- taxonomy_lpsn %>%
# add fungi
bind_rows ( taxonomy_mycobank ) %>%
2024-09-29 22:17:56 +02:00
# add GBIF to the bottom
2024-07-16 14:51:57 +02:00
bind_rows ( taxonomy_gbif ) %>%
2024-02-24 15:16:52 +01:00
# group on unique species
2024-07-16 14:51:57 +02:00
group_by ( kingdom , fullname ) %>%
2024-02-24 15:16:52 +01:00
# fill the NAs in LPSN/GBIF fields and ref with the other source (so LPSN: 123 and GBIF: NA will become LPSN: 123 and GBIF: 123)
2024-07-16 14:51:57 +02:00
mutate ( across ( matches ( " ^(lpsn|mycobank|gbif|ref)" ) , function ( x ) rep ( x [ ! is.na ( x ) ] [1 ] , length ( x ) ) ) ) %>%
2024-02-24 15:16:52 +01:00
# ungroup again
2024-07-16 14:51:57 +02:00
ungroup ( ) %>%
2024-02-24 15:16:52 +01:00
# only keep unique species per kingdom
2024-07-16 14:51:57 +02:00
distinct ( kingdom , fullname , .keep_all = TRUE ) %>%
2024-02-24 15:16:52 +01:00
arrange ( fullname )
2019-09-22 17:19:59 +02:00
2022-12-16 16:10:43 +01:00
# get missing entries from existing microorganisms data set
2024-02-24 15:16:52 +01:00
taxonomy.old <- AMR :: microorganisms %>%
2024-09-29 22:17:56 +02:00
select ( any_of ( colnames ( taxonomy ) ) ) %>%
filter (
! paste ( kingdom , fullname ) %in% paste ( taxonomy $ kingdom , taxonomy $ fullname ) ,
# these will be added later:
source != " manually added" )
2024-02-24 15:16:52 +01:00
taxonomy <- taxonomy %>%
bind_rows ( taxonomy.old ) %>%
2022-12-16 16:10:43 +01:00
arrange ( fullname ) %>%
filter ( fullname != " " )
2022-10-05 09:12:22 +02:00
# fix rank
2024-07-16 14:51:57 +02:00
taxonomy %>% count ( rank , sort = TRUE )
2022-10-05 09:12:22 +02:00
taxonomy <- taxonomy %>%
mutate ( rank = case_when (
subspecies != " " ~ " subspecies" ,
species != " " ~ " species" ,
genus != " " ~ " genus" ,
family != " " ~ " family" ,
order != " " ~ " order" ,
class != " " ~ " class" ,
phylum != " " ~ " phylum" ,
kingdom != " " ~ " kingdom" ,
TRUE ~ NA_character_
) )
2024-07-16 14:51:57 +02:00
taxonomy %>% count ( rank , sort = TRUE )
2022-10-05 09:12:22 +02:00
2024-09-29 22:17:56 +02:00
taxonomy0 <- taxonomy
2024-02-24 15:16:52 +01:00
# at this point, it happens that some genera within kingdoms have multiple families / orders, etc., see here:
2024-07-16 14:51:57 +02:00
taxonomy %>% filter ( genus != " " ) %>% group_by ( kingdom , genus ) %>% filter ( n_distinct ( family ) > 1 ) %>% View ( )
2024-02-24 15:16:52 +01:00
# so make this universal
2024-07-16 14:51:57 +02:00
taxonomy <- taxonomy %>%
group_by ( kingdom , genus ) %>%
mutate ( family = get_top_lvl ( family , rank , source , " genus" , genus ) ) %>%
group_by ( kingdom , family ) %>%
mutate ( order = get_top_lvl ( order , rank , source , " family" , family ) ) %>%
group_by ( kingdom , order ) %>%
mutate ( class = get_top_lvl ( class , rank , source , " order" , order ) ) %>%
group_by ( kingdom , class ) %>%
2024-09-29 22:17:56 +02:00
mutate ( phylum = get_top_lvl ( phylum , rank , source , " class" , class ) ) %>%
2024-02-24 15:16:52 +01:00
ungroup ( )
# and remove the taxonomy where it must remain empty
2024-07-16 14:51:57 +02:00
taxonomy <- taxonomy %>%
2024-09-29 22:17:56 +02:00
mutate ( phylum = if_else ( rank %in% c ( " kingdom" ) , " " , phylum ) ,
class = if_else ( rank %in% c ( " kingdom" , " phylum" ) , " " , class ) ,
order = if_else ( rank %in% c ( " kingdom" , " phylum" , " class" ) , " " , order ) ,
family = if_else ( rank %in% c ( " kingdom" , " phylum" , " class" , " order" ) , " " , family ) ,
genus = if_else ( rank %in% c ( " kingdom" , " phylum" , " class" , " order" , " family" ) , " " , genus ) ,
species = if_else ( rank %in% c ( " kingdom" , " phylum" , " class" , " order" , " family" , " genus" ) , " " , species ) ,
subspecies = if_else ( rank %in% c ( " kingdom" , " phylum" , " class" , " order" , " family" , " genus" , " species" ) , " " , subspecies ) )
2024-02-24 15:16:52 +01:00
2022-12-16 16:10:43 +01:00
2024-09-29 22:17:56 +02:00
# Save intermediate results (0) -------------------------------------------------------------------
2022-12-16 16:10:43 +01:00
saveRDS ( taxonomy , " data-raw/taxonomy0.rds" )
2024-09-29 22:17:56 +02:00
# taxonomy <- readRDS("data-raw/taxonomy0.rds")
2022-10-05 09:12:22 +02:00
2024-09-29 22:17:56 +02:00
# Add missing and fix old taxonomic entries -------------------------------------------------------
2022-10-05 09:12:22 +02:00
# this part will make sure that the whole taxonomy of every included species exists, so no missing genera, classes, etc.
current_gbif <- taxonomy_gbif.bak %>%
filter ( is.na ( acceptedNameUsageID ) ) %>%
mutate (
taxonID = as.character ( taxonID ) ,
parentNameUsageID = as.character ( parentNameUsageID )
)
# add missing kingdoms
2024-02-24 15:16:52 +01:00
taxonomy_all_missing <- taxonomy %>%
filter ( kingdom != " " ) %>%
distinct ( kingdom ) %>%
mutate (
fullname = kingdom ,
rank = " kingdom"
) %>%
filter ( ! paste ( kingdom , rank ) %in% paste ( taxonomy $ kingdom , taxonomy $ rank ) ) %>%
left_join (
current_gbif %>%
select ( kingdom , rank = taxonRank , ref = scientificNameAuthorship , gbif = taxonID , gbif_parent = parentNameUsageID ) ,
by = c ( " kingdom" , " rank" )
) %>%
2024-09-29 22:17:56 +02:00
mutate ( source = if_else ( ! is.na ( gbif ) , " GBIF" , " manually added" ) ,
status = if_else ( ! is.na ( gbif ) , " accepted" , " unknown" ) )
2022-10-05 09:12:22 +02:00
# 2 = phylum ... 6 = genus
for ( i in 2 : 6 ) {
i_name <- colnames ( taxonomy ) [i + 1 ]
message ( " Adding missing: " , i_name , " ... " , appendLF = FALSE )
to_add <- taxonomy %>%
filter ( .[ [i + 1 ] ] != " " ) %>%
distinct ( kingdom , .[ [i + 1 ] ] , .keep_all = TRUE ) %>%
select ( kingdom : ( i + 1 ) ) %>%
2022-08-28 10:31:50 +02:00
mutate (
2022-10-05 09:12:22 +02:00
fullname = .[ [ncol ( .) ] ] ,
2024-02-24 15:16:52 +01:00
rank = i_name
2022-10-05 09:12:22 +02:00
) %>%
2024-02-24 15:16:52 +01:00
filter ( ! paste ( kingdom , .[ [ncol ( .) - 2 ] ] , rank ) %in% paste ( taxonomy $ kingdom , taxonomy [ [i + 1 ] ] , taxonomy $ rank ) ) %>%
2023-01-23 15:01:21 +01:00
# get GBIF identifier where available
left_join (
current_gbif %>%
select ( kingdom , all_of ( i_name ) , rank = taxonRank , ref = scientificNameAuthorship , gbif = taxonID , gbif_parent = parentNameUsageID ) ,
by = c ( " kingdom" , " rank" , i_name )
) %>%
2024-09-29 22:17:56 +02:00
mutate ( source = if_else ( ! is.na ( gbif ) , " GBIF" , " manually added" ) ,
status = if_else ( ! is.na ( gbif ) , " accepted" , " unknown" ) )
2022-10-05 09:12:22 +02:00
message ( " n = " , nrow ( to_add ) )
2024-02-24 15:16:52 +01:00
taxonomy_all_missing <- taxonomy_all_missing %>%
bind_rows ( to_add )
2024-09-29 22:17:56 +02:00
rm ( to_add )
2022-10-29 14:15:23 +02:00
}
2022-12-12 00:14:56 +01:00
taxonomy_all_missing %>% View ( )
2022-10-30 14:31:45 +01:00
2022-12-12 00:14:56 +01:00
taxonomy <- taxonomy %>%
bind_rows ( taxonomy_all_missing )
# fix for duplicate fullnames within a kingdom (such as Nitrospira which is the name of the genus AND its class)
2023-01-23 15:01:21 +01:00
taxonomy <- taxonomy %>%
mutate (
rank_index = case_when (
rank == " subspecies" ~ 1 ,
rank == " species" ~ 2 ,
rank == " genus" ~ 3 ,
rank == " family" ~ 4 ,
rank == " order" ~ 5 ,
rank == " class" ~ 6 ,
TRUE ~ 7
) ,
fullname_rank = paste0 ( fullname , " {" , rank , " }" )
) %>%
arrange ( kingdom , fullname , rank_index ) %>%
group_by ( kingdom , fullname ) %>%
mutate ( fullname = if_else ( row_number ( ) > 1 , fullname_rank , fullname ) ) %>%
ungroup ( ) %>%
select ( - fullname_rank , - rank_index ) %>%
2022-12-12 00:14:56 +01:00
arrange ( fullname )
2022-10-29 14:15:23 +02:00
2024-02-24 15:16:52 +01:00
# now also add missing species that have subspecies (requires combination with genus)
2024-07-16 14:51:57 +02:00
missing_species <- taxonomy %>%
filter ( species != " " ) %>%
distinct ( kingdom , genus , species , .keep_all = TRUE ) %>%
select ( kingdom : species ) %>%
mutate (
fullname = paste ( genus , species ) ,
rank = " species"
) %>%
filter ( ! paste ( kingdom , genus , species , rank ) %in% paste ( taxonomy $ kingdom , taxonomy $ genus , taxonomy $ species , taxonomy $ rank ) ) %>%
# get GBIF identifier where available
left_join (
current_gbif %>%
select ( kingdom , genus , species = specificEpithet , rank = taxonRank , ref = scientificNameAuthorship , gbif = taxonID , gbif_parent = parentNameUsageID ) ,
by = c ( " kingdom" , " rank" , " genus" , " species" )
) %>%
2024-09-29 22:17:56 +02:00
mutate ( source = if_else ( ! is.na ( gbif ) , " GBIF" , " manually added" ) ,
status = if_else ( ! is.na ( gbif ) , " accepted" , " unknown" ) )
2024-07-16 14:51:57 +02:00
2022-10-05 09:12:22 +02:00
taxonomy <- taxonomy %>%
2024-07-16 14:51:57 +02:00
bind_rows ( missing_species )
2022-10-05 09:12:22 +02:00
# remove NAs from taxonomy again, and keep unique full names
taxonomy <- taxonomy %>%
2024-09-29 22:17:56 +02:00
mutate ( across ( kingdom : subspecies , function ( x ) if_else ( is.na ( x ) , " " , x ) ) ) %>%
2024-07-16 14:51:57 +02:00
arrange ( kingdom , fullname , ref ) %>%
2022-08-28 10:31:50 +02:00
distinct ( kingdom , fullname , .keep_all = TRUE ) %>%
2022-10-05 09:12:22 +02:00
filter ( kingdom != " " )
2019-03-18 14:29:41 +01:00
2022-12-12 00:14:56 +01:00
2024-09-29 22:17:56 +02:00
# Save intermediate results (1) -------------------------------------------------------------------
2022-10-05 09:12:22 +02:00
2022-12-12 00:14:56 +01:00
saveRDS ( taxonomy , " data-raw/taxonomy1.rds" )
2024-09-29 22:17:56 +02:00
# taxonomy <- readRDS("data-raw/taxonomy1.rds")
2022-10-05 09:12:22 +02:00
2024-09-29 22:17:56 +02:00
# Get previously manually added entries -----------------------------------------------------------
2022-10-05 09:12:22 +02:00
manually_added <- AMR :: microorganisms %>%
2024-02-24 15:16:52 +01:00
filter ( source == " manually added" ,
! paste ( kingdom , fullname ) %in% paste ( taxonomy $ kingdom , taxonomy $ fullname ) ,
! rank %in% c ( " kingdom" , " phylum" , " class" , " order" , " family" ) ) %>%
2022-10-05 09:12:22 +02:00
select ( fullname : subspecies , ref , source , rank )
# get latest taxonomy for those entries
2024-09-29 22:17:56 +02:00
`%if_na%` <- function ( x , y ) if ( is.na ( x ) ) y else x
2022-10-05 09:12:22 +02:00
for ( g in unique ( manually_added $ genus [manually_added $ genus != " " & manually_added $ genus %in% taxonomy $ genus ] ) ) {
2024-09-29 22:17:56 +02:00
manually_added $ family [which ( manually_added $ genus == g ) ] <-
taxonomy $ family [which ( taxonomy $ genus == g & ! is.na ( taxonomy $ lpsn ) ) ] [1 ] %if_na%
taxonomy $ family [which ( taxonomy $ genus == g & ! is.na ( taxonomy $ mycobank ) ) ] [1 ] %if_na%
taxonomy $ family [which ( taxonomy $ genus == g ) ] [1 ]
2022-10-05 09:12:22 +02:00
}
for ( f in unique ( manually_added $ family [manually_added $ family != " " & manually_added $ family %in% taxonomy $ family ] ) ) {
2024-09-29 22:17:56 +02:00
manually_added $ order [which ( manually_added $ family == f ) ] <-
taxonomy $ order [which ( taxonomy $ family == f & ! is.na ( taxonomy $ lpsn ) ) ] [1 ] %if_na%
taxonomy $ order [which ( taxonomy $ family == f & ! is.na ( taxonomy $ mycobank ) ) ] [1 ] %if_na%
taxonomy $ order [which ( taxonomy $ family == f ) ] [1 ]
2022-10-05 09:12:22 +02:00
}
for ( o in unique ( manually_added $ order [manually_added $ order != " " & manually_added $ order %in% taxonomy $ order ] ) ) {
2024-09-29 22:17:56 +02:00
manually_added $ class [which ( manually_added $ order == o ) ] <-
taxonomy $ class [which ( taxonomy $ order == o & ! is.na ( taxonomy $ lpsn ) ) ] [1 ] %if_na%
taxonomy $ class [which ( taxonomy $ order == o & ! is.na ( taxonomy $ mycobank ) ) ] [1 ] %if_na%
taxonomy $ class [which ( taxonomy $ order == o ) ] [1 ]
2022-10-05 09:12:22 +02:00
}
for ( cc in unique ( manually_added $ class [manually_added $ class != " " & manually_added $ class %in% taxonomy $ class ] ) ) {
2024-09-29 22:17:56 +02:00
manually_added $ phylum [which ( manually_added $ class == cc ) ] <-
taxonomy $ phylum [which ( taxonomy $ class == cc & ! is.na ( taxonomy $ lpsn ) ) ] [1 ] %if_na%
taxonomy $ phylum [which ( taxonomy $ class == cc & ! is.na ( taxonomy $ mycobank ) ) ] [1 ] %if_na%
taxonomy $ phylum [which ( taxonomy $ class == cc ) ] [1 ]
2022-10-05 09:12:22 +02:00
}
2022-12-16 16:10:43 +01:00
for ( p in unique ( manually_added $ phylum [manually_added $ phylum != " " & manually_added $ phylum %in% taxonomy $ phylum ] ) ) {
2024-09-29 22:17:56 +02:00
manually_added $ kingdom [which ( manually_added $ phylum == p ) ] <-
taxonomy $ kingdom [which ( taxonomy $ phylum == p & ! is.na ( taxonomy $ lpsn ) ) ] [1 ] %if_na%
taxonomy $ kingdom [which ( taxonomy $ phylum == p & ! is.na ( taxonomy $ mycobank ) ) ] [1 ] %if_na%
taxonomy $ kingdom [which ( taxonomy $ phylum == p ) ] [1 ]
2022-12-16 16:10:43 +01:00
}
2022-10-05 09:12:22 +02:00
2022-12-12 00:14:56 +01:00
manually_added <- manually_added %>%
mutate (
2024-02-24 15:16:52 +01:00
status = " unknown" ,
2024-09-29 22:17:56 +02:00
rank = if_else ( fullname %like% " unknown" , " (unknown rank)" , rank )
2022-12-12 00:14:56 +01:00
)
2024-07-16 14:51:57 +02:00
manually_added %>% View ( )
2022-10-05 09:12:22 +02:00
2024-02-24 15:16:52 +01:00
# these are now included in the new taxonomy, check them
2024-07-16 14:51:57 +02:00
manually_added %>% filter ( fullname %in% taxonomy $ fullname )
2024-02-24 15:16:52 +01:00
2022-10-05 09:12:22 +02:00
taxonomy <- taxonomy %>%
2022-12-16 16:10:43 +01:00
# here also the 'unknowns' are added, such as "(unknown fungus)"
2022-12-12 00:14:56 +01:00
bind_rows ( manually_added ) %>%
2022-10-05 09:12:22 +02:00
arrange ( fullname )
table ( taxonomy $ rank , useNA = " always" )
2024-07-16 14:51:57 +02:00
2024-09-29 22:17:56 +02:00
# Get LPSN data for records missing from `taxonomy_lpsn` ------------------------------------------
2024-02-24 15:16:52 +01:00
2024-09-29 22:17:56 +02:00
# Weirdly enough, some LPSN records are lacking from the API and the CSV file (i.e., `taxonomy_lpsn`),
# such as family Thiotrichaceae and its order Thiotrichales, or the genus Coleospermum. When running
2024-07-16 14:51:57 +02:00
# get_lpsn_and_author("family", "Thiotrichaceae") you do get a result, vs. taxonomy_lpsn %>% filter(family == "Thiotrichaceae").
2024-02-24 15:16:52 +01:00
# So check every non-LPSN records from the kingdom of Bacteria and add it
2024-09-29 22:17:56 +02:00
gbif_bacteria <- which ( taxonomy $ kingdom == " Bacteria" & taxonomy $ source %in% c ( " GBIF" , " manually added" ) & taxonomy $ rank %in% c ( " phylum" , " class" , " order" , " family" , " genus" ) )
2024-02-24 15:16:52 +01:00
added <- 0
pb <- progress_bar $ new ( total = length ( gbif_bacteria ) , format = " [:bar] :current/:total :eta" )
for ( record in gbif_bacteria ) {
pb $ tick ( )
lpsn <- get_lpsn_and_author ( rank = taxonomy $ rank [record ] ,
name = taxonomy $ fullname [record ] )
if ( is.na ( lpsn [ " lpsn" ] ) ) {
next
} else {
added <- added + 1
taxonomy $ source [record ] <- " LPSN"
taxonomy $ lpsn [record ] <- unname ( lpsn [ " lpsn" ] )
taxonomy $ ref [record ] <- unname ( lpsn [ " ref" ] )
taxonomy $ status [record ] <- unname ( lpsn [ " status" ] )
}
}
2024-07-16 14:51:57 +02:00
warnings ( )
2024-02-24 15:16:52 +01:00
message ( added , " GBIF records altered to latest LPSN" )
2022-10-05 09:12:22 +02:00
2024-09-29 22:17:56 +02:00
# parent LPSNs will be added later
saveRDS ( taxonomy , " data-raw/taxonomy1b.rds" )
# taxonomy <- readRDS("data-raw/taxonomy1b.rds")
# Clean scientific reference ----------------------------------------------------------------------
2022-10-05 09:12:22 +02:00
taxonomy <- taxonomy %>%
mutate ( ref = get_author_year ( ref ) )
2019-08-06 14:39:22 +02:00
2019-09-18 15:46:09 +02:00
2024-09-29 22:17:56 +02:00
# Get the latest upper taxonomy from LPSN/MycoBank for GBIF data ---------------------------------------
2022-12-16 16:10:43 +01:00
2024-09-29 22:17:56 +02:00
# we did this for the manually added ones (from the current AMR pkg version), but not for the new GBIF records
2024-02-24 15:16:52 +01:00
# (e.g., phylum above class "Bacilli" was still "Firmicutes" in 2023, should be "Bacillota")
2022-12-16 16:10:43 +01:00
for ( k in unique ( taxonomy $ kingdom [taxonomy $ kingdom != " " ] ) ) {
2024-07-16 14:51:57 +02:00
if ( k == " Fungi" ) {
src <- " MycoBank"
} else {
src <- " LPSN"
}
message ( " Fixing GBIF taxonomy for kingdom " , k , " based on " , src , " ." , appendLF = FALSE )
2022-12-16 16:10:43 +01:00
i <- 0
2024-07-16 14:51:57 +02:00
for ( g in unique ( taxonomy $ genus [taxonomy $ genus != " " & taxonomy $ kingdom == k & taxonomy $ source == src ] ) ) {
2022-12-16 16:10:43 +01:00
i <- i + 1
if ( i %% 50 == 0 ) message ( " ." , appendLF = FALSE )
2024-07-16 14:51:57 +02:00
taxonomy $ family [which ( taxonomy $ genus == g & taxonomy $ kingdom == k ) ] <- taxonomy $ family [which ( taxonomy $ genus == g & taxonomy $ kingdom == k & taxonomy $ source == src ) ] [1 ]
2022-12-16 16:10:43 +01:00
}
2024-07-16 14:51:57 +02:00
for ( f in unique ( taxonomy $ family [taxonomy $ family != " " & taxonomy $ kingdom == k & taxonomy $ source == src ] ) ) {
2022-12-16 16:10:43 +01:00
i <- i + 1
if ( i %% 50 == 0 ) message ( " ." , appendLF = FALSE )
2024-07-16 14:51:57 +02:00
taxonomy $ order [which ( taxonomy $ family == f & taxonomy $ kingdom == k ) ] <- taxonomy $ order [which ( taxonomy $ family == f & taxonomy $ kingdom == k & taxonomy $ source == src ) ] [1 ]
2022-12-16 16:10:43 +01:00
}
2024-07-16 14:51:57 +02:00
for ( o in unique ( taxonomy $ order [taxonomy $ order != " " & taxonomy $ kingdom == k & taxonomy $ source == src ] ) ) {
2022-12-16 16:10:43 +01:00
i <- i + 1
if ( i %% 50 == 0 ) message ( " ." , appendLF = FALSE )
2024-07-16 14:51:57 +02:00
taxonomy $ class [which ( taxonomy $ order == o & taxonomy $ kingdom == k ) ] <- taxonomy $ class [which ( taxonomy $ order == o & taxonomy $ kingdom == k & taxonomy $ source == src ) ] [1 ]
2022-12-16 16:10:43 +01:00
}
2024-07-16 14:51:57 +02:00
for ( cc in unique ( taxonomy $ class [taxonomy $ class != " " & taxonomy $ kingdom == k & taxonomy $ source == src ] ) ) {
2022-12-16 16:10:43 +01:00
i <- i + 1
if ( i %% 50 == 0 ) message ( " ." , appendLF = FALSE )
2024-07-16 14:51:57 +02:00
taxonomy $ phylum [which ( taxonomy $ class == cc & taxonomy $ kingdom == k ) ] <- taxonomy $ phylum [which ( taxonomy $ class == cc & taxonomy $ kingdom == k & taxonomy $ source == src ) ] [1 ]
2022-12-16 16:10:43 +01:00
}
message ( " OK." )
}
2022-12-12 00:14:56 +01:00
# fix rank
taxonomy <- taxonomy %>%
mutate ( rank = case_when (
subspecies != " " ~ " subspecies" ,
species != " " ~ " species" ,
genus != " " ~ " genus" ,
family != " " ~ " family" ,
order != " " ~ " order" ,
class != " " ~ " class" ,
phylum != " " ~ " phylum" ,
kingdom != " " ~ " kingdom" ,
TRUE ~ NA_character_
) )
2019-09-18 15:46:09 +02:00
2024-09-29 22:17:56 +02:00
saveRDS ( taxonomy , " data-raw/taxonomy1c.rds" )
# taxonomy <- readRDS("data-raw/taxonomy1c.rds")
2022-10-05 09:12:22 +02:00
2024-09-29 22:17:56 +02:00
# Add parent identifiers --------------------------------------------------------------------------
2024-07-16 14:51:57 +02:00
# requires full name and full taxonomy
taxonomy <- taxonomy %>%
mutate (
lpsn_parent = case_when (
rank == " phylum" ~ lpsn [match ( kingdom , fullname ) ] ,
# in class, take parent from phylum if available, otherwise kingdom
rank == " class" & phylum != " " ~ lpsn [match ( phylum , fullname ) ] ,
rank == " class" ~ lpsn [match ( kingdom , fullname ) ] ,
# in order, take parent from class if available, otherwise phylum, otherwise kingdom
rank == " order" & class != " " ~ lpsn [match ( class , fullname ) ] ,
rank == " order" & phylum != " " ~ lpsn [match ( phylum , fullname ) ] ,
rank == " order" ~ lpsn [match ( kingdom , fullname ) ] ,
# family
rank == " family" & order != " " ~ lpsn [match ( order , fullname ) ] ,
rank == " family" & class != " " ~ lpsn [match ( class , fullname ) ] ,
rank == " family" & phylum != " " ~ lpsn [match ( phylum , fullname ) ] ,
rank == " family" ~ lpsn [match ( kingdom , fullname ) ] ,
# genus
rank == " genus" & family != " " ~ lpsn [match ( family , fullname ) ] ,
rank == " genus" & order != " " ~ lpsn [match ( order , fullname ) ] ,
rank == " genus" & class != " " ~ lpsn [match ( class , fullname ) ] ,
rank == " genus" & phylum != " " ~ lpsn [match ( phylum , fullname ) ] ,
rank == " genus" ~ lpsn [match ( kingdom , fullname ) ] ,
# species, always has a genus
rank == " species" ~ lpsn [match ( genus , fullname ) ] ,
2024-07-17 14:29:55 +02:00
# subspecies, always has a genus + species
rank == " subspecies" ~ lpsn [match ( paste ( genus , species ) , fullname ) ] ,
2024-07-16 14:51:57 +02:00
TRUE ~ NA_character_ ) ,
mycobank_parent = case_when (
rank == " phylum" ~ mycobank [match ( kingdom , fullname ) ] ,
# class
rank == " class" & phylum != " " ~ mycobank [match ( phylum , fullname ) ] ,
rank == " class" ~ mycobank [match ( kingdom , fullname ) ] ,
# order
rank == " order" & class != " " ~ mycobank [match ( class , fullname ) ] ,
rank == " order" & phylum != " " ~ mycobank [match ( phylum , fullname ) ] ,
rank == " order" ~ mycobank [match ( kingdom , fullname ) ] ,
# family
rank == " family" & order != " " ~ mycobank [match ( order , fullname ) ] ,
rank == " family" & class != " " ~ mycobank [match ( class , fullname ) ] ,
rank == " family" & phylum != " " ~ mycobank [match ( phylum , fullname ) ] ,
rank == " family" ~ mycobank [match ( kingdom , fullname ) ] ,
# genus
rank == " genus" & family != " " ~ mycobank [match ( family , fullname ) ] ,
rank == " genus" & order != " " ~ mycobank [match ( order , fullname ) ] ,
rank == " genus" & class != " " ~ mycobank [match ( class , fullname ) ] ,
rank == " genus" & phylum != " " ~ mycobank [match ( phylum , fullname ) ] ,
rank == " genus" ~ mycobank [match ( kingdom , fullname ) ] ,
# species
rank == " species" ~ mycobank [match ( genus , fullname ) ] ,
2024-07-17 14:29:55 +02:00
# subspecies
rank == " subspecies" ~ mycobank [match ( paste ( genus , species ) , fullname ) ] ,
2024-07-16 14:51:57 +02:00
TRUE ~ NA_character_ ) ,
gbif_parent = case_when (
rank == " phylum" ~ gbif [match ( kingdom , fullname ) ] ,
# class
rank == " class" & phylum != " " ~ gbif [match ( phylum , fullname ) ] ,
rank == " class" ~ gbif [match ( kingdom , fullname ) ] ,
# order
rank == " order" & class != " " ~ gbif [match ( class , fullname ) ] ,
rank == " order" & phylum != " " ~ gbif [match ( phylum , fullname ) ] ,
rank == " order" ~ gbif [match ( kingdom , fullname ) ] ,
# family
rank == " family" & order != " " ~ gbif [match ( order , fullname ) ] ,
rank == " family" & class != " " ~ gbif [match ( class , fullname ) ] ,
rank == " family" & phylum != " " ~ gbif [match ( phylum , fullname ) ] ,
rank == " family" ~ gbif [match ( kingdom , fullname ) ] ,
# genus
rank == " genus" & family != " " ~ gbif [match ( family , fullname ) ] ,
rank == " genus" & order != " " ~ gbif [match ( order , fullname ) ] ,
rank == " genus" & class != " " ~ gbif [match ( class , fullname ) ] ,
rank == " genus" & phylum != " " ~ gbif [match ( phylum , fullname ) ] ,
rank == " genus" ~ gbif [match ( kingdom , fullname ) ] ,
# species
rank == " species" ~ gbif [match ( genus , fullname ) ] ,
2024-07-17 14:29:55 +02:00
# subspecies
rank == " subspecies" ~ gbif [match ( paste ( genus , species ) , fullname ) ] ,
2024-07-16 14:51:57 +02:00
TRUE ~ NA_character_ ) )
# these still have no record in our data set:
which ( ! taxonomy $ lpsn_parent %in% taxonomy $ lpsn )
which ( ! taxonomy $ mycobank_parent %in% taxonomy $ mycobank )
which ( ! taxonomy $ gbif_parent %in% taxonomy $ gbif )
2024-09-29 22:17:56 +02:00
# Add prevalence ----------------------------------------------------------------------------------
# this part is required here, because it's needed for filtering on 'relevant' species to keep later on
2022-12-19 15:32:41 +01:00
pathogens <- read_excel ( file_bartlett , sheet = " Tab 6 Full List" )
# get all established, both old and current taxonomic names
2023-01-23 15:01:21 +01:00
established <- pathogens %>%
filter ( status == " established" ) %>%
2022-12-19 15:32:41 +01:00
mutate ( fullname = paste ( genus , species ) ) %>%
2023-01-23 15:01:21 +01:00
pull ( fullname ) %>%
c (
unlist ( mo_current ( .) ) ,
unlist ( mo_synonyms ( ., keep_synonyms = FALSE ) )
) %>%
strsplit ( " " , fixed = TRUE ) %>%
2024-09-29 22:17:56 +02:00
sapply ( function ( x ) if ( length ( x ) == 1 ) x else paste ( x [1 ] , x [2 ] ) ) %>%
2023-01-23 15:01:21 +01:00
sort ( ) %>%
2022-12-19 15:32:41 +01:00
unique ( )
# get all putative, both old and current taxonomic names
2023-01-23 15:01:21 +01:00
putative <- pathogens %>%
filter ( status == " putative" ) %>%
2022-12-19 15:32:41 +01:00
mutate ( fullname = paste ( genus , species ) ) %>%
2023-01-23 15:01:21 +01:00
pull ( fullname ) %>%
c (
unlist ( mo_current ( .) ) ,
unlist ( mo_synonyms ( ., keep_synonyms = FALSE ) )
) %>%
strsplit ( " " , fixed = TRUE ) %>%
2024-09-29 22:17:56 +02:00
sapply ( function ( x ) if ( length ( x ) == 1 ) x else paste ( x [1 ] , x [2 ] ) ) %>%
2023-01-23 15:01:21 +01:00
sort ( ) %>%
2022-12-19 15:32:41 +01:00
unique ( )
established <- established [established %unlike% " unknown" ]
putative <- putative [putative %unlike% " unknown" ]
2023-01-23 15:01:21 +01:00
established_genera <- established %>%
strsplit ( " " , fixed = TRUE ) %>%
sapply ( function ( x ) x [1 ] ) %>%
sort ( ) %>%
2022-12-19 15:32:41 +01:00
unique ( )
2023-01-23 15:01:21 +01:00
putative_genera <- putative %>%
strsplit ( " " , fixed = TRUE ) %>%
sapply ( function ( x ) x [1 ] ) %>%
sort ( ) %>%
2022-12-20 16:14:04 +01:00
unique ( )
2024-09-29 22:17:56 +02:00
nonbacterial_genera <- AMR ::: MO_RELEVANT_GENERA %>%
2023-01-23 15:01:21 +01:00
c (
unlist ( mo_current ( .) ) ,
unlist ( mo_synonyms ( ., keep_synonyms = FALSE ) )
) %>%
strsplit ( " " , fixed = TRUE ) %>%
sapply ( function ( x ) x [1 ] ) %>%
sort ( ) %>%
2022-12-19 15:32:41 +01:00
unique ( )
2022-12-20 16:14:04 +01:00
nonbacterial_genera <- nonbacterial_genera [nonbacterial_genera %unlike% " unknown" ]
2022-12-19 15:32:41 +01:00
# update prevalence based on taxonomy (following the recent and thorough work of Bartlett et al., 2022)
# see https://doi.org/10.1099/mic.0.001269
2023-01-23 15:01:21 +01:00
taxonomy <- taxonomy %>%
2022-12-19 15:32:41 +01:00
mutate ( prevalence = case_when (
2024-09-30 18:46:55 +02:00
# genera of the pathogens mentioned in the World Health Organization's (WHO) Priority Pathogen List
genus %in% MO_WHO_PRIORITY_GENERA ~ 1.0 ,
2022-12-20 16:14:04 +01:00
# 'established' means 'have infected at least three persons in three or more references'
2024-09-30 18:46:55 +02:00
paste ( genus , species ) %in% established & rank %in% c ( " species" , " subspecies" ) ~ 1.15 ,
2022-12-20 16:14:04 +01:00
# other genera in the 'established' group
2024-09-30 18:46:55 +02:00
genus %in% established_genera & rank == " genus" ~ 1.15 ,
2024-09-29 22:17:56 +02:00
2022-12-20 16:14:04 +01:00
# 'putative' means 'fewer than three known cases'
paste ( genus , species ) %in% putative & rank %in% c ( " species" , " subspecies" ) ~ 1.25 ,
# other genera in the 'putative' group
genus %in% putative_genera & rank == " genus" ~ 1.25 ,
2024-07-16 14:51:57 +02:00
# we keep track of prevalent genera too of non-bacterial species
2024-09-29 22:17:56 +02:00
genus %in% AMR ::: MO_RELEVANT_GENERA & kingdom != " Bacteria" & rank %in% c ( " genus" , " species" , " subspecies" ) ~ 1.25 ,
2024-07-16 14:51:57 +02:00
2022-12-20 16:14:04 +01:00
# species and subspecies in 'established' and 'putative' groups
genus %in% c ( established_genera , putative_genera ) & rank %in% c ( " species" , " subspecies" ) ~ 1.5 ,
# other species from a genus in either group
genus %in% nonbacterial_genera & rank %in% c ( " genus" , " species" , " subspecies" ) ~ 1.5 ,
2024-07-16 14:51:57 +02:00
2022-12-20 16:14:04 +01:00
# all others
2023-01-23 15:01:21 +01:00
TRUE ~ 2.0
) )
2022-12-19 15:32:41 +01:00
table ( taxonomy $ prevalence , useNA = " always" )
# (a lot will be removed further below)
2024-09-29 22:17:56 +02:00
# Save intermediate results (2) -------------------------------------------------------------------
2022-12-16 16:10:43 +01:00
saveRDS ( taxonomy , " data-raw/taxonomy2.rds" )
2024-07-16 14:51:57 +02:00
# taxonomy <- readRDS("data-raw/taxonomy2.rds")
2024-09-29 22:17:56 +02:00
# Remove unwanted taxonomic entries ---------------------------------------------------------------
2024-07-16 14:51:57 +02:00
2024-09-19 11:44:56 +02:00
part1 <- taxonomy %>%
2024-07-16 14:51:57 +02:00
filter (
# keep all we added ourselves:
source == " manually added" |
# keep all bacteria anyway, main focus of our package:
kingdom == " Bacteria" |
# and these kingdoms are very small, and also mainly microorganisms:
kingdom == " Protozoa" |
kingdom == " Archaea" |
kingdom == " Chromista" |
# keep everything from family up, 'ghost' entries will be removed later on
rank %in% c ( " kingdom" , " phylum" , " class" , " order" , " family" ) |
# other relevant genera to keep:
2024-09-29 22:17:56 +02:00
genus %in% AMR ::: MO_RELEVANT_GENERA |
2024-07-16 14:51:57 +02:00
# relevant for biotechnology:
genus %in% c ( " Archaeoglobus" , " Desulfurococcus" , " Ferroglobus" , " Ferroplasma" , " Halobacterium" , " Halococcus" , " Haloferax" , " Naegleria" , " Nanoarchaeum" , " Nitrosopumilus" , " Nosema" , " Pleistophora" , " Pyrobaculum" , " Pyrococcus" , " Spraguea" , " Thelohania" , " Thermococcus" , " Thermoplasma" , " Thermoproteus" , " Tritrichomonas" ) |
genus %like_case% " Methano" |
# kingdom of Protozoa:
( phylum %in% c ( " Choanozoa" , " Mycetozoa" ) & prevalence < 2 ) |
# Fungi:
2024-09-19 11:44:56 +02:00
( kingdom == " Fungi" & ( ! rank %in% c ( " genus" , " species" , " subspecies" ) | prevalence < 2 | class == " Pichiomycetes" ) ) |
2024-07-16 14:51:57 +02:00
# Animalia:
genus %in% c ( " Lucilia" , " Lumbricus" ) |
( class == " Insecta" & ! rank %in% c ( " species" , " subspecies" ) ) | # keep only genus of insects, not all of their (sub)species
( genus == " Amoeba" & kingdom != " Animalia" ) # keep only in the protozoa, not the animalia
) %>%
# this kingdom only contained Curvularia and Hymenolepis, which have coincidental twin names with Fungi
filter ( kingdom != " Plantae" ,
! ( genus %in% c ( " Aedes" , " Anopheles" ) & rank %in% c ( " species" , " subspecies" ) ) )
2024-09-19 11:44:56 +02:00
# now get the parents and old names
part2 <- taxonomy %>%
filter ( gbif %in% c ( part1 $ gbif_parent [ ! is.na ( part1 $ gbif_parent ) ] , part1 $ gbif_renamed_to [ ! is.na ( part1 $ gbif_renamed_to ) ] ) |
mycobank %in% c ( part1 $ mycobank_parent [ ! is.na ( part1 $ mycobank_parent ) ] , part1 $ mycobank_renamed_to [ ! is.na ( part1 $ mycobank_renamed_to ) ] ) |
lpsn %in% c ( part1 $ lpsn_parent [ ! is.na ( part1 $ lpsn_parent ) ] , part1 $ lpsn_renamed_to [ ! is.na ( part1 $ lpsn_renamed_to ) ] ) )
parts <- bind_rows ( part1 , part2 )
part3 <- taxonomy %>%
filter ( gbif %in% c ( parts $ gbif_parent [ ! is.na ( parts $ gbif_parent ) ] , parts $ gbif_renamed_to [ ! is.na ( parts $ gbif_renamed_to ) ] ) |
mycobank %in% c ( parts $ mycobank_parent [ ! is.na ( parts $ mycobank_parent ) ] , parts $ mycobank_renamed_to [ ! is.na ( parts $ mycobank_renamed_to ) ] ) |
lpsn %in% c ( parts $ lpsn_parent [ ! is.na ( parts $ lpsn_parent ) ] , parts $ lpsn_renamed_to [ ! is.na ( parts $ lpsn_renamed_to ) ] ) )
parts <- bind_rows ( part1 , part2 , part3 )
part4 <- taxonomy %>%
filter ( gbif %in% c ( parts $ gbif_parent [ ! is.na ( parts $ gbif_parent ) ] , parts $ gbif_renamed_to [ ! is.na ( parts $ gbif_renamed_to ) ] ) |
mycobank %in% c ( parts $ mycobank_parent [ ! is.na ( parts $ mycobank_parent ) ] , parts $ mycobank_renamed_to [ ! is.na ( parts $ mycobank_renamed_to ) ] ) |
lpsn %in% c ( parts $ lpsn_parent [ ! is.na ( parts $ lpsn_parent ) ] , parts $ lpsn_renamed_to [ ! is.na ( parts $ lpsn_renamed_to ) ] ) )
parts <- bind_rows ( part1 , part2 , part3 , part4 )
taxonomy <- bind_rows ( part1 , part2 , part3 , part4 ) %>%
2024-09-29 22:17:56 +02:00
mutate (
# here we must prefer in this order: LPSN > MycoBank > GBIF
source_index = case_when ( source == " LPSN" ~ 1 ,
source == " MycoBank" ~ 2 ,
source == " GBIF" ~ 3 ,
# manually added:
TRUE ~ 4 ) ,
# also arrange on rank, otherwise e.g. Cyptococcus comes from Animalia, not Fungi
rank_index = case_when (
kingdom == " Bacteria" ~ 1 ,
kingdom == " Fungi" ~ 2 ,
kingdom == " Protozoa" ~ 3 ,
kingdom == " Archaea" ~ 4 ,
kingdom == " Chromista" ~ 5 ,
kingdom == " Animalia" ~ 6 ,
TRUE ~ 7 ) ) %>%
arrange ( source_index , rank_index , fullname ) %>%
distinct ( fullname , .keep_all = TRUE ) %>%
select ( - c ( source_index , rank_index ) )
# some manual updates, remove later
taxonomy $ family [taxonomy $ genus == " Blastocystis" ] <- " Blastocystidae"
taxonomy $ ref [which ( taxonomy $ fullname == " Salmonella Typhimurium" ) ] <- " Castellani et al., 1919"
taxonomy $ ref [which ( taxonomy $ fullname == " Salmonella Paratyphi" ) ] <- " Ezaki et al., 2000"
taxonomy $ ref [which ( taxonomy $ fullname == " Salmonella Typhi" ) ] <- " Warren et al., 1930"
# first make sure that species of a genus cannot be across multiple kingdoms, since otherwise IDs cannot be given correctly
# (e.g., Amoeba has some species in Protozoa and some in Animalia)
# we acknowledge that this may be taxonomically right, but we need the same KINGDOM_GENUS identifier for each record
taxonomy <- taxonomy %>%
group_by ( genus ) %>%
mutate (
kingdom = if_else ( rank != " genus" & genus != " " & fullname %unlike% " unknown" , first ( kingdom [rank == " genus" ] ) , kingdom ) ,
phylum = if_else ( rank != " genus" & genus != " " & fullname %unlike% " unknown" , first ( phylum [rank == " genus" ] ) , phylum ) ,
class = if_else ( rank != " genus" & genus != " " & fullname %unlike% " unknown" , first ( class [rank == " genus" ] ) , class ) ,
order = if_else ( rank != " genus" & genus != " " & fullname %unlike% " unknown" , first ( order [rank == " genus" ] ) , order ) ,
family = if_else ( rank != " genus" & genus != " " & fullname %unlike% " unknown" , first ( family [rank == " genus" ] ) , family ) ,
prevalence = if_else ( rank != " genus" & genus != " " & fullname %unlike% " unknown" , first ( prevalence [rank == " genus" ] ) , prevalence )
) %>%
ungroup ( ) %>%
arrange ( fullname )
# update the taxonomic names based on new genus classification
taxonomy <- taxonomy %>%
# first, fix kingdom based on family level (only when family is not empty)
group_by ( family ) %>%
mutate ( kingdom = if_else ( family != " " , get_top_lvl ( kingdom , rank , source , " phylum" , phylum ) , kingdom ) ,
phylum = if_else ( family != " " , first ( phylum [phylum != " " & rank != " phylum" ] ) , phylum ) ,
class = if_else ( family != " " , first ( class [class != " " & rank != " class" ] ) , class ) ,
order = if_else ( family != " " , first ( order [order != " " & rank != " order" ] ) , order ) ) %>%
# next, update all taxonomic layers
group_by ( kingdom , genus ) %>%
mutate ( family = get_top_lvl ( family , rank , source , " genus" , genus ) ) %>%
group_by ( kingdom , family ) %>%
mutate ( order = get_top_lvl ( order , rank , source , " family" , family ) ) %>%
group_by ( kingdom , order ) %>%
mutate ( class = get_top_lvl ( class , rank , source , " order" , order ) ) %>%
group_by ( kingdom , class ) %>%
mutate ( phylum = get_top_lvl ( phylum , rank , source , " class" , class ) ) %>%
ungroup ( ) %>%
2024-09-19 11:44:56 +02:00
arrange ( fullname ) %>%
2024-09-29 22:17:56 +02:00
mutate ( across ( phylum : family , ~ if_else ( is.na ( .x ) , " " , .x ) ) )
2024-09-19 11:44:56 +02:00
2024-09-29 22:17:56 +02:00
# no ghost families, orders, classes, phyla
# (but keep the ghost families of bacteria)
2024-07-16 14:51:57 +02:00
taxonomy <- taxonomy %>%
group_by ( kingdom , family ) %>%
2024-09-29 22:17:56 +02:00
filter ( n ( ) > 1 | fullname %like% " unknown" | rank %in% c ( " kingdom" , " genus" , " species" , " subspecies" ) | kingdom == " Bacteria" ) %>%
2024-07-16 14:51:57 +02:00
group_by ( kingdom , order ) %>%
2024-09-29 22:17:56 +02:00
filter ( n ( ) > 1 | fullname %like% " unknown" | rank %in% c ( " kingdom" , " genus" , " species" , " subspecies" ) | family != " " ) %>%
2024-07-16 14:51:57 +02:00
group_by ( kingdom , class ) %>%
2024-09-29 22:17:56 +02:00
filter ( n ( ) > 1 | fullname %like% " unknown" | rank %in% c ( " kingdom" , " genus" , " species" , " subspecies" ) | family != " " | order != " " ) %>%
2024-07-16 14:51:57 +02:00
group_by ( kingdom , phylum ) %>%
2024-09-29 22:17:56 +02:00
filter ( n ( ) > 1 | fullname %like% " unknown" | rank %in% c ( " kingdom" , " genus" , " species" , " subspecies" ) | family != " " | order != " " | class != " " ) %>%
2024-07-16 14:51:57 +02:00
ungroup ( )
2022-12-16 16:10:43 +01:00
2024-09-29 22:17:56 +02:00
saveRDS ( taxonomy , " data-raw/taxonomy2b.rds" )
# taxonomy <- readRDS("data-raw/taxonomy2b.rds")
2022-12-16 16:10:43 +01:00
2024-09-29 22:17:56 +02:00
# Add microbial IDs -------------------------------------------------------------------------------
2022-10-05 09:12:22 +02:00
2024-09-29 22:17:56 +02:00
# (MO codes in the AMR package have the form: KINGDOM_GENUS_SPECIES_SUBSPECIES where all are abbreviated)
2022-10-05 09:12:22 +02:00
# Kingdom is abbreviated with 1 character, with exceptions for Animalia and Plantae
mo_kingdom <- taxonomy %>%
filter ( rank == " kingdom" ) %>%
select ( kingdom ) %>%
mutate ( mo_kingdom = case_when (
kingdom == " Animalia" ~ " AN" ,
kingdom == " Archaea" ~ " A" ,
kingdom == " Bacteria" ~ " B" ,
kingdom == " Chromista" ~ " C" ,
kingdom == " Fungi" ~ " F" ,
2024-09-29 22:17:56 +02:00
kingdom == " Plantae" ~ " PL" , # this is actually not part of the scope, having 0 results (2024)
2022-10-05 09:12:22 +02:00
kingdom == " Protozoa" ~ " P" ,
TRUE ~ " "
) )
2024-02-24 15:16:52 +01:00
2022-10-05 09:12:22 +02:00
# phylum until family are abbreviated with 8 characters and prefixed with their rank
# Phylum - keep old and fill up for new ones
mo_phylum <- taxonomy %>%
filter ( rank == " phylum" ) %>%
distinct ( kingdom , phylum ) %>%
2023-01-23 15:01:21 +01:00
left_join (
AMR :: microorganisms %>%
filter ( rank == " phylum" ) %>%
transmute ( kingdom ,
2024-09-29 22:17:56 +02:00
phylum = fullname ,
mo_old = gsub ( " [A-Z]{1,2}_" , " " , as.character ( mo ) )
2023-01-23 15:01:21 +01:00
) ,
by = c ( " kingdom" , " phylum" )
2022-10-05 09:12:22 +02:00
) %>%
group_by ( kingdom ) %>%
mutate (
2023-05-08 13:04:18 +02:00
mo_phylum8 = AMR ::: abbreviate_mo ( phylum , minlength = 8 , prefix = " [PHL]_" ) ,
mo_phylum9 = AMR ::: abbreviate_mo ( phylum , minlength = 9 , prefix = " [PHL]_" ) ,
2024-09-29 22:17:56 +02:00
mo_phylum = if_else ( ! is.na ( mo_old ) , mo_old , mo_phylum8 ) ,
2022-10-05 09:12:22 +02:00
mo_duplicated = duplicated ( mo_phylum ) ,
2024-09-29 22:17:56 +02:00
mo_phylum = if_else ( mo_duplicated , mo_phylum9 , mo_phylum ) ,
2022-10-05 09:12:22 +02:00
mo_duplicated = duplicated ( mo_phylum )
) %>%
ungroup ( )
if ( any ( mo_phylum $ mo_duplicated , na.rm = TRUE ) ) stop ( " Duplicate MO codes for phylum!" )
mo_phylum <- mo_phylum %>%
select ( kingdom , phylum , mo_phylum )
# Class - keep old and fill up for new ones
mo_class <- taxonomy %>%
filter ( rank == " class" ) %>%
distinct ( kingdom , class ) %>%
2023-01-23 15:01:21 +01:00
left_join (
AMR :: microorganisms %>%
filter ( rank == " class" ) %>%
transmute ( kingdom ,
2024-09-29 22:17:56 +02:00
class = fullname ,
mo_old = gsub ( " [A-Z]{1,2}_" , " " , as.character ( mo ) )
2023-01-23 15:01:21 +01:00
) ,
by = c ( " kingdom" , " class" )
2022-10-05 09:12:22 +02:00
) %>%
group_by ( kingdom ) %>%
2022-08-28 10:31:50 +02:00
mutate (
2023-05-08 13:04:18 +02:00
mo_class8 = AMR ::: abbreviate_mo ( class , minlength = 8 , prefix = " [CLS]_" ) ,
mo_class9 = AMR ::: abbreviate_mo ( class , minlength = 9 , prefix = " [CLS]_" ) ,
2024-09-29 22:17:56 +02:00
mo_class = if_else ( ! is.na ( mo_old ) , mo_old , mo_class8 ) ,
2022-10-05 09:12:22 +02:00
mo_duplicated = duplicated ( mo_class ) ,
2024-09-29 22:17:56 +02:00
mo_class = if_else ( mo_duplicated , mo_class9 , mo_class ) ,
2022-10-05 09:12:22 +02:00
mo_duplicated = duplicated ( mo_class )
) %>%
ungroup ( )
if ( any ( mo_class $ mo_duplicated , na.rm = TRUE ) ) stop ( " Duplicate MO codes for class!" )
mo_class <- mo_class %>%
select ( kingdom , class , mo_class )
# Order - keep old and fill up for new ones
mo_order <- taxonomy %>%
filter ( rank == " order" ) %>%
distinct ( kingdom , order ) %>%
2023-01-23 15:01:21 +01:00
left_join (
AMR :: microorganisms %>%
filter ( rank == " order" ) %>%
transmute ( kingdom ,
2024-09-29 22:17:56 +02:00
order = fullname ,
mo_old = gsub ( " [A-Z]{1,2}_" , " " , as.character ( mo ) )
2023-01-23 15:01:21 +01:00
) ,
by = c ( " kingdom" , " order" )
2022-08-28 10:31:50 +02:00
) %>%
2022-10-05 09:12:22 +02:00
group_by ( kingdom ) %>%
2022-08-28 10:31:50 +02:00
mutate (
2023-05-08 13:04:18 +02:00
mo_order8 = AMR ::: abbreviate_mo ( order , minlength = 8 , prefix = " [ORD]_" ) ,
mo_order9 = AMR ::: abbreviate_mo ( order , minlength = 9 , prefix = " [ORD]_" ) ,
2024-09-29 22:17:56 +02:00
mo_order = if_else ( ! is.na ( mo_old ) , mo_old , mo_order8 ) ,
2022-10-05 09:12:22 +02:00
mo_duplicated = duplicated ( mo_order ) ,
2024-09-29 22:17:56 +02:00
mo_order = if_else ( mo_duplicated , mo_order9 , mo_order ) ,
2022-10-05 09:12:22 +02:00
mo_duplicated = duplicated ( mo_order )
) %>%
ungroup ( )
if ( any ( mo_order $ mo_duplicated , na.rm = TRUE ) ) stop ( " Duplicate MO codes for order!" )
mo_order <- mo_order %>%
select ( kingdom , order , mo_order )
# Family - keep old and fill up for new ones
mo_family <- taxonomy %>%
filter ( rank == " family" ) %>%
distinct ( kingdom , family ) %>%
2023-01-23 15:01:21 +01:00
left_join (
AMR :: microorganisms %>%
filter ( rank == " family" ) %>%
transmute ( kingdom ,
2024-09-29 22:17:56 +02:00
family = fullname ,
mo_old = gsub ( " [A-Z]{1,2}_" , " " , as.character ( mo ) )
2023-01-23 15:01:21 +01:00
) ,
by = c ( " kingdom" , " family" )
2022-10-05 09:12:22 +02:00
) %>%
group_by ( kingdom ) %>%
mutate (
2023-05-08 13:04:18 +02:00
mo_family8 = AMR ::: abbreviate_mo ( family , minlength = 8 , prefix = " [FAM]_" ) ,
mo_family9 = AMR ::: abbreviate_mo ( family , minlength = 9 , prefix = " [FAM]_" ) ,
2024-09-29 22:17:56 +02:00
mo_family = if_else ( ! is.na ( mo_old ) , mo_old , mo_family8 ) ,
2022-10-05 09:12:22 +02:00
mo_duplicated = duplicated ( mo_family ) ,
2024-09-29 22:17:56 +02:00
mo_family = if_else ( mo_duplicated , mo_family9 , mo_family ) ,
2022-10-05 09:12:22 +02:00
mo_duplicated = duplicated ( mo_family )
2022-08-28 10:31:50 +02:00
) %>%
2022-10-05 09:12:22 +02:00
ungroup ( )
if ( any ( mo_family $ mo_duplicated , na.rm = TRUE ) ) stop ( " Duplicate MO codes for family!" )
mo_family <- mo_family %>%
select ( kingdom , family , mo_family )
2019-02-20 00:04:48 +01:00
2022-10-05 09:12:22 +02:00
# construct code part for genus - keep old code where available and generate new ones where needed
mo_genus <- taxonomy %>%
filter ( rank == " genus" ) %>%
distinct ( kingdom , genus ) %>%
# get available old MO codes
2023-01-23 15:01:21 +01:00
left_join (
AMR :: microorganisms %>%
filter ( rank == " genus" ) %>%
transmute ( mo_genus_old = gsub ( " ^[A-Z]+_" , " " , as.character ( mo ) ) , kingdom , genus ) %>%
distinct ( kingdom , genus , .keep_all = TRUE ) ,
by = c ( " kingdom" , " genus" )
2022-10-05 09:12:22 +02:00
) %>%
distinct ( kingdom , genus , .keep_all = TRUE ) %>%
# since kingdom is part of the code, genus abbreviations may be duplicated between kingdoms
group_by ( kingdom ) %>%
# generate new MO codes for genus and set the right one
mutate (
2023-05-08 13:04:18 +02:00
mo_genus_new5 = AMR ::: abbreviate_mo ( genus , 5 ) ,
mo_genus_new5b = paste0 ( AMR ::: abbreviate_mo ( genus , 5 ) , 1 ) ,
2024-07-16 14:51:57 +02:00
mo_genus_new5c = paste0 ( AMR ::: abbreviate_mo ( genus , 5 ) , 2 ) ,
2023-05-08 13:04:18 +02:00
mo_genus_new6 = AMR ::: abbreviate_mo ( genus , 6 ) ,
mo_genus_new7 = AMR ::: abbreviate_mo ( genus , 7 ) ,
mo_genus_new8 = AMR ::: abbreviate_mo ( genus , 8 ) ,
2022-10-05 09:12:22 +02:00
mo_genus_new = case_when (
! is.na ( mo_genus_old ) ~ mo_genus_old ,
! mo_genus_new5 %in% mo_genus_old ~ mo_genus_new5 ,
! mo_genus_new6 %in% mo_genus_old ~ mo_genus_new6 ,
! mo_genus_new7 %in% mo_genus_old ~ mo_genus_new7 ,
! mo_genus_new8 %in% mo_genus_old ~ mo_genus_new8 ,
! mo_genus_new5b %in% mo_genus_old ~ mo_genus_new5b ,
TRUE ~ mo_genus_old
2022-08-28 10:31:50 +02:00
) ,
2022-10-05 09:12:22 +02:00
mo_duplicated = duplicated ( mo_genus_new ) ,
mo_genus_new = case_when (
! mo_duplicated ~ mo_genus_new ,
2024-07-16 14:51:57 +02:00
mo_duplicated & mo_genus_new == mo_genus_old ~ mo_genus_new6 ,
2022-10-05 09:12:22 +02:00
mo_duplicated & mo_genus_new == mo_genus_new5 ~ mo_genus_new6 ,
mo_duplicated & mo_genus_new == mo_genus_new6 ~ mo_genus_new7 ,
mo_duplicated & mo_genus_new == mo_genus_new7 ~ mo_genus_new8 ,
mo_duplicated & mo_genus_new == mo_genus_new8 ~ mo_genus_new5b ,
TRUE ~ NA_character_
2022-08-28 10:31:50 +02:00
) ,
2024-07-16 14:51:57 +02:00
mo_duplicated = duplicated ( mo_genus_new ) ,
mo_genus_new = case_when (
! mo_duplicated ~ mo_genus_new ,
mo_duplicated & mo_genus_new == mo_genus_new6 ~ mo_genus_new7 ,
mo_duplicated & mo_genus_new == mo_genus_new7 ~ mo_genus_new8 ,
mo_duplicated & mo_genus_new == mo_genus_new8 ~ mo_genus_new5b ,
mo_duplicated ~ mo_genus_new5c ,
TRUE ~ NA_character_
) ,
2022-10-05 09:12:22 +02:00
mo_duplicated = duplicated ( mo_genus_new )
) %>%
ungroup ( )
if ( any ( mo_genus $ mo_duplicated , na.rm = TRUE ) | anyNA ( mo_genus $ mo_genus_new ) ) stop ( " Duplicate MO codes for genus!" )
2024-07-16 14:51:57 +02:00
# mo_genus %>% filter(mo_duplicated)
2022-10-05 09:12:22 +02:00
# no duplicates *within kingdoms*, so keep the right columns for left joining later
mo_genus <- mo_genus %>%
select ( kingdom , genus , mo_genus = mo_genus_new )
# same for species - keep old where available and create new per kingdom-genus where needed:
mo_species <- taxonomy %>%
filter ( rank == " species" ) %>%
distinct ( kingdom , genus , species ) %>%
2023-01-23 15:01:21 +01:00
left_join (
AMR :: microorganisms %>%
filter ( rank == " species" ) %>%
transmute ( mo_species_old = gsub ( " ^[A-Z]+_[A-Z]+_" , " " , as.character ( mo ) ) , kingdom , genus , species ) %>%
filter ( mo_species_old %unlike% " -" ) %>%
distinct ( kingdom , genus , species , .keep_all = TRUE ) ,
by = c ( " kingdom" , " genus" , " species" )
2022-10-05 09:12:22 +02:00
) %>%
distinct ( kingdom , genus , species , .keep_all = TRUE ) %>%
group_by ( kingdom , genus ) %>%
mutate (
2023-05-08 13:04:18 +02:00
mo_species_new4 = AMR ::: abbreviate_mo ( species , 4 , hyphen_as_space = TRUE ) ,
mo_species_new5 = AMR ::: abbreviate_mo ( species , 5 , hyphen_as_space = TRUE ) ,
mo_species_new5b = paste0 ( AMR ::: abbreviate_mo ( species , 5 , hyphen_as_space = TRUE ) , 1 ) ,
mo_species_new6 = AMR ::: abbreviate_mo ( species , 6 , hyphen_as_space = TRUE ) ,
mo_species_new7 = AMR ::: abbreviate_mo ( species , 7 , hyphen_as_space = TRUE ) ,
mo_species_new8 = AMR ::: abbreviate_mo ( species , 8 , hyphen_as_space = TRUE ) ,
2022-10-05 09:12:22 +02:00
mo_species_new = case_when (
! is.na ( mo_species_old ) ~ mo_species_old ,
! mo_species_new4 %in% mo_species_old ~ mo_species_new4 ,
! mo_species_new5 %in% mo_species_old ~ mo_species_new5 ,
! mo_species_new6 %in% mo_species_old ~ mo_species_new6 ,
! mo_species_new7 %in% mo_species_old ~ mo_species_new7 ,
! mo_species_new8 %in% mo_species_old ~ mo_species_new8 ,
! mo_species_new5b %in% mo_species_old ~ mo_species_new5b ,
TRUE ~ mo_species_old
2022-08-28 10:31:50 +02:00
) ,
2022-10-05 09:12:22 +02:00
mo_duplicated = duplicated ( mo_species_new ) ,
mo_species_new = case_when (
! mo_duplicated ~ mo_species_new ,
mo_duplicated & mo_species_new == mo_species_new4 ~ mo_species_new5 ,
mo_duplicated & mo_species_new == mo_species_new5 ~ mo_species_new6 ,
mo_duplicated & mo_species_new == mo_species_new6 ~ mo_species_new7 ,
mo_duplicated & mo_species_new == mo_species_new7 ~ mo_species_new8 ,
mo_duplicated & mo_species_new == mo_species_new8 ~ mo_species_new5b ,
TRUE ~ NA_character_
2022-08-28 10:31:50 +02:00
) ,
2022-10-05 09:12:22 +02:00
mo_duplicated = duplicated ( mo_species_new )
) %>%
ungroup ( )
if ( any ( mo_species $ mo_duplicated , na.rm = TRUE ) | anyNA ( mo_species $ mo_species_new ) ) stop ( " Duplicate MO codes for species!" )
# no duplicates *within kingdoms*, so keep the right columns for left joining later
mo_species <- mo_species %>%
select ( kingdom , genus , species , mo_species = mo_species_new )
# same for subspecies - keep old where available and create new per kingdom-genus-species where needed:
mo_subspecies <- taxonomy %>%
filter ( rank == " subspecies" ) %>%
distinct ( kingdom , genus , species , subspecies ) %>%
2023-01-23 15:01:21 +01:00
left_join (
AMR :: microorganisms %>%
filter ( rank %in% c ( " subspecies" , " subsp." , " infraspecies" ) ) %>%
transmute ( mo_subspecies_old = gsub ( " ^[A-Z]+_[A-Z]+_[A-Z]+_" , " " , as.character ( mo ) ) , kingdom , genus , species , subspecies ) %>%
filter ( mo_subspecies_old %unlike% " -" ) %>%
distinct ( kingdom , genus , species , subspecies , .keep_all = TRUE ) ,
by = c ( " kingdom" , " genus" , " species" , " subspecies" )
2022-10-05 09:12:22 +02:00
) %>%
distinct ( kingdom , genus , species , subspecies , .keep_all = TRUE ) %>%
group_by ( kingdom , genus , species ) %>%
mutate (
2023-05-08 13:04:18 +02:00
mo_subspecies_new4 = AMR ::: abbreviate_mo ( subspecies , 4 , hyphen_as_space = TRUE ) ,
mo_subspecies_new5 = AMR ::: abbreviate_mo ( subspecies , 5 , hyphen_as_space = TRUE ) ,
mo_subspecies_new5b = paste0 ( AMR ::: abbreviate_mo ( subspecies , 5 , hyphen_as_space = TRUE ) , 1 ) ,
mo_subspecies_new6 = AMR ::: abbreviate_mo ( subspecies , 6 , hyphen_as_space = TRUE ) ,
mo_subspecies_new7 = AMR ::: abbreviate_mo ( subspecies , 7 , hyphen_as_space = TRUE ) ,
mo_subspecies_new8 = AMR ::: abbreviate_mo ( subspecies , 8 , hyphen_as_space = TRUE ) ,
2022-10-05 09:12:22 +02:00
mo_subspecies_new = case_when (
! is.na ( mo_subspecies_old ) ~ mo_subspecies_old ,
! mo_subspecies_new4 %in% mo_subspecies_old ~ mo_subspecies_new4 ,
! mo_subspecies_new5 %in% mo_subspecies_old ~ mo_subspecies_new5 ,
! mo_subspecies_new6 %in% mo_subspecies_old ~ mo_subspecies_new6 ,
! mo_subspecies_new7 %in% mo_subspecies_old ~ mo_subspecies_new7 ,
! mo_subspecies_new8 %in% mo_subspecies_old ~ mo_subspecies_new8 ,
! mo_subspecies_new5b %in% mo_subspecies_old ~ mo_subspecies_new5b ,
TRUE ~ mo_subspecies_old
2022-08-28 10:31:50 +02:00
) ,
2022-10-05 09:12:22 +02:00
mo_duplicated = duplicated ( mo_subspecies_new ) ,
mo_subspecies_new = case_when (
! mo_duplicated ~ mo_subspecies_new ,
mo_duplicated & mo_subspecies_new == mo_subspecies_new4 ~ mo_subspecies_new5 ,
mo_duplicated & mo_subspecies_new == mo_subspecies_new5 ~ mo_subspecies_new6 ,
mo_duplicated & mo_subspecies_new == mo_subspecies_new6 ~ mo_subspecies_new7 ,
mo_duplicated & mo_subspecies_new == mo_subspecies_new7 ~ mo_subspecies_new8 ,
mo_duplicated & mo_subspecies_new == mo_subspecies_new8 ~ mo_subspecies_new5b ,
TRUE ~ NA_character_
) ,
mo_duplicated = duplicated ( mo_subspecies_new )
) %>%
ungroup ( )
if ( any ( mo_subspecies $ mo_duplicated , na.rm = TRUE ) | anyNA ( mo_subspecies $ mo_subspecies_new ) ) stop ( " Duplicate MO codes for subspecies!" )
# no duplicates *within kingdoms*, so keep the right columns for left joining later
mo_subspecies <- mo_subspecies %>%
select ( kingdom , genus , species , subspecies , mo_subspecies = mo_subspecies_new )
2019-02-20 00:04:48 +01:00
2022-10-05 09:12:22 +02:00
# unknowns - manually added
mo_unknown <- AMR :: microorganisms %>%
filter ( fullname %like% " unknown" ) %>%
transmute ( fullname , mo_unknown = as.character ( mo ) )
# apply the new codes!
taxonomy <- taxonomy %>%
left_join ( mo_kingdom , by = " kingdom" ) %>%
left_join ( mo_phylum , by = c ( " kingdom" , " phylum" ) ) %>%
left_join ( mo_class , by = c ( " kingdom" , " class" ) ) %>%
left_join ( mo_order , by = c ( " kingdom" , " order" ) ) %>%
left_join ( mo_family , by = c ( " kingdom" , " family" ) ) %>%
left_join ( mo_genus , by = c ( " kingdom" , " genus" ) ) %>%
left_join ( mo_species , by = c ( " kingdom" , " genus" , " species" ) ) %>%
left_join ( mo_subspecies , by = c ( " kingdom" , " genus" , " species" , " subspecies" ) ) %>%
left_join ( mo_unknown , by = " fullname" ) %>%
2024-09-29 22:17:56 +02:00
mutate ( across ( starts_with ( " mo_" ) , function ( x ) if_else ( is.na ( x ) , " " , x ) ) ) %>%
2022-10-05 09:12:22 +02:00
mutate (
mo = case_when (
fullname %like% " unknown" ~ mo_unknown ,
# add special cases for taxons higher than genus
rank == " kingdom" ~ paste ( mo_kingdom , " [KNG]" , toupper ( kingdom ) , sep = " _" ) ,
rank == " phylum" ~ paste ( mo_kingdom , mo_phylum , sep = " _" ) ,
rank == " class" ~ paste ( mo_kingdom , mo_class , sep = " _" ) ,
rank == " order" ~ paste ( mo_kingdom , mo_order , sep = " _" ) ,
rank == " family" ~ paste ( mo_kingdom , mo_family , sep = " _" ) ,
TRUE ~ paste ( mo_kingdom , mo_genus , mo_species , mo_subspecies , sep = " _" )
2022-08-28 10:31:50 +02:00
) ,
2022-10-05 09:12:22 +02:00
mo = trimws ( gsub ( " _+$" , " " , mo ) ) ,
.before = 1
) %>%
select ( ! starts_with ( " mo_" ) ) %>%
2022-10-30 14:31:45 +01:00
arrange ( fullname )
2022-10-29 14:15:23 +02:00
2024-09-29 22:17:56 +02:00
# now check these duplicate fullnames, they should not exist (didn't at least in 2024)
# this example shows that some could have same fullnames: taxonomy %>% filter(class == genus & class != "")
# such as Kapibacteria (both genus and class), but the class last time correctly got the " {class}" suffix in the fullname
2023-01-23 15:01:21 +01:00
taxonomy %>%
filter ( fullname %in% .[duplicated ( fullname ) , " fullname" , drop = TRUE ] ) %>%
View ( )
2024-09-29 22:17:56 +02:00
# OTHERWISE uncomment this section:
# # we will prefer kingdoms as Bacteria > Fungi > Protozoa > Archaea > Animalia, so check the ones within 1 kingdom
# taxonomy %>%
# filter(fullname %in% .[duplicated(fullname), "fullname", drop = TRUE]) %>%
# group_by(fullname) %>%
# filter(n_distinct(kingdom) == 1)
#
# # fullnames must be unique, we'll keep the most relevant ones only
# taxonomy <- taxonomy %>%
# mutate(rank_index = case_when(
# kingdom == "Bacteria" ~ 1,
# kingdom == "Fungi" ~ 2,
# kingdom == "Protozoa" ~ 3,
# kingdom == "Archaea" ~ 4,
# kingdom == "Chromista" ~ 5,
# kingdom == "Animalia" ~ 6,
# TRUE ~ 7
# )) %>%
# arrange(fullname, rank_index) %>%
# distinct(fullname, .keep_all = TRUE) %>%
# select(-rank_index) %>%
# filter(mo != "")
# These were overwritten to Bacteria, fix them
taxonomy $ kingdom [taxonomy $ mo == " UNKNOWN" ] <- " (unknown kingdom)"
taxonomy $ kingdom [taxonomy $ mo == " F_FUNGUS" ] <- " Fungi"
taxonomy $ kingdom [taxonomy $ mo == " F_YEAST" ] <- " Fungi"
taxonomy $ kingdom [taxonomy $ mo == " P_PROTOZOAN" ] <- " Protozoa"
2024-02-24 15:16:52 +01:00
2022-12-12 00:14:56 +01:00
2024-02-24 15:16:52 +01:00
# keep the codes from manually added ones
manual_mos <- as.character ( AMR :: microorganisms $ mo ) [match ( taxonomy $ fullname [taxonomy $ source == " manually added" ] , AMR :: microorganisms $ fullname ) ]
taxonomy $ mo [taxonomy $ source == " manually added" ] [ ! is.na ( manual_mos ) ] <- manual_mos [ ! is.na ( manual_mos ) ]
2024-09-29 22:17:56 +02:00
# fix the kingdoms for manual codes (sometimes species go from Fungi to Chromista or so)
substr ( taxonomy $ mo [ ! substr ( taxonomy $ mo , 1 , 1 ) %in% c ( " B" , " U" ) & taxonomy $ kingdom == " Bacteria" & ! is.na ( taxonomy $ kingdom ) ] , 1 , 1 ) <- " B"
substr ( taxonomy $ mo [ ! substr ( taxonomy $ mo , 1 , 1 ) %in% c ( " F" , " U" ) & taxonomy $ kingdom == " Fungi" & ! is.na ( taxonomy $ kingdom ) ] , 1 , 1 ) <- " F"
substr ( taxonomy $ mo [ ! substr ( taxonomy $ mo , 1 , 1 ) %in% c ( " C" , " U" ) & taxonomy $ kingdom == " Chromista" & ! is.na ( taxonomy $ kingdom ) ] , 1 , 1 ) <- " C"
substr ( taxonomy $ mo [ ! substr ( taxonomy $ mo , 1 , 1 ) %in% c ( " A" , " U" ) & taxonomy $ kingdom == " Archaea" & ! is.na ( taxonomy $ kingdom ) ] , 1 , 1 ) <- " A"
substr ( taxonomy $ mo [ ! substr ( taxonomy $ mo , 1 , 1 ) %in% c ( " P" , " U" ) & taxonomy $ kingdom == " Protozoa" & ! is.na ( taxonomy $ kingdom ) ] , 1 , 1 ) <- " P"
substr ( taxonomy $ mo [ ! substr ( taxonomy $ mo , 1 , 2 ) %in% c ( " AN" , " UN" ) & taxonomy $ kingdom == " Animalia" & ! is.na ( taxonomy $ kingdom ) ] , 1 , 2 ) <- " AN"
substr ( taxonomy $ mo [ ! substr ( taxonomy $ mo , 1 , 2 ) %in% c ( " PL" , " UN" ) & taxonomy $ kingdom == " Plantae" & ! is.na ( taxonomy $ kingdom ) ] , 1 , 2 ) <- " PL"
2024-02-24 15:16:52 +01:00
2024-07-16 14:51:57 +02:00
# remove the manually added ones that are now ghosts
2024-09-29 22:17:56 +02:00
taxonomy %>%
filter ( ( mo %like% " __" & source == " manually added" ) ) %>%
View ( )
2024-07-16 14:51:57 +02:00
taxonomy <- taxonomy %>%
filter ( ! ( mo %like% " __" & source == " manually added" ) )
2024-09-29 22:17:56 +02:00
2024-07-16 14:51:57 +02:00
# this must not exist (2024: GBIF mess with missing genera - remove them):
2023-01-23 15:01:21 +01:00
taxonomy %>%
filter ( mo %like% " __" ) %>%
View ( )
2022-12-12 00:14:56 +01:00
taxonomy <- taxonomy %>% filter ( mo %unlike% " __" )
2024-09-29 22:17:56 +02:00
saveRDS ( taxonomy , " data-raw/taxonomy2c.rds" )
# taxonomy <- readRDS("data-raw/taxonomy2c.rds")
2022-12-12 00:14:56 +01:00
2022-12-16 16:10:43 +01:00
2024-09-29 22:17:56 +02:00
# Some integrity checks ---------------------------------------------------------------------------
2022-12-16 16:10:43 +01:00
# are mo codes unique?
2024-07-16 14:51:57 +02:00
taxonomy %>% filter ( mo %in% .[duplicated ( mo ) , " mo" , drop = TRUE ] ) %>% arrange ( mo ) %>% View ( )
2024-09-29 22:17:56 +02:00
# Nope: (2024)
# - There is one weird entry from MycoBank, this is recorded as class while it's a phylum
2024-07-16 14:51:57 +02:00
fix <- which ( taxonomy $ fullname == " Chytridiopsidomycota" )
taxonomy $ class [fix ] <- " "
taxonomy $ rank [fix ] <- " phylum"
taxonomy $ mo [fix ] <- paste0 ( " F_" , AMR ::: abbreviate_mo ( " Chytridiopsidomycota" , minlength = 8 , prefix = " [PHL]_" ) )
2024-09-29 22:17:56 +02:00
# Also:
# - There are multiple cases where the suffix " {species}" was added - these are just duplicates.
# Some are Salmonella errors at LPSN's side - typhimurium is NOT a species of it, species is enterica, serovar is Typhimurium (that we use as subspecies)
2024-07-16 14:51:57 +02:00
taxonomy %>% filter ( mo %in% .[duplicated ( mo ) , " mo" , drop = TRUE ] ) %>% arrange ( mo ) %>% View ( )
# keep the firsts
taxonomy <- taxonomy %>% arrange ( mo ) %>% distinct ( mo , .keep_all = TRUE )
2022-12-16 16:10:43 +01:00
# are fullnames unique?
2024-07-16 14:51:57 +02:00
taxonomy %>% arrange ( fullname ) %>% filter ( fullname %in% .[duplicated ( fullname ) , " fullname" , drop = TRUE ] ) %>% View ( )
# # we can now solve it like this, but check every year if this is okay
# taxonomy <- taxonomy %>%
# group_by(fullname) %>%
# mutate(fullname = if_else(duplicated(fullname), paste0(fullname, " {", rank, "}"), fullname)) %>%
# ungroup()
2020-05-27 16:37:49 +02:00
2022-12-16 16:10:43 +01:00
# are all GBIFs available?
2023-01-23 15:01:21 +01:00
taxonomy %>%
2024-07-16 14:51:57 +02:00
filter ( ( ! gbif_parent %in% gbif ) | ( ! lpsn_parent %in% lpsn ) | ( ! mycobank_parent %in% mycobank ) ) %>%
count ( source = case_when ( ! gbif_parent %in% gbif ~ " GBIF" ,
! lpsn_parent %in% lpsn ~ " LPSN" ,
! mycobank_parent %in% mycobank ~ " MycoBank" ,
TRUE ~ " ?" ) ,
2024-02-24 15:16:52 +01:00
rank )
2024-09-29 22:17:56 +02:00
# so fix again all parent identifiers (exact same code as earlier)
2024-07-16 14:51:57 +02:00
taxonomy <- taxonomy %>%
mutate (
lpsn_parent = case_when (
rank == " phylum" ~ lpsn [match ( kingdom , fullname ) ] ,
# in class, take parent from phylum if available, otherwise kingdom
rank == " class" & phylum != " " ~ lpsn [match ( phylum , fullname ) ] ,
rank == " class" ~ lpsn [match ( kingdom , fullname ) ] ,
# in order, take parent from class if available, otherwise phylum, otherwise kingdom
rank == " order" & class != " " ~ lpsn [match ( class , fullname ) ] ,
rank == " order" & phylum != " " ~ lpsn [match ( phylum , fullname ) ] ,
rank == " order" ~ lpsn [match ( kingdom , fullname ) ] ,
# family
rank == " family" & order != " " ~ lpsn [match ( order , fullname ) ] ,
rank == " family" & class != " " ~ lpsn [match ( class , fullname ) ] ,
rank == " family" & phylum != " " ~ lpsn [match ( phylum , fullname ) ] ,
rank == " family" ~ lpsn [match ( kingdom , fullname ) ] ,
# genus
rank == " genus" & family != " " ~ lpsn [match ( family , fullname ) ] ,
rank == " genus" & order != " " ~ lpsn [match ( order , fullname ) ] ,
rank == " genus" & class != " " ~ lpsn [match ( class , fullname ) ] ,
rank == " genus" & phylum != " " ~ lpsn [match ( phylum , fullname ) ] ,
rank == " genus" ~ lpsn [match ( kingdom , fullname ) ] ,
# species, always has a genus
rank == " species" ~ lpsn [match ( genus , fullname ) ] ,
2024-07-17 14:29:55 +02:00
# subspecies, always has a genus + species
rank == " subspecies" ~ lpsn [match ( paste ( genus , species ) , fullname ) ] ,
2024-07-16 14:51:57 +02:00
TRUE ~ NA_character_ ) ,
mycobank_parent = case_when (
rank == " phylum" ~ mycobank [match ( kingdom , fullname ) ] ,
# class
rank == " class" & phylum != " " ~ mycobank [match ( phylum , fullname ) ] ,
rank == " class" ~ mycobank [match ( kingdom , fullname ) ] ,
# order
rank == " order" & class != " " ~ mycobank [match ( class , fullname ) ] ,
rank == " order" & phylum != " " ~ mycobank [match ( phylum , fullname ) ] ,
rank == " order" ~ mycobank [match ( kingdom , fullname ) ] ,
# family
rank == " family" & order != " " ~ mycobank [match ( order , fullname ) ] ,
rank == " family" & class != " " ~ mycobank [match ( class , fullname ) ] ,
rank == " family" & phylum != " " ~ mycobank [match ( phylum , fullname ) ] ,
rank == " family" ~ mycobank [match ( kingdom , fullname ) ] ,
# genus
rank == " genus" & family != " " ~ mycobank [match ( family , fullname ) ] ,
rank == " genus" & order != " " ~ mycobank [match ( order , fullname ) ] ,
rank == " genus" & class != " " ~ mycobank [match ( class , fullname ) ] ,
rank == " genus" & phylum != " " ~ mycobank [match ( phylum , fullname ) ] ,
rank == " genus" ~ mycobank [match ( kingdom , fullname ) ] ,
# species
rank == " species" ~ mycobank [match ( genus , fullname ) ] ,
2024-07-17 14:29:55 +02:00
# subspecies
rank == " subspecies" ~ mycobank [match ( paste ( genus , species ) , fullname ) ] ,
2024-07-16 14:51:57 +02:00
TRUE ~ NA_character_ ) ,
gbif_parent = case_when (
rank == " phylum" ~ gbif [match ( kingdom , fullname ) ] ,
# class
rank == " class" & phylum != " " ~ gbif [match ( phylum , fullname ) ] ,
rank == " class" ~ gbif [match ( kingdom , fullname ) ] ,
# order
rank == " order" & class != " " ~ gbif [match ( class , fullname ) ] ,
rank == " order" & phylum != " " ~ gbif [match ( phylum , fullname ) ] ,
rank == " order" ~ gbif [match ( kingdom , fullname ) ] ,
# family
rank == " family" & order != " " ~ gbif [match ( order , fullname ) ] ,
rank == " family" & class != " " ~ gbif [match ( class , fullname ) ] ,
rank == " family" & phylum != " " ~ gbif [match ( phylum , fullname ) ] ,
rank == " family" ~ gbif [match ( kingdom , fullname ) ] ,
# genus
rank == " genus" & family != " " ~ gbif [match ( family , fullname ) ] ,
rank == " genus" & order != " " ~ gbif [match ( order , fullname ) ] ,
rank == " genus" & class != " " ~ gbif [match ( class , fullname ) ] ,
rank == " genus" & phylum != " " ~ gbif [match ( phylum , fullname ) ] ,
rank == " genus" ~ gbif [match ( kingdom , fullname ) ] ,
# species
rank == " species" ~ gbif [match ( genus , fullname ) ] ,
2024-07-17 14:29:55 +02:00
# subspecies
rank == " subspecies" ~ gbif [match ( paste ( genus , species ) , fullname ) ] ,
2024-07-16 14:51:57 +02:00
TRUE ~ NA_character_ ) )
2024-02-24 15:16:52 +01:00
# check again
2023-01-23 15:01:21 +01:00
taxonomy %>%
2024-07-16 14:51:57 +02:00
filter ( ( ! gbif_parent %in% gbif ) | ( ! lpsn_parent %in% lpsn ) | ( ! mycobank_parent %in% mycobank ) ) %>%
count ( source = case_when ( ! gbif_parent %in% gbif ~ " GBIF" ,
! lpsn_parent %in% lpsn ~ " LPSN" ,
! mycobank_parent %in% mycobank ~ " MycoBank" ,
TRUE ~ " ?" ) ,
2024-02-24 15:16:52 +01:00
rank )
2022-12-16 16:10:43 +01:00
2024-09-29 22:17:56 +02:00
# Save intermediate results (3) -------------------------------------------------------------------
2022-12-16 16:10:43 +01:00
saveRDS ( taxonomy , " data-raw/taxonomy3.rds" )
2024-09-29 22:17:56 +02:00
# taxonomy <- readRDS("data-raw/taxonomy3.rds")
2022-10-29 14:15:23 +02:00
2019-10-30 23:02:50 +01:00
2024-09-29 22:17:56 +02:00
# Finalised taxonomy (without additional data) ----------------------------------------------------
2024-02-24 15:16:52 +01:00
2023-01-23 15:01:21 +01:00
message (
2023-02-10 16:18:00 +01:00
" \nCongratulations! The new taxonomic table will contain " , format ( nrow ( taxonomy ) , big.mark = " " ) , " rows.\n" ,
" This was " , format ( nrow ( AMR :: microorganisms ) , big.mark = " " ) , " rows.\n"
2023-01-23 15:01:21 +01:00
)
2022-12-12 00:14:56 +01:00
# these are the new ones:
2023-01-23 15:01:21 +01:00
taxonomy %>%
2024-07-16 14:51:57 +02:00
# filter(!paste(kingdom, fullname) %in% paste(AMR::microorganisms$kingdom, AMR::microorganisms$fullname)) %>%
filter ( ! fullname %in% AMR :: microorganisms $ fullname ) %>%
2023-01-23 15:01:21 +01:00
View ( )
2024-09-29 22:17:56 +02:00
# these are to be removed:
2023-01-23 15:01:21 +01:00
AMR :: microorganisms %>%
filter ( ! fullname %in% taxonomy $ fullname ) %>%
View ( )
2020-07-08 14:48:06 +02:00
2022-10-05 09:12:22 +02:00
2024-09-29 22:17:56 +02:00
# Add SNOMED CT -----------------------------------------------------------------------------------
2022-10-05 09:12:22 +02:00
# we will use Public Health Information Network Vocabulary Access and Distribution System (PHIN VADS)
# as a source, which copies directly from the latest US SNOMED CT version
# - go to https://phinvads.cdc.gov/vads/ViewValueSet.action?oid=2.16.840.1.114222.4.11.1009
2022-10-29 14:15:23 +02:00
# - check that current online version is higher than TAXONOMY_VERSION$SNOMED
2022-10-05 09:12:22 +02:00
# - if so, click on 'Download Value Set', choose 'TXT'
snomed <- vroom ( " data-raw/SNOMED_PHVS_Microorganism_CDC_V12.txt" , skip = 3 ) %>%
select ( 1 : 2 ) %>%
setNames ( c ( " snomed" , " mo" ) ) %>%
mutate ( snomed = as.character ( snomed ) )
# try to get name of MO
snomed <- snomed %>%
mutate ( mo = gsub ( " ss. " , " " , mo , fixed = TRUE ) ) %>%
mutate ( fullname = case_when (
mo %like_case% " [A-Z][a-z]+ [a-z]+ [a-z]{4,} " ~ gsub ( " (^|.*)([A-Z][a-z]+ [a-z]+ [a-z]{4,}) .*" , " \\2" , mo ) ,
mo %like_case% " [A-Z][a-z]+ [a-z]{4,} " ~ gsub ( " (^|.*)([A-Z][a-z]+ [a-z]{4,}) .*" , " \\2" , mo ) ,
mo %like_case% " [A-Z][a-z]+" ~ gsub ( " (^|.*)([A-Z][a-z]+) .*" , " \\2" , mo ) ,
TRUE ~ NA_character_
2024-07-16 14:51:57 +02:00
) )
snomed <- snomed %>%
2022-10-05 09:12:22 +02:00
filter ( fullname %in% taxonomy $ fullname )
message ( nrow ( snomed ) , " SNOMED codes will be added to " , n_distinct ( snomed $ fullname ) , " microorganisms" )
snomed <- snomed %>%
group_by ( fullname ) %>%
summarise ( snomed = list ( snomed ) )
taxonomy <- taxonomy %>%
left_join ( snomed , by = " fullname" )
2024-09-29 22:17:56 +02:00
# Add oxygen tolerance (aerobe/anaerobe) ----------------------------------------------------------
2023-05-08 13:04:18 +02:00
# We will use the BacDive data base for this:
2024-07-16 14:51:57 +02:00
# - go to https://bacdive.dsmz.de/advsearch a
# - filter 'Oxygen tolerance' on "*" and click Submit
2023-05-08 13:04:18 +02:00
# - click on the 'Download tabel as CSV' button
bacdive <- vroom :: vroom ( " data-raw/bacdive.csv" , skip = 2 ) %>%
select ( species , oxygen = `Oxygen tolerance` )
bacdive <- bacdive %>%
# fill in missing species from previous rows
2024-07-16 14:51:57 +02:00
mutate ( fullname = if_else ( is.na ( species ) , lag ( species ) , species ) ) %>%
2023-05-11 21:56:27 +02:00
filter ( ! is.na ( species ) , ! is.na ( oxygen ) , oxygen %unlike% " tolerant" , species %unlike% " unclassified" ) %>%
2024-07-16 14:51:57 +02:00
select ( - species )
2023-05-08 13:04:18 +02:00
bacdive <- bacdive %>%
# now determine type per species
2024-07-16 14:51:57 +02:00
group_by ( fullname ) %>%
summarise ( fullname = first ( fullname ) ,
2023-05-11 21:56:27 +02:00
oxygen_tolerance = case_when ( any ( oxygen %like% " facultative" ) ~ " facultative anaerobe" ,
2023-05-08 13:04:18 +02:00
all ( oxygen == " microaerophile" ) ~ " microaerophile" ,
all ( oxygen %in% c ( " anaerobe" , " obligate anaerobe" ) ) ~ " anaerobe" ,
all ( oxygen %in% c ( " anaerobe" , " obligate anaerobe" , " microaerophile" ) ) ~ " anaerobe/microaerophile" ,
all ( oxygen %in% c ( " aerobe" , " obligate aerobe" ) ) ~ " aerobe" ,
all ( ! oxygen %in% c ( " anaerobe" , " obligate anaerobe" ) ) ~ " aerobe" ,
all ( c ( " aerobe" , " anaerobe" ) %in% oxygen ) ~ " facultative anaerobe" ,
TRUE ~ NA_character_ ) )
2023-05-11 21:56:27 +02:00
# now find all synonyms and copy them from their current taxonomic names
2024-07-16 14:51:57 +02:00
synonyms <- taxonomy %>%
filter ( status == " synonym" ) %>%
transmute ( mo ,
fullname_old = fullname ,
current = synonym_mo_to_accepted_mo ( mo , fill_in_accepted = FALSE , dataset = taxonomy ) ) %>%
filter ( ! is.na ( current ) ) %>%
mutate ( fullname = taxonomy $ fullname [match ( current , taxonomy $ mo ) ] ) %>%
left_join ( bacdive , by = " fullname" ) %>%
filter ( ! is.na ( oxygen_tolerance ) ) %>%
select ( fullname , oxygen_tolerance )
2023-05-11 21:56:27 +02:00
bacdive <- bacdive %>%
2024-07-16 14:51:57 +02:00
bind_rows ( synonyms ) %>%
2023-05-11 21:56:27 +02:00
distinct ( )
2023-05-08 13:04:18 +02:00
bacdive_genus <- bacdive %>%
2024-07-16 14:51:57 +02:00
mutate ( oxygen = oxygen_tolerance ,
genus = taxonomy $ genus [match ( fullname , taxonomy $ fullname ) ] ) %>%
group_by ( fullname = genus ) %>%
2023-05-08 13:04:18 +02:00
summarise ( oxygen_tolerance = case_when ( any ( oxygen == " facultative anaerobe" ) ~ " facultative anaerobe" ,
any ( oxygen == " anaerobe/microaerophile" ) ~ " anaerobe/microaerophile" ,
all ( oxygen == " microaerophile" ) ~ " microaerophile" ,
all ( oxygen == " anaerobe" ) ~ " anaerobe" ,
all ( oxygen == " aerobe" ) ~ " aerobe" ,
TRUE ~ " facultative anaerobe" ) )
bacdive <- bacdive %>%
bind_rows ( bacdive_genus ) %>%
2024-07-16 14:51:57 +02:00
arrange ( fullname )
2023-05-08 13:04:18 +02:00
2024-07-16 14:51:57 +02:00
bacdive_other <- taxonomy %>%
filter ( kingdom == " Bacteria" , rank == " species" , ! fullname %in% bacdive $ fullname , genus %in% bacdive $ fullname ) %>%
select ( fullname , genus ) %>%
left_join ( bacdive , by = c ( " genus" = " fullname" ) ) %>%
2024-09-29 22:17:56 +02:00
mutate ( oxygen_tolerance = if_else ( oxygen_tolerance %in% c ( " aerobe" , " anaerobe" , " microaerophile" , " anaerobe/microaerophile" ) ,
oxygen_tolerance ,
paste ( " likely" , oxygen_tolerance ) ) ) %>%
2024-07-16 14:51:57 +02:00
select ( fullname , oxygen_tolerance ) %>%
distinct ( fullname , .keep_all = TRUE )
2023-05-08 13:04:18 +02:00
bacdive <- bacdive %>%
2024-07-16 14:51:57 +02:00
bind_rows ( bacdive_other ) %>%
arrange ( fullname ) %>%
distinct ( fullname , .keep_all = TRUE )
2023-05-08 13:04:18 +02:00
taxonomy <- taxonomy %>%
2024-07-16 14:51:57 +02:00
left_join ( bacdive , by = " fullname" ) %>%
2023-05-11 21:56:27 +02:00
relocate ( oxygen_tolerance , .after = ref )
2023-05-08 13:04:18 +02:00
2024-09-29 22:17:56 +02:00
# Restore 'synonym' microorganisms to 'accepted' --------------------------------------------------
2022-10-05 09:12:22 +02:00
2024-07-16 14:51:57 +02:00
# If there are some synonyms that need to be corrected to 'accepted', you can do that here.
2024-09-29 22:17:56 +02:00
# Before 2024, we encountered this (but currently, this is all good, no action needed):
2024-07-16 14:51:57 +02:00
2023-02-26 21:26:58 +01:00
# according to LPSN: Stenotrophomonas maltophilia is the correct name if this species is regarded as a separate species (i.e., if its nomenclatural type is not assigned to another species whose name is validly published, legitimate and not rejected and has priority) within a separate genus Stenotrophomonas.
# https://lpsn.dsmz.de/species/stenotrophomonas-maltophilia
2024-07-16 14:51:57 +02:00
# # all MO's to keep as 'accepted', not as 'synonym':
# to_restore <- c(
# "Stenotrophomonas maltophilia",
# "Moraxella catarrhalis"
# )
# taxonomy %>% filter(fullname %in% to_restore) %>% View()
# all(to_restore %in% taxonomy$fullname)
# for (nm in to_restore) {
# taxonomy$lpsn_renamed_to[which(taxonomy$fullname == nm)] <- NA
# taxonomy$gbif_renamed_to[which(taxonomy$fullname == nm)] <- NA
# taxonomy$status[which(taxonomy$fullname == nm)] <- "accepted"
# }
2022-10-05 09:12:22 +02:00
2024-09-29 22:17:56 +02:00
# Add species groups ------------------------------------------------------------------------------
2024-07-16 14:51:57 +02:00
# just before the end, make sure we get the species group from the previous data set
2024-09-29 22:17:56 +02:00
old_groups <- microorganisms %>%
filter ( rank == " species group" ) %>%
mutate ( mo = as.character ( mo ) )
# fix, remove later
substr ( old_groups $ mo [ ! substr ( old_groups $ mo , 1 , 1 ) %in% c ( " B" , " U" ) & old_groups $ kingdom == " Bacteria" & ! is.na ( old_groups $ kingdom ) ] , 1 , 1 ) <- " B"
substr ( old_groups $ mo [ ! substr ( old_groups $ mo , 1 , 1 ) %in% c ( " F" , " U" ) & old_groups $ kingdom == " Fungi" & ! is.na ( old_groups $ kingdom ) ] , 1 , 1 ) <- " F"
substr ( old_groups $ mo [ ! substr ( old_groups $ mo , 1 , 1 ) %in% c ( " C" , " U" ) & old_groups $ kingdom == " Chromista" & ! is.na ( old_groups $ kingdom ) ] , 1 , 1 ) <- " C"
substr ( old_groups $ mo [ ! substr ( old_groups $ mo , 1 , 1 ) %in% c ( " A" , " U" ) & old_groups $ kingdom == " Archaea" & ! is.na ( old_groups $ kingdom ) ] , 1 , 1 ) <- " A"
substr ( old_groups $ mo [ ! substr ( old_groups $ mo , 1 , 1 ) %in% c ( " P" , " U" ) & old_groups $ kingdom == " Protozoa" & ! is.na ( old_groups $ kingdom ) ] , 1 , 1 ) <- " P"
substr ( old_groups $ mo [ ! substr ( old_groups $ mo , 1 , 2 ) %in% c ( " AN" , " UN" ) & old_groups $ kingdom == " Animalia" & ! is.na ( old_groups $ kingdom ) ] , 1 , 2 ) <- " AN"
substr ( old_groups $ mo [ ! substr ( old_groups $ mo , 1 , 2 ) %in% c ( " PL" , " UN" ) & old_groups $ kingdom == " Plantae" & ! is.na ( old_groups $ kingdom ) ] , 1 , 2 ) <- " PL"
2024-07-16 14:51:57 +02:00
# use current taxonomy
groups <- taxonomy [match ( old_groups $ genus , taxonomy $ genus ) , ]
groups <- groups %>%
mutate ( mo = as.character ( old_groups $ mo ) ,
fullname = old_groups $ fullname ,
species = old_groups $ species ,
subspecies = old_groups $ subspecies ,
rank = old_groups $ rank ,
ref = old_groups $ ref ,
status = " accepted" ,
source = " manually added" ,
lpsn = NA_character_ ,
lpsn_renamed_to = NA_character_ ,
mycobank = NA_character_ ,
mycobank_renamed_to = NA_character_ ,
gbif = NA_character_ ,
gbif_renamed_to = NA_character_ ) %>%
select ( - c ( lpsn , lpsn_renamed_to , mycobank , mycobank_renamed_to , gbif , gbif_renamed_to , snomed ) )
2022-10-05 09:12:22 +02:00
2024-07-16 14:51:57 +02:00
taxonomy <- taxonomy %>%
filter ( ! mo %in% groups $ mo ) %>%
bind_rows ( groups )
2022-10-05 09:12:22 +02:00
2024-09-19 11:44:56 +02:00
# we added an MO code, so make sure everything is still unique
2024-07-16 14:51:57 +02:00
any ( duplicated ( taxonomy $ mo ) )
2024-09-29 22:17:56 +02:00
taxonomy $ mo [duplicated ( taxonomy $ mo ) ]
2024-07-16 14:51:57 +02:00
any ( duplicated ( taxonomy $ fullname ) )
2024-09-29 22:17:56 +02:00
taxonomy $ fullname [duplicated ( taxonomy $ fullname ) ]
2022-10-05 09:12:22 +02:00
2024-09-29 22:17:56 +02:00
# Some final checks -------------------------------------------------------------------------------
2022-10-05 09:12:22 +02:00
2024-09-29 22:17:56 +02:00
fix_old_mos <- function ( dataset ) {
df <- dataset %>% mutate ( mo = as.character ( mo ) )
df $ mo [which ( ! df $ mo %in% taxonomy $ mo ) ] <- df %>% filter ( ! mo %in% taxonomy $ mo ) %>% mutate ( name = mo_name ( mo , keep_synonyms = TRUE ) , new_mo = taxonomy $ mo [match ( name , taxonomy $ fullname ) ] ) %>% pull ( new_mo )
class ( df $ mo ) <- c ( " mo" , " character" )
df
}
2022-10-05 09:12:22 +02:00
# and check: these codes should not be missing (will otherwise throw a unit test error):
AMR :: microorganisms.codes %>% filter ( ! mo %in% taxonomy $ mo )
2024-09-29 22:17:56 +02:00
# fix:
microorganisms.codes <- fix_old_mos ( AMR :: microorganisms.codes )
usethis :: use_data ( microorganisms.codes , overwrite = TRUE , version = 2 , compress = " xz" )
rm ( microorganisms.codes )
2023-01-21 23:47:20 +01:00
AMR :: clinical_breakpoints %>% filter ( ! mo %in% taxonomy $ mo )
2024-09-29 22:17:56 +02:00
2022-10-05 09:12:22 +02:00
AMR :: example_isolates %>% filter ( ! mo %in% taxonomy $ mo )
2024-09-29 22:17:56 +02:00
2022-10-05 09:12:22 +02:00
AMR :: intrinsic_resistant %>% filter ( ! mo %in% taxonomy $ mo )
2024-09-29 22:17:56 +02:00
# fix:
intrinsic_resistant <- fix_old_mos ( AMR :: intrinsic_resistant )
usethis :: use_data ( intrinsic_resistant , overwrite = TRUE , version = 2 , compress = " xz" )
rm ( intrinsic_resistant )
2020-05-27 16:37:49 +02:00
2024-09-19 11:44:56 +02:00
# all our previously manually added names should be in it
all ( microorganisms $ fullname [microorganisms $ source == " manually added" ] %in% taxonomy $ fullname )
microorganisms $ fullname [ ! microorganisms $ fullname [microorganisms $ source == " manually added" ] %in% taxonomy $ fullname ]
# we added an MO code, so make sure everything is still unique
2024-07-16 14:51:57 +02:00
any ( duplicated ( taxonomy $ mo ) )
any ( duplicated ( taxonomy $ fullname ) )
2024-09-29 22:17:56 +02:00
# Save to package ---------------------------------------------------------------------------------
2024-07-16 14:51:57 +02:00
# format to tibble and remove non-ASCII characters
2024-09-29 22:17:56 +02:00
saveRDS ( taxonomy , " data-raw/taxonomy3b.rds" )
# taxonomy <- readRDS("data-raw/taxonomy3b.rds")
2024-07-16 14:51:57 +02:00
taxonomy <- taxonomy %>%
arrange ( fullname ) %>%
select ( mo , fullname , status , kingdom : subspecies , rank , ref , oxygen_tolerance , source , starts_with ( " lpsn" ) , starts_with ( " mycobank" ) , starts_with ( " gbif" ) , prevalence , snomed ) %>%
df_remove_nonASCII ( )
microorganisms <- taxonomy
# set class <mo>
class ( microorganisms $ mo ) <- c ( " mo" , " character" )
2024-09-30 18:46:55 +02:00
microorganisms <- microorganisms %>% arrange ( fullname ) %>% df_remove_nonASCII ( )
2024-07-16 14:51:57 +02:00
usethis :: use_data ( microorganisms , overwrite = TRUE , version = 2 , compress = " xz" )
rm ( microorganisms )
# DON'T FORGET TO UPDATE R/_globals.R!
# load new data set
2020-05-27 16:37:49 +02:00
devtools :: load_all ( " ." )
2022-10-05 09:12:22 +02:00
2024-09-29 22:17:56 +02:00
# Reset previously changed mo codes ---------------------------------------------------------------
2024-07-16 14:51:57 +02:00
2024-09-29 22:17:56 +02:00
if ( ! identical ( clinical_breakpoints $ mo , as.mo ( clinical_breakpoints $ mo , language = NULL , keep_synonyms = FALSE ) ) ) {
clinical_breakpoints $ mo <- as.mo ( clinical_breakpoints $ mo , language = NULL , keep_synonyms = FALSE )
2023-01-21 23:47:20 +01:00
usethis :: use_data ( clinical_breakpoints , overwrite = TRUE , version = 2 , compress = " xz" )
rm ( clinical_breakpoints )
2022-12-12 00:14:56 +01:00
}
2019-09-20 12:33:05 +02:00
2024-09-29 22:17:56 +02:00
if ( ! identical ( microorganisms.codes $ mo , as.mo ( microorganisms.codes $ mo , language = NULL , keep_synonyms = FALSE ) ) ) {
2022-12-12 00:14:56 +01:00
microorganisms.codes <- microorganisms.codes %>% filter ( mo %in% microorganisms $ mo )
2024-09-29 22:17:56 +02:00
microorganisms.codes $ mo <- as.mo ( microorganisms.codes $ mo , language = NULL , keep_synonyms = FALSE )
2022-12-12 00:14:56 +01:00
usethis :: use_data ( microorganisms.codes , overwrite = TRUE , version = 2 , compress = " xz" )
rm ( microorganisms.codes )
}
2020-05-27 16:37:49 +02:00
2024-09-29 22:17:56 +02:00
if ( ! identical ( microorganisms.groups $ mo_group , as.mo ( microorganisms.groups $ mo_group , language = NULL , keep_synonyms = FALSE ) ) ||
! identical ( microorganisms.groups $ mo , as.mo ( microorganisms.groups $ mo , language = NULL , keep_synonyms = FALSE ) ) ) {
microorganisms.groups $ mo_group <- as.mo ( microorganisms.groups $ mo_group_name , language = NULL , keep_synonyms = FALSE )
microorganisms.groups $ mo <- as.mo ( microorganisms.groups $ mo_name , language = NULL , keep_synonyms = FALSE )
usethis :: use_data ( microorganisms.groups , overwrite = TRUE , version = 2 )
rm ( microorganisms.groups )
}
if ( ! identical ( example_isolates $ mo , as.mo ( example_isolates $ mo , language = NULL , keep_synonyms = FALSE ) ) ) {
example_isolates $ mo <- as.mo ( example_isolates $ mo , language = NULL , keep_synonyms = FALSE )
2022-12-12 00:14:56 +01:00
usethis :: use_data ( example_isolates , overwrite = TRUE , version = 2 )
rm ( example_isolates )
}
2022-10-05 09:12:22 +02:00
2024-09-29 22:17:56 +02:00
if ( ! identical ( intrinsic_resistant $ mo , as.mo ( intrinsic_resistant $ mo , language = NULL ) ) ) {
intrinsic_resistant $ mo <- as.mo ( intrinsic_resistant $ mo , language = NULL , keep_synonyms = FALSE )
intrinsic_resistant <- distinct ( intrinsic_resistant )
usethis :: use_data ( intrinsic_resistant , overwrite = TRUE , version = 2 )
rm ( intrinsic_resistant )
}
2024-07-16 14:51:57 +02:00
anyNA ( microorganisms $ mo )
anyNA ( microorganisms.codes $ mo )
anyNA ( intrinsic_resistant $ mo )
anyNA ( clinical_breakpoints $ mo )
2022-10-05 09:12:22 +02:00
# load new data sets again
devtools :: load_all ( " ." )
2024-04-23 10:55:48 +02:00
source ( " data-raw/_pre_commit_checks.R" )
2022-10-05 09:12:22 +02:00
devtools :: load_all ( " ." )
2020-05-27 16:37:49 +02:00
# run the unit tests
2022-10-05 09:12:22 +02:00
Sys.setenv ( NOT_CRAN = " true" )
2024-07-16 14:51:57 +02:00
testthat :: test_file ( " inst/tinytest/test-data.R" )
testthat :: test_file ( " inst/tinytest/test-mo.R" )
testthat :: test_file ( " inst/tinytest/test-mo_property.R" )