2020-01-05 17:22:09 +01:00
# ==================================================================== #
2023-06-26 13:52:02 +02:00
# TITLE: #
2022-10-05 09:12:22 +02:00
# AMR: An R Package for Working with Antimicrobial Resistance Data #
2020-01-05 17:22:09 +01:00
# #
2023-06-26 13:52:02 +02:00
# SOURCE CODE: #
2020-07-09 20:07:39 +02:00
# https://github.com/msberends/AMR #
2020-01-05 17:22:09 +01:00
# #
2023-06-26 13:52:02 +02:00
# PLEASE CITE THIS SOFTWARE AS: #
2022-10-05 09:12:22 +02:00
# Berends MS, Luz CF, Friedrich AW, Sinha BNM, Albers CJ, Glasner C #
# (2022). AMR: An R Package for Working with Antimicrobial Resistance #
# Data. Journal of Statistical Software, 104(3), 1-31. #
2023-05-27 10:39:22 +02:00
# https://doi.org/10.18637/jss.v104.i03 #
2022-10-05 09:12:22 +02:00
# #
2022-12-27 15:16:15 +01:00
# Developed at the University of Groningen and the University Medical #
# Center Groningen in The Netherlands, in collaboration with many #
# colleagues from around the world, see our website. #
2020-01-05 17:22:09 +01:00
# #
# This R package is free software; you can freely use and distribute #
# it for both personal and commercial purposes under the terms of the #
# GNU General Public License version 2.0 (GNU GPL-2), as published by #
# the Free Software Foundation. #
# We created this package for both routine data analysis and academic #
# research and it was publicly released in the hope that it will be #
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. #
2020-10-08 11:16:03 +02:00
# #
# Visit our website for the full manual and a complete tutorial about #
2021-02-02 23:57:35 +01:00
# how to conduct AMR data analysis: https://msberends.github.io/AMR/ #
2020-01-05 17:22:09 +01:00
# ==================================================================== #
2022-10-05 09:12:22 +02:00
# THIS SCRIPT REQUIRES AT LEAST 16 GB RAM
# (at least 10 GB will be used by the R session for the size of the files)
2019-03-18 14:29:41 +01:00
2022-10-05 09:12:22 +02:00
# 1. Go to https://doi.org/10.15468/39omei and find the download link for the
2022-12-12 00:14:56 +01:00
# latest GBIF backbone taxonony under "Endpoints" and unpack Taxon.tsv from it (~2.2 GB)
2022-10-05 09:12:22 +02:00
# ALSO BE SURE to get the date of release and update R/aa_globals.R later!
# 2. Go to https://lpsn.dsmz.de/downloads (register first) and download the latest
2022-12-12 00:14:56 +01:00
# CSV file (~12,5 MB) as "taxonomy.csv". Their API unfortunately does
2022-10-05 09:12:22 +02:00
# not include the full taxonomy and is currently (2022) pretty worthless.
2022-12-19 15:32:41 +01:00
# 3. For data about human pathogens, we use Bartlett et al. (2022),
2023-01-23 15:01:21 +01:00
# https://doi.org/10.1099/mic.0.001269. Their latest supplementary material
2022-12-19 15:32:41 +01:00
# can be found here: https://github.com/padpadpadpad/bartlett_et_al_2022_human_pathogens.
2023-01-23 15:01:21 +01:00
# . Download their latest xlsx file in the `data` folder and save it to our
# . `data-raw` folder.
2022-12-19 15:32:41 +01:00
# 4. Set this folder_location to the path where these two files are:
2022-10-05 09:12:22 +02:00
folder_location <- " ~/Downloads/backbone/"
file_gbif <- paste0 ( folder_location , " Taxon.tsv" )
file_lpsn <- paste0 ( folder_location , " taxonomy.csv" )
2019-02-20 00:04:48 +01:00
2022-12-19 15:32:41 +01:00
file_bartlett <- " data-raw/bartlett_et_al_2022_human_pathogens.xlsx"
2022-10-05 09:12:22 +02:00
# 4. Run the rest of this script line by line and check everything :)
2019-03-18 14:29:41 +01:00
2022-10-05 09:12:22 +02:00
if ( ! file.exists ( file_gbif ) ) stop ( " GBIF file not found" )
if ( ! file.exists ( file_lpsn ) ) stop ( " LPSN file not found" )
2022-12-19 15:32:41 +01:00
if ( ! file.exists ( file_bartlett ) ) stop ( " Bartlett et al. Excel file not found" )
2019-03-18 14:29:41 +01:00
2022-10-05 09:12:22 +02:00
library ( dplyr )
2022-12-12 00:14:56 +01:00
library ( vroom ) # to import files
library ( rvest ) # to scape LPSN website
library ( progress ) # to show progress bars
2022-12-19 15:32:41 +01:00
library ( readxl ) # for reading the Bartlett Excel file
2022-12-16 16:10:43 +01:00
devtools :: load_all ( " ." ) # load AMR package
2019-03-18 14:29:41 +01:00
2022-10-05 09:12:22 +02:00
# Helper functions --------------------------------------------------------
2019-04-05 18:47:39 +02:00
2020-05-27 16:37:49 +02:00
get_author_year <- function ( ref ) {
# Only keep first author, e.g. transform 'Smith, Jones, 2011' to 'Smith et al., 2011'
2023-01-23 15:01:21 +01:00
2020-05-27 16:37:49 +02:00
authors2 <- iconv ( ref , from = " UTF-8" , to = " ASCII//TRANSLIT" )
2022-10-05 09:12:22 +02:00
authors2 <- gsub ( " ?\\(Approved Lists [0-9]+\\) ?" , " () " , authors2 )
authors2 <- gsub ( " [)(]+ $" , " " , authors2 )
2020-05-27 16:37:49 +02:00
# remove leading and trailing brackets
2022-10-05 09:12:22 +02:00
authors2 <- trimws ( gsub ( " ^[(](.*)[)]$" , " \\1" , authors2 ) )
2020-05-27 16:37:49 +02:00
# only take part after brackets if there's a name
authors2 <- ifelse ( grepl ( " .*[)] [a-zA-Z]+.*" , authors2 ) ,
2023-01-23 15:01:21 +01:00
gsub ( " .*[)] (.*)" , " \\1" , authors2 ) ,
authors2
2022-08-28 10:31:50 +02:00
)
2022-10-05 09:12:22 +02:00
# replace parentheses with emend. to get the latest authors
authors2 <- gsub ( " (" , " emend. " , authors2 , fixed = TRUE )
authors2 <- gsub ( " )" , " " , authors2 , fixed = TRUE )
authors2 <- gsub ( " +" , " " , authors2 )
authors2 <- trimws ( authors2 )
2023-01-23 15:01:21 +01:00
2020-05-27 16:37:49 +02:00
# get year from last 4 digits
2022-08-28 10:31:50 +02:00
lastyear <- as.integer ( gsub ( " .*([0-9]{4})$" , " \\1" , authors2 ) )
2020-05-27 16:37:49 +02:00
# can never be later than now
2022-08-28 10:31:50 +02:00
lastyear <- ifelse ( lastyear > as.integer ( format ( Sys.Date ( ) , " %Y" ) ) ,
2023-01-23 15:01:21 +01:00
NA ,
lastyear
2022-08-28 10:31:50 +02:00
)
2020-05-27 16:37:49 +02:00
# get authors without last year
authors <- gsub ( " (.*)[0-9]{4}$" , " \\1" , authors2 )
2022-10-05 09:12:22 +02:00
# not sure what this is
authors <- gsub ( " (Saito)" , " " , authors , fixed = TRUE )
authors <- gsub ( " (Oudem.)" , " " , authors , fixed = TRUE )
2020-05-27 16:37:49 +02:00
# remove nonsense characters from names
2022-10-05 09:12:22 +02:00
authors <- gsub ( " [^a-zA-Z,'&. -]" , " " , authors )
# no initials, only surname
authors <- gsub ( " [A-Z][.]" , " " , authors , ignore.case = FALSE )
2020-05-27 16:37:49 +02:00
# remove trailing and leading spaces
authors <- trimws ( authors )
2022-10-05 09:12:22 +02:00
# keep only the part after last 'emend.' to get the latest authors
authors <- gsub ( " .*emend[.] ?" , " " , authors )
2020-05-27 16:37:49 +02:00
# only keep first author and replace all others by 'et al'
authors <- gsub ( " (,| and| et| &| ex| emend\\.?) .*" , " et al." , authors )
# et al. always with ending dot
authors <- gsub ( " et al\\.?" , " et al." , authors )
authors <- gsub ( " ?,$" , " " , authors )
# don't start with 'sensu' or 'ehrenb'
2022-10-05 09:12:22 +02:00
authors <- gsub ( " ^(sensu|Ehrenb.?|corrig.?) " , " " , authors , ignore.case = TRUE )
2020-05-27 16:37:49 +02:00
# no initials, only surname
2022-10-05 09:12:22 +02:00
authors <- trimws ( authors )
authors <- gsub ( " ^([A-Z][.])+( & ?)?" , " " , authors , ignore.case = FALSE )
2020-05-27 16:37:49 +02:00
authors <- gsub ( " ^([A-Z]+ )+" , " " , authors , ignore.case = FALSE )
2022-10-05 09:12:22 +02:00
# remove dots
authors <- gsub ( " ." , " " , authors , fixed = TRUE )
authors <- gsub ( " et al" , " et al." , authors , fixed = TRUE )
authors [nchar ( authors ) <= 3 ] <- " "
2020-05-27 16:37:49 +02:00
# combine author and year if year is available
ref <- ifelse ( ! is.na ( lastyear ) ,
2023-01-23 15:01:21 +01:00
paste0 ( authors , " , " , lastyear ) ,
authors
2022-08-28 10:31:50 +02:00
)
2020-05-27 16:37:49 +02:00
# fix beginning and ending
ref <- gsub ( " , $" , " " , ref )
ref <- gsub ( " ^, " , " " , ref )
ref <- gsub ( " ^(emend|et al.,?)" , " " , ref )
ref <- trimws ( ref )
2022-10-05 09:12:22 +02:00
ref <- gsub ( " '" , " " , ref )
2023-01-23 15:01:21 +01:00
2020-05-27 16:37:49 +02:00
# a lot start with a lowercase character - fix that
ref [ ! grepl ( " ^d[A-Z]" , ref ) ] <- gsub ( " ^([a-z])" , " \\U\\1" , ref [ ! grepl ( " ^d[A-Z]" , ref ) ] , perl = TRUE )
2022-08-28 10:31:50 +02:00
# specific one for the French that are named dOrbigny
2020-05-27 16:37:49 +02:00
ref [grepl ( " ^d[A-Z]" , ref ) ] <- gsub ( " ^d" , " d'" , ref [grepl ( " ^d[A-Z]" , ref ) ] )
ref <- gsub ( " +" , " " , ref )
2022-10-05 09:12:22 +02:00
ref [ref == " " ] <- NA_character_
2020-05-27 16:37:49 +02:00
ref
}
2022-10-05 09:12:22 +02:00
df_remove_nonASCII <- function ( df ) {
# Remove non-ASCII characters (these are not allowed by CRAN)
df %>%
mutate_if ( is.character , iconv , from = " UTF-8" , to = " ASCII//TRANSLIT" ) %>%
# also remove invalid characters
mutate_if ( is.character , ~ gsub ( " [\"'`]+" , " " , .) ) %>%
AMR ::: dataset_UTF8_to_ASCII ( )
}
2019-02-20 00:04:48 +01:00
2022-10-05 09:12:22 +02:00
# to retrieve LPSN and authors from LPSN website
get_lpsn_and_author <- function ( rank , name ) {
2024-02-24 15:16:52 +01:00
name <- gsub ( " ^Candidatus " , " " , name )
2022-10-05 09:12:22 +02:00
url <- paste0 ( " https://lpsn.dsmz.de/" , tolower ( rank ) , " /" , tolower ( name ) )
2022-12-12 00:14:56 +01:00
page_txt <- tryCatch ( read_html ( url ) , error = function ( e ) NULL )
2022-10-05 09:12:22 +02:00
if ( is.null ( page_txt ) ) {
warning ( " No LPSN found for " , tolower ( rank ) , " '" , name , " '" )
lpsn <- NA_character_
ref <- NA_character_
2024-02-24 15:16:52 +01:00
status <- " unknown"
2022-10-05 09:12:22 +02:00
} else {
page_txt <- page_txt %>%
2022-12-12 00:14:56 +01:00
html_element ( " #detail-page" ) %>%
html_text ( )
2022-10-05 09:12:22 +02:00
lpsn <- gsub ( " .*Record number:[\r\n\t ]*([0-9]+).*" , " \\1" , page_txt , perl = FALSE )
ref <- page_txt %>%
gsub ( " .*?Name: (.*[0-9]{4}?).*" , " \\1" , ., perl = FALSE ) %>%
gsub ( name , " " , ., fixed = TRUE ) %>%
2024-02-24 15:16:52 +01:00
gsub ( " ^\"?Candidatus ?\"?" , " " , .) %>%
2022-10-05 09:12:22 +02:00
trimws ( )
2024-02-24 15:16:52 +01:00
status <- trimws ( gsub ( " .*Nomenclatural status:[\r\n\t ]*([a-zA-Z, ]+)[\r\n\t].*" , " \\1" , page_txt , perl = FALSE ) )
if ( ( status %like% " validly published" & status %unlike% " not valid" ) | status %like% " [\r\n\t]" ) {
# we used to take "accepted" for every LPSN record, also candidates. Now only for missing values and explicit accepted ones.
status <- " accepted"
} else {
status <- " not validly published"
}
}
c ( " lpsn" = lpsn , " ref" = ref , " status" = status )
}
# this will e.g. take the family from the root genus record, and gives all species that family
get_top_lvl <- function ( current , rank , rank_target ) {
if ( ! rank_target %in% rank ) {
current [1 ]
} else {
out <- current [rank == rank_target ] [1 ]
if ( out %in% c ( " " , NA ) ) {
out <- names ( sort ( table ( current [which ( ! current %in% c ( " " , NA ) ) ] ) , decreasing = TRUE ) [1 ] )
if ( is.null ( out ) ) {
out <- " "
}
}
out
2022-10-05 09:12:22 +02:00
}
}
# MB/ August 2022: useless, does not contain full taxonomy, e.g. LPSN::request(cred, category = "family") is empty.
# get_from_lpsn <- function (user, pw) {
# if (!"LPSN" %in% rownames(utils::installed.packages())) {
# stop("Install the official LPSN package for R using: install.packages('LPSN', repos = 'https://r-forge.r-project.org')")
# }
# cred <- LPSN::open_lpsn(user, pw)
#
# lpsn_genus <- LPSN::request(cred, category = "genus")
# message("Downloading genus data (n = ", lpsn_genus$count, ") from LPSN API...")
# lpsn_genus <- as.data.frame(LPSN::retrieve(cred, category = "genus"))
#
# lpsn_species <- LPSN::request(cred, category = "species")
# message("Downloading species data (n = ", lpsn_species$count, ") from LPSN API...")
# lpsn_species <- as.data.frame(LPSN::retrieve(cred, category = "species"))
#
# lpsn_subspecies <- LPSN::request(cred, category = "subspecies")
# message("Downloading subspecies data (n = ", lpsn_subspecies$count, ") from LPSN API...")
# lpsn_subspecies <- as.data.frame(LPSN::retrieve(cred, category = "subspecies"))
#
# message("Binding rows...")
# lpsn_total <- bind_rows(lpsn_genus, lpsn_species, lpsn_subspecies)
# message("Done.")
# lpsn_total
# }
2019-02-20 00:04:48 +01:00
2022-10-05 09:12:22 +02:00
# Read GBIF data ----------------------------------------------------------
taxonomy_gbif.bak <- vroom ( file_gbif )
2022-12-12 00:14:56 +01:00
include_fungal_orders <- c (
" Eurotiales" , " Microascales" , " Mucorales" , " Saccharomycetales" ,
" Schizosaccharomycetales" , " Tremellales" , " Onygenales" , " Pneumocystales"
)
# get latest taxonomic names of these fungal orders
include_fungal_orders_ids <- taxonomy_gbif.bak %>%
filter ( order %in% include_fungal_orders )
2023-01-23 15:01:21 +01:00
include_fungal_orders <- taxonomy_gbif.bak %>%
filter ( taxonID %in% c ( include_fungal_orders_ids $ taxonID , include_fungal_orders_ids $ acceptedNameUsageID ) ) %>%
distinct ( order ) %>%
2024-02-24 15:16:52 +01:00
pull ( order ) | >
sort ( )
2022-12-12 00:14:56 +01:00
# check some columns to validate below filters
2024-02-24 15:16:52 +01:00
taxonomy_gbif.bak $ taxonomicStatus | > table ( ) | > sort ( ) | > as.data.frame ( )
taxonomy_gbif.bak $ taxonRank | > table ( ) | > sort ( ) | > as.data.frame ( )
2022-12-12 00:14:56 +01:00
2022-10-05 09:12:22 +02:00
taxonomy_gbif <- taxonomy_gbif.bak %>%
# immediately filter rows we really never want
filter (
# never doubtful status, only accepted and all synonyms, and only ranked items
taxonomicStatus != " doubtful" ,
taxonRank != " unranked" ,
# include these kingdoms (no Chromista)
kingdom %in% c ( " Archaea" , " Bacteria" , " Protozoa" ) |
# include all of these fungal orders
2024-02-24 15:16:52 +01:00
order %in% include_fungal_orders |
2022-10-05 09:12:22 +02:00
# and all of these important genera (see "data-raw/_pre_commit_hook.R")
# (they also contain bacteria and protozoa, but these will get prevalence = 2 later on)
genus %in% AMR ::: MO_PREVALENT_GENERA
) %>%
select (
kingdom ,
phylum ,
class ,
order ,
family ,
genus ,
species = specificEpithet ,
subspecies = infraspecificEpithet ,
rank = taxonRank ,
status = taxonomicStatus ,
ref = scientificNameAuthorship ,
gbif = taxonID ,
gbif_parent = parentNameUsageID ,
gbif_renamed_to = acceptedNameUsageID
) %>%
2022-08-28 10:31:50 +02:00
mutate (
2022-10-05 09:12:22 +02:00
# do this mutate after the original selection/filtering, as it decreases computing time tremendously
status = ifelse ( status == " accepted" , " accepted" , " synonym" ) ,
# checked taxonRank - the "form" and "variety" always have a subspecies, so:
rank = ifelse ( rank %in% c ( " form" , " variety" ) , " subspecies" , rank ) ,
source = " GBIF"
) %>%
filter (
# their data is messy - keep only these:
rank == " kingdom" & ! is.na ( kingdom ) |
rank == " phylum" & ! is.na ( phylum ) |
rank == " class" & ! is.na ( class ) |
rank == " order" & ! is.na ( order ) |
rank == " family" & ! is.na ( family ) |
rank == " genus" & ! is.na ( genus ) |
rank == " species" & ! is.na ( species ) |
rank == " subspecies" & ! is.na ( subspecies )
) %>%
# some items end with _A or _B... why??
mutate_all ( ~ gsub ( " _[A-Z]$" , " " , .x , perl = TRUE ) ) %>%
# now we have duplicates, remove these, but prioritise "accepted" status and highest taxon ID
arrange ( status , gbif ) %>%
distinct ( kingdom , phylum , class , order , family , genus , species , subspecies , .keep_all = TRUE ) %>%
filter (
kingdom %unlike% " [0-9]" ,
phylum %unlike% " [0-9]" ,
class %unlike% " [0-9]" ,
order %unlike% " [0-9]" ,
family %unlike% " [0-9]" ,
genus %unlike% " [0-9]"
)
# integrity tests
sort ( table ( taxonomy_gbif $ rank ) )
sort ( table ( taxonomy_gbif $ status ) )
2022-12-12 00:14:56 +01:00
taxonomy_gbif
2022-10-05 09:12:22 +02:00
# Read LPSN data ----------------------------------------------------------
taxonomy_lpsn.bak <- vroom ( file_lpsn )
2022-12-12 00:14:56 +01:00
2022-10-05 09:12:22 +02:00
taxonomy_lpsn <- taxonomy_lpsn.bak %>%
transmute (
genus = genus_name ,
species = sp_epithet ,
subspecies = subsp_epithet ,
rank = case_when (
! is.na ( subsp_epithet ) ~ " subspecies" ,
! is.na ( sp_epithet ) ~ " species" ,
TRUE ~ " genus"
) ,
status = ifelse ( is.na ( record_lnk ) , " accepted" , " synonym" ) ,
ref = authors ,
lpsn = as.character ( record_no ) ,
lpsn_parent = NA_character_ ,
lpsn_renamed_to = as.character ( record_lnk )
) %>%
mutate ( source = " LPSN" )
2024-02-24 15:16:52 +01:00
# integrity tests
sort ( table ( taxonomy_lpsn $ rank ) )
sort ( table ( taxonomy_lpsn $ status ) )
2022-10-05 09:12:22 +02:00
taxonomy_lpsn
# download additional taxonomy to the domain/kingdom level (their API is not sufficient...)
taxonomy_lpsn_missing <- tibble (
kingdom = character ( 0 ) ,
phylum = character ( 0 ) ,
class = character ( 0 ) ,
order = character ( 0 ) ,
family = character ( 0 ) ,
genus = character ( 0 )
)
for ( page in LETTERS ) {
2022-12-12 00:14:56 +01:00
# this will not alter `taxonomy_lpsn` yet
2024-02-24 15:16:52 +01:00
message ( " Downloading page " , page , " ..." , appendLF = TRUE )
2022-10-05 09:12:22 +02:00
url <- paste0 ( " https://lpsn.dsmz.de/genus?page=" , page )
2024-02-24 15:16:52 +01:00
x <- tryCatch ( read_html ( url ) ,
error = function ( e ) {
message ( " Waiting 10 seconds because of error: " , e $ message )
Sys.sleep ( 10 )
read_html ( url )
} )
x <- x %>%
2022-10-05 09:12:22 +02:00
# class "main-list" is the main table
2022-12-12 00:14:56 +01:00
html_element ( " .main-list" ) %>%
2022-10-05 09:12:22 +02:00
# get every list element with a set <id> attribute
2022-12-12 00:14:56 +01:00
html_elements ( " li[id]" )
2024-02-24 15:16:52 +01:00
pb <- progress_bar $ new ( total = length ( x ) , format = " [:bar] :current/:total :eta" )
2022-10-05 09:12:22 +02:00
for ( i in seq_len ( length ( x ) ) ) {
2024-02-24 15:16:52 +01:00
pb $ tick ( )
2022-12-12 00:14:56 +01:00
elements <- x [ [i ] ] %>% html_elements ( " a" )
hrefs <- elements %>% html_attr ( " href" )
2022-10-05 09:12:22 +02:00
ranks <- hrefs %>% gsub ( " .*/(.*?)/.*" , " \\1" , .)
names <- elements %>%
2022-12-12 00:14:56 +01:00
html_text ( ) %>%
2022-10-05 09:12:22 +02:00
gsub ( ' "' , " " , ., fixed = TRUE )
# no species, this must be until genus level
hrefs <- hrefs [ranks != " species" ]
names <- names [ranks != " species" ]
ranks <- ranks [ranks != " species" ]
ranks [ranks == " domain" ] <- " kingdom"
2024-02-24 15:16:52 +01:00
suppressMessages (
df <- names %>%
tibble ( ) %>%
t ( ) %>%
as_tibble ( .name_repair = " unique" ) %>%
setNames ( ranks ) %>%
# no candidates please
filter ( genus %unlike% " ^(Candidatus|\\[)" )
)
2023-01-23 15:01:21 +01:00
2022-10-05 09:12:22 +02:00
taxonomy_lpsn_missing <- taxonomy_lpsn_missing %>%
bind_rows ( df )
}
2024-02-24 15:16:52 +01:00
message ( " => " , length ( x ) , " entries incl. candidates (cleaned total: " , nrow ( taxonomy_lpsn_missing ) , " )" )
2022-10-05 09:12:22 +02:00
}
2024-02-24 15:16:52 +01:00
taxonomy_lpsn_missing <- taxonomy_lpsn_missing | > distinct ( )
# had to pick the right genus/family combination here:
taxonomy_lpsn_missing <- taxonomy_lpsn_missing | > filter ( ! ( genus == " Pusillimonas" & family == " Oscillospiraceae" ) )
taxonomy_lpsn.bak2 <- taxonomy_lpsn.bak
2022-10-05 09:12:22 +02:00
taxonomy_lpsn <- taxonomy_lpsn %>%
left_join ( taxonomy_lpsn_missing , by = " genus" ) %>%
select ( kingdom : family , everything ( ) ) %>%
# remove entries like "[Bacteria, no family]" and "[Bacteria, no class]"
mutate_all ( function ( x ) ifelse ( x %like_case% " no " , NA_character_ , x ) )
taxonomy_lpsn.bak2 <- taxonomy_lpsn
2024-02-24 15:16:52 +01:00
# download family directly from LPSN website using scraping, by using get_lpsn_and_author()
# try it first:
# get_lpsn_and_author("genus", "Escherichia")
pb <- progress_bar $ new ( total = length ( unique ( taxonomy_lpsn $ family ) ) , format = " [:bar] :current/:total :eta" )
2022-10-05 09:12:22 +02:00
for ( f in unique ( taxonomy_lpsn $ family ) ) {
pb $ tick ( )
if ( is.na ( f ) ) next
tax_info <- get_lpsn_and_author ( " Family" , f )
taxonomy_lpsn <- taxonomy_lpsn %>%
bind_rows ( tibble (
kingdom = taxonomy_lpsn $ kingdom [which ( taxonomy_lpsn $ family == f ) [1 ] ] ,
phylum = taxonomy_lpsn $ phylum [which ( taxonomy_lpsn $ family == f ) [1 ] ] ,
class = taxonomy_lpsn $ class [which ( taxonomy_lpsn $ family == f ) [1 ] ] ,
order = taxonomy_lpsn $ order [which ( taxonomy_lpsn $ family == f ) [1 ] ] ,
family = f ,
rank = " family" ,
2024-02-24 15:16:52 +01:00
status = unname ( tax_info [ " status" ] ) ,
2022-10-05 09:12:22 +02:00
source = " LPSN" ,
lpsn = unname ( tax_info [ " lpsn" ] ) ,
ref = unname ( tax_info [ " ref" ] )
) )
}
2022-12-12 00:14:56 +01:00
# download order directly from LPSN website using scraping
2024-02-24 15:16:52 +01:00
pb <- progress_bar $ new ( total = length ( unique ( taxonomy_lpsn $ order ) ) , format = " [:bar] :current/:total :eta" )
2022-10-05 09:12:22 +02:00
for ( o in unique ( taxonomy_lpsn $ order ) ) {
pb $ tick ( )
if ( is.na ( o ) ) next
tax_info <- get_lpsn_and_author ( " Order" , o )
taxonomy_lpsn <- taxonomy_lpsn %>%
bind_rows ( tibble (
kingdom = taxonomy_lpsn $ kingdom [which ( taxonomy_lpsn $ order == o ) [1 ] ] ,
phylum = taxonomy_lpsn $ phylum [which ( taxonomy_lpsn $ order == o ) [1 ] ] ,
class = taxonomy_lpsn $ class [which ( taxonomy_lpsn $ order == o ) [1 ] ] ,
order = o ,
rank = " order" ,
2024-02-24 15:16:52 +01:00
status = unname ( tax_info [ " status" ] ) ,
2022-10-05 09:12:22 +02:00
source = " LPSN" ,
lpsn = unname ( tax_info [ " lpsn" ] ) ,
ref = unname ( tax_info [ " ref" ] )
) )
}
2022-12-12 00:14:56 +01:00
# download class directly from LPSN website using scraping
2024-02-24 15:16:52 +01:00
pb <- progress_bar $ new ( total = length ( unique ( taxonomy_lpsn $ class ) ) , format = " [:bar] :current/:total :eta" )
2022-10-05 09:12:22 +02:00
for ( cc in unique ( taxonomy_lpsn $ class ) ) {
pb $ tick ( )
if ( is.na ( cc ) ) next
tax_info <- get_lpsn_and_author ( " Class" , cc )
taxonomy_lpsn <- taxonomy_lpsn %>%
bind_rows ( tibble (
kingdom = taxonomy_lpsn $ kingdom [which ( taxonomy_lpsn $ class == cc ) [1 ] ] ,
phylum = taxonomy_lpsn $ phylum [which ( taxonomy_lpsn $ class == cc ) [1 ] ] ,
class = cc ,
rank = " class" ,
2024-02-24 15:16:52 +01:00
status = unname ( tax_info [ " status" ] ) ,
2022-10-05 09:12:22 +02:00
source = " LPSN" ,
lpsn = unname ( tax_info [ " lpsn" ] ) ,
ref = unname ( tax_info [ " ref" ] )
) )
}
2022-12-12 00:14:56 +01:00
# download phylum directly from LPSN website using scraping
2024-02-24 15:16:52 +01:00
pb <- progress_bar $ new ( total = length ( unique ( taxonomy_lpsn $ phylum ) ) , format = " [:bar] :current/:total :eta" )
2022-10-05 09:12:22 +02:00
for ( p in unique ( taxonomy_lpsn $ phylum ) ) {
pb $ tick ( )
if ( is.na ( p ) ) next
tax_info <- get_lpsn_and_author ( " Phylum" , p )
taxonomy_lpsn <- taxonomy_lpsn %>%
bind_rows ( tibble (
kingdom = taxonomy_lpsn $ kingdom [which ( taxonomy_lpsn $ phylum == p ) [1 ] ] ,
phylum = p ,
rank = " phylum" ,
2024-02-24 15:16:52 +01:00
status = unname ( tax_info [ " status" ] ) ,
2022-10-05 09:12:22 +02:00
source = " LPSN" ,
lpsn = unname ( tax_info [ " lpsn" ] ) ,
ref = unname ( tax_info [ " ref" ] )
) )
}
2022-12-12 00:14:56 +01:00
# download kingdom directly from LPSN website using scraping
2024-02-24 15:16:52 +01:00
pb <- progress_bar $ new ( total = length ( unique ( taxonomy_lpsn $ kingdom ) ) , format = " [:bar] :current/:total :eta" )
2022-10-05 09:12:22 +02:00
for ( k in unique ( taxonomy_lpsn $ kingdom ) ) {
pb $ tick ( )
if ( is.na ( k ) ) next
tax_info <- get_lpsn_and_author ( " Domain" , k )
taxonomy_lpsn <- taxonomy_lpsn %>%
bind_rows ( tibble (
kingdom = k ,
rank = " kingdom" ,
2024-02-24 15:16:52 +01:00
status = unname ( tax_info [ " status" ] ) ,
2022-10-05 09:12:22 +02:00
source = " LPSN" ,
lpsn = unname ( tax_info [ " lpsn" ] ) ,
ref = unname ( tax_info [ " ref" ] )
) )
}
# integrity tests
sort ( table ( taxonomy_lpsn $ rank ) )
sort ( table ( taxonomy_lpsn $ status ) )
# Save intermediate results -----------------------------------------------
saveRDS ( taxonomy_gbif , " data-raw/taxonomy_gbif.rds" , version = 2 )
saveRDS ( taxonomy_lpsn , " data-raw/taxonomy_lpsn.rds" , version = 2 )
2022-12-12 00:14:56 +01:00
# this allows to always get back to this point by simply loading the files from data-raw/.
2022-10-05 09:12:22 +02:00
2024-02-24 15:16:52 +01:00
2022-10-05 09:12:22 +02:00
# Add full names ----------------------------------------------------------
taxonomy_gbif <- taxonomy_gbif %>%
# clean NAs and add fullname
mutate ( across ( kingdom : subspecies , function ( x ) ifelse ( is.na ( x ) , " " , x ) ) ,
2023-01-23 15:01:21 +01:00
fullname = trimws ( case_when (
rank == " family" ~ family ,
rank == " order" ~ order ,
rank == " class" ~ class ,
rank == " phylum" ~ phylum ,
rank == " kingdom" ~ kingdom ,
TRUE ~ paste ( genus , species , subspecies )
) ) , .before = 1
2022-08-28 10:31:50 +02:00
) %>%
2022-10-05 09:12:22 +02:00
# keep only one GBIF taxon ID per full name
arrange ( fullname , gbif ) %>%
2022-12-12 00:14:56 +01:00
distinct ( kingdom , rank , fullname , .keep_all = TRUE )
2020-05-27 16:37:49 +02:00
2022-10-05 09:12:22 +02:00
taxonomy_lpsn <- taxonomy_lpsn %>%
# clean NAs and add fullname
mutate ( across ( kingdom : subspecies , function ( x ) ifelse ( is.na ( x ) , " " , x ) ) ,
2023-01-23 15:01:21 +01:00
fullname = trimws ( case_when (
rank == " family" ~ family ,
rank == " order" ~ order ,
rank == " class" ~ class ,
rank == " phylum" ~ phylum ,
rank == " kingdom" ~ kingdom ,
TRUE ~ paste ( genus , species , subspecies )
) ) , .before = 1
2022-08-28 10:31:50 +02:00
) %>%
2022-10-05 09:12:22 +02:00
# keep only one LPSN record ID per full name
arrange ( fullname , lpsn ) %>%
2022-12-12 00:14:56 +01:00
distinct ( kingdom , rank , fullname , .keep_all = TRUE )
2022-10-05 09:12:22 +02:00
# set parent LPSN IDs, requires full name
taxonomy_lpsn $ lpsn_parent [taxonomy_lpsn $ rank == " phylum" ] <- taxonomy_lpsn $ lpsn [match ( taxonomy_lpsn $ kingdom [taxonomy_lpsn $ rank == " phylum" ] , taxonomy_lpsn $ fullname ) ]
taxonomy_lpsn $ lpsn_parent [taxonomy_lpsn $ rank == " class" ] <- taxonomy_lpsn $ lpsn [match ( taxonomy_lpsn $ phylum [taxonomy_lpsn $ rank == " class" ] , taxonomy_lpsn $ fullname ) ]
taxonomy_lpsn $ lpsn_parent [taxonomy_lpsn $ rank == " order" ] <- taxonomy_lpsn $ lpsn [match ( taxonomy_lpsn $ class [taxonomy_lpsn $ rank == " order" ] , taxonomy_lpsn $ fullname ) ]
taxonomy_lpsn $ lpsn_parent [taxonomy_lpsn $ rank == " family" ] <- taxonomy_lpsn $ lpsn [match ( taxonomy_lpsn $ order [taxonomy_lpsn $ rank == " family" ] , taxonomy_lpsn $ fullname ) ]
taxonomy_lpsn $ lpsn_parent [taxonomy_lpsn $ rank == " genus" ] <- taxonomy_lpsn $ lpsn [match ( taxonomy_lpsn $ family [taxonomy_lpsn $ rank == " genus" ] , taxonomy_lpsn $ fullname ) ]
taxonomy_lpsn $ lpsn_parent [taxonomy_lpsn $ rank == " species" ] <- taxonomy_lpsn $ lpsn [match ( taxonomy_lpsn $ genus [taxonomy_lpsn $ rank == " species" ] , taxonomy_lpsn $ fullname ) ]
taxonomy_lpsn $ lpsn_parent [taxonomy_lpsn $ rank == " subspecies" ] <- taxonomy_lpsn $ lpsn [match ( paste ( taxonomy_lpsn $ genus [taxonomy_lpsn $ rank == " subspecies" ] , taxonomy_lpsn $ species [taxonomy_lpsn $ rank == " subspecies" ] ) , taxonomy_lpsn $ fullname ) ]
# Combine the datasets ----------------------------------------------------
2024-02-24 15:16:52 +01:00
taxonomy <- taxonomy_lpsn | >
# start by adding GBIF to the bottom
bind_rows ( taxonomy_gbif ) | >
# group on unique species
group_by ( kingdom , fullname ) | >
# fill the NAs in LPSN/GBIF fields and ref with the other source (so LPSN: 123 and GBIF: NA will become LPSN: 123 and GBIF: 123)
mutate ( across ( matches ( " ^(lpsn|gbif|ref)" ) , function ( x ) rep ( x [ ! is.na ( x ) ] [1 ] , length ( x ) ) ) ) | >
# ungroup again
ungroup ( ) | >
# only keep unique species per kingdom
distinct ( kingdom , fullname , .keep_all = TRUE ) | >
arrange ( fullname )
2019-09-22 17:19:59 +02:00
2022-12-16 16:10:43 +01:00
# get missing entries from existing microorganisms data set
2024-02-24 15:16:52 +01:00
taxonomy.old <- AMR :: microorganisms %>%
2023-01-23 15:01:21 +01:00
select ( all_of ( colnames ( taxonomy ) ) ) %>%
filter (
! paste ( kingdom , fullname ) %in% paste ( taxonomy $ kingdom , taxonomy $ fullname ) ,
# these will be added later:
2024-02-24 15:16:52 +01:00
source != " manually added" )
taxonomy <- taxonomy %>%
bind_rows ( taxonomy.old ) %>%
2022-12-16 16:10:43 +01:00
arrange ( fullname ) %>%
filter ( fullname != " " )
2022-10-05 09:12:22 +02:00
# fix rank
2022-12-12 00:14:56 +01:00
table ( taxonomy $ rank , useNA = " always" )
2022-10-05 09:12:22 +02:00
taxonomy <- taxonomy %>%
mutate ( rank = case_when (
subspecies != " " ~ " subspecies" ,
species != " " ~ " species" ,
genus != " " ~ " genus" ,
family != " " ~ " family" ,
order != " " ~ " order" ,
class != " " ~ " class" ,
phylum != " " ~ " phylum" ,
kingdom != " " ~ " kingdom" ,
TRUE ~ NA_character_
) )
table ( taxonomy $ rank , useNA = " always" )
2024-02-24 15:16:52 +01:00
# at this point, it happens that some genera within kingdoms have multiple families / orders, etc., see here:
taxonomy | > filter ( genus != " " ) | > group_by ( kingdom , genus ) | > filter ( n_distinct ( family ) > 1 ) | > View ( )
# so make this universal
taxonomy <- taxonomy | >
group_by ( kingdom , genus ) | >
mutate ( family = get_top_lvl ( family , rank , " genus" ) ) | >
group_by ( kingdom , family ) | >
mutate ( order = get_top_lvl ( order , rank , " family" ) ) | >
group_by ( kingdom , order ) | >
mutate ( class = get_top_lvl ( class , rank , " order" ) ) | >
group_by ( kingdom , class ) | >
mutate ( phylum = get_top_lvl ( phylum , rank , " class" ) ) | >
ungroup ( )
# and remove the taxonomy where it must remain empty
taxonomy <- taxonomy | >
mutate ( phylum = ifelse ( rank %in% c ( " kingdom" ) , " " , phylum ) ,
class = ifelse ( rank %in% c ( " kingdom" , " phylum" ) , " " , class ) ,
order = ifelse ( rank %in% c ( " kingdom" , " phylum" , " class" ) , " " , order ) ,
family = ifelse ( rank %in% c ( " kingdom" , " phylum" , " class" , " order" ) , " " , family ) ,
genus = ifelse ( rank %in% c ( " kingdom" , " phylum" , " class" , " order" , " family" ) , " " , genus ) ,
species = ifelse ( rank %in% c ( " kingdom" , " phylum" , " class" , " order" , " family" , " genus" ) , " " , species ) ,
subspecies = ifelse ( rank %in% c ( " kingdom" , " phylum" , " class" , " order" , " family" , " genus" , " species" ) , " " , subspecies ) )
2022-12-16 16:10:43 +01:00
# Save intermediate results (0) -------------------------------------------
saveRDS ( taxonomy , " data-raw/taxonomy0.rds" )
2022-10-05 09:12:22 +02:00
2022-12-16 16:10:43 +01:00
# Add missing and fix old taxonomic entries -------------------------------
2022-10-05 09:12:22 +02:00
# this part will make sure that the whole taxonomy of every included species exists, so no missing genera, classes, etc.
current_gbif <- taxonomy_gbif.bak %>%
filter ( is.na ( acceptedNameUsageID ) ) %>%
mutate (
taxonID = as.character ( taxonID ) ,
parentNameUsageID = as.character ( parentNameUsageID )
)
# add missing kingdoms
2024-02-24 15:16:52 +01:00
taxonomy_all_missing <- taxonomy %>%
filter ( kingdom != " " ) %>%
distinct ( kingdom ) %>%
mutate (
fullname = kingdom ,
rank = " kingdom"
) %>%
filter ( ! paste ( kingdom , rank ) %in% paste ( taxonomy $ kingdom , taxonomy $ rank ) ) %>%
left_join (
current_gbif %>%
select ( kingdom , rank = taxonRank , ref = scientificNameAuthorship , gbif = taxonID , gbif_parent = parentNameUsageID ) ,
by = c ( " kingdom" , " rank" )
) %>%
mutate ( source = ifelse ( ! is.na ( gbif ) , " GBIF" , " manually added" ) ,
status = ifelse ( ! is.na ( gbif ) , " accepted" , " unknown" ) )
2022-10-05 09:12:22 +02:00
# 2 = phylum ... 6 = genus
for ( i in 2 : 6 ) {
i_name <- colnames ( taxonomy ) [i + 1 ]
message ( " Adding missing: " , i_name , " ... " , appendLF = FALSE )
to_add <- taxonomy %>%
filter ( .[ [i + 1 ] ] != " " ) %>%
distinct ( kingdom , .[ [i + 1 ] ] , .keep_all = TRUE ) %>%
select ( kingdom : ( i + 1 ) ) %>%
2022-08-28 10:31:50 +02:00
mutate (
2022-10-05 09:12:22 +02:00
fullname = .[ [ncol ( .) ] ] ,
2024-02-24 15:16:52 +01:00
rank = i_name
2022-10-05 09:12:22 +02:00
) %>%
2024-02-24 15:16:52 +01:00
filter ( ! paste ( kingdom , .[ [ncol ( .) - 2 ] ] , rank ) %in% paste ( taxonomy $ kingdom , taxonomy [ [i + 1 ] ] , taxonomy $ rank ) ) %>%
2023-01-23 15:01:21 +01:00
# get GBIF identifier where available
left_join (
current_gbif %>%
select ( kingdom , all_of ( i_name ) , rank = taxonRank , ref = scientificNameAuthorship , gbif = taxonID , gbif_parent = parentNameUsageID ) ,
by = c ( " kingdom" , " rank" , i_name )
) %>%
2024-02-24 15:16:52 +01:00
mutate ( source = ifelse ( ! is.na ( gbif ) , " GBIF" , " manually added" ) ,
status = ifelse ( ! is.na ( gbif ) , " accepted" , " unknown" ) )
2022-10-05 09:12:22 +02:00
message ( " n = " , nrow ( to_add ) )
2024-02-24 15:16:52 +01:00
taxonomy_all_missing <- taxonomy_all_missing %>%
bind_rows ( to_add )
2022-10-29 14:15:23 +02:00
}
2022-12-12 00:14:56 +01:00
taxonomy_all_missing %>% View ( )
2022-10-30 14:31:45 +01:00
2022-12-12 00:14:56 +01:00
taxonomy <- taxonomy %>%
bind_rows ( taxonomy_all_missing )
# fix for duplicate fullnames within a kingdom (such as Nitrospira which is the name of the genus AND its class)
2023-01-23 15:01:21 +01:00
taxonomy <- taxonomy %>%
mutate (
rank_index = case_when (
rank == " subspecies" ~ 1 ,
rank == " species" ~ 2 ,
rank == " genus" ~ 3 ,
rank == " family" ~ 4 ,
rank == " order" ~ 5 ,
rank == " class" ~ 6 ,
TRUE ~ 7
) ,
fullname_rank = paste0 ( fullname , " {" , rank , " }" )
) %>%
arrange ( kingdom , fullname , rank_index ) %>%
group_by ( kingdom , fullname ) %>%
mutate ( fullname = if_else ( row_number ( ) > 1 , fullname_rank , fullname ) ) %>%
ungroup ( ) %>%
select ( - fullname_rank , - rank_index ) %>%
2022-12-12 00:14:56 +01:00
arrange ( fullname )
2022-10-29 14:15:23 +02:00
2024-02-24 15:16:52 +01:00
# now also add missing species that have subspecies (requires combination with genus)
2022-10-05 09:12:22 +02:00
taxonomy <- taxonomy %>%
2022-12-12 00:14:56 +01:00
bind_rows (
taxonomy %>%
filter ( species != " " ) %>%
distinct ( kingdom , genus , species , .keep_all = TRUE ) %>%
select ( kingdom : species ) %>%
mutate (
fullname = paste ( genus , species ) ,
2024-02-24 15:16:52 +01:00
rank = " species"
2022-12-12 00:14:56 +01:00
) %>%
filter ( ! paste ( kingdom , genus , species , rank ) %in% paste ( taxonomy $ kingdom , taxonomy $ genus , taxonomy $ species , taxonomy $ rank ) ) %>%
# get GBIF identifier where available
2023-01-23 15:01:21 +01:00
left_join (
current_gbif %>%
select ( kingdom , genus , species = specificEpithet , rank = taxonRank , ref = scientificNameAuthorship , gbif = taxonID , gbif_parent = parentNameUsageID ) ,
by = c ( " kingdom" , " rank" , " genus" , " species" )
2022-12-12 00:14:56 +01:00
) %>%
2024-02-24 15:16:52 +01:00
mutate ( source = ifelse ( ! is.na ( gbif ) , " GBIF" , " manually added" ) ,
status = ifelse ( ! is.na ( gbif ) , " accepted" , " unknown" ) )
2023-01-23 15:01:21 +01:00
)
2022-10-05 09:12:22 +02:00
2020-05-27 16:37:49 +02:00
2022-10-05 09:12:22 +02:00
# remove NAs from taxonomy again, and keep unique full names
taxonomy <- taxonomy %>%
mutate ( across ( kingdom : subspecies , function ( x ) ifelse ( is.na ( x ) , " " , x ) ) ) %>%
2024-02-24 15:16:52 +01:00
arrange ( kingdom , fullname , ref ) | >
2022-08-28 10:31:50 +02:00
distinct ( kingdom , fullname , .keep_all = TRUE ) %>%
2022-10-05 09:12:22 +02:00
filter ( kingdom != " " )
2019-03-18 14:29:41 +01:00
2022-12-12 00:14:56 +01:00
2022-12-16 16:10:43 +01:00
# Save intermediate results (1) -------------------------------------------
2022-10-05 09:12:22 +02:00
2022-12-12 00:14:56 +01:00
saveRDS ( taxonomy , " data-raw/taxonomy1.rds" )
2022-10-05 09:12:22 +02:00
# Get previously manually added entries -----------------------------------
manually_added <- AMR :: microorganisms %>%
2024-02-24 15:16:52 +01:00
filter ( source == " manually added" ,
! paste ( kingdom , fullname ) %in% paste ( taxonomy $ kingdom , taxonomy $ fullname ) ,
! rank %in% c ( " kingdom" , " phylum" , " class" , " order" , " family" ) ) %>%
2022-10-05 09:12:22 +02:00
select ( fullname : subspecies , ref , source , rank )
# get latest taxonomy for those entries
for ( g in unique ( manually_added $ genus [manually_added $ genus != " " & manually_added $ genus %in% taxonomy $ genus ] ) ) {
manually_added $ family [which ( manually_added $ genus == g ) ] <- taxonomy $ family [which ( taxonomy $ genus == g & is.na ( taxonomy $ lpsn ) ) ] [1 ]
}
for ( f in unique ( manually_added $ family [manually_added $ family != " " & manually_added $ family %in% taxonomy $ family ] ) ) {
manually_added $ order [which ( manually_added $ family == f ) ] <- taxonomy $ order [which ( taxonomy $ family == f & is.na ( taxonomy $ lpsn ) ) ] [1 ]
}
for ( o in unique ( manually_added $ order [manually_added $ order != " " & manually_added $ order %in% taxonomy $ order ] ) ) {
manually_added $ class [which ( manually_added $ order == o ) ] <- taxonomy $ class [which ( taxonomy $ order == o & is.na ( taxonomy $ lpsn ) ) ] [1 ]
}
for ( cc in unique ( manually_added $ class [manually_added $ class != " " & manually_added $ class %in% taxonomy $ class ] ) ) {
manually_added $ phylum [which ( manually_added $ class == cc ) ] <- taxonomy $ phylum [which ( taxonomy $ class == cc & is.na ( taxonomy $ lpsn ) ) ] [1 ]
}
2022-12-16 16:10:43 +01:00
for ( p in unique ( manually_added $ phylum [manually_added $ phylum != " " & manually_added $ phylum %in% taxonomy $ phylum ] ) ) {
manually_added $ kingdom [which ( manually_added $ phylum == p ) ] <- taxonomy $ kingdom [which ( taxonomy $ phylum == p & is.na ( taxonomy $ lpsn ) ) ] [1 ]
}
2022-10-05 09:12:22 +02:00
2022-12-12 00:14:56 +01:00
manually_added <- manually_added %>%
mutate (
2024-02-24 15:16:52 +01:00
status = " unknown" ,
2022-12-12 00:14:56 +01:00
rank = ifelse ( fullname %like% " unknown" , " (unknown rank)" , rank )
)
2022-12-16 16:10:43 +01:00
manually_added
2022-10-05 09:12:22 +02:00
2024-02-24 15:16:52 +01:00
# these are now included in the new taxonomy, check them
manually_added | > filter ( fullname %in% taxonomy $ fullname )
2022-10-05 09:12:22 +02:00
taxonomy <- taxonomy %>%
2022-12-16 16:10:43 +01:00
# here also the 'unknowns' are added, such as "(unknown fungus)"
2022-12-12 00:14:56 +01:00
bind_rows ( manually_added ) %>%
2022-10-05 09:12:22 +02:00
arrange ( fullname )
table ( taxonomy $ rank , useNA = " always" )
2024-02-24 15:16:52 +01:00
# Get LPSN data for records missing from `taxonomy_lpsn` ------------------
# Weirdly enough, some LPSN records were lacking from `taxonomy_lpsn`,
# such as family Thiotrichaceae and its order Thiotrichales. When running
# get_lpsn_and_author("family", "Thiotrichaceae") you do get a result.
# So check every non-LPSN records from the kingdom of Bacteria and add it
gbif_bacteria <- which ( taxonomy $ kingdom == " Bacteria" & taxonomy $ source == " GBIF" & taxonomy $ rank %in% c ( " phylum" , " class" , " order" , " family" ) )
added <- 0
pb <- progress_bar $ new ( total = length ( gbif_bacteria ) , format = " [:bar] :current/:total :eta" )
for ( record in gbif_bacteria ) {
pb $ tick ( )
lpsn <- get_lpsn_and_author ( rank = taxonomy $ rank [record ] ,
name = taxonomy $ fullname [record ] )
if ( is.na ( lpsn [ " lpsn" ] ) ) {
next
} else {
added <- added + 1
taxonomy $ source [record ] <- " LPSN"
taxonomy $ lpsn [record ] <- unname ( lpsn [ " lpsn" ] )
taxonomy $ ref [record ] <- unname ( lpsn [ " ref" ] )
taxonomy $ status [record ] <- unname ( lpsn [ " status" ] )
}
}
message ( added , " GBIF records altered to latest LPSN" )
taxbak <- taxonomy
2022-10-05 09:12:22 +02:00
# Clean scientific reference ----------------------------------------------
taxonomy <- taxonomy %>%
mutate ( ref = get_author_year ( ref ) )
2019-08-06 14:39:22 +02:00
2019-09-18 15:46:09 +02:00
2022-12-16 16:10:43 +01:00
# Get the latest upper taxonomy from LPSN for non-LPSN data ---------------
2024-02-24 15:16:52 +01:00
# (e.g., phylum above class "Bacilli" was still "Firmicutes" in 2023, should be "Bacillota")
2022-12-16 16:10:43 +01:00
for ( k in unique ( taxonomy $ kingdom [taxonomy $ kingdom != " " ] ) ) {
message ( " Fixing GBIF taxonomy for kingdom " , k , " ." , appendLF = FALSE )
i <- 0
for ( g in unique ( taxonomy $ genus [taxonomy $ genus != " " & taxonomy $ kingdom == k & taxonomy $ source == " LPSN" ] ) ) {
i <- i + 1
if ( i %% 50 == 0 ) message ( " ." , appendLF = FALSE )
taxonomy $ family [which ( taxonomy $ genus == g & taxonomy $ kingdom == k ) ] <- taxonomy $ family [which ( taxonomy $ genus == g & taxonomy $ kingdom == k & taxonomy $ source == " LPSN" ) ] [1 ]
}
for ( f in unique ( taxonomy $ family [taxonomy $ family != " " & taxonomy $ kingdom == k & taxonomy $ source == " LPSN" ] ) ) {
i <- i + 1
if ( i %% 50 == 0 ) message ( " ." , appendLF = FALSE )
taxonomy $ order [which ( taxonomy $ family == f & taxonomy $ kingdom == k ) ] <- taxonomy $ order [which ( taxonomy $ family == f & taxonomy $ kingdom == k & taxonomy $ source == " LPSN" ) ] [1 ]
}
for ( o in unique ( taxonomy $ order [taxonomy $ order != " " & taxonomy $ kingdom == k & taxonomy $ source == " LPSN" ] ) ) {
i <- i + 1
if ( i %% 50 == 0 ) message ( " ." , appendLF = FALSE )
taxonomy $ class [which ( taxonomy $ order == o & taxonomy $ kingdom == k ) ] <- taxonomy $ class [which ( taxonomy $ order == o & taxonomy $ kingdom == k & taxonomy $ source == " LPSN" ) ] [1 ]
}
for ( cc in unique ( taxonomy $ class [taxonomy $ class != " " & taxonomy $ kingdom == k & taxonomy $ source == " LPSN" ] ) ) {
i <- i + 1
if ( i %% 50 == 0 ) message ( " ." , appendLF = FALSE )
taxonomy $ phylum [which ( taxonomy $ class == cc & taxonomy $ kingdom == k ) ] <- taxonomy $ phylum [which ( taxonomy $ class == cc & taxonomy $ kingdom == k & taxonomy $ source == " LPSN" ) ] [1 ]
}
message ( " OK." )
}
# we need to fix parent GBIF identifiers
2024-02-24 15:16:52 +01:00
taxonomy $ gbif_parent [taxonomy $ rank == " phylum" ] <- taxonomy $ gbif [match ( taxonomy $ kingdom [taxonomy $ rank == " phylum" ] , taxonomy $ fullname ) ]
taxonomy $ gbif_parent [taxonomy $ rank == " class" ] <- taxonomy $ gbif [match ( taxonomy $ phylum [taxonomy $ rank == " class" ] , taxonomy $ fullname ) ]
taxonomy $ gbif_parent [taxonomy $ rank == " order" ] <- taxonomy $ gbif [match ( taxonomy $ class [taxonomy $ rank == " order" ] , taxonomy $ fullname ) ]
taxonomy $ gbif_parent [taxonomy $ rank == " family" ] <- taxonomy $ gbif [match ( taxonomy $ order [taxonomy $ rank == " family" ] , taxonomy $ fullname ) ]
taxonomy $ gbif_parent [taxonomy $ rank == " genus" ] <- taxonomy $ gbif [match ( taxonomy $ family [taxonomy $ rank == " genus" ] , taxonomy $ fullname ) ]
taxonomy $ gbif_parent [taxonomy $ rank == " species" ] <- taxonomy $ gbif [match ( taxonomy $ genus [taxonomy $ rank == " species" ] , taxonomy $ fullname ) ]
taxonomy $ gbif_parent [taxonomy $ rank == " subspecies" ] <- taxonomy $ gbif [match ( paste ( taxonomy $ genus [taxonomy $ rank == " subspecies" ] , taxonomy $ species [taxonomy $ rank == " subspecies" ] ) , taxonomy $ fullname ) ]
# and LPSN parents
taxonomy $ lpsn_parent [taxonomy $ rank == " phylum" ] <- taxonomy $ lpsn [match ( taxonomy $ kingdom [taxonomy $ rank == " phylum" ] , taxonomy $ fullname ) ]
taxonomy $ lpsn_parent [taxonomy $ rank == " class" ] <- taxonomy $ lpsn [match ( taxonomy $ phylum [taxonomy $ rank == " class" ] , taxonomy $ fullname ) ]
taxonomy $ lpsn_parent [taxonomy $ rank == " order" ] <- taxonomy $ lpsn [match ( taxonomy $ class [taxonomy $ rank == " order" ] , taxonomy $ fullname ) ]
taxonomy $ lpsn_parent [taxonomy $ rank == " family" ] <- taxonomy $ lpsn [match ( taxonomy $ order [taxonomy $ rank == " family" ] , taxonomy $ fullname ) ]
taxonomy $ lpsn_parent [taxonomy $ rank == " genus" ] <- taxonomy $ lpsn [match ( taxonomy $ family [taxonomy $ rank == " genus" ] , taxonomy $ fullname ) ]
taxonomy $ lpsn_parent [taxonomy $ rank == " species" ] <- taxonomy $ lpsn [match ( taxonomy $ genus [taxonomy $ rank == " species" ] , taxonomy $ fullname ) ]
taxonomy $ lpsn_parent [taxonomy $ rank == " subspecies" ] <- taxonomy $ lpsn [match ( paste ( taxonomy $ genus [taxonomy $ rank == " subspecies" ] , taxonomy $ species [taxonomy $ rank == " subspecies" ] ) , taxonomy $ fullname ) ]
2022-12-16 16:10:43 +01:00
# these still have no record in our data set:
2024-02-24 15:16:52 +01:00
which ( ! taxonomy $ lpsn_parent %in% taxonomy $ lpsn )
which ( ! taxonomy $ gbif_parent %in% taxonomy $ gbif )
2022-12-16 16:10:43 +01:00
2022-12-12 00:14:56 +01:00
# fix rank
taxonomy <- taxonomy %>%
mutate ( rank = case_when (
subspecies != " " ~ " subspecies" ,
species != " " ~ " species" ,
genus != " " ~ " genus" ,
family != " " ~ " family" ,
order != " " ~ " order" ,
class != " " ~ " class" ,
phylum != " " ~ " phylum" ,
kingdom != " " ~ " kingdom" ,
TRUE ~ NA_character_
) )
2019-09-18 15:46:09 +02:00
2022-10-05 09:12:22 +02:00
2022-12-19 15:32:41 +01:00
# Add prevalence ----------------------------------------------------------
2024-02-24 15:16:52 +01:00
taxonomy_lpsn.bak3 <- taxonomy
2022-12-19 15:32:41 +01:00
pathogens <- read_excel ( file_bartlett , sheet = " Tab 6 Full List" )
# get all established, both old and current taxonomic names
2023-01-23 15:01:21 +01:00
established <- pathogens %>%
filter ( status == " established" ) %>%
2022-12-19 15:32:41 +01:00
mutate ( fullname = paste ( genus , species ) ) %>%
2023-01-23 15:01:21 +01:00
pull ( fullname ) %>%
c (
unlist ( mo_current ( .) ) ,
unlist ( mo_synonyms ( ., keep_synonyms = FALSE ) )
) %>%
strsplit ( " " , fixed = TRUE ) %>%
sapply ( function ( x ) ifelse ( length ( x ) == 1 , x , paste ( x [1 ] , x [2 ] ) ) ) %>%
sort ( ) %>%
2022-12-19 15:32:41 +01:00
unique ( )
# get all putative, both old and current taxonomic names
2023-01-23 15:01:21 +01:00
putative <- pathogens %>%
filter ( status == " putative" ) %>%
2022-12-19 15:32:41 +01:00
mutate ( fullname = paste ( genus , species ) ) %>%
2023-01-23 15:01:21 +01:00
pull ( fullname ) %>%
c (
unlist ( mo_current ( .) ) ,
unlist ( mo_synonyms ( ., keep_synonyms = FALSE ) )
) %>%
strsplit ( " " , fixed = TRUE ) %>%
sapply ( function ( x ) ifelse ( length ( x ) == 1 , x , paste ( x [1 ] , x [2 ] ) ) ) %>%
sort ( ) %>%
2022-12-19 15:32:41 +01:00
unique ( )
established <- established [established %unlike% " unknown" ]
putative <- putative [putative %unlike% " unknown" ]
2023-01-23 15:01:21 +01:00
established_genera <- established %>%
strsplit ( " " , fixed = TRUE ) %>%
sapply ( function ( x ) x [1 ] ) %>%
sort ( ) %>%
2022-12-19 15:32:41 +01:00
unique ( )
2023-01-23 15:01:21 +01:00
putative_genera <- putative %>%
strsplit ( " " , fixed = TRUE ) %>%
sapply ( function ( x ) x [1 ] ) %>%
sort ( ) %>%
2022-12-20 16:14:04 +01:00
unique ( )
2023-01-23 15:01:21 +01:00
nonbacterial_genera <- AMR ::: MO_PREVALENT_GENERA %>%
c (
unlist ( mo_current ( .) ) ,
unlist ( mo_synonyms ( ., keep_synonyms = FALSE ) )
) %>%
strsplit ( " " , fixed = TRUE ) %>%
sapply ( function ( x ) x [1 ] ) %>%
sort ( ) %>%
2022-12-19 15:32:41 +01:00
unique ( )
2022-12-20 16:14:04 +01:00
nonbacterial_genera <- nonbacterial_genera [nonbacterial_genera %unlike% " unknown" ]
2022-12-19 15:32:41 +01:00
# update prevalence based on taxonomy (following the recent and thorough work of Bartlett et al., 2022)
# see https://doi.org/10.1099/mic.0.001269
2023-01-23 15:01:21 +01:00
taxonomy <- taxonomy %>%
2022-12-19 15:32:41 +01:00
mutate ( prevalence = case_when (
2022-12-20 16:14:04 +01:00
# 'established' means 'have infected at least three persons in three or more references'
paste ( genus , species ) %in% established & rank %in% c ( " species" , " subspecies" ) ~ 1.0 ,
# other genera in the 'established' group
genus %in% established_genera & rank == " genus" ~ 1.0 ,
2023-01-23 15:01:21 +01:00
2022-12-20 16:14:04 +01:00
# 'putative' means 'fewer than three known cases'
paste ( genus , species ) %in% putative & rank %in% c ( " species" , " subspecies" ) ~ 1.25 ,
# other genera in the 'putative' group
genus %in% putative_genera & rank == " genus" ~ 1.25 ,
2023-01-23 15:01:21 +01:00
2022-12-20 16:14:04 +01:00
# species and subspecies in 'established' and 'putative' groups
genus %in% c ( established_genera , putative_genera ) & rank %in% c ( " species" , " subspecies" ) ~ 1.5 ,
# other species from a genus in either group
genus %in% nonbacterial_genera & rank %in% c ( " genus" , " species" , " subspecies" ) ~ 1.5 ,
2022-12-19 15:32:41 +01:00
# we keep track of prevalent genera too of non-bacterial species
2023-07-08 17:30:05 +02:00
genus %in% AMR ::: MO_PREVALENT_GENERA & kingdom != " Bacteria" & rank %in% c ( " genus" , " species" , " subspecies" ) ~ 1.25 ,
2023-01-23 15:01:21 +01:00
2022-12-20 16:14:04 +01:00
# all others
2023-01-23 15:01:21 +01:00
TRUE ~ 2.0
) )
2022-12-19 15:32:41 +01:00
table ( taxonomy $ prevalence , useNA = " always" )
# (a lot will be removed further below)
2022-12-16 16:10:43 +01:00
# Save intermediate results (2) -------------------------------------------
saveRDS ( taxonomy , " data-raw/taxonomy2.rds" )
2022-10-05 09:12:22 +02:00
# Add microbial IDs -------------------------------------------------------
# MO codes in the AMR package have the form KINGDOM_GENUS_SPECIES_SUBSPECIES where all are abbreviated.
# Kingdom is abbreviated with 1 character, with exceptions for Animalia and Plantae
mo_kingdom <- taxonomy %>%
filter ( rank == " kingdom" ) %>%
select ( kingdom ) %>%
mutate ( mo_kingdom = case_when (
kingdom == " Animalia" ~ " AN" ,
kingdom == " Archaea" ~ " A" ,
kingdom == " Bacteria" ~ " B" ,
kingdom == " Chromista" ~ " C" ,
kingdom == " Fungi" ~ " F" ,
kingdom == " Plantae" ~ " PL" ,
kingdom == " Protozoa" ~ " P" ,
TRUE ~ " "
) )
2024-02-24 15:16:52 +01:00
2022-10-05 09:12:22 +02:00
# phylum until family are abbreviated with 8 characters and prefixed with their rank
# Phylum - keep old and fill up for new ones
mo_phylum <- taxonomy %>%
filter ( rank == " phylum" ) %>%
distinct ( kingdom , phylum ) %>%
2023-01-23 15:01:21 +01:00
left_join (
AMR :: microorganisms %>%
filter ( rank == " phylum" ) %>%
transmute ( kingdom ,
phylum = fullname ,
mo_old = gsub ( " [A-Z]{1,2}_" , " " , as.character ( mo ) )
) ,
by = c ( " kingdom" , " phylum" )
2022-10-05 09:12:22 +02:00
) %>%
group_by ( kingdom ) %>%
mutate (
2023-05-08 13:04:18 +02:00
mo_phylum8 = AMR ::: abbreviate_mo ( phylum , minlength = 8 , prefix = " [PHL]_" ) ,
mo_phylum9 = AMR ::: abbreviate_mo ( phylum , minlength = 9 , prefix = " [PHL]_" ) ,
2022-10-05 09:12:22 +02:00
mo_phylum = ifelse ( ! is.na ( mo_old ) , mo_old , mo_phylum8 ) ,
mo_duplicated = duplicated ( mo_phylum ) ,
mo_phylum = ifelse ( mo_duplicated , mo_phylum9 , mo_phylum ) ,
mo_duplicated = duplicated ( mo_phylum )
) %>%
ungroup ( )
if ( any ( mo_phylum $ mo_duplicated , na.rm = TRUE ) ) stop ( " Duplicate MO codes for phylum!" )
mo_phylum <- mo_phylum %>%
select ( kingdom , phylum , mo_phylum )
# Class - keep old and fill up for new ones
mo_class <- taxonomy %>%
filter ( rank == " class" ) %>%
distinct ( kingdom , class ) %>%
2023-01-23 15:01:21 +01:00
left_join (
AMR :: microorganisms %>%
filter ( rank == " class" ) %>%
transmute ( kingdom ,
class = fullname ,
mo_old = gsub ( " [A-Z]{1,2}_" , " " , as.character ( mo ) )
) ,
by = c ( " kingdom" , " class" )
2022-10-05 09:12:22 +02:00
) %>%
group_by ( kingdom ) %>%
2022-08-28 10:31:50 +02:00
mutate (
2023-05-08 13:04:18 +02:00
mo_class8 = AMR ::: abbreviate_mo ( class , minlength = 8 , prefix = " [CLS]_" ) ,
mo_class9 = AMR ::: abbreviate_mo ( class , minlength = 9 , prefix = " [CLS]_" ) ,
2022-10-05 09:12:22 +02:00
mo_class = ifelse ( ! is.na ( mo_old ) , mo_old , mo_class8 ) ,
mo_duplicated = duplicated ( mo_class ) ,
mo_class = ifelse ( mo_duplicated , mo_class9 , mo_class ) ,
mo_duplicated = duplicated ( mo_class )
) %>%
ungroup ( )
if ( any ( mo_class $ mo_duplicated , na.rm = TRUE ) ) stop ( " Duplicate MO codes for class!" )
mo_class <- mo_class %>%
select ( kingdom , class , mo_class )
# Order - keep old and fill up for new ones
mo_order <- taxonomy %>%
filter ( rank == " order" ) %>%
distinct ( kingdom , order ) %>%
2023-01-23 15:01:21 +01:00
left_join (
AMR :: microorganisms %>%
filter ( rank == " order" ) %>%
transmute ( kingdom ,
order = fullname ,
mo_old = gsub ( " [A-Z]{1,2}_" , " " , as.character ( mo ) )
) ,
by = c ( " kingdom" , " order" )
2022-08-28 10:31:50 +02:00
) %>%
2022-10-05 09:12:22 +02:00
group_by ( kingdom ) %>%
2022-08-28 10:31:50 +02:00
mutate (
2023-05-08 13:04:18 +02:00
mo_order8 = AMR ::: abbreviate_mo ( order , minlength = 8 , prefix = " [ORD]_" ) ,
mo_order9 = AMR ::: abbreviate_mo ( order , minlength = 9 , prefix = " [ORD]_" ) ,
2022-10-05 09:12:22 +02:00
mo_order = ifelse ( ! is.na ( mo_old ) , mo_old , mo_order8 ) ,
mo_duplicated = duplicated ( mo_order ) ,
mo_order = ifelse ( mo_duplicated , mo_order9 , mo_order ) ,
mo_duplicated = duplicated ( mo_order )
) %>%
ungroup ( )
if ( any ( mo_order $ mo_duplicated , na.rm = TRUE ) ) stop ( " Duplicate MO codes for order!" )
mo_order <- mo_order %>%
select ( kingdom , order , mo_order )
# Family - keep old and fill up for new ones
mo_family <- taxonomy %>%
filter ( rank == " family" ) %>%
distinct ( kingdom , family ) %>%
2023-01-23 15:01:21 +01:00
left_join (
AMR :: microorganisms %>%
filter ( rank == " family" ) %>%
transmute ( kingdom ,
family = fullname ,
mo_old = gsub ( " [A-Z]{1,2}_" , " " , as.character ( mo ) )
) ,
by = c ( " kingdom" , " family" )
2022-10-05 09:12:22 +02:00
) %>%
group_by ( kingdom ) %>%
mutate (
2023-05-08 13:04:18 +02:00
mo_family8 = AMR ::: abbreviate_mo ( family , minlength = 8 , prefix = " [FAM]_" ) ,
mo_family9 = AMR ::: abbreviate_mo ( family , minlength = 9 , prefix = " [FAM]_" ) ,
2022-10-05 09:12:22 +02:00
mo_family = ifelse ( ! is.na ( mo_old ) , mo_old , mo_family8 ) ,
mo_duplicated = duplicated ( mo_family ) ,
mo_family = ifelse ( mo_duplicated , mo_family9 , mo_family ) ,
mo_duplicated = duplicated ( mo_family )
2022-08-28 10:31:50 +02:00
) %>%
2022-10-05 09:12:22 +02:00
ungroup ( )
if ( any ( mo_family $ mo_duplicated , na.rm = TRUE ) ) stop ( " Duplicate MO codes for family!" )
mo_family <- mo_family %>%
select ( kingdom , family , mo_family )
2019-02-20 00:04:48 +01:00
2022-10-05 09:12:22 +02:00
# construct code part for genus - keep old code where available and generate new ones where needed
mo_genus <- taxonomy %>%
filter ( rank == " genus" ) %>%
distinct ( kingdom , genus ) %>%
# get available old MO codes
2023-01-23 15:01:21 +01:00
left_join (
AMR :: microorganisms %>%
filter ( rank == " genus" ) %>%
transmute ( mo_genus_old = gsub ( " ^[A-Z]+_" , " " , as.character ( mo ) ) , kingdom , genus ) %>%
distinct ( kingdom , genus , .keep_all = TRUE ) ,
by = c ( " kingdom" , " genus" )
2022-10-05 09:12:22 +02:00
) %>%
distinct ( kingdom , genus , .keep_all = TRUE ) %>%
# since kingdom is part of the code, genus abbreviations may be duplicated between kingdoms
group_by ( kingdom ) %>%
# generate new MO codes for genus and set the right one
mutate (
2023-05-08 13:04:18 +02:00
mo_genus_new5 = AMR ::: abbreviate_mo ( genus , 5 ) ,
mo_genus_new5b = paste0 ( AMR ::: abbreviate_mo ( genus , 5 ) , 1 ) ,
mo_genus_new6 = AMR ::: abbreviate_mo ( genus , 6 ) ,
mo_genus_new7 = AMR ::: abbreviate_mo ( genus , 7 ) ,
mo_genus_new8 = AMR ::: abbreviate_mo ( genus , 8 ) ,
2022-10-05 09:12:22 +02:00
mo_genus_new = case_when (
! is.na ( mo_genus_old ) ~ mo_genus_old ,
! mo_genus_new5 %in% mo_genus_old ~ mo_genus_new5 ,
! mo_genus_new6 %in% mo_genus_old ~ mo_genus_new6 ,
! mo_genus_new7 %in% mo_genus_old ~ mo_genus_new7 ,
! mo_genus_new8 %in% mo_genus_old ~ mo_genus_new8 ,
! mo_genus_new5b %in% mo_genus_old ~ mo_genus_new5b ,
TRUE ~ mo_genus_old
2022-08-28 10:31:50 +02:00
) ,
2022-10-05 09:12:22 +02:00
mo_duplicated = duplicated ( mo_genus_new ) ,
mo_genus_new = case_when (
! mo_duplicated ~ mo_genus_new ,
mo_duplicated & mo_genus_new == mo_genus_new5 ~ mo_genus_new6 ,
mo_duplicated & mo_genus_new == mo_genus_new6 ~ mo_genus_new7 ,
mo_duplicated & mo_genus_new == mo_genus_new7 ~ mo_genus_new8 ,
mo_duplicated & mo_genus_new == mo_genus_new8 ~ mo_genus_new5b ,
TRUE ~ NA_character_
2022-08-28 10:31:50 +02:00
) ,
2022-10-05 09:12:22 +02:00
mo_duplicated = duplicated ( mo_genus_new )
) %>%
ungroup ( )
if ( any ( mo_genus $ mo_duplicated , na.rm = TRUE ) | anyNA ( mo_genus $ mo_genus_new ) ) stop ( " Duplicate MO codes for genus!" )
# no duplicates *within kingdoms*, so keep the right columns for left joining later
mo_genus <- mo_genus %>%
select ( kingdom , genus , mo_genus = mo_genus_new )
# same for species - keep old where available and create new per kingdom-genus where needed:
mo_species <- taxonomy %>%
filter ( rank == " species" ) %>%
distinct ( kingdom , genus , species ) %>%
2023-01-23 15:01:21 +01:00
left_join (
AMR :: microorganisms %>%
filter ( rank == " species" ) %>%
transmute ( mo_species_old = gsub ( " ^[A-Z]+_[A-Z]+_" , " " , as.character ( mo ) ) , kingdom , genus , species ) %>%
filter ( mo_species_old %unlike% " -" ) %>%
distinct ( kingdom , genus , species , .keep_all = TRUE ) ,
by = c ( " kingdom" , " genus" , " species" )
2022-10-05 09:12:22 +02:00
) %>%
distinct ( kingdom , genus , species , .keep_all = TRUE ) %>%
group_by ( kingdom , genus ) %>%
mutate (
2023-05-08 13:04:18 +02:00
mo_species_new4 = AMR ::: abbreviate_mo ( species , 4 , hyphen_as_space = TRUE ) ,
mo_species_new5 = AMR ::: abbreviate_mo ( species , 5 , hyphen_as_space = TRUE ) ,
mo_species_new5b = paste0 ( AMR ::: abbreviate_mo ( species , 5 , hyphen_as_space = TRUE ) , 1 ) ,
mo_species_new6 = AMR ::: abbreviate_mo ( species , 6 , hyphen_as_space = TRUE ) ,
mo_species_new7 = AMR ::: abbreviate_mo ( species , 7 , hyphen_as_space = TRUE ) ,
mo_species_new8 = AMR ::: abbreviate_mo ( species , 8 , hyphen_as_space = TRUE ) ,
2022-10-05 09:12:22 +02:00
mo_species_new = case_when (
! is.na ( mo_species_old ) ~ mo_species_old ,
! mo_species_new4 %in% mo_species_old ~ mo_species_new4 ,
! mo_species_new5 %in% mo_species_old ~ mo_species_new5 ,
! mo_species_new6 %in% mo_species_old ~ mo_species_new6 ,
! mo_species_new7 %in% mo_species_old ~ mo_species_new7 ,
! mo_species_new8 %in% mo_species_old ~ mo_species_new8 ,
! mo_species_new5b %in% mo_species_old ~ mo_species_new5b ,
TRUE ~ mo_species_old
2022-08-28 10:31:50 +02:00
) ,
2022-10-05 09:12:22 +02:00
mo_duplicated = duplicated ( mo_species_new ) ,
mo_species_new = case_when (
! mo_duplicated ~ mo_species_new ,
mo_duplicated & mo_species_new == mo_species_new4 ~ mo_species_new5 ,
mo_duplicated & mo_species_new == mo_species_new5 ~ mo_species_new6 ,
mo_duplicated & mo_species_new == mo_species_new6 ~ mo_species_new7 ,
mo_duplicated & mo_species_new == mo_species_new7 ~ mo_species_new8 ,
mo_duplicated & mo_species_new == mo_species_new8 ~ mo_species_new5b ,
TRUE ~ NA_character_
2022-08-28 10:31:50 +02:00
) ,
2022-10-05 09:12:22 +02:00
mo_duplicated = duplicated ( mo_species_new )
) %>%
ungroup ( )
if ( any ( mo_species $ mo_duplicated , na.rm = TRUE ) | anyNA ( mo_species $ mo_species_new ) ) stop ( " Duplicate MO codes for species!" )
# no duplicates *within kingdoms*, so keep the right columns for left joining later
mo_species <- mo_species %>%
select ( kingdom , genus , species , mo_species = mo_species_new )
# same for subspecies - keep old where available and create new per kingdom-genus-species where needed:
mo_subspecies <- taxonomy %>%
filter ( rank == " subspecies" ) %>%
distinct ( kingdom , genus , species , subspecies ) %>%
2023-01-23 15:01:21 +01:00
left_join (
AMR :: microorganisms %>%
filter ( rank %in% c ( " subspecies" , " subsp." , " infraspecies" ) ) %>%
transmute ( mo_subspecies_old = gsub ( " ^[A-Z]+_[A-Z]+_[A-Z]+_" , " " , as.character ( mo ) ) , kingdom , genus , species , subspecies ) %>%
filter ( mo_subspecies_old %unlike% " -" ) %>%
distinct ( kingdom , genus , species , subspecies , .keep_all = TRUE ) ,
by = c ( " kingdom" , " genus" , " species" , " subspecies" )
2022-10-05 09:12:22 +02:00
) %>%
distinct ( kingdom , genus , species , subspecies , .keep_all = TRUE ) %>%
group_by ( kingdom , genus , species ) %>%
mutate (
2023-05-08 13:04:18 +02:00
mo_subspecies_new4 = AMR ::: abbreviate_mo ( subspecies , 4 , hyphen_as_space = TRUE ) ,
mo_subspecies_new5 = AMR ::: abbreviate_mo ( subspecies , 5 , hyphen_as_space = TRUE ) ,
mo_subspecies_new5b = paste0 ( AMR ::: abbreviate_mo ( subspecies , 5 , hyphen_as_space = TRUE ) , 1 ) ,
mo_subspecies_new6 = AMR ::: abbreviate_mo ( subspecies , 6 , hyphen_as_space = TRUE ) ,
mo_subspecies_new7 = AMR ::: abbreviate_mo ( subspecies , 7 , hyphen_as_space = TRUE ) ,
mo_subspecies_new8 = AMR ::: abbreviate_mo ( subspecies , 8 , hyphen_as_space = TRUE ) ,
2022-10-05 09:12:22 +02:00
mo_subspecies_new = case_when (
! is.na ( mo_subspecies_old ) ~ mo_subspecies_old ,
! mo_subspecies_new4 %in% mo_subspecies_old ~ mo_subspecies_new4 ,
! mo_subspecies_new5 %in% mo_subspecies_old ~ mo_subspecies_new5 ,
! mo_subspecies_new6 %in% mo_subspecies_old ~ mo_subspecies_new6 ,
! mo_subspecies_new7 %in% mo_subspecies_old ~ mo_subspecies_new7 ,
! mo_subspecies_new8 %in% mo_subspecies_old ~ mo_subspecies_new8 ,
! mo_subspecies_new5b %in% mo_subspecies_old ~ mo_subspecies_new5b ,
TRUE ~ mo_subspecies_old
2022-08-28 10:31:50 +02:00
) ,
2022-10-05 09:12:22 +02:00
mo_duplicated = duplicated ( mo_subspecies_new ) ,
mo_subspecies_new = case_when (
! mo_duplicated ~ mo_subspecies_new ,
mo_duplicated & mo_subspecies_new == mo_subspecies_new4 ~ mo_subspecies_new5 ,
mo_duplicated & mo_subspecies_new == mo_subspecies_new5 ~ mo_subspecies_new6 ,
mo_duplicated & mo_subspecies_new == mo_subspecies_new6 ~ mo_subspecies_new7 ,
mo_duplicated & mo_subspecies_new == mo_subspecies_new7 ~ mo_subspecies_new8 ,
mo_duplicated & mo_subspecies_new == mo_subspecies_new8 ~ mo_subspecies_new5b ,
TRUE ~ NA_character_
) ,
mo_duplicated = duplicated ( mo_subspecies_new )
) %>%
ungroup ( )
if ( any ( mo_subspecies $ mo_duplicated , na.rm = TRUE ) | anyNA ( mo_subspecies $ mo_subspecies_new ) ) stop ( " Duplicate MO codes for subspecies!" )
# no duplicates *within kingdoms*, so keep the right columns for left joining later
mo_subspecies <- mo_subspecies %>%
select ( kingdom , genus , species , subspecies , mo_subspecies = mo_subspecies_new )
2019-02-20 00:04:48 +01:00
2022-10-05 09:12:22 +02:00
# unknowns - manually added
mo_unknown <- AMR :: microorganisms %>%
filter ( fullname %like% " unknown" ) %>%
transmute ( fullname , mo_unknown = as.character ( mo ) )
# apply the new codes!
taxonomy <- taxonomy %>%
left_join ( mo_kingdom , by = " kingdom" ) %>%
left_join ( mo_phylum , by = c ( " kingdom" , " phylum" ) ) %>%
left_join ( mo_class , by = c ( " kingdom" , " class" ) ) %>%
left_join ( mo_order , by = c ( " kingdom" , " order" ) ) %>%
left_join ( mo_family , by = c ( " kingdom" , " family" ) ) %>%
left_join ( mo_genus , by = c ( " kingdom" , " genus" ) ) %>%
left_join ( mo_species , by = c ( " kingdom" , " genus" , " species" ) ) %>%
left_join ( mo_subspecies , by = c ( " kingdom" , " genus" , " species" , " subspecies" ) ) %>%
left_join ( mo_unknown , by = " fullname" ) %>%
mutate ( across ( starts_with ( " mo_" ) , function ( x ) ifelse ( is.na ( x ) , " " , x ) ) ) %>%
mutate (
mo = case_when (
fullname %like% " unknown" ~ mo_unknown ,
# add special cases for taxons higher than genus
rank == " kingdom" ~ paste ( mo_kingdom , " [KNG]" , toupper ( kingdom ) , sep = " _" ) ,
rank == " phylum" ~ paste ( mo_kingdom , mo_phylum , sep = " _" ) ,
rank == " class" ~ paste ( mo_kingdom , mo_class , sep = " _" ) ,
rank == " order" ~ paste ( mo_kingdom , mo_order , sep = " _" ) ,
rank == " family" ~ paste ( mo_kingdom , mo_family , sep = " _" ) ,
TRUE ~ paste ( mo_kingdom , mo_genus , mo_species , mo_subspecies , sep = " _" )
2022-08-28 10:31:50 +02:00
) ,
2022-10-05 09:12:22 +02:00
mo = trimws ( gsub ( " _+$" , " " , mo ) ) ,
.before = 1
) %>%
select ( ! starts_with ( " mo_" ) ) %>%
2022-10-30 14:31:45 +01:00
arrange ( fullname )
2022-10-29 14:15:23 +02:00
# now check these - e.g. Nitrospira is the name of a genus AND its class
2023-01-23 15:01:21 +01:00
taxonomy %>%
filter ( fullname %in% .[duplicated ( fullname ) , " fullname" , drop = TRUE ] ) %>%
View ( )
2024-02-24 15:16:52 +01:00
# fullnames must be unique, we'll keep the most relevant ones only
2022-10-30 14:31:45 +01:00
taxonomy <- taxonomy %>%
2023-01-23 15:01:21 +01:00
mutate ( rank_index = case_when (
kingdom == " Bacteria" ~ 1 ,
kingdom == " Fungi" ~ 2 ,
kingdom == " Protozoa" ~ 3 ,
kingdom == " Archaea" ~ 4 ,
2024-02-24 15:16:52 +01:00
kingdom == " Animalia" ~ 5 ,
TRUE ~ 6
2023-01-23 15:01:21 +01:00
) ) %>%
arrange ( fullname , rank_index ) %>%
distinct ( fullname , .keep_all = TRUE ) %>%
select ( - rank_index ) %>%
2022-12-12 00:14:56 +01:00
filter ( mo != " " )
2024-02-24 15:16:52 +01:00
# keep the codes from manually added ones
manual_mos <- as.character ( AMR :: microorganisms $ mo ) [match ( taxonomy $ fullname [taxonomy $ source == " manually added" ] , AMR :: microorganisms $ fullname ) ]
taxonomy $ mo [taxonomy $ source == " manually added" ] [ ! is.na ( manual_mos ) ] <- manual_mos [ ! is.na ( manual_mos ) ]
2022-12-12 00:14:56 +01:00
# this must not exist:
2023-01-23 15:01:21 +01:00
taxonomy %>%
filter ( mo %like% " __" ) %>%
View ( )
2022-12-12 00:14:56 +01:00
taxonomy <- taxonomy %>% filter ( mo %unlike% " __" )
2024-02-24 15:16:52 +01:00
taxonomy_lpsn.bak4 <- taxonomy
2022-12-12 00:14:56 +01:00
2022-12-16 16:10:43 +01:00
# Some integrity checks ---------------------------------------------------
# are mo codes unique?
2024-02-24 15:16:52 +01:00
taxonomy %>% filter ( mo %in% .[duplicated ( mo ) , " mo" , drop = TRUE ] ) | > arrange ( mo ) | > View ( )
# no, there are not, so sort on MO and keep the first
taxonomy <- taxonomy %>% arrange ( mo ) | > distinct ( mo , .keep_all = TRUE )
taxonomy <- taxonomy | >
mutate ( fullname = case_match ( rank ,
" phylum" ~ phylum ,
" class" ~ class ,
" order" ~ order ,
" family" ~ family ,
.default = fullname ) )
2022-12-16 16:10:43 +01:00
# are fullnames unique?
taxonomy %>% filter ( fullname %in% .[duplicated ( fullname ) , " fullname" , drop = TRUE ] )
2020-05-27 16:37:49 +02:00
2022-12-16 16:10:43 +01:00
# are all GBIFs available?
2023-01-23 15:01:21 +01:00
taxonomy %>%
2024-02-24 15:16:52 +01:00
filter ( ( ! gbif_parent %in% gbif ) | ( ! lpsn_parent %in% lpsn ) ) %>%
count ( source = ifelse ( ! gbif_parent %in% gbif , " GBIF" , " LPSN" ) ,
rank )
# so fix again all parent GBIF identifiers
taxonomy $ gbif_parent [taxonomy $ rank == " phylum" ] <- taxonomy $ gbif [match ( taxonomy $ kingdom [taxonomy $ rank == " phylum" ] , taxonomy $ fullname ) ]
taxonomy $ gbif_parent [taxonomy $ rank == " class" ] <- taxonomy $ gbif [match ( taxonomy $ phylum [taxonomy $ rank == " class" ] , taxonomy $ fullname ) ]
taxonomy $ gbif_parent [taxonomy $ rank == " order" ] <- taxonomy $ gbif [match ( taxonomy $ class [taxonomy $ rank == " order" ] , taxonomy $ fullname ) ]
taxonomy $ gbif_parent [taxonomy $ rank == " family" ] <- taxonomy $ gbif [match ( taxonomy $ order [taxonomy $ rank == " family" ] , taxonomy $ fullname ) ]
taxonomy $ gbif_parent [taxonomy $ rank == " genus" ] <- taxonomy $ gbif [match ( taxonomy $ family [taxonomy $ rank == " genus" ] , taxonomy $ fullname ) ]
taxonomy $ gbif_parent [taxonomy $ rank == " species" ] <- taxonomy $ gbif [match ( taxonomy $ genus [taxonomy $ rank == " species" ] , taxonomy $ fullname ) ]
taxonomy $ gbif_parent [taxonomy $ rank == " subspecies" ] <- taxonomy $ gbif [match ( paste ( taxonomy $ genus [taxonomy $ rank == " subspecies" ] , taxonomy $ species [taxonomy $ rank == " subspecies" ] ) , taxonomy $ fullname ) ]
# and LPSN identifiers
taxonomy $ lpsn_parent [taxonomy $ rank == " phylum" ] <- taxonomy $ lpsn [match ( taxonomy $ kingdom [taxonomy $ rank == " phylum" ] , taxonomy $ fullname ) ]
taxonomy $ lpsn_parent [taxonomy $ rank == " class" ] <- taxonomy $ lpsn [match ( taxonomy $ phylum [taxonomy $ rank == " class" ] , taxonomy $ fullname ) ]
taxonomy $ lpsn_parent [taxonomy $ rank == " order" ] <- taxonomy $ lpsn [match ( taxonomy $ class [taxonomy $ rank == " order" ] , taxonomy $ fullname ) ]
taxonomy $ lpsn_parent [taxonomy $ rank == " family" ] <- taxonomy $ lpsn [match ( taxonomy $ order [taxonomy $ rank == " family" ] , taxonomy $ fullname ) ]
taxonomy $ lpsn_parent [taxonomy $ rank == " genus" ] <- taxonomy $ lpsn [match ( taxonomy $ family [taxonomy $ rank == " genus" ] , taxonomy $ fullname ) ]
taxonomy $ lpsn_parent [taxonomy $ rank == " species" ] <- taxonomy $ lpsn [match ( taxonomy $ genus [taxonomy $ rank == " species" ] , taxonomy $ fullname ) ]
taxonomy $ lpsn_parent [taxonomy $ rank == " subspecies" ] <- taxonomy $ lpsn [match ( paste ( taxonomy $ genus [taxonomy $ rank == " subspecies" ] , taxonomy $ species [taxonomy $ rank == " subspecies" ] ) , taxonomy $ fullname ) ]
# check again
2023-01-23 15:01:21 +01:00
taxonomy %>%
2024-02-24 15:16:52 +01:00
filter ( ( ! gbif_parent %in% gbif ) | ( ! lpsn_parent %in% lpsn ) ) %>%
count ( source = ifelse ( ! gbif_parent %in% gbif , " GBIF" , " LPSN" ) ,
rank )
2022-12-16 16:10:43 +01:00
# Save intermediate results (3) -------------------------------------------
saveRDS ( taxonomy , " data-raw/taxonomy3.rds" )
2022-10-29 14:15:23 +02:00
2019-10-30 23:02:50 +01:00
2024-02-24 15:16:52 +01:00
# Redo LPSN missings and parents ------------------------------------------
gbif_bacteria_second_run <- which ( taxonomy $ kingdom == " Bacteria" & taxonomy $ source == " GBIF" & taxonomy $ rank %in% c ( " phylum" , " class" , " order" , " family" ) )
gbif_bacteria_second_run <- gbif_bacteria_second_run [ ! gbif_bacteria_second_run %in% gbif_bacteria ]
added <- 0
pb <- progress_bar $ new ( total = length ( gbif_bacteria_second_run ) , format = " [:bar] :current/:total :eta" )
for ( record in gbif_bacteria_second_run ) {
pb $ tick ( )
lpsn <- get_lpsn_and_author ( rank = taxonomy $ rank [record ] ,
name = taxonomy $ fullname [record ] )
if ( is.na ( lpsn [ " lpsn" ] ) ) {
next
} else {
added <- added + 1
taxonomy $ source [record ] <- " LPSN"
taxonomy $ lpsn [record ] <- unname ( lpsn [ " lpsn" ] )
taxonomy $ ref [record ] <- unname ( lpsn [ " ref" ] )
taxonomy $ status [record ] <- unname ( lpsn [ " status" ] )
}
}
message ( added , " GBIF records altered to latest LPSN" )
taxbak <- taxonomy
taxonomy $ lpsn_parent [taxonomy $ rank == " phylum" ] <- taxonomy $ lpsn [match ( taxonomy $ kingdom [taxonomy $ rank == " phylum" ] , taxonomy $ fullname ) ]
taxonomy $ lpsn_parent [taxonomy $ rank == " class" ] <- taxonomy $ lpsn [match ( taxonomy $ phylum [taxonomy $ rank == " class" ] , taxonomy $ fullname ) ]
taxonomy $ lpsn_parent [taxonomy $ rank == " order" ] <- taxonomy $ lpsn [match ( taxonomy $ class [taxonomy $ rank == " order" ] , taxonomy $ fullname ) ]
taxonomy $ lpsn_parent [taxonomy $ rank == " family" ] <- taxonomy $ lpsn [match ( taxonomy $ order [taxonomy $ rank == " family" ] , taxonomy $ fullname ) ]
taxonomy $ lpsn_parent [taxonomy $ rank == " genus" ] <- taxonomy $ lpsn [match ( taxonomy $ family [taxonomy $ rank == " genus" ] , taxonomy $ fullname ) ]
taxonomy $ lpsn_parent [taxonomy $ rank == " species" ] <- taxonomy $ lpsn [match ( taxonomy $ genus [taxonomy $ rank == " species" ] , taxonomy $ fullname ) ]
taxonomy $ lpsn_parent [taxonomy $ rank == " subspecies" ] <- taxonomy $ lpsn [match ( paste ( taxonomy $ genus [taxonomy $ rank == " subspecies" ] , taxonomy $ species [taxonomy $ rank == " subspecies" ] ) , taxonomy $ fullname ) ]
# TODO: there is no order Eggerthellales anymore
2022-10-05 09:12:22 +02:00
# Remove unwanted taxonomic entries from Protoza/Fungi --------------------
2019-09-20 14:18:29 +02:00
2022-10-05 09:12:22 +02:00
# this must be done after the microbial ID generation, since it will otherwise generate a lot of different IDs
taxonomy <- taxonomy %>%
filter (
# Protozoa:
! ( phylum %in% c ( " Choanozoa" , " Mycetozoa" ) & prevalence == 3 ) ,
# Fungi:
2024-02-24 15:16:52 +01:00
! ( phylum %in% c ( " Ascomycota" , " Zygomycota" , " Basidiomycota" ) & prevalence == 3 & rank %in% c ( " genus" , " species" , " subspecies" ) ) ,
2022-12-12 00:14:56 +01:00
! ( genus %in% c ( " Phoma" , " Leptosphaeria" , " Physarum" ) & rank %in% c ( " species" , " subspecies" ) ) , # only genus of this rare fungus, with resp. 1300 and 800 species
2022-10-05 09:12:22 +02:00
# (leave Alternaria in there, part of human mycobiome and opportunistic pathogen)
# Animalia:
! genus %in% c ( " Lucilia" , " Lumbricus" ) ,
2024-02-24 15:16:52 +01:00
! ( class == " Insecta" & rank %in% c ( " species" , " subspecies" ) ) , # keep only genus of insects, not all of their (sub)species
2022-12-12 00:14:56 +01:00
! ( genus == " Amoeba" & kingdom == " Animalia" ) ,
2022-10-05 09:12:22 +02:00
! ( genus %in% c ( " Aedes" , " Anopheles" ) & rank %in% c ( " species" , " subspecies" ) ) , # only genus of the many hundreds of mosquitoes species
kingdom != " Plantae"
) # this kingdom only contained Curvularia and Hymenolepis, which have coincidental twin names with Fungi
2019-02-28 13:56:28 +01:00
2022-12-12 00:14:56 +01:00
# no ghost families, orders classes, phyla
taxonomy <- taxonomy %>%
2023-01-23 15:01:21 +01:00
group_by ( kingdom , family ) %>%
2024-02-24 15:16:52 +01:00
# (but keep the ghost families of bacteria)
filter ( n ( ) > 1 | fullname %like% " unknown" | rank == " kingdom" | kingdom == " Bacteria" ) %>%
2023-01-23 15:01:21 +01:00
group_by ( kingdom , order ) %>%
filter ( n ( ) > 1 | fullname %like% " unknown" | rank == " kingdom" ) %>%
group_by ( kingdom , class ) %>%
filter ( n ( ) > 1 | fullname %like% " unknown" | rank == " kingdom" ) %>%
group_by ( kingdom , phylum ) %>%
filter ( n ( ) > 1 | fullname %like% " unknown" | rank == " kingdom" ) %>%
2022-12-12 00:14:56 +01:00
ungroup ( )
2024-02-24 15:16:52 +01:00
for ( i in which ( colnames ( taxonomy ) %in% c ( " phylum" , " class" , " order" , " family" ) ) - 1 ) {
i_name <- colnames ( taxonomy ) [i + 1 ]
message ( " Adding missing: " , i_name , " ... " , appendLF = FALSE )
to_add <- taxonomy %>%
filter ( .[ [i + 1 ] ] != " " ) %>%
distinct ( kingdom , .[ [i + 1 ] ] , .keep_all = TRUE ) %>%
select ( kingdom : ( i + 1 ) ) %>%
mutate (
fullname = .[ [ncol ( .) ] ] ,
rank = i_name
) %>%
filter ( ! paste ( kingdom , .[ [ncol ( .) - 2 ] ] , rank ) %in% paste ( taxonomy $ kingdom , taxonomy [ [i + 1 ] ] , taxonomy $ rank ) ) %>%
# get GBIF identifier where available
left_join (
current_gbif %>%
select ( kingdom , all_of ( i_name ) , rank = taxonRank , ref = scientificNameAuthorship , gbif = taxonID , gbif_parent = parentNameUsageID ) ,
by = c ( " kingdom" , " rank" , i_name )
) %>%
mutate ( source = ifelse ( ! is.na ( gbif ) , " GBIF" , " manually added" ) ,
status = ifelse ( ! is.na ( gbif ) , " accepted" , " unknown" ) )
message ( " n = " , nrow ( to_add ) )
# taxonomy_all_missing <- taxonomy_all_missing %>%
# bind_rows(to_add)
}
2023-01-23 15:01:21 +01:00
message (
2023-02-10 16:18:00 +01:00
" \nCongratulations! The new taxonomic table will contain " , format ( nrow ( taxonomy ) , big.mark = " " ) , " rows.\n" ,
" This was " , format ( nrow ( AMR :: microorganisms ) , big.mark = " " ) , " rows.\n"
2023-01-23 15:01:21 +01:00
)
2022-12-12 00:14:56 +01:00
# these are the new ones:
2023-01-23 15:01:21 +01:00
taxonomy %>%
filter ( ! paste ( kingdom , fullname ) %in% paste ( AMR :: microorganisms $ kingdom , AMR :: microorganisms $ fullname ) ) %>%
View ( )
2022-12-12 00:14:56 +01:00
# these were removed:
2023-01-23 15:01:21 +01:00
AMR :: microorganisms %>%
filter ( ! paste ( kingdom , fullname ) %in% paste ( taxonomy $ kingdom , taxonomy $ fullname ) ) %>%
View ( )
AMR :: microorganisms %>%
filter ( ! fullname %in% taxonomy $ fullname ) %>%
View ( )
2020-07-08 14:48:06 +02:00
2022-10-05 09:12:22 +02:00
2023-07-08 17:30:05 +02:00
# Some manual fixes -------------------------------------------------------
# Candida haemulonis and C. duobushaemulonis should be Candida haemulonii and C. duobushaemulonii
# not sure how this can be, but GBIF contained spelling errors?
# see https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3486233/
taxonomy $ species [which ( taxonomy $ fullname %like% " ^Candida .*haemulonis" ) ] <- gsub ( " nis$" , " nii" , taxonomy $ species [which ( taxonomy $ fullname %like% " ^Candida .*haemulonis" ) ] )
taxonomy $ fullname [which ( taxonomy $ fullname %like% " ^Candida .*haemulonis" ) ] <- gsub ( " nis$" , " nii" , taxonomy $ fullname [which ( taxonomy $ fullname %like% " ^Candida .*haemulonis" ) ] )
2022-10-05 09:12:22 +02:00
# Add SNOMED CT -----------------------------------------------------------
# we will use Public Health Information Network Vocabulary Access and Distribution System (PHIN VADS)
# as a source, which copies directly from the latest US SNOMED CT version
# - go to https://phinvads.cdc.gov/vads/ViewValueSet.action?oid=2.16.840.1.114222.4.11.1009
2022-10-29 14:15:23 +02:00
# - check that current online version is higher than TAXONOMY_VERSION$SNOMED
2022-10-05 09:12:22 +02:00
# - if so, click on 'Download Value Set', choose 'TXT'
snomed <- vroom ( " data-raw/SNOMED_PHVS_Microorganism_CDC_V12.txt" , skip = 3 ) %>%
select ( 1 : 2 ) %>%
setNames ( c ( " snomed" , " mo" ) ) %>%
mutate ( snomed = as.character ( snomed ) )
# try to get name of MO
snomed <- snomed %>%
mutate ( mo = gsub ( " ss. " , " " , mo , fixed = TRUE ) ) %>%
mutate ( fullname = case_when (
mo %like_case% " [A-Z][a-z]+ [a-z]+ [a-z]{4,} " ~ gsub ( " (^|.*)([A-Z][a-z]+ [a-z]+ [a-z]{4,}) .*" , " \\2" , mo ) ,
mo %like_case% " [A-Z][a-z]+ [a-z]{4,} " ~ gsub ( " (^|.*)([A-Z][a-z]+ [a-z]{4,}) .*" , " \\2" , mo ) ,
mo %like_case% " [A-Z][a-z]+" ~ gsub ( " (^|.*)([A-Z][a-z]+) .*" , " \\2" , mo ) ,
TRUE ~ NA_character_
) ) %>%
filter ( fullname %in% taxonomy $ fullname )
message ( nrow ( snomed ) , " SNOMED codes will be added to " , n_distinct ( snomed $ fullname ) , " microorganisms" )
snomed <- snomed %>%
group_by ( fullname ) %>%
summarise ( snomed = list ( snomed ) )
taxonomy <- taxonomy %>%
left_join ( snomed , by = " fullname" )
2023-05-08 13:04:18 +02:00
# Add oxygen tolerance (aerobe/anaerobe) ----------------------------------
# We will use the BacDive data base for this:
# - go to https://bacdive.dsmz.de/advsearch and filter 'Oxygen tolerance' on "*"
# - click on the 'Download tabel as CSV' button
#
bacdive <- vroom :: vroom ( " data-raw/bacdive.csv" , skip = 2 ) %>%
select ( species , oxygen = `Oxygen tolerance` )
bacdive <- bacdive %>%
# fill in missing species from previous rows
mutate ( species = ifelse ( is.na ( species ) , lag ( species ) , species ) ) %>%
2023-05-11 21:56:27 +02:00
filter ( ! is.na ( species ) , ! is.na ( oxygen ) , oxygen %unlike% " tolerant" , species %unlike% " unclassified" ) %>%
mutate ( mo = as.mo ( species , keep_synonyms = FALSE ) )
2023-05-08 13:04:18 +02:00
bacdive <- bacdive %>%
# now determine type per species
2023-05-11 21:56:27 +02:00
group_by ( mo ) %>%
summarise ( species = first ( species ) ,
oxygen_tolerance = case_when ( any ( oxygen %like% " facultative" ) ~ " facultative anaerobe" ,
2023-05-08 13:04:18 +02:00
all ( oxygen == " microaerophile" ) ~ " microaerophile" ,
all ( oxygen %in% c ( " anaerobe" , " obligate anaerobe" ) ) ~ " anaerobe" ,
all ( oxygen %in% c ( " anaerobe" , " obligate anaerobe" , " microaerophile" ) ) ~ " anaerobe/microaerophile" ,
all ( oxygen %in% c ( " aerobe" , " obligate aerobe" ) ) ~ " aerobe" ,
all ( ! oxygen %in% c ( " anaerobe" , " obligate anaerobe" ) ) ~ " aerobe" ,
all ( c ( " aerobe" , " anaerobe" ) %in% oxygen ) ~ " facultative anaerobe" ,
TRUE ~ NA_character_ ) )
2023-05-11 21:56:27 +02:00
# now find all synonyms and copy them from their current taxonomic names
synonyms <- as.mo ( unique ( unlist ( mo_synonyms ( bacdive $ mo , keep_synonyms = TRUE ) ) ) ,
keep_synonyms = TRUE )
syns <- tibble ( species = synonyms ,
mo = synonyms %>% mo_current ( ) %>% as.mo ( ) ) %>%
filter ( species != mo ) %>%
mutate ( species = mo_name ( species , keep_synonyms = TRUE ) ) %>%
left_join ( bacdive %>% select ( mo , oxygen_tolerance ) ) %>%
# set mo to mo of the synonym
mutate ( mo = as.mo ( species , keep_synonyms = TRUE ) ) %>%
select ( all_of ( colnames ( bacdive ) ) )
bacdive <- bacdive %>%
bind_rows ( syns ) %>%
distinct ( )
2023-05-08 13:04:18 +02:00
bacdive_genus <- bacdive %>%
2023-05-11 21:56:27 +02:00
mutate ( oxygen = oxygen_tolerance ) %>%
group_by ( species = mo_genus ( mo ) ) %>%
2023-05-08 13:04:18 +02:00
summarise ( oxygen_tolerance = case_when ( any ( oxygen == " facultative anaerobe" ) ~ " facultative anaerobe" ,
any ( oxygen == " anaerobe/microaerophile" ) ~ " anaerobe/microaerophile" ,
all ( oxygen == " microaerophile" ) ~ " microaerophile" ,
all ( oxygen == " anaerobe" ) ~ " anaerobe" ,
all ( oxygen == " aerobe" ) ~ " aerobe" ,
TRUE ~ " facultative anaerobe" ) )
bacdive <- bacdive %>%
filter ( species %unlike% " sp[.]" ) %>%
bind_rows ( bacdive_genus ) %>%
arrange ( species ) %>%
2023-05-11 21:56:27 +02:00
mutate ( mo = as.mo ( species , keep_synonyms = TRUE ) )
2023-05-08 13:04:18 +02:00
other_species <- microorganisms %>%
filter ( kingdom == " Bacteria" , rank == " species" , ! mo %in% bacdive $ mo , genus %in% bacdive $ species ) %>%
select ( species = fullname , genus , mo2 = mo ) %>%
left_join ( bacdive , by = c ( " genus" = " species" ) ) %>%
mutate ( oxygen_tolerance = ifelse ( oxygen_tolerance %in% c ( " aerobe" , " anaerobe" , " microaerophile" , " anaerobe/microaerophile" ) ,
oxygen_tolerance ,
paste ( " likely" , oxygen_tolerance ) ) ) %>%
2023-05-11 21:56:27 +02:00
select ( species , oxygen_tolerance , mo = mo2 ) %>%
distinct ( species , .keep_all = TRUE )
2023-05-08 13:04:18 +02:00
bacdive <- bacdive %>%
bind_rows ( other_species ) %>%
2023-05-11 21:56:27 +02:00
arrange ( species ) %>%
distinct ( mo , .keep_all = TRUE ) %>%
select ( - species )
2023-05-08 13:04:18 +02:00
taxonomy <- taxonomy %>%
2023-05-11 21:56:27 +02:00
left_join ( bacdive , by = " mo" ) %>%
relocate ( oxygen_tolerance , .after = ref )
2023-05-08 13:04:18 +02:00
2022-10-05 09:12:22 +02:00
# Clean data set ----------------------------------------------------------
# format to tibble and check again for invalid characters
taxonomy <- taxonomy %>%
select ( mo , fullname , status , kingdom : subspecies , rank , ref , source , starts_with ( " lpsn" ) , starts_with ( " gbif" ) , prevalence , snomed ) %>%
df_remove_nonASCII ( )
# set class <mo>
class ( taxonomy $ mo ) <- c ( " mo" , " character" )
2023-02-26 21:26:58 +01:00
microorganisms <- taxonomy
# Restore 'synonym' microorganisms to 'accepted' --------------------------
2022-10-05 09:12:22 +02:00
2023-02-26 21:26:58 +01:00
# according to LPSN: Stenotrophomonas maltophilia is the correct name if this species is regarded as a separate species (i.e., if its nomenclatural type is not assigned to another species whose name is validly published, legitimate and not rejected and has priority) within a separate genus Stenotrophomonas.
# https://lpsn.dsmz.de/species/stenotrophomonas-maltophilia
# all MO's to keep as 'accepted', not as 'synonym':
2023-03-11 14:24:34 +01:00
to_restore <- c (
" Stenotrophomonas maltophilia" ,
" Moraxella catarrhalis"
)
2023-02-26 21:26:58 +01:00
all ( to_restore %in% microorganisms $ fullname )
for ( nm in to_restore ) {
microorganisms $ lpsn_renamed_to [which ( microorganisms $ fullname == nm ) ] <- NA
microorganisms $ gbif_renamed_to [which ( microorganisms $ fullname == nm ) ] <- NA
microorganisms $ status [which ( microorganisms $ fullname == nm ) ] <- " accepted"
}
2022-10-05 09:12:22 +02:00
# Save to package ---------------------------------------------------------
2023-04-15 09:32:13 +02:00
# set class <mo> if still needed (if you run only this part coming from other scripts)
class ( microorganisms $ mo ) <- c ( " mo" , " character" )
microorganisms <- microorganisms %>% arrange ( fullname )
2022-10-05 09:12:22 +02:00
usethis :: use_data ( microorganisms , overwrite = TRUE , version = 2 , compress = " xz" )
2019-02-28 13:56:28 +01:00
rm ( microorganisms )
2022-10-05 09:12:22 +02:00
2022-12-12 00:14:56 +01:00
# DON'T FORGET TO UPDATE R/_globals.R!
2022-10-05 09:12:22 +02:00
# Test updates ------------------------------------------------------------
# and check: these codes should not be missing (will otherwise throw a unit test error):
AMR :: microorganisms.codes %>% filter ( ! mo %in% taxonomy $ mo )
2023-01-21 23:47:20 +01:00
AMR :: clinical_breakpoints %>% filter ( ! mo %in% taxonomy $ mo )
2022-10-05 09:12:22 +02:00
AMR :: example_isolates %>% filter ( ! mo %in% taxonomy $ mo )
AMR :: intrinsic_resistant %>% filter ( ! mo %in% taxonomy $ mo )
2020-05-27 16:37:49 +02:00
# load new data sets
devtools :: load_all ( " ." )
2022-10-05 09:12:22 +02:00
2020-05-27 16:37:49 +02:00
# reset previously changed mo codes
2023-01-21 23:47:20 +01:00
if ( ! identical ( clinical_breakpoints $ mo , as.mo ( clinical_breakpoints $ mo , language = NULL ) ) ) {
clinical_breakpoints $ mo <- as.mo ( clinical_breakpoints $ mo , language = NULL )
usethis :: use_data ( clinical_breakpoints , overwrite = TRUE , version = 2 , compress = " xz" )
rm ( clinical_breakpoints )
2022-12-12 00:14:56 +01:00
}
2019-09-20 12:33:05 +02:00
2022-12-12 00:14:56 +01:00
if ( ! identical ( microorganisms.codes $ mo , as.mo ( microorganisms.codes $ mo , language = NULL ) ) ) {
microorganisms.codes <- microorganisms.codes %>% filter ( mo %in% microorganisms $ mo )
microorganisms.codes $ mo <- as.mo ( microorganisms.codes $ mo , language = NULL )
usethis :: use_data ( microorganisms.codes , overwrite = TRUE , version = 2 , compress = " xz" )
rm ( microorganisms.codes )
}
2020-05-27 16:37:49 +02:00
2022-12-12 00:14:56 +01:00
if ( ! identical ( example_isolates $ mo , as.mo ( example_isolates $ mo , language = NULL ) ) ) {
example_isolates $ mo <- as.mo ( example_isolates $ mo , language = NULL )
usethis :: use_data ( example_isolates , overwrite = TRUE , version = 2 )
rm ( example_isolates )
}
2022-10-05 09:12:22 +02:00
# load new data sets again
devtools :: load_all ( " ." )
source ( " data-raw/_pre_commit_hook.R" )
devtools :: load_all ( " ." )
2020-05-27 16:37:49 +02:00
2022-12-12 00:14:56 +01:00
if ( ! identical ( intrinsic_resistant $ mo , as.mo ( intrinsic_resistant $ mo , language = NULL ) ) ) {
stop ( " Run data-raw/reproduction_of_intrinsic_resistant.R again" )
}
2020-05-27 16:37:49 +02:00
# run the unit tests
2022-10-05 09:12:22 +02:00
Sys.setenv ( NOT_CRAN = " true" )
2020-05-27 16:37:49 +02:00
testthat :: test_file ( " tests/testthat/test-data.R" )
testthat :: test_file ( " tests/testthat/test-mo.R" )
testthat :: test_file ( " tests/testthat/test-mo_property.R" )