2020-01-05 17:22:09 +01:00
# ==================================================================== #
# TITLE #
2022-10-05 09:12:22 +02:00
# AMR: An R Package for Working with Antimicrobial Resistance Data #
2020-01-05 17:22:09 +01:00
# #
# SOURCE #
2020-07-09 20:07:39 +02:00
# https://github.com/msberends/AMR #
2020-01-05 17:22:09 +01:00
# #
2022-10-05 09:12:22 +02:00
# CITE AS #
# Berends MS, Luz CF, Friedrich AW, Sinha BNM, Albers CJ, Glasner C #
# (2022). AMR: An R Package for Working with Antimicrobial Resistance #
# Data. Journal of Statistical Software, 104(3), 1-31. #
# doi:10.18637/jss.v104.i03 #
# #
2022-12-27 15:16:15 +01:00
# Developed at the University of Groningen and the University Medical #
# Center Groningen in The Netherlands, in collaboration with many #
# colleagues from around the world, see our website. #
2020-01-05 17:22:09 +01:00
# #
# This R package is free software; you can freely use and distribute #
# it for both personal and commercial purposes under the terms of the #
# GNU General Public License version 2.0 (GNU GPL-2), as published by #
# the Free Software Foundation. #
# We created this package for both routine data analysis and academic #
# research and it was publicly released in the hope that it will be #
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. #
2020-10-08 11:16:03 +02:00
# #
# Visit our website for the full manual and a complete tutorial about #
2021-02-02 23:57:35 +01:00
# how to conduct AMR data analysis: https://msberends.github.io/AMR/ #
2020-01-05 17:22:09 +01:00
# ==================================================================== #
2022-10-05 09:12:22 +02:00
# THIS SCRIPT REQUIRES AT LEAST 16 GB RAM
# (at least 10 GB will be used by the R session for the size of the files)
2019-03-18 14:29:41 +01:00
2022-10-05 09:12:22 +02:00
# 1. Go to https://doi.org/10.15468/39omei and find the download link for the
2022-12-12 00:14:56 +01:00
# latest GBIF backbone taxonony under "Endpoints" and unpack Taxon.tsv from it (~2.2 GB)
2022-10-05 09:12:22 +02:00
# ALSO BE SURE to get the date of release and update R/aa_globals.R later!
# 2. Go to https://lpsn.dsmz.de/downloads (register first) and download the latest
2022-12-12 00:14:56 +01:00
# CSV file (~12,5 MB) as "taxonomy.csv". Their API unfortunately does
2022-10-05 09:12:22 +02:00
# not include the full taxonomy and is currently (2022) pretty worthless.
2022-12-19 15:32:41 +01:00
# 3. For data about human pathogens, we use Bartlett et al. (2022),
2023-01-23 15:01:21 +01:00
# https://doi.org/10.1099/mic.0.001269. Their latest supplementary material
2022-12-19 15:32:41 +01:00
# can be found here: https://github.com/padpadpadpad/bartlett_et_al_2022_human_pathogens.
2023-01-23 15:01:21 +01:00
# . Download their latest xlsx file in the `data` folder and save it to our
# . `data-raw` folder.
2022-12-19 15:32:41 +01:00
# 4. Set this folder_location to the path where these two files are:
2022-10-05 09:12:22 +02:00
folder_location <- " ~/Downloads/backbone/"
file_gbif <- paste0 ( folder_location , " Taxon.tsv" )
file_lpsn <- paste0 ( folder_location , " taxonomy.csv" )
2019-02-20 00:04:48 +01:00
2022-12-19 15:32:41 +01:00
file_bartlett <- " data-raw/bartlett_et_al_2022_human_pathogens.xlsx"
2022-10-05 09:12:22 +02:00
# 4. Run the rest of this script line by line and check everything :)
2019-03-18 14:29:41 +01:00
2022-10-05 09:12:22 +02:00
if ( ! file.exists ( file_gbif ) ) stop ( " GBIF file not found" )
if ( ! file.exists ( file_lpsn ) ) stop ( " LPSN file not found" )
2022-12-19 15:32:41 +01:00
if ( ! file.exists ( file_bartlett ) ) stop ( " Bartlett et al. Excel file not found" )
2019-03-18 14:29:41 +01:00
2022-10-05 09:12:22 +02:00
library ( dplyr )
2022-12-12 00:14:56 +01:00
library ( vroom ) # to import files
library ( rvest ) # to scape LPSN website
library ( progress ) # to show progress bars
2022-12-19 15:32:41 +01:00
library ( readxl ) # for reading the Bartlett Excel file
2022-12-16 16:10:43 +01:00
devtools :: load_all ( " ." ) # load AMR package
2019-03-18 14:29:41 +01:00
2022-10-05 09:12:22 +02:00
# Helper functions --------------------------------------------------------
2019-04-05 18:47:39 +02:00
2020-05-27 16:37:49 +02:00
get_author_year <- function ( ref ) {
# Only keep first author, e.g. transform 'Smith, Jones, 2011' to 'Smith et al., 2011'
2023-01-23 15:01:21 +01:00
2020-05-27 16:37:49 +02:00
authors2 <- iconv ( ref , from = " UTF-8" , to = " ASCII//TRANSLIT" )
2022-10-05 09:12:22 +02:00
authors2 <- gsub ( " ?\\(Approved Lists [0-9]+\\) ?" , " () " , authors2 )
authors2 <- gsub ( " [)(]+ $" , " " , authors2 )
2020-05-27 16:37:49 +02:00
# remove leading and trailing brackets
2022-10-05 09:12:22 +02:00
authors2 <- trimws ( gsub ( " ^[(](.*)[)]$" , " \\1" , authors2 ) )
2020-05-27 16:37:49 +02:00
# only take part after brackets if there's a name
authors2 <- ifelse ( grepl ( " .*[)] [a-zA-Z]+.*" , authors2 ) ,
2023-01-23 15:01:21 +01:00
gsub ( " .*[)] (.*)" , " \\1" , authors2 ) ,
authors2
2022-08-28 10:31:50 +02:00
)
2022-10-05 09:12:22 +02:00
# replace parentheses with emend. to get the latest authors
authors2 <- gsub ( " (" , " emend. " , authors2 , fixed = TRUE )
authors2 <- gsub ( " )" , " " , authors2 , fixed = TRUE )
authors2 <- gsub ( " +" , " " , authors2 )
authors2 <- trimws ( authors2 )
2023-01-23 15:01:21 +01:00
2020-05-27 16:37:49 +02:00
# get year from last 4 digits
2022-08-28 10:31:50 +02:00
lastyear <- as.integer ( gsub ( " .*([0-9]{4})$" , " \\1" , authors2 ) )
2020-05-27 16:37:49 +02:00
# can never be later than now
2022-08-28 10:31:50 +02:00
lastyear <- ifelse ( lastyear > as.integer ( format ( Sys.Date ( ) , " %Y" ) ) ,
2023-01-23 15:01:21 +01:00
NA ,
lastyear
2022-08-28 10:31:50 +02:00
)
2020-05-27 16:37:49 +02:00
# get authors without last year
authors <- gsub ( " (.*)[0-9]{4}$" , " \\1" , authors2 )
2022-10-05 09:12:22 +02:00
# not sure what this is
authors <- gsub ( " (Saito)" , " " , authors , fixed = TRUE )
authors <- gsub ( " (Oudem.)" , " " , authors , fixed = TRUE )
2020-05-27 16:37:49 +02:00
# remove nonsense characters from names
2022-10-05 09:12:22 +02:00
authors <- gsub ( " [^a-zA-Z,'&. -]" , " " , authors )
# no initials, only surname
authors <- gsub ( " [A-Z][.]" , " " , authors , ignore.case = FALSE )
2020-05-27 16:37:49 +02:00
# remove trailing and leading spaces
authors <- trimws ( authors )
2022-10-05 09:12:22 +02:00
# keep only the part after last 'emend.' to get the latest authors
authors <- gsub ( " .*emend[.] ?" , " " , authors )
2020-05-27 16:37:49 +02:00
# only keep first author and replace all others by 'et al'
authors <- gsub ( " (,| and| et| &| ex| emend\\.?) .*" , " et al." , authors )
# et al. always with ending dot
authors <- gsub ( " et al\\.?" , " et al." , authors )
authors <- gsub ( " ?,$" , " " , authors )
# don't start with 'sensu' or 'ehrenb'
2022-10-05 09:12:22 +02:00
authors <- gsub ( " ^(sensu|Ehrenb.?|corrig.?) " , " " , authors , ignore.case = TRUE )
2020-05-27 16:37:49 +02:00
# no initials, only surname
2022-10-05 09:12:22 +02:00
authors <- trimws ( authors )
authors <- gsub ( " ^([A-Z][.])+( & ?)?" , " " , authors , ignore.case = FALSE )
2020-05-27 16:37:49 +02:00
authors <- gsub ( " ^([A-Z]+ )+" , " " , authors , ignore.case = FALSE )
2022-10-05 09:12:22 +02:00
# remove dots
authors <- gsub ( " ." , " " , authors , fixed = TRUE )
authors <- gsub ( " et al" , " et al." , authors , fixed = TRUE )
authors [nchar ( authors ) <= 3 ] <- " "
2020-05-27 16:37:49 +02:00
# combine author and year if year is available
ref <- ifelse ( ! is.na ( lastyear ) ,
2023-01-23 15:01:21 +01:00
paste0 ( authors , " , " , lastyear ) ,
authors
2022-08-28 10:31:50 +02:00
)
2020-05-27 16:37:49 +02:00
# fix beginning and ending
ref <- gsub ( " , $" , " " , ref )
ref <- gsub ( " ^, " , " " , ref )
ref <- gsub ( " ^(emend|et al.,?)" , " " , ref )
ref <- trimws ( ref )
2022-10-05 09:12:22 +02:00
ref <- gsub ( " '" , " " , ref )
2023-01-23 15:01:21 +01:00
2020-05-27 16:37:49 +02:00
# a lot start with a lowercase character - fix that
ref [ ! grepl ( " ^d[A-Z]" , ref ) ] <- gsub ( " ^([a-z])" , " \\U\\1" , ref [ ! grepl ( " ^d[A-Z]" , ref ) ] , perl = TRUE )
2022-08-28 10:31:50 +02:00
# specific one for the French that are named dOrbigny
2020-05-27 16:37:49 +02:00
ref [grepl ( " ^d[A-Z]" , ref ) ] <- gsub ( " ^d" , " d'" , ref [grepl ( " ^d[A-Z]" , ref ) ] )
ref <- gsub ( " +" , " " , ref )
2022-10-05 09:12:22 +02:00
ref [ref == " " ] <- NA_character_
2020-05-27 16:37:49 +02:00
ref
}
2022-10-05 09:12:22 +02:00
df_remove_nonASCII <- function ( df ) {
# Remove non-ASCII characters (these are not allowed by CRAN)
df %>%
mutate_if ( is.character , iconv , from = " UTF-8" , to = " ASCII//TRANSLIT" ) %>%
# also remove invalid characters
mutate_if ( is.character , ~ gsub ( " [\"'`]+" , " " , .) ) %>%
AMR ::: dataset_UTF8_to_ASCII ( )
}
abbreviate_mo <- function ( x , minlength = 5 , prefix = " " , hyphen_as_space = FALSE , ... ) {
if ( hyphen_as_space == TRUE ) {
x <- gsub ( " -" , " " , x , fixed = TRUE )
}
# keep a starting Latin ae
suppressWarnings (
gsub ( " ^ae" , " \u00E6\u00E6" , x , ignore.case = TRUE ) %>%
abbreviate (
minlength = minlength ,
use.classes = TRUE ,
method = " both.sides" , ...
) %>%
paste0 ( prefix , .) %>%
toupper ( ) %>%
gsub ( " (\u00C6|\u00E6)+" , " AE" , .)
)
}
2019-02-20 00:04:48 +01:00
2022-10-05 09:12:22 +02:00
# to retrieve LPSN and authors from LPSN website
get_lpsn_and_author <- function ( rank , name ) {
url <- paste0 ( " https://lpsn.dsmz.de/" , tolower ( rank ) , " /" , tolower ( name ) )
2022-12-12 00:14:56 +01:00
page_txt <- tryCatch ( read_html ( url ) , error = function ( e ) NULL )
2022-10-05 09:12:22 +02:00
if ( is.null ( page_txt ) ) {
warning ( " No LPSN found for " , tolower ( rank ) , " '" , name , " '" )
lpsn <- NA_character_
ref <- NA_character_
} else {
page_txt <- page_txt %>%
2022-12-12 00:14:56 +01:00
html_element ( " #detail-page" ) %>%
html_text ( )
2022-10-05 09:12:22 +02:00
lpsn <- gsub ( " .*Record number:[\r\n\t ]*([0-9]+).*" , " \\1" , page_txt , perl = FALSE )
ref <- page_txt %>%
gsub ( " .*?Name: (.*[0-9]{4}?).*" , " \\1" , ., perl = FALSE ) %>%
gsub ( name , " " , ., fixed = TRUE ) %>%
trimws ( )
}
c ( " lpsn" = lpsn , " ref" = ref )
}
# MB/ August 2022: useless, does not contain full taxonomy, e.g. LPSN::request(cred, category = "family") is empty.
# get_from_lpsn <- function (user, pw) {
# if (!"LPSN" %in% rownames(utils::installed.packages())) {
# stop("Install the official LPSN package for R using: install.packages('LPSN', repos = 'https://r-forge.r-project.org')")
# }
# cred <- LPSN::open_lpsn(user, pw)
#
# lpsn_genus <- LPSN::request(cred, category = "genus")
# message("Downloading genus data (n = ", lpsn_genus$count, ") from LPSN API...")
# lpsn_genus <- as.data.frame(LPSN::retrieve(cred, category = "genus"))
#
# lpsn_species <- LPSN::request(cred, category = "species")
# message("Downloading species data (n = ", lpsn_species$count, ") from LPSN API...")
# lpsn_species <- as.data.frame(LPSN::retrieve(cred, category = "species"))
#
# lpsn_subspecies <- LPSN::request(cred, category = "subspecies")
# message("Downloading subspecies data (n = ", lpsn_subspecies$count, ") from LPSN API...")
# lpsn_subspecies <- as.data.frame(LPSN::retrieve(cred, category = "subspecies"))
#
# message("Binding rows...")
# lpsn_total <- bind_rows(lpsn_genus, lpsn_species, lpsn_subspecies)
# message("Done.")
# lpsn_total
# }
2019-02-20 00:04:48 +01:00
2022-10-05 09:12:22 +02:00
# Read GBIF data ----------------------------------------------------------
taxonomy_gbif.bak <- vroom ( file_gbif )
2022-12-12 00:14:56 +01:00
include_fungal_orders <- c (
" Eurotiales" , " Microascales" , " Mucorales" , " Saccharomycetales" ,
" Schizosaccharomycetales" , " Tremellales" , " Onygenales" , " Pneumocystales"
)
# get latest taxonomic names of these fungal orders
include_fungal_orders_ids <- taxonomy_gbif.bak %>%
filter ( order %in% include_fungal_orders )
2023-01-23 15:01:21 +01:00
include_fungal_orders <- taxonomy_gbif.bak %>%
filter ( taxonID %in% c ( include_fungal_orders_ids $ taxonID , include_fungal_orders_ids $ acceptedNameUsageID ) ) %>%
distinct ( order ) %>%
2022-12-12 00:14:56 +01:00
pull ( order )
# check some columns to validate below filters
sort ( table ( taxonomy_gbif.bak $ taxonomicStatus ) )
sort ( table ( taxonomy_gbif.bak $ taxonRank ) )
2022-10-05 09:12:22 +02:00
taxonomy_gbif <- taxonomy_gbif.bak %>%
# immediately filter rows we really never want
filter (
# never doubtful status, only accepted and all synonyms, and only ranked items
taxonomicStatus != " doubtful" ,
taxonRank != " unranked" ,
# include these kingdoms (no Chromista)
kingdom %in% c ( " Archaea" , " Bacteria" , " Protozoa" ) |
# include all of these fungal orders
order %in% c (
" Eurotiales" , " Microascales" , " Mucorales" , " Saccharomycetales" ,
" Schizosaccharomycetales" , " Tremellales" , " Onygenales" , " Pneumocystales"
) |
# and all of these important genera (see "data-raw/_pre_commit_hook.R")
# (they also contain bacteria and protozoa, but these will get prevalence = 2 later on)
genus %in% AMR ::: MO_PREVALENT_GENERA
) %>%
select (
kingdom ,
phylum ,
class ,
order ,
family ,
genus ,
species = specificEpithet ,
subspecies = infraspecificEpithet ,
rank = taxonRank ,
status = taxonomicStatus ,
ref = scientificNameAuthorship ,
gbif = taxonID ,
gbif_parent = parentNameUsageID ,
gbif_renamed_to = acceptedNameUsageID
) %>%
2022-08-28 10:31:50 +02:00
mutate (
2022-10-05 09:12:22 +02:00
# do this mutate after the original selection/filtering, as it decreases computing time tremendously
status = ifelse ( status == " accepted" , " accepted" , " synonym" ) ,
# checked taxonRank - the "form" and "variety" always have a subspecies, so:
rank = ifelse ( rank %in% c ( " form" , " variety" ) , " subspecies" , rank ) ,
source = " GBIF"
) %>%
filter (
# their data is messy - keep only these:
rank == " kingdom" & ! is.na ( kingdom ) |
rank == " phylum" & ! is.na ( phylum ) |
rank == " class" & ! is.na ( class ) |
rank == " order" & ! is.na ( order ) |
rank == " family" & ! is.na ( family ) |
rank == " genus" & ! is.na ( genus ) |
rank == " species" & ! is.na ( species ) |
rank == " subspecies" & ! is.na ( subspecies )
) %>%
# some items end with _A or _B... why??
mutate_all ( ~ gsub ( " _[A-Z]$" , " " , .x , perl = TRUE ) ) %>%
# now we have duplicates, remove these, but prioritise "accepted" status and highest taxon ID
arrange ( status , gbif ) %>%
distinct ( kingdom , phylum , class , order , family , genus , species , subspecies , .keep_all = TRUE ) %>%
filter (
kingdom %unlike% " [0-9]" ,
phylum %unlike% " [0-9]" ,
class %unlike% " [0-9]" ,
order %unlike% " [0-9]" ,
family %unlike% " [0-9]" ,
genus %unlike% " [0-9]"
)
# integrity tests
sort ( table ( taxonomy_gbif $ rank ) )
sort ( table ( taxonomy_gbif $ status ) )
2022-12-12 00:14:56 +01:00
taxonomy_gbif
2022-10-05 09:12:22 +02:00
# Read LPSN data ----------------------------------------------------------
taxonomy_lpsn.bak <- vroom ( file_lpsn )
2022-12-12 00:14:56 +01:00
# check some columns to validate below filters
sort ( table ( is.na ( taxonomy_lpsn.bak $ record_lnk ) ) ) # accepted = TRUE
2022-10-05 09:12:22 +02:00
taxonomy_lpsn <- taxonomy_lpsn.bak %>%
transmute (
genus = genus_name ,
species = sp_epithet ,
subspecies = subsp_epithet ,
rank = case_when (
! is.na ( subsp_epithet ) ~ " subspecies" ,
! is.na ( sp_epithet ) ~ " species" ,
TRUE ~ " genus"
) ,
status = ifelse ( is.na ( record_lnk ) , " accepted" , " synonym" ) ,
ref = authors ,
lpsn = as.character ( record_no ) ,
lpsn_parent = NA_character_ ,
lpsn_renamed_to = as.character ( record_lnk )
) %>%
mutate ( source = " LPSN" )
taxonomy_lpsn
# download additional taxonomy to the domain/kingdom level (their API is not sufficient...)
taxonomy_lpsn_missing <- tibble (
kingdom = character ( 0 ) ,
phylum = character ( 0 ) ,
class = character ( 0 ) ,
order = character ( 0 ) ,
family = character ( 0 ) ,
genus = character ( 0 )
)
for ( page in LETTERS ) {
2022-12-12 00:14:56 +01:00
# this will not alter `taxonomy_lpsn` yet
message ( " Downloading page " , page , " ..." , appendLF = FALSE )
2022-10-05 09:12:22 +02:00
url <- paste0 ( " https://lpsn.dsmz.de/genus?page=" , page )
2022-12-12 00:14:56 +01:00
x <- read_html ( url ) %>%
2022-10-05 09:12:22 +02:00
# class "main-list" is the main table
2022-12-12 00:14:56 +01:00
html_element ( " .main-list" ) %>%
2022-10-05 09:12:22 +02:00
# get every list element with a set <id> attribute
2022-12-12 00:14:56 +01:00
html_elements ( " li[id]" )
2022-10-05 09:12:22 +02:00
for ( i in seq_len ( length ( x ) ) ) {
if ( i %% 25 == 0 ) {
message ( " ." , appendLF = FALSE )
}
2022-12-12 00:14:56 +01:00
elements <- x [ [i ] ] %>% html_elements ( " a" )
hrefs <- elements %>% html_attr ( " href" )
2022-10-05 09:12:22 +02:00
ranks <- hrefs %>% gsub ( " .*/(.*?)/.*" , " \\1" , .)
names <- elements %>%
2022-12-12 00:14:56 +01:00
html_text ( ) %>%
2022-10-05 09:12:22 +02:00
gsub ( ' "' , " " , ., fixed = TRUE )
# no species, this must be until genus level
hrefs <- hrefs [ranks != " species" ]
names <- names [ranks != " species" ]
ranks <- ranks [ranks != " species" ]
ranks [ranks == " domain" ] <- " kingdom"
2023-01-23 15:01:21 +01:00
2022-10-05 09:12:22 +02:00
df <- names %>%
tibble ( ) %>%
t ( ) %>%
2022-12-12 00:14:56 +01:00
as_tibble ( .name_repair = " unique" ) %>%
2022-10-05 09:12:22 +02:00
setNames ( ranks ) %>%
# no candidates please
filter ( genus %unlike% " ^(Candidatus|\\[)" )
2023-01-23 15:01:21 +01:00
2022-10-05 09:12:22 +02:00
taxonomy_lpsn_missing <- taxonomy_lpsn_missing %>%
bind_rows ( df )
}
message ( length ( x ) , " entries incl. candidates (cleaned total: " , nrow ( taxonomy_lpsn_missing ) , " )" )
}
2022-12-12 00:14:56 +01:00
taxonomy_lpsn_missing
2022-10-05 09:12:22 +02:00
taxonomy_lpsn <- taxonomy_lpsn %>%
left_join ( taxonomy_lpsn_missing , by = " genus" ) %>%
select ( kingdom : family , everything ( ) ) %>%
# remove entries like "[Bacteria, no family]" and "[Bacteria, no class]"
mutate_all ( function ( x ) ifelse ( x %like_case% " no " , NA_character_ , x ) )
taxonomy_lpsn.bak2 <- taxonomy_lpsn
2022-12-12 00:14:56 +01:00
# download family directly from LPSN website using scraping
pb <- progress_bar $ new ( total = length ( unique ( taxonomy_lpsn $ family ) ) )
2022-10-05 09:12:22 +02:00
for ( f in unique ( taxonomy_lpsn $ family ) ) {
pb $ tick ( )
if ( is.na ( f ) ) next
tax_info <- get_lpsn_and_author ( " Family" , f )
taxonomy_lpsn <- taxonomy_lpsn %>%
bind_rows ( tibble (
kingdom = taxonomy_lpsn $ kingdom [which ( taxonomy_lpsn $ family == f ) [1 ] ] ,
phylum = taxonomy_lpsn $ phylum [which ( taxonomy_lpsn $ family == f ) [1 ] ] ,
class = taxonomy_lpsn $ class [which ( taxonomy_lpsn $ family == f ) [1 ] ] ,
order = taxonomy_lpsn $ order [which ( taxonomy_lpsn $ family == f ) [1 ] ] ,
family = f ,
rank = " family" ,
status = " accepted" ,
source = " LPSN" ,
lpsn = unname ( tax_info [ " lpsn" ] ) ,
ref = unname ( tax_info [ " ref" ] )
) )
}
2022-12-12 00:14:56 +01:00
# download order directly from LPSN website using scraping
pb <- progress_bar $ new ( total = length ( unique ( taxonomy_lpsn $ order ) ) )
2022-10-05 09:12:22 +02:00
for ( o in unique ( taxonomy_lpsn $ order ) ) {
pb $ tick ( )
if ( is.na ( o ) ) next
tax_info <- get_lpsn_and_author ( " Order" , o )
taxonomy_lpsn <- taxonomy_lpsn %>%
bind_rows ( tibble (
kingdom = taxonomy_lpsn $ kingdom [which ( taxonomy_lpsn $ order == o ) [1 ] ] ,
phylum = taxonomy_lpsn $ phylum [which ( taxonomy_lpsn $ order == o ) [1 ] ] ,
class = taxonomy_lpsn $ class [which ( taxonomy_lpsn $ order == o ) [1 ] ] ,
order = o ,
rank = " order" ,
status = " accepted" ,
source = " LPSN" ,
lpsn = unname ( tax_info [ " lpsn" ] ) ,
ref = unname ( tax_info [ " ref" ] )
) )
}
2022-12-12 00:14:56 +01:00
# download class directly from LPSN website using scraping
pb <- progress_bar $ new ( total = length ( unique ( taxonomy_lpsn $ class ) ) )
2022-10-05 09:12:22 +02:00
for ( cc in unique ( taxonomy_lpsn $ class ) ) {
pb $ tick ( )
if ( is.na ( cc ) ) next
tax_info <- get_lpsn_and_author ( " Class" , cc )
taxonomy_lpsn <- taxonomy_lpsn %>%
bind_rows ( tibble (
kingdom = taxonomy_lpsn $ kingdom [which ( taxonomy_lpsn $ class == cc ) [1 ] ] ,
phylum = taxonomy_lpsn $ phylum [which ( taxonomy_lpsn $ class == cc ) [1 ] ] ,
class = cc ,
rank = " class" ,
status = " accepted" ,
source = " LPSN" ,
lpsn = unname ( tax_info [ " lpsn" ] ) ,
ref = unname ( tax_info [ " ref" ] )
) )
}
2022-12-12 00:14:56 +01:00
# download phylum directly from LPSN website using scraping
pb <- progress_bar $ new ( total = length ( unique ( taxonomy_lpsn $ phylum ) ) )
2022-10-05 09:12:22 +02:00
for ( p in unique ( taxonomy_lpsn $ phylum ) ) {
pb $ tick ( )
if ( is.na ( p ) ) next
tax_info <- get_lpsn_and_author ( " Phylum" , p )
taxonomy_lpsn <- taxonomy_lpsn %>%
bind_rows ( tibble (
kingdom = taxonomy_lpsn $ kingdom [which ( taxonomy_lpsn $ phylum == p ) [1 ] ] ,
phylum = p ,
rank = " phylum" ,
status = " accepted" ,
source = " LPSN" ,
lpsn = unname ( tax_info [ " lpsn" ] ) ,
ref = unname ( tax_info [ " ref" ] )
) )
}
2022-12-12 00:14:56 +01:00
# download kingdom directly from LPSN website using scraping
pb <- progress_bar $ new ( total = length ( unique ( taxonomy_lpsn $ kingdom ) ) )
2022-10-05 09:12:22 +02:00
for ( k in unique ( taxonomy_lpsn $ kingdom ) ) {
pb $ tick ( )
if ( is.na ( k ) ) next
tax_info <- get_lpsn_and_author ( " Domain" , k )
taxonomy_lpsn <- taxonomy_lpsn %>%
bind_rows ( tibble (
kingdom = k ,
rank = " kingdom" ,
status = " accepted" ,
source = " LPSN" ,
lpsn = unname ( tax_info [ " lpsn" ] ) ,
ref = unname ( tax_info [ " ref" ] )
) )
}
# integrity tests
sort ( table ( taxonomy_lpsn $ rank ) )
sort ( table ( taxonomy_lpsn $ status ) )
# Save intermediate results -----------------------------------------------
saveRDS ( taxonomy_gbif , " data-raw/taxonomy_gbif.rds" , version = 2 )
saveRDS ( taxonomy_lpsn , " data-raw/taxonomy_lpsn.rds" , version = 2 )
2022-12-12 00:14:56 +01:00
# this allows to always get back to this point by simply loading the files from data-raw/.
2022-10-05 09:12:22 +02:00
# Add full names ----------------------------------------------------------
taxonomy_gbif <- taxonomy_gbif %>%
# clean NAs and add fullname
mutate ( across ( kingdom : subspecies , function ( x ) ifelse ( is.na ( x ) , " " , x ) ) ,
2023-01-23 15:01:21 +01:00
fullname = trimws ( case_when (
rank == " family" ~ family ,
rank == " order" ~ order ,
rank == " class" ~ class ,
rank == " phylum" ~ phylum ,
rank == " kingdom" ~ kingdom ,
TRUE ~ paste ( genus , species , subspecies )
) ) , .before = 1
2022-08-28 10:31:50 +02:00
) %>%
2022-10-05 09:12:22 +02:00
# keep only one GBIF taxon ID per full name
arrange ( fullname , gbif ) %>%
2022-12-12 00:14:56 +01:00
distinct ( kingdom , rank , fullname , .keep_all = TRUE )
2020-05-27 16:37:49 +02:00
2022-10-05 09:12:22 +02:00
taxonomy_lpsn <- taxonomy_lpsn %>%
# clean NAs and add fullname
mutate ( across ( kingdom : subspecies , function ( x ) ifelse ( is.na ( x ) , " " , x ) ) ,
2023-01-23 15:01:21 +01:00
fullname = trimws ( case_when (
rank == " family" ~ family ,
rank == " order" ~ order ,
rank == " class" ~ class ,
rank == " phylum" ~ phylum ,
rank == " kingdom" ~ kingdom ,
TRUE ~ paste ( genus , species , subspecies )
) ) , .before = 1
2022-08-28 10:31:50 +02:00
) %>%
2022-10-05 09:12:22 +02:00
# keep only one LPSN record ID per full name
arrange ( fullname , lpsn ) %>%
2022-12-12 00:14:56 +01:00
distinct ( kingdom , rank , fullname , .keep_all = TRUE )
2022-10-05 09:12:22 +02:00
# set parent LPSN IDs, requires full name
taxonomy_lpsn $ lpsn_parent [taxonomy_lpsn $ rank == " phylum" ] <- taxonomy_lpsn $ lpsn [match ( taxonomy_lpsn $ kingdom [taxonomy_lpsn $ rank == " phylum" ] , taxonomy_lpsn $ fullname ) ]
taxonomy_lpsn $ lpsn_parent [taxonomy_lpsn $ rank == " class" ] <- taxonomy_lpsn $ lpsn [match ( taxonomy_lpsn $ phylum [taxonomy_lpsn $ rank == " class" ] , taxonomy_lpsn $ fullname ) ]
taxonomy_lpsn $ lpsn_parent [taxonomy_lpsn $ rank == " order" ] <- taxonomy_lpsn $ lpsn [match ( taxonomy_lpsn $ class [taxonomy_lpsn $ rank == " order" ] , taxonomy_lpsn $ fullname ) ]
taxonomy_lpsn $ lpsn_parent [taxonomy_lpsn $ rank == " family" ] <- taxonomy_lpsn $ lpsn [match ( taxonomy_lpsn $ order [taxonomy_lpsn $ rank == " family" ] , taxonomy_lpsn $ fullname ) ]
taxonomy_lpsn $ lpsn_parent [taxonomy_lpsn $ rank == " genus" ] <- taxonomy_lpsn $ lpsn [match ( taxonomy_lpsn $ family [taxonomy_lpsn $ rank == " genus" ] , taxonomy_lpsn $ fullname ) ]
taxonomy_lpsn $ lpsn_parent [taxonomy_lpsn $ rank == " species" ] <- taxonomy_lpsn $ lpsn [match ( taxonomy_lpsn $ genus [taxonomy_lpsn $ rank == " species" ] , taxonomy_lpsn $ fullname ) ]
taxonomy_lpsn $ lpsn_parent [taxonomy_lpsn $ rank == " subspecies" ] <- taxonomy_lpsn $ lpsn [match ( paste ( taxonomy_lpsn $ genus [taxonomy_lpsn $ rank == " subspecies" ] , taxonomy_lpsn $ species [taxonomy_lpsn $ rank == " subspecies" ] ) , taxonomy_lpsn $ fullname ) ]
# Combine the datasets ----------------------------------------------------
# basis must be LPSN as it's most recent
taxonomy <- taxonomy_lpsn %>%
# join GBIF identifiers to them
left_join ( taxonomy_gbif %>% select ( kingdom , fullname , starts_with ( " gbif" ) ) ,
2023-01-23 15:01:21 +01:00
by = c ( " kingdom" , " fullname" )
2022-10-05 09:12:22 +02:00
)
# for everything else, add the GBIF data
taxonomy <- taxonomy %>%
bind_rows ( taxonomy_gbif %>%
2023-01-23 15:01:21 +01:00
filter ( ! paste ( kingdom , fullname ) %in% paste ( taxonomy $ kingdom , taxonomy $ fullname ) ) ) %>%
2022-08-28 10:31:50 +02:00
arrange ( fullname ) %>%
2022-10-05 09:12:22 +02:00
filter ( fullname != " " )
2019-09-22 17:19:59 +02:00
2022-12-16 16:10:43 +01:00
# get missing entries from existing microorganisms data set
taxonomy <- taxonomy %>%
bind_rows ( AMR :: microorganisms %>%
2023-01-23 15:01:21 +01:00
select ( all_of ( colnames ( taxonomy ) ) ) %>%
filter (
! paste ( kingdom , fullname ) %in% paste ( taxonomy $ kingdom , taxonomy $ fullname ) ,
# these will be added later:
source != " manually added"
) ) %>%
2022-12-16 16:10:43 +01:00
arrange ( fullname ) %>%
filter ( fullname != " " )
2022-10-05 09:12:22 +02:00
# fix rank
2022-12-12 00:14:56 +01:00
table ( taxonomy $ rank , useNA = " always" )
2022-10-05 09:12:22 +02:00
taxonomy <- taxonomy %>%
mutate ( rank = case_when (
subspecies != " " ~ " subspecies" ,
species != " " ~ " species" ,
genus != " " ~ " genus" ,
family != " " ~ " family" ,
order != " " ~ " order" ,
class != " " ~ " class" ,
phylum != " " ~ " phylum" ,
kingdom != " " ~ " kingdom" ,
TRUE ~ NA_character_
) )
table ( taxonomy $ rank , useNA = " always" )
2022-12-16 16:10:43 +01:00
# Save intermediate results (0) -------------------------------------------
saveRDS ( taxonomy , " data-raw/taxonomy0.rds" )
2022-10-05 09:12:22 +02:00
2022-12-16 16:10:43 +01:00
# Add missing and fix old taxonomic entries -------------------------------
2022-10-05 09:12:22 +02:00
# this part will make sure that the whole taxonomy of every included species exists, so no missing genera, classes, etc.
current_gbif <- taxonomy_gbif.bak %>%
filter ( is.na ( acceptedNameUsageID ) ) %>%
mutate (
taxonID = as.character ( taxonID ) ,
parentNameUsageID = as.character ( parentNameUsageID )
)
# add missing kingdoms
taxonomy <- taxonomy %>%
bind_rows (
taxonomy %>%
filter ( kingdom != " " ) %>%
distinct ( kingdom ) %>%
mutate (
fullname = kingdom ,
rank = " kingdom" ,
status = " accepted" ,
source = " manually added"
) %>%
filter ( ! paste ( kingdom , rank ) %in% paste ( taxonomy $ kingdom , taxonomy $ rank ) ) %>%
2023-01-23 15:01:21 +01:00
left_join (
current_gbif %>%
select ( kingdom , rank = taxonRank , ref = scientificNameAuthorship , gbif = taxonID , gbif_parent = parentNameUsageID ) ,
by = c ( " kingdom" , " rank" )
2022-10-05 09:12:22 +02:00
) %>%
mutate ( source = ifelse ( ! is.na ( gbif ) , " GBIF" , source ) )
)
# 2 = phylum ... 6 = genus
2022-12-12 00:14:56 +01:00
taxonomy_all_missing <- NULL
2022-10-05 09:12:22 +02:00
for ( i in 2 : 6 ) {
i_name <- colnames ( taxonomy ) [i + 1 ]
message ( " Adding missing: " , i_name , " ... " , appendLF = FALSE )
to_add <- taxonomy %>%
filter ( .[ [i + 1 ] ] != " " ) %>%
distinct ( kingdom , .[ [i + 1 ] ] , .keep_all = TRUE ) %>%
select ( kingdom : ( i + 1 ) ) %>%
2022-08-28 10:31:50 +02:00
mutate (
2022-10-05 09:12:22 +02:00
fullname = .[ [ncol ( .) ] ] ,
rank = i_name ,
status = " accepted" ,
source = " manually added"
) %>%
2022-12-12 00:14:56 +01:00
filter ( ! paste ( kingdom , .[ [ncol ( .) - 4 ] ] , rank ) %in% paste ( taxonomy $ kingdom , taxonomy [ [i + 1 ] ] , taxonomy $ rank ) ) %>%
2023-01-23 15:01:21 +01:00
# get GBIF identifier where available
left_join (
current_gbif %>%
select ( kingdom , all_of ( i_name ) , rank = taxonRank , ref = scientificNameAuthorship , gbif = taxonID , gbif_parent = parentNameUsageID ) ,
by = c ( " kingdom" , " rank" , i_name )
) %>%
mutate ( source = ifelse ( ! is.na ( gbif ) , " GBIF" , source ) )
2022-10-05 09:12:22 +02:00
message ( " n = " , nrow ( to_add ) )
2022-12-12 00:14:56 +01:00
if ( is.null ( taxonomy_all_missing ) ) {
taxonomy_all_missing <- to_add
} else {
2023-01-23 15:01:21 +01:00
taxonomy_all_missing <- taxonomy_all_missing %>%
2022-12-12 00:14:56 +01:00
bind_rows ( to_add )
2022-10-29 14:15:23 +02:00
}
}
2022-12-12 00:14:56 +01:00
taxonomy_all_missing %>% View ( )
2022-10-30 14:31:45 +01:00
2022-12-12 00:14:56 +01:00
taxonomy <- taxonomy %>%
bind_rows ( taxonomy_all_missing )
# fix for duplicate fullnames within a kingdom (such as Nitrospira which is the name of the genus AND its class)
2023-01-23 15:01:21 +01:00
taxonomy <- taxonomy %>%
mutate (
rank_index = case_when (
rank == " subspecies" ~ 1 ,
rank == " species" ~ 2 ,
rank == " genus" ~ 3 ,
rank == " family" ~ 4 ,
rank == " order" ~ 5 ,
rank == " class" ~ 6 ,
TRUE ~ 7
) ,
fullname_rank = paste0 ( fullname , " {" , rank , " }" )
) %>%
arrange ( kingdom , fullname , rank_index ) %>%
group_by ( kingdom , fullname ) %>%
mutate ( fullname = if_else ( row_number ( ) > 1 , fullname_rank , fullname ) ) %>%
ungroup ( ) %>%
select ( - fullname_rank , - rank_index ) %>%
2022-12-12 00:14:56 +01:00
arrange ( fullname )
2022-10-29 14:15:23 +02:00
2022-12-12 00:14:56 +01:00
# now also add missing species (requires combination with genus)
2022-10-05 09:12:22 +02:00
taxonomy <- taxonomy %>%
2022-12-12 00:14:56 +01:00
bind_rows (
taxonomy %>%
filter ( species != " " ) %>%
distinct ( kingdom , genus , species , .keep_all = TRUE ) %>%
select ( kingdom : species ) %>%
mutate (
fullname = paste ( genus , species ) ,
rank = " species" ,
status = " accepted" ,
source = " manually added"
) %>%
filter ( ! paste ( kingdom , genus , species , rank ) %in% paste ( taxonomy $ kingdom , taxonomy $ genus , taxonomy $ species , taxonomy $ rank ) ) %>%
# get GBIF identifier where available
2023-01-23 15:01:21 +01:00
left_join (
current_gbif %>%
select ( kingdom , genus , species = specificEpithet , rank = taxonRank , ref = scientificNameAuthorship , gbif = taxonID , gbif_parent = parentNameUsageID ) ,
by = c ( " kingdom" , " rank" , " genus" , " species" )
2022-12-12 00:14:56 +01:00
) %>%
mutate ( source = ifelse ( ! is.na ( gbif ) , " GBIF" , source ) )
2023-01-23 15:01:21 +01:00
)
2022-10-05 09:12:22 +02:00
2020-05-27 16:37:49 +02:00
2022-10-05 09:12:22 +02:00
# remove NAs from taxonomy again, and keep unique full names
taxonomy <- taxonomy %>%
mutate ( across ( kingdom : subspecies , function ( x ) ifelse ( is.na ( x ) , " " , x ) ) ) %>%
2022-08-28 10:31:50 +02:00
distinct ( kingdom , fullname , .keep_all = TRUE ) %>%
2022-10-05 09:12:22 +02:00
filter ( kingdom != " " )
2019-03-18 14:29:41 +01:00
2022-12-12 00:14:56 +01:00
2022-12-16 16:10:43 +01:00
# Save intermediate results (1) -------------------------------------------
2022-10-05 09:12:22 +02:00
2022-12-12 00:14:56 +01:00
saveRDS ( taxonomy , " data-raw/taxonomy1.rds" )
2022-10-05 09:12:22 +02:00
# Get previously manually added entries -----------------------------------
manually_added <- AMR :: microorganisms %>%
2022-12-12 00:14:56 +01:00
filter ( source == " manually added" , ! paste ( kingdom , fullname ) %in% paste ( taxonomy $ kingdom , taxonomy $ fullname ) ) %>%
2022-10-05 09:12:22 +02:00
select ( fullname : subspecies , ref , source , rank )
2023-01-23 15:01:21 +01:00
manually_added <- manually_added %>%
2022-12-16 16:10:43 +01:00
bind_rows ( salmonellae )
2022-10-05 09:12:22 +02:00
# get latest taxonomy for those entries
for ( g in unique ( manually_added $ genus [manually_added $ genus != " " & manually_added $ genus %in% taxonomy $ genus ] ) ) {
manually_added $ family [which ( manually_added $ genus == g ) ] <- taxonomy $ family [which ( taxonomy $ genus == g & is.na ( taxonomy $ lpsn ) ) ] [1 ]
}
for ( f in unique ( manually_added $ family [manually_added $ family != " " & manually_added $ family %in% taxonomy $ family ] ) ) {
manually_added $ order [which ( manually_added $ family == f ) ] <- taxonomy $ order [which ( taxonomy $ family == f & is.na ( taxonomy $ lpsn ) ) ] [1 ]
}
for ( o in unique ( manually_added $ order [manually_added $ order != " " & manually_added $ order %in% taxonomy $ order ] ) ) {
manually_added $ class [which ( manually_added $ order == o ) ] <- taxonomy $ class [which ( taxonomy $ order == o & is.na ( taxonomy $ lpsn ) ) ] [1 ]
}
for ( cc in unique ( manually_added $ class [manually_added $ class != " " & manually_added $ class %in% taxonomy $ class ] ) ) {
manually_added $ phylum [which ( manually_added $ class == cc ) ] <- taxonomy $ phylum [which ( taxonomy $ class == cc & is.na ( taxonomy $ lpsn ) ) ] [1 ]
}
2022-12-16 16:10:43 +01:00
for ( p in unique ( manually_added $ phylum [manually_added $ phylum != " " & manually_added $ phylum %in% taxonomy $ phylum ] ) ) {
manually_added $ kingdom [which ( manually_added $ phylum == p ) ] <- taxonomy $ kingdom [which ( taxonomy $ phylum == p & is.na ( taxonomy $ lpsn ) ) ] [1 ]
}
2022-10-05 09:12:22 +02:00
2022-12-12 00:14:56 +01:00
manually_added <- manually_added %>%
mutate (
status = " accepted" ,
rank = ifelse ( fullname %like% " unknown" , " (unknown rank)" , rank )
)
2022-12-16 16:10:43 +01:00
manually_added
2022-10-05 09:12:22 +02:00
taxonomy <- taxonomy %>%
2022-12-16 16:10:43 +01:00
# here also the 'unknowns' are added, such as "(unknown fungus)"
2022-12-12 00:14:56 +01:00
bind_rows ( manually_added ) %>%
2022-10-05 09:12:22 +02:00
arrange ( fullname )
table ( taxonomy $ rank , useNA = " always" )
# Clean scientific reference ----------------------------------------------
taxonomy <- taxonomy %>%
mutate ( ref = get_author_year ( ref ) )
2019-08-06 14:39:22 +02:00
2019-09-18 15:46:09 +02:00
2022-12-16 16:10:43 +01:00
# Get the latest upper taxonomy from LPSN for non-LPSN data ---------------
# (e.g., phylum above class "Bacilli" was still "Firmicutes", should be "Bacillota" in 2022)
for ( k in unique ( taxonomy $ kingdom [taxonomy $ kingdom != " " ] ) ) {
message ( " Fixing GBIF taxonomy for kingdom " , k , " ." , appendLF = FALSE )
i <- 0
for ( g in unique ( taxonomy $ genus [taxonomy $ genus != " " & taxonomy $ kingdom == k & taxonomy $ source == " LPSN" ] ) ) {
i <- i + 1
if ( i %% 50 == 0 ) message ( " ." , appendLF = FALSE )
taxonomy $ family [which ( taxonomy $ genus == g & taxonomy $ kingdom == k ) ] <- taxonomy $ family [which ( taxonomy $ genus == g & taxonomy $ kingdom == k & taxonomy $ source == " LPSN" ) ] [1 ]
}
for ( f in unique ( taxonomy $ family [taxonomy $ family != " " & taxonomy $ kingdom == k & taxonomy $ source == " LPSN" ] ) ) {
i <- i + 1
if ( i %% 50 == 0 ) message ( " ." , appendLF = FALSE )
taxonomy $ order [which ( taxonomy $ family == f & taxonomy $ kingdom == k ) ] <- taxonomy $ order [which ( taxonomy $ family == f & taxonomy $ kingdom == k & taxonomy $ source == " LPSN" ) ] [1 ]
}
for ( o in unique ( taxonomy $ order [taxonomy $ order != " " & taxonomy $ kingdom == k & taxonomy $ source == " LPSN" ] ) ) {
i <- i + 1
if ( i %% 50 == 0 ) message ( " ." , appendLF = FALSE )
taxonomy $ class [which ( taxonomy $ order == o & taxonomy $ kingdom == k ) ] <- taxonomy $ class [which ( taxonomy $ order == o & taxonomy $ kingdom == k & taxonomy $ source == " LPSN" ) ] [1 ]
}
for ( cc in unique ( taxonomy $ class [taxonomy $ class != " " & taxonomy $ kingdom == k & taxonomy $ source == " LPSN" ] ) ) {
i <- i + 1
if ( i %% 50 == 0 ) message ( " ." , appendLF = FALSE )
taxonomy $ phylum [which ( taxonomy $ class == cc & taxonomy $ kingdom == k ) ] <- taxonomy $ phylum [which ( taxonomy $ class == cc & taxonomy $ kingdom == k & taxonomy $ source == " LPSN" ) ] [1 ]
}
message ( " OK." )
}
# we need to fix parent GBIF identifiers
taxonomy $ gbif_parent [taxonomy $ rank == " phylum" & ! is.na ( taxonomy $ gbif ) ] <- taxonomy $ gbif [match ( taxonomy $ kingdom [taxonomy $ rank == " phylum" & ! is.na ( taxonomy $ gbif ) ] , taxonomy $ fullname ) ]
taxonomy $ gbif_parent [taxonomy $ rank == " class" & ! is.na ( taxonomy $ gbif ) ] <- taxonomy $ gbif [match ( taxonomy $ phylum [taxonomy $ rank == " class" & ! is.na ( taxonomy $ gbif ) ] , taxonomy $ fullname ) ]
taxonomy $ gbif_parent [taxonomy $ rank == " order" & ! is.na ( taxonomy $ gbif ) ] <- taxonomy $ gbif [match ( taxonomy $ class [taxonomy $ rank == " order" & ! is.na ( taxonomy $ gbif ) ] , taxonomy $ fullname ) ]
taxonomy $ gbif_parent [taxonomy $ rank == " family" & ! is.na ( taxonomy $ gbif ) ] <- taxonomy $ gbif [match ( taxonomy $ order [taxonomy $ rank == " family" & ! is.na ( taxonomy $ gbif ) ] , taxonomy $ fullname ) ]
taxonomy $ gbif_parent [taxonomy $ rank == " genus" & ! is.na ( taxonomy $ gbif ) ] <- taxonomy $ gbif [match ( taxonomy $ family [taxonomy $ rank == " genus" & ! is.na ( taxonomy $ gbif ) ] , taxonomy $ fullname ) ]
taxonomy $ gbif_parent [taxonomy $ rank == " species" & ! is.na ( taxonomy $ gbif ) ] <- taxonomy $ gbif [match ( taxonomy $ genus [taxonomy $ rank == " species" & ! is.na ( taxonomy $ gbif ) ] , taxonomy $ fullname ) ]
taxonomy $ gbif_parent [taxonomy $ rank == " subspecies" & ! is.na ( taxonomy $ gbif ) ] <- taxonomy $ gbif [match ( paste ( taxonomy $ genus [taxonomy $ rank == " subspecies" & ! is.na ( taxonomy $ gbif ) ] , taxonomy $ species [taxonomy $ rank == " subspecies" & ! is.na ( taxonomy $ gbif ) ] ) , taxonomy $ fullname ) ]
# these still have no record in our data set:
all ( taxonomy $ lpsn_parent %in% taxonomy $ lpsn )
all ( taxonomy $ gbif_parent %in% taxonomy $ gbif )
2022-12-12 00:14:56 +01:00
# fix rank
taxonomy <- taxonomy %>%
mutate ( rank = case_when (
subspecies != " " ~ " subspecies" ,
species != " " ~ " species" ,
genus != " " ~ " genus" ,
family != " " ~ " family" ,
order != " " ~ " order" ,
class != " " ~ " class" ,
phylum != " " ~ " phylum" ,
kingdom != " " ~ " kingdom" ,
TRUE ~ NA_character_
) )
2019-09-18 15:46:09 +02:00
2022-10-05 09:12:22 +02:00
2022-12-19 15:32:41 +01:00
# Add prevalence ----------------------------------------------------------
pathogens <- read_excel ( file_bartlett , sheet = " Tab 6 Full List" )
# get all established, both old and current taxonomic names
2023-01-23 15:01:21 +01:00
established <- pathogens %>%
filter ( status == " established" ) %>%
2022-12-19 15:32:41 +01:00
mutate ( fullname = paste ( genus , species ) ) %>%
2023-01-23 15:01:21 +01:00
pull ( fullname ) %>%
c (
unlist ( mo_current ( .) ) ,
unlist ( mo_synonyms ( ., keep_synonyms = FALSE ) )
) %>%
strsplit ( " " , fixed = TRUE ) %>%
sapply ( function ( x ) ifelse ( length ( x ) == 1 , x , paste ( x [1 ] , x [2 ] ) ) ) %>%
sort ( ) %>%
2022-12-19 15:32:41 +01:00
unique ( )
# get all putative, both old and current taxonomic names
2023-01-23 15:01:21 +01:00
putative <- pathogens %>%
filter ( status == " putative" ) %>%
2022-12-19 15:32:41 +01:00
mutate ( fullname = paste ( genus , species ) ) %>%
2023-01-23 15:01:21 +01:00
pull ( fullname ) %>%
c (
unlist ( mo_current ( .) ) ,
unlist ( mo_synonyms ( ., keep_synonyms = FALSE ) )
) %>%
strsplit ( " " , fixed = TRUE ) %>%
sapply ( function ( x ) ifelse ( length ( x ) == 1 , x , paste ( x [1 ] , x [2 ] ) ) ) %>%
sort ( ) %>%
2022-12-19 15:32:41 +01:00
unique ( )
established <- established [established %unlike% " unknown" ]
putative <- putative [putative %unlike% " unknown" ]
2023-01-23 15:01:21 +01:00
established_genera <- established %>%
strsplit ( " " , fixed = TRUE ) %>%
sapply ( function ( x ) x [1 ] ) %>%
sort ( ) %>%
2022-12-19 15:32:41 +01:00
unique ( )
2023-01-23 15:01:21 +01:00
putative_genera <- putative %>%
strsplit ( " " , fixed = TRUE ) %>%
sapply ( function ( x ) x [1 ] ) %>%
sort ( ) %>%
2022-12-20 16:14:04 +01:00
unique ( )
2023-01-23 15:01:21 +01:00
nonbacterial_genera <- AMR ::: MO_PREVALENT_GENERA %>%
c (
unlist ( mo_current ( .) ) ,
unlist ( mo_synonyms ( ., keep_synonyms = FALSE ) )
) %>%
strsplit ( " " , fixed = TRUE ) %>%
sapply ( function ( x ) x [1 ] ) %>%
sort ( ) %>%
2022-12-19 15:32:41 +01:00
unique ( )
2022-12-20 16:14:04 +01:00
nonbacterial_genera <- nonbacterial_genera [nonbacterial_genera %unlike% " unknown" ]
2022-12-19 15:32:41 +01:00
# update prevalence based on taxonomy (following the recent and thorough work of Bartlett et al., 2022)
# see https://doi.org/10.1099/mic.0.001269
2023-01-23 15:01:21 +01:00
taxonomy <- taxonomy %>%
2022-12-19 15:32:41 +01:00
mutate ( prevalence = case_when (
2022-12-20 16:14:04 +01:00
# 'established' means 'have infected at least three persons in three or more references'
paste ( genus , species ) %in% established & rank %in% c ( " species" , " subspecies" ) ~ 1.0 ,
# other genera in the 'established' group
genus %in% established_genera & rank == " genus" ~ 1.0 ,
2023-01-23 15:01:21 +01:00
2022-12-20 16:14:04 +01:00
# 'putative' means 'fewer than three known cases'
paste ( genus , species ) %in% putative & rank %in% c ( " species" , " subspecies" ) ~ 1.25 ,
# other genera in the 'putative' group
genus %in% putative_genera & rank == " genus" ~ 1.25 ,
2023-01-23 15:01:21 +01:00
2022-12-20 16:14:04 +01:00
# species and subspecies in 'established' and 'putative' groups
genus %in% c ( established_genera , putative_genera ) & rank %in% c ( " species" , " subspecies" ) ~ 1.5 ,
# other species from a genus in either group
genus %in% nonbacterial_genera & rank %in% c ( " genus" , " species" , " subspecies" ) ~ 1.5 ,
2022-12-19 15:32:41 +01:00
# we keep track of prevalent genera too of non-bacterial species
2022-12-20 16:14:04 +01:00
genus %in% AMR ::: MO_PREVALENT_GENERA & kingdom != " Bacteria" & rank %in% c ( " genus" , " species" , " subspecies" ) ~ 1.5 ,
2023-01-23 15:01:21 +01:00
2022-12-20 16:14:04 +01:00
# all others
2023-01-23 15:01:21 +01:00
TRUE ~ 2.0
) )
2022-12-19 15:32:41 +01:00
table ( taxonomy $ prevalence , useNA = " always" )
# (a lot will be removed further below)
2022-12-16 16:10:43 +01:00
# Save intermediate results (2) -------------------------------------------
saveRDS ( taxonomy , " data-raw/taxonomy2.rds" )
2022-10-05 09:12:22 +02:00
# Add microbial IDs -------------------------------------------------------
# MO codes in the AMR package have the form KINGDOM_GENUS_SPECIES_SUBSPECIES where all are abbreviated.
# Kingdom is abbreviated with 1 character, with exceptions for Animalia and Plantae
mo_kingdom <- taxonomy %>%
filter ( rank == " kingdom" ) %>%
select ( kingdom ) %>%
mutate ( mo_kingdom = case_when (
kingdom == " Animalia" ~ " AN" ,
kingdom == " Archaea" ~ " A" ,
kingdom == " Bacteria" ~ " B" ,
kingdom == " Chromista" ~ " C" ,
kingdom == " Fungi" ~ " F" ,
kingdom == " Plantae" ~ " PL" ,
kingdom == " Protozoa" ~ " P" ,
TRUE ~ " "
) )
# phylum until family are abbreviated with 8 characters and prefixed with their rank
# Phylum - keep old and fill up for new ones
mo_phylum <- taxonomy %>%
filter ( rank == " phylum" ) %>%
distinct ( kingdom , phylum ) %>%
2023-01-23 15:01:21 +01:00
left_join (
AMR :: microorganisms %>%
filter ( rank == " phylum" ) %>%
transmute ( kingdom ,
phylum = fullname ,
mo_old = gsub ( " [A-Z]{1,2}_" , " " , as.character ( mo ) )
) ,
by = c ( " kingdom" , " phylum" )
2022-10-05 09:12:22 +02:00
) %>%
group_by ( kingdom ) %>%
mutate (
mo_phylum8 = abbreviate_mo ( phylum , minlength = 8 , prefix = " [PHL]_" ) ,
mo_phylum9 = abbreviate_mo ( phylum , minlength = 9 , prefix = " [PHL]_" ) ,
mo_phylum = ifelse ( ! is.na ( mo_old ) , mo_old , mo_phylum8 ) ,
mo_duplicated = duplicated ( mo_phylum ) ,
mo_phylum = ifelse ( mo_duplicated , mo_phylum9 , mo_phylum ) ,
mo_duplicated = duplicated ( mo_phylum )
) %>%
ungroup ( )
if ( any ( mo_phylum $ mo_duplicated , na.rm = TRUE ) ) stop ( " Duplicate MO codes for phylum!" )
mo_phylum <- mo_phylum %>%
select ( kingdom , phylum , mo_phylum )
# Class - keep old and fill up for new ones
mo_class <- taxonomy %>%
filter ( rank == " class" ) %>%
distinct ( kingdom , class ) %>%
2023-01-23 15:01:21 +01:00
left_join (
AMR :: microorganisms %>%
filter ( rank == " class" ) %>%
transmute ( kingdom ,
class = fullname ,
mo_old = gsub ( " [A-Z]{1,2}_" , " " , as.character ( mo ) )
) ,
by = c ( " kingdom" , " class" )
2022-10-05 09:12:22 +02:00
) %>%
group_by ( kingdom ) %>%
2022-08-28 10:31:50 +02:00
mutate (
2022-10-05 09:12:22 +02:00
mo_class8 = abbreviate_mo ( class , minlength = 8 , prefix = " [CLS]_" ) ,
mo_class9 = abbreviate_mo ( class , minlength = 9 , prefix = " [CLS]_" ) ,
mo_class = ifelse ( ! is.na ( mo_old ) , mo_old , mo_class8 ) ,
mo_duplicated = duplicated ( mo_class ) ,
mo_class = ifelse ( mo_duplicated , mo_class9 , mo_class ) ,
mo_duplicated = duplicated ( mo_class )
) %>%
ungroup ( )
if ( any ( mo_class $ mo_duplicated , na.rm = TRUE ) ) stop ( " Duplicate MO codes for class!" )
mo_class <- mo_class %>%
select ( kingdom , class , mo_class )
# Order - keep old and fill up for new ones
mo_order <- taxonomy %>%
filter ( rank == " order" ) %>%
distinct ( kingdom , order ) %>%
2023-01-23 15:01:21 +01:00
left_join (
AMR :: microorganisms %>%
filter ( rank == " order" ) %>%
transmute ( kingdom ,
order = fullname ,
mo_old = gsub ( " [A-Z]{1,2}_" , " " , as.character ( mo ) )
) ,
by = c ( " kingdom" , " order" )
2022-08-28 10:31:50 +02:00
) %>%
2022-10-05 09:12:22 +02:00
group_by ( kingdom ) %>%
2022-08-28 10:31:50 +02:00
mutate (
2022-10-05 09:12:22 +02:00
mo_order8 = abbreviate_mo ( order , minlength = 8 , prefix = " [ORD]_" ) ,
mo_order9 = abbreviate_mo ( order , minlength = 9 , prefix = " [ORD]_" ) ,
mo_order = ifelse ( ! is.na ( mo_old ) , mo_old , mo_order8 ) ,
mo_duplicated = duplicated ( mo_order ) ,
mo_order = ifelse ( mo_duplicated , mo_order9 , mo_order ) ,
mo_duplicated = duplicated ( mo_order )
) %>%
ungroup ( )
if ( any ( mo_order $ mo_duplicated , na.rm = TRUE ) ) stop ( " Duplicate MO codes for order!" )
mo_order <- mo_order %>%
select ( kingdom , order , mo_order )
# Family - keep old and fill up for new ones
mo_family <- taxonomy %>%
filter ( rank == " family" ) %>%
distinct ( kingdom , family ) %>%
2023-01-23 15:01:21 +01:00
left_join (
AMR :: microorganisms %>%
filter ( rank == " family" ) %>%
transmute ( kingdom ,
family = fullname ,
mo_old = gsub ( " [A-Z]{1,2}_" , " " , as.character ( mo ) )
) ,
by = c ( " kingdom" , " family" )
2022-10-05 09:12:22 +02:00
) %>%
group_by ( kingdom ) %>%
mutate (
mo_family8 = abbreviate_mo ( family , minlength = 8 , prefix = " [FAM]_" ) ,
mo_family9 = abbreviate_mo ( family , minlength = 9 , prefix = " [FAM]_" ) ,
mo_family = ifelse ( ! is.na ( mo_old ) , mo_old , mo_family8 ) ,
mo_duplicated = duplicated ( mo_family ) ,
mo_family = ifelse ( mo_duplicated , mo_family9 , mo_family ) ,
mo_duplicated = duplicated ( mo_family )
2022-08-28 10:31:50 +02:00
) %>%
2022-10-05 09:12:22 +02:00
ungroup ( )
if ( any ( mo_family $ mo_duplicated , na.rm = TRUE ) ) stop ( " Duplicate MO codes for family!" )
mo_family <- mo_family %>%
select ( kingdom , family , mo_family )
2019-02-20 00:04:48 +01:00
2022-10-05 09:12:22 +02:00
# construct code part for genus - keep old code where available and generate new ones where needed
mo_genus <- taxonomy %>%
filter ( rank == " genus" ) %>%
distinct ( kingdom , genus ) %>%
# get available old MO codes
2023-01-23 15:01:21 +01:00
left_join (
AMR :: microorganisms %>%
filter ( rank == " genus" ) %>%
transmute ( mo_genus_old = gsub ( " ^[A-Z]+_" , " " , as.character ( mo ) ) , kingdom , genus ) %>%
distinct ( kingdom , genus , .keep_all = TRUE ) ,
by = c ( " kingdom" , " genus" )
2022-10-05 09:12:22 +02:00
) %>%
distinct ( kingdom , genus , .keep_all = TRUE ) %>%
# since kingdom is part of the code, genus abbreviations may be duplicated between kingdoms
group_by ( kingdom ) %>%
# generate new MO codes for genus and set the right one
mutate (
mo_genus_new5 = abbreviate_mo ( genus , 5 ) ,
mo_genus_new5b = paste0 ( abbreviate_mo ( genus , 5 ) , 1 ) ,
mo_genus_new6 = abbreviate_mo ( genus , 6 ) ,
mo_genus_new7 = abbreviate_mo ( genus , 7 ) ,
mo_genus_new8 = abbreviate_mo ( genus , 8 ) ,
mo_genus_new = case_when (
! is.na ( mo_genus_old ) ~ mo_genus_old ,
! mo_genus_new5 %in% mo_genus_old ~ mo_genus_new5 ,
! mo_genus_new6 %in% mo_genus_old ~ mo_genus_new6 ,
! mo_genus_new7 %in% mo_genus_old ~ mo_genus_new7 ,
! mo_genus_new8 %in% mo_genus_old ~ mo_genus_new8 ,
! mo_genus_new5b %in% mo_genus_old ~ mo_genus_new5b ,
TRUE ~ mo_genus_old
2022-08-28 10:31:50 +02:00
) ,
2022-10-05 09:12:22 +02:00
mo_duplicated = duplicated ( mo_genus_new ) ,
mo_genus_new = case_when (
! mo_duplicated ~ mo_genus_new ,
mo_duplicated & mo_genus_new == mo_genus_new5 ~ mo_genus_new6 ,
mo_duplicated & mo_genus_new == mo_genus_new6 ~ mo_genus_new7 ,
mo_duplicated & mo_genus_new == mo_genus_new7 ~ mo_genus_new8 ,
mo_duplicated & mo_genus_new == mo_genus_new8 ~ mo_genus_new5b ,
TRUE ~ NA_character_
2022-08-28 10:31:50 +02:00
) ,
2022-10-05 09:12:22 +02:00
mo_duplicated = duplicated ( mo_genus_new )
) %>%
ungroup ( )
if ( any ( mo_genus $ mo_duplicated , na.rm = TRUE ) | anyNA ( mo_genus $ mo_genus_new ) ) stop ( " Duplicate MO codes for genus!" )
# no duplicates *within kingdoms*, so keep the right columns for left joining later
mo_genus <- mo_genus %>%
select ( kingdom , genus , mo_genus = mo_genus_new )
# same for species - keep old where available and create new per kingdom-genus where needed:
mo_species <- taxonomy %>%
filter ( rank == " species" ) %>%
distinct ( kingdom , genus , species ) %>%
2023-01-23 15:01:21 +01:00
left_join (
AMR :: microorganisms %>%
filter ( rank == " species" ) %>%
transmute ( mo_species_old = gsub ( " ^[A-Z]+_[A-Z]+_" , " " , as.character ( mo ) ) , kingdom , genus , species ) %>%
filter ( mo_species_old %unlike% " -" ) %>%
distinct ( kingdom , genus , species , .keep_all = TRUE ) ,
by = c ( " kingdom" , " genus" , " species" )
2022-10-05 09:12:22 +02:00
) %>%
distinct ( kingdom , genus , species , .keep_all = TRUE ) %>%
group_by ( kingdom , genus ) %>%
mutate (
mo_species_new4 = abbreviate_mo ( species , 4 , hyphen_as_space = TRUE ) ,
mo_species_new5 = abbreviate_mo ( species , 5 , hyphen_as_space = TRUE ) ,
mo_species_new5b = paste0 ( abbreviate_mo ( species , 5 , hyphen_as_space = TRUE ) , 1 ) ,
mo_species_new6 = abbreviate_mo ( species , 6 , hyphen_as_space = TRUE ) ,
mo_species_new7 = abbreviate_mo ( species , 7 , hyphen_as_space = TRUE ) ,
mo_species_new8 = abbreviate_mo ( species , 8 , hyphen_as_space = TRUE ) ,
mo_species_new = case_when (
! is.na ( mo_species_old ) ~ mo_species_old ,
! mo_species_new4 %in% mo_species_old ~ mo_species_new4 ,
! mo_species_new5 %in% mo_species_old ~ mo_species_new5 ,
! mo_species_new6 %in% mo_species_old ~ mo_species_new6 ,
! mo_species_new7 %in% mo_species_old ~ mo_species_new7 ,
! mo_species_new8 %in% mo_species_old ~ mo_species_new8 ,
! mo_species_new5b %in% mo_species_old ~ mo_species_new5b ,
TRUE ~ mo_species_old
2022-08-28 10:31:50 +02:00
) ,
2022-10-05 09:12:22 +02:00
mo_duplicated = duplicated ( mo_species_new ) ,
mo_species_new = case_when (
! mo_duplicated ~ mo_species_new ,
mo_duplicated & mo_species_new == mo_species_new4 ~ mo_species_new5 ,
mo_duplicated & mo_species_new == mo_species_new5 ~ mo_species_new6 ,
mo_duplicated & mo_species_new == mo_species_new6 ~ mo_species_new7 ,
mo_duplicated & mo_species_new == mo_species_new7 ~ mo_species_new8 ,
mo_duplicated & mo_species_new == mo_species_new8 ~ mo_species_new5b ,
TRUE ~ NA_character_
2022-08-28 10:31:50 +02:00
) ,
2022-10-05 09:12:22 +02:00
mo_duplicated = duplicated ( mo_species_new )
) %>%
ungroup ( )
if ( any ( mo_species $ mo_duplicated , na.rm = TRUE ) | anyNA ( mo_species $ mo_species_new ) ) stop ( " Duplicate MO codes for species!" )
# no duplicates *within kingdoms*, so keep the right columns for left joining later
mo_species <- mo_species %>%
select ( kingdom , genus , species , mo_species = mo_species_new )
# same for subspecies - keep old where available and create new per kingdom-genus-species where needed:
mo_subspecies <- taxonomy %>%
filter ( rank == " subspecies" ) %>%
distinct ( kingdom , genus , species , subspecies ) %>%
2023-01-23 15:01:21 +01:00
left_join (
AMR :: microorganisms %>%
filter ( rank %in% c ( " subspecies" , " subsp." , " infraspecies" ) ) %>%
transmute ( mo_subspecies_old = gsub ( " ^[A-Z]+_[A-Z]+_[A-Z]+_" , " " , as.character ( mo ) ) , kingdom , genus , species , subspecies ) %>%
filter ( mo_subspecies_old %unlike% " -" ) %>%
distinct ( kingdom , genus , species , subspecies , .keep_all = TRUE ) ,
by = c ( " kingdom" , " genus" , " species" , " subspecies" )
2022-10-05 09:12:22 +02:00
) %>%
distinct ( kingdom , genus , species , subspecies , .keep_all = TRUE ) %>%
group_by ( kingdom , genus , species ) %>%
mutate (
mo_subspecies_new4 = abbreviate_mo ( subspecies , 4 , hyphen_as_space = TRUE ) ,
mo_subspecies_new5 = abbreviate_mo ( subspecies , 5 , hyphen_as_space = TRUE ) ,
mo_subspecies_new5b = paste0 ( abbreviate_mo ( subspecies , 5 , hyphen_as_space = TRUE ) , 1 ) ,
mo_subspecies_new6 = abbreviate_mo ( subspecies , 6 , hyphen_as_space = TRUE ) ,
mo_subspecies_new7 = abbreviate_mo ( subspecies , 7 , hyphen_as_space = TRUE ) ,
mo_subspecies_new8 = abbreviate_mo ( subspecies , 8 , hyphen_as_space = TRUE ) ,
mo_subspecies_new = case_when (
! is.na ( mo_subspecies_old ) ~ mo_subspecies_old ,
! mo_subspecies_new4 %in% mo_subspecies_old ~ mo_subspecies_new4 ,
! mo_subspecies_new5 %in% mo_subspecies_old ~ mo_subspecies_new5 ,
! mo_subspecies_new6 %in% mo_subspecies_old ~ mo_subspecies_new6 ,
! mo_subspecies_new7 %in% mo_subspecies_old ~ mo_subspecies_new7 ,
! mo_subspecies_new8 %in% mo_subspecies_old ~ mo_subspecies_new8 ,
! mo_subspecies_new5b %in% mo_subspecies_old ~ mo_subspecies_new5b ,
TRUE ~ mo_subspecies_old
2022-08-28 10:31:50 +02:00
) ,
2022-10-05 09:12:22 +02:00
mo_duplicated = duplicated ( mo_subspecies_new ) ,
mo_subspecies_new = case_when (
! mo_duplicated ~ mo_subspecies_new ,
mo_duplicated & mo_subspecies_new == mo_subspecies_new4 ~ mo_subspecies_new5 ,
mo_duplicated & mo_subspecies_new == mo_subspecies_new5 ~ mo_subspecies_new6 ,
mo_duplicated & mo_subspecies_new == mo_subspecies_new6 ~ mo_subspecies_new7 ,
mo_duplicated & mo_subspecies_new == mo_subspecies_new7 ~ mo_subspecies_new8 ,
mo_duplicated & mo_subspecies_new == mo_subspecies_new8 ~ mo_subspecies_new5b ,
TRUE ~ NA_character_
) ,
mo_duplicated = duplicated ( mo_subspecies_new )
) %>%
ungroup ( )
if ( any ( mo_subspecies $ mo_duplicated , na.rm = TRUE ) | anyNA ( mo_subspecies $ mo_subspecies_new ) ) stop ( " Duplicate MO codes for subspecies!" )
# no duplicates *within kingdoms*, so keep the right columns for left joining later
mo_subspecies <- mo_subspecies %>%
select ( kingdom , genus , species , subspecies , mo_subspecies = mo_subspecies_new )
2019-02-20 00:04:48 +01:00
2022-10-05 09:12:22 +02:00
# unknowns - manually added
mo_unknown <- AMR :: microorganisms %>%
filter ( fullname %like% " unknown" ) %>%
transmute ( fullname , mo_unknown = as.character ( mo ) )
# apply the new codes!
taxonomy <- taxonomy %>%
left_join ( mo_kingdom , by = " kingdom" ) %>%
left_join ( mo_phylum , by = c ( " kingdom" , " phylum" ) ) %>%
left_join ( mo_class , by = c ( " kingdom" , " class" ) ) %>%
left_join ( mo_order , by = c ( " kingdom" , " order" ) ) %>%
left_join ( mo_family , by = c ( " kingdom" , " family" ) ) %>%
left_join ( mo_genus , by = c ( " kingdom" , " genus" ) ) %>%
left_join ( mo_species , by = c ( " kingdom" , " genus" , " species" ) ) %>%
left_join ( mo_subspecies , by = c ( " kingdom" , " genus" , " species" , " subspecies" ) ) %>%
left_join ( mo_unknown , by = " fullname" ) %>%
mutate ( across ( starts_with ( " mo_" ) , function ( x ) ifelse ( is.na ( x ) , " " , x ) ) ) %>%
mutate (
mo = case_when (
fullname %like% " unknown" ~ mo_unknown ,
# add special cases for taxons higher than genus
rank == " kingdom" ~ paste ( mo_kingdom , " [KNG]" , toupper ( kingdom ) , sep = " _" ) ,
rank == " phylum" ~ paste ( mo_kingdom , mo_phylum , sep = " _" ) ,
rank == " class" ~ paste ( mo_kingdom , mo_class , sep = " _" ) ,
rank == " order" ~ paste ( mo_kingdom , mo_order , sep = " _" ) ,
rank == " family" ~ paste ( mo_kingdom , mo_family , sep = " _" ) ,
TRUE ~ paste ( mo_kingdom , mo_genus , mo_species , mo_subspecies , sep = " _" )
2022-08-28 10:31:50 +02:00
) ,
2022-10-05 09:12:22 +02:00
mo = trimws ( gsub ( " _+$" , " " , mo ) ) ,
.before = 1
) %>%
select ( ! starts_with ( " mo_" ) ) %>%
2022-10-30 14:31:45 +01:00
arrange ( fullname )
2022-10-29 14:15:23 +02:00
# now check these - e.g. Nitrospira is the name of a genus AND its class
2023-01-23 15:01:21 +01:00
taxonomy %>%
filter ( fullname %in% .[duplicated ( fullname ) , " fullname" , drop = TRUE ] ) %>%
View ( )
2022-10-30 14:31:45 +01:00
taxonomy <- taxonomy %>%
2023-01-23 15:01:21 +01:00
mutate ( rank_index = case_when (
kingdom == " Bacteria" ~ 1 ,
kingdom == " Fungi" ~ 2 ,
kingdom == " Protozoa" ~ 3 ,
kingdom == " Archaea" ~ 4 ,
TRUE ~ 5
) ) %>%
arrange ( fullname , rank_index ) %>%
distinct ( fullname , .keep_all = TRUE ) %>%
select ( - rank_index ) %>%
2022-12-12 00:14:56 +01:00
filter ( mo != " " )
# this must not exist:
2023-01-23 15:01:21 +01:00
taxonomy %>%
filter ( mo %like% " __" ) %>%
View ( )
2022-12-12 00:14:56 +01:00
taxonomy <- taxonomy %>% filter ( mo %unlike% " __" )
2022-12-16 16:10:43 +01:00
# Some integrity checks ---------------------------------------------------
# are mo codes unique?
taxonomy %>% filter ( mo %in% .[duplicated ( mo ) , " mo" , drop = TRUE ] )
2022-12-12 00:14:56 +01:00
taxonomy <- taxonomy %>% distinct ( mo , .keep_all = TRUE )
2022-12-16 16:10:43 +01:00
# are fullnames unique?
taxonomy %>% filter ( fullname %in% .[duplicated ( fullname ) , " fullname" , drop = TRUE ] )
2020-05-27 16:37:49 +02:00
2022-12-16 16:10:43 +01:00
# are all GBIFs available?
2023-01-23 15:01:21 +01:00
taxonomy %>%
filter ( ! gbif_parent %in% gbif ) %>%
count ( rank )
2022-12-16 16:10:43 +01:00
# try to find the right gbif IDs
taxonomy $ gbif_parent [which ( ! taxonomy $ gbif_parent %in% taxonomy $ gbif & taxonomy $ rank == " species" ) ] <- taxonomy $ gbif [match ( taxonomy $ genus [which ( ! taxonomy $ gbif_parent %in% taxonomy $ gbif & taxonomy $ rank == " species" ) ] , taxonomy $ genus ) ]
taxonomy $ gbif_parent [which ( ! taxonomy $ gbif_parent %in% taxonomy $ gbif & taxonomy $ rank == " class" ) ] <- taxonomy $ gbif [match ( taxonomy $ phylum [which ( ! taxonomy $ gbif_parent %in% taxonomy $ gbif & taxonomy $ rank == " class" ) ] , taxonomy $ phylum ) ]
2023-01-23 15:01:21 +01:00
taxonomy %>%
filter ( ! gbif_parent %in% gbif ) %>%
count ( rank )
2022-12-16 16:10:43 +01:00
# are all LPSNs available?
2023-01-23 15:01:21 +01:00
taxonomy %>%
filter ( ! lpsn_parent %in% lpsn ) %>%
count ( rank )
2022-12-16 16:10:43 +01:00
# make GBIF refer to newest renaming according to LPSN
taxonomy $ gbif_renamed_to [which ( ! is.na ( taxonomy $ gbif_renamed_to ) & ! is.na ( taxonomy $ lpsn_renamed_to ) ) ] <- taxonomy $ gbif [match ( taxonomy $ lpsn_renamed_to [which ( ! is.na ( taxonomy $ gbif_renamed_to ) & ! is.na ( taxonomy $ lpsn_renamed_to ) ) ] , taxonomy $ lpsn ) ]
# Save intermediate results (3) -------------------------------------------
saveRDS ( taxonomy , " data-raw/taxonomy3.rds" )
2022-10-29 14:15:23 +02:00
2019-10-30 23:02:50 +01:00
2022-10-05 09:12:22 +02:00
# Remove unwanted taxonomic entries from Protoza/Fungi --------------------
2019-09-20 14:18:29 +02:00
2022-10-05 09:12:22 +02:00
# this must be done after the microbial ID generation, since it will otherwise generate a lot of different IDs
taxonomy <- taxonomy %>%
filter (
# Protozoa:
! ( phylum %in% c ( " Choanozoa" , " Mycetozoa" ) & prevalence == 3 ) ,
# Fungi:
! ( phylum %in% c ( " Ascomycota" , " Zygomycota" , " Basidiomycota" ) & prevalence == 3 ) ,
2022-12-12 00:14:56 +01:00
! ( genus %in% c ( " Phoma" , " Leptosphaeria" , " Physarum" ) & rank %in% c ( " species" , " subspecies" ) ) , # only genus of this rare fungus, with resp. 1300 and 800 species
2022-10-05 09:12:22 +02:00
# (leave Alternaria in there, part of human mycobiome and opportunistic pathogen)
# Animalia:
! genus %in% c ( " Lucilia" , " Lumbricus" ) ,
2022-12-12 00:14:56 +01:00
! ( class == " Insecta" & rank %in% c ( " species" , " subspecies" ) ) , # keep only genus of insects
! ( genus == " Amoeba" & kingdom == " Animalia" ) ,
2022-10-05 09:12:22 +02:00
! ( genus %in% c ( " Aedes" , " Anopheles" ) & rank %in% c ( " species" , " subspecies" ) ) , # only genus of the many hundreds of mosquitoes species
kingdom != " Plantae"
) # this kingdom only contained Curvularia and Hymenolepis, which have coincidental twin names with Fungi
2019-02-28 13:56:28 +01:00
2022-12-12 00:14:56 +01:00
# no ghost families, orders classes, phyla
taxonomy <- taxonomy %>%
2023-01-23 15:01:21 +01:00
group_by ( kingdom , family ) %>%
filter ( n ( ) > 1 | fullname %like% " unknown" | rank == " kingdom" ) %>%
group_by ( kingdom , order ) %>%
filter ( n ( ) > 1 | fullname %like% " unknown" | rank == " kingdom" ) %>%
group_by ( kingdom , class ) %>%
filter ( n ( ) > 1 | fullname %like% " unknown" | rank == " kingdom" ) %>%
group_by ( kingdom , phylum ) %>%
filter ( n ( ) > 1 | fullname %like% " unknown" | rank == " kingdom" ) %>%
2022-12-12 00:14:56 +01:00
ungroup ( )
2023-01-23 15:01:21 +01:00
message (
2023-02-10 16:18:00 +01:00
" \nCongratulations! The new taxonomic table will contain " , format ( nrow ( taxonomy ) , big.mark = " " ) , " rows.\n" ,
" This was " , format ( nrow ( AMR :: microorganisms ) , big.mark = " " ) , " rows.\n"
2023-01-23 15:01:21 +01:00
)
2022-12-12 00:14:56 +01:00
# these are the new ones:
2023-01-23 15:01:21 +01:00
taxonomy %>%
filter ( ! paste ( kingdom , fullname ) %in% paste ( AMR :: microorganisms $ kingdom , AMR :: microorganisms $ fullname ) ) %>%
View ( )
2022-12-12 00:14:56 +01:00
# these were removed:
2023-01-23 15:01:21 +01:00
AMR :: microorganisms %>%
filter ( ! paste ( kingdom , fullname ) %in% paste ( taxonomy $ kingdom , taxonomy $ fullname ) ) %>%
View ( )
AMR :: microorganisms %>%
filter ( ! fullname %in% taxonomy $ fullname ) %>%
View ( )
2020-07-08 14:48:06 +02:00
2022-10-05 09:12:22 +02:00
# Add SNOMED CT -----------------------------------------------------------
# we will use Public Health Information Network Vocabulary Access and Distribution System (PHIN VADS)
# as a source, which copies directly from the latest US SNOMED CT version
# - go to https://phinvads.cdc.gov/vads/ViewValueSet.action?oid=2.16.840.1.114222.4.11.1009
2022-10-29 14:15:23 +02:00
# - check that current online version is higher than TAXONOMY_VERSION$SNOMED
2022-10-05 09:12:22 +02:00
# - if so, click on 'Download Value Set', choose 'TXT'
snomed <- vroom ( " data-raw/SNOMED_PHVS_Microorganism_CDC_V12.txt" , skip = 3 ) %>%
select ( 1 : 2 ) %>%
setNames ( c ( " snomed" , " mo" ) ) %>%
mutate ( snomed = as.character ( snomed ) )
# try to get name of MO
snomed <- snomed %>%
mutate ( mo = gsub ( " ss. " , " " , mo , fixed = TRUE ) ) %>%
mutate ( fullname = case_when (
mo %like_case% " [A-Z][a-z]+ [a-z]+ [a-z]{4,} " ~ gsub ( " (^|.*)([A-Z][a-z]+ [a-z]+ [a-z]{4,}) .*" , " \\2" , mo ) ,
mo %like_case% " [A-Z][a-z]+ [a-z]{4,} " ~ gsub ( " (^|.*)([A-Z][a-z]+ [a-z]{4,}) .*" , " \\2" , mo ) ,
mo %like_case% " [A-Z][a-z]+" ~ gsub ( " (^|.*)([A-Z][a-z]+) .*" , " \\2" , mo ) ,
TRUE ~ NA_character_
) ) %>%
filter ( fullname %in% taxonomy $ fullname )
message ( nrow ( snomed ) , " SNOMED codes will be added to " , n_distinct ( snomed $ fullname ) , " microorganisms" )
snomed <- snomed %>%
group_by ( fullname ) %>%
summarise ( snomed = list ( snomed ) )
taxonomy <- taxonomy %>%
left_join ( snomed , by = " fullname" )
# Clean data set ----------------------------------------------------------
# format to tibble and check again for invalid characters
taxonomy <- taxonomy %>%
select ( mo , fullname , status , kingdom : subspecies , rank , ref , source , starts_with ( " lpsn" ) , starts_with ( " gbif" ) , prevalence , snomed ) %>%
df_remove_nonASCII ( )
# set class <mo>
class ( taxonomy $ mo ) <- c ( " mo" , " character" )
2023-02-26 21:26:58 +01:00
microorganisms <- taxonomy
# Restore 'synonym' microorganisms to 'accepted' --------------------------
2022-10-05 09:12:22 +02:00
2023-02-26 21:26:58 +01:00
# according to LPSN: Stenotrophomonas maltophilia is the correct name if this species is regarded as a separate species (i.e., if its nomenclatural type is not assigned to another species whose name is validly published, legitimate and not rejected and has priority) within a separate genus Stenotrophomonas.
# https://lpsn.dsmz.de/species/stenotrophomonas-maltophilia
# all MO's to keep as 'accepted', not as 'synonym':
2023-03-11 14:24:34 +01:00
to_restore <- c (
" Stenotrophomonas maltophilia" ,
" Moraxella catarrhalis"
)
2023-02-26 21:26:58 +01:00
all ( to_restore %in% microorganisms $ fullname )
for ( nm in to_restore ) {
microorganisms $ lpsn_renamed_to [which ( microorganisms $ fullname == nm ) ] <- NA
microorganisms $ gbif_renamed_to [which ( microorganisms $ fullname == nm ) ] <- NA
microorganisms $ status [which ( microorganisms $ fullname == nm ) ] <- " accepted"
}
2022-10-05 09:12:22 +02:00
# Save to package ---------------------------------------------------------
usethis :: use_data ( microorganisms , overwrite = TRUE , version = 2 , compress = " xz" )
2019-02-28 13:56:28 +01:00
rm ( microorganisms )
2022-10-05 09:12:22 +02:00
2022-12-12 00:14:56 +01:00
# DON'T FORGET TO UPDATE R/_globals.R!
2022-10-05 09:12:22 +02:00
# Test updates ------------------------------------------------------------
# and check: these codes should not be missing (will otherwise throw a unit test error):
AMR :: microorganisms.codes %>% filter ( ! mo %in% taxonomy $ mo )
2023-01-21 23:47:20 +01:00
AMR :: clinical_breakpoints %>% filter ( ! mo %in% taxonomy $ mo )
2022-10-05 09:12:22 +02:00
AMR :: example_isolates %>% filter ( ! mo %in% taxonomy $ mo )
AMR :: intrinsic_resistant %>% filter ( ! mo %in% taxonomy $ mo )
2020-05-27 16:37:49 +02:00
# load new data sets
devtools :: load_all ( " ." )
2022-10-05 09:12:22 +02:00
2020-05-27 16:37:49 +02:00
# reset previously changed mo codes
2023-01-21 23:47:20 +01:00
if ( ! identical ( clinical_breakpoints $ mo , as.mo ( clinical_breakpoints $ mo , language = NULL ) ) ) {
clinical_breakpoints $ mo <- as.mo ( clinical_breakpoints $ mo , language = NULL )
usethis :: use_data ( clinical_breakpoints , overwrite = TRUE , version = 2 , compress = " xz" )
rm ( clinical_breakpoints )
2022-12-12 00:14:56 +01:00
}
2019-09-20 12:33:05 +02:00
2022-12-12 00:14:56 +01:00
if ( ! identical ( microorganisms.codes $ mo , as.mo ( microorganisms.codes $ mo , language = NULL ) ) ) {
microorganisms.codes <- microorganisms.codes %>% filter ( mo %in% microorganisms $ mo )
microorganisms.codes $ mo <- as.mo ( microorganisms.codes $ mo , language = NULL )
usethis :: use_data ( microorganisms.codes , overwrite = TRUE , version = 2 , compress = " xz" )
rm ( microorganisms.codes )
}
2020-05-27 16:37:49 +02:00
2022-12-12 00:14:56 +01:00
if ( ! identical ( example_isolates $ mo , as.mo ( example_isolates $ mo , language = NULL ) ) ) {
example_isolates $ mo <- as.mo ( example_isolates $ mo , language = NULL )
usethis :: use_data ( example_isolates , overwrite = TRUE , version = 2 )
rm ( example_isolates )
}
2022-10-05 09:12:22 +02:00
# load new data sets again
devtools :: load_all ( " ." )
source ( " data-raw/_pre_commit_hook.R" )
devtools :: load_all ( " ." )
2020-05-27 16:37:49 +02:00
2022-12-12 00:14:56 +01:00
if ( ! identical ( intrinsic_resistant $ mo , as.mo ( intrinsic_resistant $ mo , language = NULL ) ) ) {
stop ( " Run data-raw/reproduction_of_intrinsic_resistant.R again" )
}
2020-05-27 16:37:49 +02:00
# run the unit tests
2022-10-05 09:12:22 +02:00
Sys.setenv ( NOT_CRAN = " true" )
2020-05-27 16:37:49 +02:00
testthat :: test_file ( " tests/testthat/test-data.R" )
testthat :: test_file ( " tests/testthat/test-mo.R" )
testthat :: test_file ( " tests/testthat/test-mo_property.R" )