AMR/reproduction_of_microorgani...

257 lines
11 KiB
R
Raw Normal View History

2019-02-20 00:04:48 +01:00
# Catalogue of Life
# Data retrieved from Encyclopaedia of Life:
# https://opendata.eol.org/dataset/catalogue-of-life/
# unzip and extract taxon.tab, then:
taxon <- data.table::fread("taxon.tab")
# result is over 3.7M rows
taxon %>% freq(kingdom)
# Item Count Percent Cum. Count Cum. Percent
# --- ---------- ---------- -------- ----------- -------------
# 1 Animalia 2,225,627 59.1% 2,225,627 59.1%
# 2 Plantae 1,177,412 31.3% 3,403,039 90.4%
# 3 Fungi 290,145 7.7% 3,693,184 98.1%
# 4 Chromista 47,126 1.3% 3,740,310 99.3%
# 5 Bacteria 14,478 0.4% 3,754,788 99.7%
# 6 Protozoa 6,060 0.2% 3,760,848 99.9%
# 7 Viruses 3,827 0.1% 3,764,675 100.0%
# 8 Archaea 610 0.0% 3,765,285 100.0%
MOs <- taxon %>%
# tibble for future transformations
as_tibble() %>%
filter(
# we only want all microorganisms and viruses
!kingdom %in% c("Animalia", "Plantae"),
# and no entries above genus - they all already have a taxonomic tree
!taxonRank %in% c("kingdom", "phylum", "superfamily", "class", "order", "family"),
# not all fungi: Aspergillus, Candida, Trichphyton and Pneumocystis are the most important,
# so only keep these orders from the fungi:
!(kingdom == "Fungi" & !order %in% c("Eurotiales", "Saccharomycetales", "Schizosaccharomycetales", "Onygenales", "Pneumocystales"))) %>%
# remove text if it contains 'Not assigned' like phylum in viruses
mutate_all(funs(gsub("Not assigned", "", .))) %>%
# only latest ref, not original authors
mutate(scientificNameAuthorship = trimws(gsub(".*[)] ", "", scientificNameAuthorship)),
scientificNameAuthorship = ifelse(grepl(" emend[. ]", scientificNameAuthorship, ignore.case = TRUE),
gsub("(.*)emend[. ]+(.*)", "\\2", scientificNameAuthorship, ignore.case = TRUE),
scientificNameAuthorship),
scientificNameAuthorship = gsub(".", "", scientificNameAuthorship, fixed = TRUE),
scientificNameAuthorship = gsub(",? et al", " et al.", scientificNameAuthorship, fixed = FALSE, ignore.case = TRUE),
scientificNameAuthorship = gsub("[()]", "", scientificNameAuthorship),
# year always preceded by comma
scientificNameAuthorship = gsub(" ([0-9]{4})$", ", \\1", scientificNameAuthorship),
scientificNameAuthorship = gsub(",,", ",", scientificNameAuthorship, fixed = TRUE),
# only first author with *et al.*
scientificNameAuthorship = gsub(",.*,", " et al.,", scientificNameAuthorship),
scientificNameAuthorship = gsub(" (and|&) .*,", " et al.,", scientificNameAuthorship),
scientificNameAuthorship = gsub(", [^0-9]+", ", ", scientificNameAuthorship),
scientificNameAuthorship = gsub(", $", "", scientificNameAuthorship)
)
# remove non-ASCII characters (not allowed by CRAN)
MOs <- MOs %>%
lapply(iconv, from = "UTF-8", to = "ASCII//TRANSLIT") %>%
as_tibble(stringsAsFactors = FALSE)
# split old taxonomic names - they refer to a new `taxonID` with `acceptedNameUsageID`
MOs.old <- MOs %>%
filter(!is.na(acceptedNameUsageID),
scientificNameAuthorship != "") %>%
transmute(col_id = taxonID,
col_id_new = acceptedNameUsageID,
fullname =
trimws(
gsub("(.*)[(].*", "\\1",
stringr::str_replace(
string = scientificName,
pattern = stringr::fixed(scientificNameAuthorship),
replacement = ""))),
ref = scientificNameAuthorship) %>%
filter(!is.na(fullname)) %>%
distinct(fullname, .keep_all = TRUE) %>%
arrange(col_id)
MOs <- MOs %>%
filter(is.na(acceptedNameUsageID)) %>%
transmute(col_id = taxonID,
fullname = trimws(ifelse(kingdom == "Viruses",
paste(specificEpithet, infraspecificEpithet),
paste(genus, specificEpithet, infraspecificEpithet))),
kingdom,
phylum,
class,
order,
family,
genus = gsub(":", "", genus),
species = specificEpithet,
subspecies = infraspecificEpithet,
rank = taxonRank,
ref = scientificNameAuthorship,
species_id = gsub(".*/([a-f0-9]+)", "\\1", furtherInformationURL)) %>%
distinct(fullname, .keep_all = TRUE) %>%
filter(!grepl("unassigned", fullname, ignore.case = TRUE))
# only old names of species that are in MOs:
MOs.old <- MOs.old %>% filter(col_id_new %in% MOs$col_id)
MOs <- MOs %>%
group_by(kingdom) %>%
# abbreviations may be same for genera between kingdoms,
# because each abbreviation starts with the the first character of the kingdom
mutate(abbr_genus = abbreviate(genus,
minlength = 5,
use.classes = TRUE,
method = "both.sides",
strict = FALSE)) %>%
ungroup() %>%
group_by(genus) %>%
# species abbreviations may be the same between genera
# because the genus abbreviation is part of the abbreviation
mutate(abbr_species = abbreviate(species,
minlength = 3,
use.classes = FALSE,
method = "both.sides")) %>%
ungroup() %>%
group_by(genus, species) %>%
mutate(abbr_subspecies = abbreviate(subspecies,
minlength = 3,
use.classes = FALSE,
method = "both.sides")) %>%
ungroup() %>%
# remove trailing underscores
mutate(mo = gsub("_+$", "",
toupper(paste(substr(kingdom, 1, 1),
abbr_genus,
abbr_species,
abbr_subspecies,
sep = "_")))) %>%
mutate(mo = ifelse(duplicated(.$mo), paste0(mo, "1"), mo)) %>%
select(mo, everything(), -abbr_genus, -abbr_species, -abbr_subspecies)
# everything distinct?
sum(duplicated(MOs$mo))
# add non-taxonomic entries
MOs <- MOs %>%
bind_rows(
# CoNS
MOs %>%
filter(genus == "Staphylococcus", species == "epidermidis") %>% .[1,] %>%
mutate(mo = gsub("EPI", "CNS", mo),
col_id = NA_integer_,
species = "coagulase negative",
fullname = "Coagulase Negative Staphylococcus (CoNS)",
ref = NA_character_),
# CoPS
MOs %>%
filter(genus == "Staphylococcus", species == "epidermidis") %>% .[1,] %>%
mutate(mo = gsub("EPI", "CPS", mo),
col_id = NA_integer_,
species = "coagulase positive",
fullname = "Coagulase Positive Staphylococcus (CoPS)",
ref = NA_character_),
# Streptococci groups A, B, C, F, H, K
MOs %>%
filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
mutate(mo = gsub("AGA", "GRA", mo),
col_id = NA_integer_,
species = "group A" ,
fullname = "Streptococcus group A"),
MOs %>%
filter(genus == "Streptococcus", species == "dysgalactiae") %>% .[1,] %>%
mutate(mo = gsub("DYS", "GRB", mo),
col_id = NA_integer_,
species = "group B" ,
fullname = "Streptococcus group B"),
MOs %>%
filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
mutate(mo = gsub("AGA", "GRC", mo),
col_id = NA_integer_,
species = "group C" ,
fullname = "Streptococcus group C",
ref = NA_character_),
MOs %>%
filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
mutate(mo = gsub("AGA", "GRD", mo),
col_id = NA_integer_,
species = "group D" ,
fullname = "Streptococcus group D",
ref = NA_character_),
MOs %>%
filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
mutate(mo = gsub("AGA", "GRF", mo),
col_id = NA_integer_,
species = "group F" ,
fullname = "Streptococcus group F",
ref = NA_character_),
MOs %>%
filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
mutate(mo = gsub("AGA", "GRG", mo),
col_id = NA_integer_,
species = "group F" ,
fullname = "Streptococcus group G",
ref = NA_character_),
MOs %>%
filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
mutate(mo = gsub("AGA", "GRH", mo),
col_id = NA_integer_,
species = "group H" ,
fullname = "Streptococcus group H",
ref = NA_character_),
MOs %>%
filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
mutate(mo = gsub("AGA", "GRK", mo),
col_id = NA_integer_,
species = "group K" ,
fullname = "Streptococcus group K",
ref = NA_character_),
# Beta haemolytic Streptococci
MOs %>%
filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
mutate(mo = gsub("AGA", "HAE", mo),
col_id = NA_integer_,
species = "beta-haemolytic" ,
fullname = "Beta-haemolytic Streptococcus",
ref = NA_character_),
# unknowns
data.frame(mo = "B_GRAMN",
col_id = NA_integer_,
fullname = "(unknown Gram negatives)",
kingdom = "Bacteria",
phylum = NA_character_,
class = NA_character_,
order = NA_character_,
family = NA_character_,
genus = "(unknown Gram negatives)",
species = NA_character_,
subspecies = NA_character_,
rank = "species",
ref = NA_character_,
stringsAsFactors = FALSE),
data.frame(mo = "B_GRAMP",
col_id = NA_integer_,
fullname = "(unknown Gram positives)",
kingdom = "Bacteria",
phylum = NA_character_,
class = NA_character_,
order = NA_character_,
family = NA_character_,
genus = "(unknown Gram positives)",
species = NA_character_,
subspecies = NA_character_,
rank = "species",
ref = NA_character_,
stringsAsFactors = FALSE)
)
# save it
MOs <- as.data.frame(MOs %>% arrange(mo), stringsAsFactors = FALSE)
MOs.old <- as.data.frame(MOs.old, stringsAsFactors = FALSE)
class(MOs$mo) <- "mo"
saveRDS(MOs, "microorganisms.rds")
saveRDS(MOs.old, "microorganisms.old.rds")
# on the server:
# usethis::use_data(microorganisms, overwrite = TRUE)
# usethis::use_data(microorganisms.old, overwrite = TRUE)