mirror of
https://github.com/msberends/AMR.git
synced 2025-07-08 11:51:59 +02:00
(v1.1.0.9021) 1st isolates update
This commit is contained in:
@ -90,9 +90,29 @@ countries_geometry <- sf::st_as_sf(map('world', plot = FALSE, fill = TRUE)) %>%
|
||||
not_antarctica = as.integer(ID != "Antarctica"),
|
||||
countries_name = ifelse(included == 1, as.character(ID), NA))
|
||||
|
||||
# add countries not in the list
|
||||
countries_missing <- unique(ip_tbl$country[!ip_tbl$country %in% countries_geometry$countries_code])
|
||||
for (i in seq_len(length(countries_missing))) {
|
||||
countries_geometry <- countries_geometry %>%
|
||||
rbind(countries_geometry %>%
|
||||
filter(ID == "Netherlands") %>%
|
||||
mutate(ID = countrycode::countrycode(countries_missing[i],
|
||||
origin = 'iso2c',
|
||||
destination = 'country.name'),
|
||||
countries_code = countries_missing[i],
|
||||
included = 1,
|
||||
not_antarctica = 1,
|
||||
countries_name = countrycode::countrycode(countries_missing[i],
|
||||
origin = 'iso2c',
|
||||
destination = 'country.name')))
|
||||
}
|
||||
|
||||
# how many?
|
||||
countries_geometry %>% filter(included == 1) %>% nrow()
|
||||
|
||||
countries_geometry$countries_name <- gsub("UK", "United Kingdom", countries_geometry$countries_name, fixed = TRUE)
|
||||
countries_geometry$countries_name <- gsub("USA", "United States", countries_geometry$countries_name, fixed = TRUE)
|
||||
|
||||
countries_plot <- ggplot(countries_geometry) +
|
||||
geom_sf(aes(fill = included, colour = not_antarctica),
|
||||
size = 0.25,
|
||||
@ -101,9 +121,9 @@ countries_plot <- ggplot(countries_geometry) +
|
||||
theme(panel.grid = element_blank(),
|
||||
axis.title = element_blank(),
|
||||
axis.text = element_blank()) +
|
||||
scale_fill_gradient(low = "white", high = "#CAD6EA", ) +
|
||||
scale_fill_gradient(low = "white", high = "#128f7645") +
|
||||
# this makes the border Antarctica turn white (invisible):
|
||||
scale_colour_gradient(low = "white", high = "#81899B")
|
||||
scale_colour_gradient(low = "white", high = "#128f76")
|
||||
|
||||
countries_plot_mini <- countries_plot
|
||||
countries_plot_mini$data <- countries_plot_mini$data %>% filter(ID != "Antarctica")
|
||||
|
@ -37251,6 +37251,7 @@
|
||||
"B_MYCBC_TKNS" "Mycobacterium tokaiense" "Bacteria" "Actinobacteria" "(unknown class)" "Actinomycetales" "Mycobacteriaceae" "Mycobacterium" "tokaiense" "" "species" "Tsukamura, 1981" "c457ca4ae3a404100c8ce8c82a6100cc" "CoL" 2 "72477006"
|
||||
"B_MYCBC_TRPL" "Mycobacterium triplex" "Bacteria" "Actinobacteria" "(unknown class)" "Actinomycetales" "Mycobacteriaceae" "Mycobacterium" "triplex" "" "species" "Floyd et al., 1997" "f23c2b6cad7a0e20374cdf3d3ff55dce" "CoL" 2 "113860005"
|
||||
"B_MYCBC_TRVL" "Mycobacterium triviale" "Bacteria" "Actinobacteria" "(unknown class)" "Actinomycetales" "Mycobacteriaceae" "Mycobacterium" "triviale" "" "species" "Kubica, 1970" "9cb8b676cce27952821e173b12bfff3f" "CoL" 2 "40333002"
|
||||
"B_MYCBC_TBRC" "Mycobacterium tuberculosis" "Bacteria" "Actinobacteria" "(unknown class)" "Actinomycetales" "Mycobacteriaceae" "Mycobacterium" "tuberculosis" "" "species" "Lehmann et al., 2018" "778540" "DSMZ" 2 "c(\"113861009\", \"113858008\")"
|
||||
"B_MYCBC_TUSC" "Mycobacterium tusciae" "Bacteria" "Actinobacteria" "(unknown class)" "Actinomycetales" "Mycobacteriaceae" "Mycobacterium" "tusciae" "" "species" "Tortoli et al., 1999" "7a8ff8f5a2b16131366fe6e8dfb6b570" "CoL" 2
|
||||
"B_MYCBC_ULCR" "Mycobacterium ulcerans" "Bacteria" "Actinobacteria" "(unknown class)" "Actinomycetales" "Mycobacteriaceae" "Mycobacterium" "ulcerans" "" "species" "MacCallum et al., 1950" "96b3a2e207e76f4725132034d7d0bde1" "CoL" 2 "40713003"
|
||||
"B_MYCBC_VACC" "Mycobacterium vaccae" "Bacteria" "Actinobacteria" "(unknown class)" "Actinomycetales" "Mycobacteriaceae" "Mycobacterium" "vaccae" "" "species" "Bonicke et al., 1964" "adbc928aba39beadc25b2ba7e8214c91" "CoL" 2 "54925005"
|
||||
|
@ -920,6 +920,22 @@ testthat::test_file("tests/testthat/test-data.R")
|
||||
testthat::test_file("tests/testthat/test-mo.R")
|
||||
testthat::test_file("tests/testthat/test-mo_property.R")
|
||||
|
||||
# edit 2020-05-28
|
||||
# Not sure why it now says M. tuberculosis was renamed to M. africanum (B_MYCBC_AFRC), but that's not true
|
||||
microorganisms <- microorganisms %>%
|
||||
bind_rows(microorganisms %>%
|
||||
filter(mo == "B_MYCBC_AFRC") %>%
|
||||
mutate(mo = "B_MYCBC_TBRC", snomed = list(c("113861009", "113858008")),
|
||||
ref = "Lehmann et al., 2018",species_id = "778540",
|
||||
source = "DSMZ", species = "tuberculosis",
|
||||
fullname = "Mycobacterium tuberculosis")) %>%
|
||||
arrange(fullname)
|
||||
class(microorganisms$mo) <- c("mo", "character")
|
||||
microorganisms.old <- microorganisms.old %>% filter(fullname != "Mycobacterium tuberculosis")
|
||||
|
||||
usethis::use_data(microorganisms, overwrite = TRUE, version = 2)
|
||||
usethis::use_data(microorganisms.old, overwrite = TRUE, version = 2)
|
||||
|
||||
|
||||
|
||||
# OLD CODE ----------------------------------------------------------------
|
||||
|
@ -1,682 +0,0 @@
|
||||
# ==================================================================== #
|
||||
# TITLE #
|
||||
# Antimicrobial Resistance (AMR) Analysis #
|
||||
# #
|
||||
# SOURCE #
|
||||
# https://gitlab.com/msberends/AMR #
|
||||
# #
|
||||
# LICENCE #
|
||||
# (c) 2018-2020 Berends MS, Luz CF et al. #
|
||||
# #
|
||||
# This R package is free software; you can freely use and distribute #
|
||||
# it for both personal and commercial purposes under the terms of the #
|
||||
# GNU General Public License version 2.0 (GNU GPL-2), as published by #
|
||||
# the Free Software Foundation. #
|
||||
# #
|
||||
# We created this package for both routine data analysis and academic #
|
||||
# research and it was publicly released in the hope that it will be #
|
||||
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. #
|
||||
# Visit our website for more info: https://msberends.gitlab.io/AMR. #
|
||||
# ==================================================================== #
|
||||
|
||||
# ---------------------------------------------------------------------------------
|
||||
# Reproduction of the `microorganisms` data set
|
||||
# ---------------------------------------------------------------------------------
|
||||
# Data retrieved from:
|
||||
#
|
||||
# [1] Catalogue of Life (CoL) through the Encyclopaedia of Life
|
||||
# https://opendata.eol.org/dataset/catalogue-of-life/
|
||||
# * Download the resource file with a name like "Catalogue of Life yyyy-mm-dd"
|
||||
# * Extract "taxon.tab"
|
||||
#
|
||||
# [2] Global Biodiversity Information Facility (GBIF)
|
||||
# https://doi.org/10.15468/39omei
|
||||
# * Extract "Taxon.tsv"
|
||||
#
|
||||
# [3] Deutsche Sammlung von Mikroorganismen und Zellkulturen (DSMZ)
|
||||
# https://www.dsmz.de/support/bacterial-nomenclature-up-to-date-downloads.html
|
||||
# * Download the latest "Complete List" as xlsx file (DSMZ_bactnames.xlsx)
|
||||
# ---------------------------------------------------------------------------------
|
||||
|
||||
library(dplyr)
|
||||
library(AMR)
|
||||
|
||||
data_col <- data.table::fread("Documents/taxon.tab")
|
||||
data_gbif <- data.table::fread("Documents/Taxon.tsv")
|
||||
|
||||
# read the xlsx file from DSMZ (only around 2.5 MB):
|
||||
data_dsmz <- readxl::read_xlsx("Downloads/DSMZ_bactnames.xlsx")
|
||||
|
||||
# the CoL data is over 3.7M rows:
|
||||
data_col %>% freq(kingdom)
|
||||
# Item Count Percent Cum. Count Cum. Percent
|
||||
# --- ---------- ---------- -------- ----------- -------------
|
||||
# 1 Animalia 2,225,627 59.1% 2,225,627 59.1%
|
||||
# 2 Plantae 1,177,412 31.3% 3,403,039 90.4%
|
||||
# 3 Fungi 290,145 7.7% 3,693,184 98.1%
|
||||
# 4 Chromista 47,126 1.3% 3,740,310 99.3%
|
||||
# 5 Bacteria 14,478 0.4% 3,754,788 99.7%
|
||||
# 6 Protozoa 6,060 0.2% 3,760,848 99.9%
|
||||
# 7 Viruses 3,827 0.1% 3,764,675 100.0%
|
||||
# 8 Archaea 610 0.0% 3,765,285 100.0%
|
||||
|
||||
# the GBIF data is over 5.8M rows:
|
||||
data_gbif %>% freq(kingdom)
|
||||
# Item Count Percent Cum. Count Cum. Percent
|
||||
# --- --------------- ---------- -------- ----------- -------------
|
||||
# 1 Animalia 3,264,138 55.7% 3,264,138 55.7%
|
||||
# 2 Plantae 1,814,962 31.0% 5,079,100 86.7%
|
||||
# 3 Fungi 538,086 9.2% 5,617,186 95.9%
|
||||
# 4 Chromista 181,374 3.1% 5,798,560 99.0%
|
||||
# 5 Bacteria 24,048 0.4% 5,822,608 99.4%
|
||||
# 6 Protozoa 15,138 0.3% 5,837,746 99.7%
|
||||
# 7 incertae sedis 9,995 0.2% 5,847,741 99.8%
|
||||
# 8 Viruses 9,630 0.2% 5,857,371 100.0%
|
||||
# 9 Archaea 771 0.0% 5,858,142 100.0%
|
||||
|
||||
|
||||
# Clean up helper function ------------------------------------------------
|
||||
clean_new <- function(new) {
|
||||
new %>%
|
||||
# only the ones that have no new ID to refer to a newer name
|
||||
filter(is.na(col_id_new)) %>%
|
||||
filter(
|
||||
(
|
||||
# we only want all MICROorganisms and no viruses
|
||||
!kingdom %in% c("Animalia", "Chromista", "Plantae", "Viruses")
|
||||
# and not all fungi: Aspergillus, Candida, Trichphyton and Pneumocystis are the most important,
|
||||
# so only keep these orders from the fungi:
|
||||
& !(kingdom == "Fungi"
|
||||
& !order %in% c("Eurotiales", "Saccharomycetales", "Schizosaccharomycetales", "Tremellales", "Onygenales", "Pneumocystales"))
|
||||
)
|
||||
# or the family has to contain a genus we found in our hospitals last decades (Northern Netherlands, 2002-2018)
|
||||
| genus %in% c("Absidia", "Acremonium", "Actinotignum", "Alternaria", "Anaerosalibacter", "Ancylostoma", "Anisakis", "Apophysomyces",
|
||||
"Arachnia", "Ascaris", "Aureobacterium", "Aureobasidium", "Balantidum", "Bilophilia", "Branhamella", "Brochontrix",
|
||||
"Brugia", "Calymmatobacterium", "Catabacter", "Chilomastix", "Chryseomonas", "Cladophialophora", "Cladosporium",
|
||||
"Clonorchis", "Cordylobia", "Curvularia", "Demodex", "Dermatobia", "Diphyllobothrium", "Dracunculus", "Echinococcus",
|
||||
"Enterobius", "Euascomycetes", "Exophiala", "Fasciola", "Fusarium", "Hendersonula", "Hymenolepis", "Kloeckera",
|
||||
"Koserella", "Larva", "Leishmania", "Lelliottia", "Loa", "Lumbricus", "Malassezia", "Metagonimus", "Molonomonas",
|
||||
"Mucor", "Nattrassia", "Necator", "Novospingobium", "Onchocerca", "Opistorchis", "Paragonimus", "Paramyxovirus",
|
||||
"Pediculus", "Phoma", "Phthirus", "Pityrosporum", "Pseudallescheria", "Pulex", "Rhizomucor", "Rhizopus", "Rhodotorula",
|
||||
"Salinococcus", "Sanguibacteroides", "Schistosoma", "Scopulariopsis", "Scytalidium", "Sporobolomyces", "Stomatococcus",
|
||||
"Strongyloides", "Syncephalastraceae", "Taenia", "Torulopsis", "Trichinella", "Trichobilharzia", "Trichomonas",
|
||||
"Trichosporon", "Trichuris", "Trypanosoma", "Wuchereria")) %>%
|
||||
mutate(
|
||||
authors2 = iconv(ref, from = "UTF-8", to = "ASCII//TRANSLIT"),
|
||||
# remove leading and trailing brackets
|
||||
authors2 = gsub("^[(](.*)[)]$", "\\1", authors2),
|
||||
# only take part after brackets if there's a name
|
||||
authors2 = ifelse(grepl(".*[)] [a-zA-Z]+.*", authors2),
|
||||
gsub(".*[)] (.*)", "\\1", authors2),
|
||||
authors2),
|
||||
# get year from last 4 digits
|
||||
lastyear = as.integer(gsub(".*([0-9]{4})$", "\\1", authors2)),
|
||||
# can never be later than now
|
||||
lastyear = ifelse(lastyear > as.integer(format(Sys.Date(), "%Y")),
|
||||
NA,
|
||||
lastyear),
|
||||
# get authors without last year
|
||||
authors = gsub("(.*)[0-9]{4}$", "\\1", authors2),
|
||||
# remove nonsense characters from names
|
||||
authors = gsub("[^a-zA-Z,'& -]", "", authors),
|
||||
# remove trailing and leading spaces
|
||||
authors = trimws(authors),
|
||||
# only keep first author and replace all others by 'et al'
|
||||
authors = gsub("(,| and| et| &| ex| emend\\.?) .*", " et al.", authors),
|
||||
# et al. always with ending dot
|
||||
authors = gsub(" et al\\.?", " et al.", authors),
|
||||
authors = gsub(" ?,$", "", authors),
|
||||
# don't start with 'sensu' or 'ehrenb'
|
||||
authors = gsub("^(sensu|Ehrenb.?) ", "", authors, ignore.case = TRUE),
|
||||
# no initials, only surname
|
||||
authors = gsub("^([A-Z]+ )+", "", authors, ignore.case = FALSE),
|
||||
# combine author and year if year is available
|
||||
ref = ifelse(!is.na(lastyear),
|
||||
paste0(authors, ", ", lastyear),
|
||||
authors),
|
||||
# fix beginning and ending
|
||||
ref = gsub(", $", "", ref),
|
||||
ref = gsub("^, ", "", ref)) %>%
|
||||
# remove text if it contains 'Not assigned' like phylum in viruses
|
||||
mutate_all(~gsub("Not assigned", "", .)) %>%
|
||||
# Remove non-ASCII characters (these are not allowed by CRAN)
|
||||
lapply(iconv, from = "UTF-8", to = "ASCII//TRANSLIT") %>%
|
||||
as_tibble(stringsAsFactors = FALSE) %>%
|
||||
mutate(fullname = trimws(case_when(rank == "family" ~ family,
|
||||
rank == "order" ~ order,
|
||||
rank == "class" ~ class,
|
||||
rank == "phylum" ~ phylum,
|
||||
rank == "kingdom" ~ kingdom,
|
||||
TRUE ~ paste(genus, species, subspecies))))
|
||||
}
|
||||
clean_old <- function(old, new) {
|
||||
old %>%
|
||||
# only the ones that exist in the new data set
|
||||
filter(col_id_new %in% new$col_id) %>%
|
||||
mutate(
|
||||
authors2 = iconv(ref, from = "UTF-8", to = "ASCII//TRANSLIT"),
|
||||
# remove leading and trailing brackets
|
||||
authors2 = gsub("^[(](.*)[)]$", "\\1", authors2),
|
||||
# only take part after brackets if there's a name
|
||||
authors2 = ifelse(grepl(".*[)] [a-zA-Z]+.*", authors2),
|
||||
gsub(".*[)] (.*)", "\\1", authors2),
|
||||
authors2),
|
||||
# get year from last 4 digits
|
||||
lastyear = as.integer(gsub(".*([0-9]{4})$", "\\1", authors2)),
|
||||
# can never be later than now
|
||||
lastyear = ifelse(lastyear > as.integer(format(Sys.Date(), "%Y")),
|
||||
NA,
|
||||
lastyear),
|
||||
# get authors without last year
|
||||
authors = gsub("(.*)[0-9]{4}$", "\\1", authors2),
|
||||
# remove nonsense characters from names
|
||||
authors = gsub("[^a-zA-Z,'& -]", "", authors),
|
||||
# remove trailing and leading spaces
|
||||
authors = trimws(authors),
|
||||
# only keep first author and replace all others by 'et al'
|
||||
authors = gsub("(,| and| et| &| ex| emend\\.?) .*", " et al.", authors),
|
||||
# et al. always with ending dot
|
||||
authors = gsub(" et al\\.?", " et al.", authors),
|
||||
authors = gsub(" ?,$", "", authors),
|
||||
# don't start with 'sensu' or 'ehrenb'
|
||||
authors = gsub("^(sensu|Ehrenb.?) ", "", authors, ignore.case = TRUE),
|
||||
# no initials, only surname
|
||||
authors = gsub("^([A-Z]+ )+", "", authors, ignore.case = FALSE),
|
||||
# combine author and year if year is available
|
||||
ref = ifelse(!is.na(lastyear),
|
||||
paste0(authors, ", ", lastyear),
|
||||
authors),
|
||||
# fix beginning and ending
|
||||
ref = gsub(", $", "", ref),
|
||||
ref = gsub("^, ", "", ref)) %>%
|
||||
# remove text if it contains 'Not assigned' like phylum in viruses
|
||||
mutate_all(~gsub("Not assigned", "", .)) %>%
|
||||
# Remove non-ASCII characters (these are not allowed by CRAN)
|
||||
lapply(iconv, from = "UTF-8", to = "ASCII//TRANSLIT") %>%
|
||||
as_tibble(stringsAsFactors = FALSE) %>%
|
||||
select(col_id_new, fullname, ref, authors2) %>%
|
||||
left_join(new %>% select(col_id, fullname_new = fullname), by = c(col_id_new = "col_id")) %>%
|
||||
mutate(fullname = trimws(
|
||||
gsub("(.*)[(].*", "\\1",
|
||||
stringr::str_replace(
|
||||
string = fullname,
|
||||
pattern = stringr::fixed(authors2),
|
||||
replacement = "")) %>%
|
||||
gsub(" (var|f|subsp)[.]", "", .))) %>%
|
||||
select(-c("col_id_new", "authors2")) %>%
|
||||
filter(!is.na(fullname), !is.na(fullname_new)) %>%
|
||||
filter(fullname != fullname_new, !fullname %like% "^[?]")
|
||||
}
|
||||
|
||||
# clean CoL and GBIF ----
|
||||
# clean data_col
|
||||
data_col <- data_col %>%
|
||||
as_tibble() %>%
|
||||
select(col_id = taxonID,
|
||||
col_id_new = acceptedNameUsageID,
|
||||
fullname = scientificName,
|
||||
kingdom,
|
||||
phylum,
|
||||
class,
|
||||
order,
|
||||
family,
|
||||
genus,
|
||||
species = specificEpithet,
|
||||
subspecies = infraspecificEpithet,
|
||||
rank = taxonRank,
|
||||
ref = scientificNameAuthorship,
|
||||
species_id = furtherInformationURL) %>%
|
||||
mutate(source = "CoL")
|
||||
# split into old and new
|
||||
data_col.new <- data_col %>% clean_new()
|
||||
data_col.old <- data_col %>% clean_old(new = data_col.new)
|
||||
rm(data_col)
|
||||
|
||||
# clean data_gbif
|
||||
data_gbif <- data_gbif %>%
|
||||
as_tibble() %>%
|
||||
filter(
|
||||
# no uncertain taxonomic placements
|
||||
taxonRemarks != "doubtful",
|
||||
kingdom != "incertae sedis",
|
||||
taxonRank != "unranked") %>%
|
||||
transmute(col_id = taxonID,
|
||||
col_id_new = acceptedNameUsageID,
|
||||
fullname = scientificName,
|
||||
kingdom,
|
||||
phylum,
|
||||
class,
|
||||
order,
|
||||
family,
|
||||
genus,
|
||||
species = specificEpithet,
|
||||
subspecies = infraspecificEpithet,
|
||||
rank = taxonRank,
|
||||
ref = scientificNameAuthorship,
|
||||
species_id = as.character(parentNameUsageID)) %>%
|
||||
mutate(source = "GBIF")
|
||||
# split into old and new
|
||||
data_gbif.new <- data_gbif %>% clean_new()
|
||||
data_gbif.old <- data_gbif %>% clean_old(new = data_gbif.new)
|
||||
rm(data_gbif)
|
||||
|
||||
# put CoL and GBIF together ----
|
||||
MOs.new <- bind_rows(data_col.new,
|
||||
data_gbif.new) %>%
|
||||
mutate(taxonomic_tree_length = nchar(trimws(paste(kingdom, phylum, class, order, family, genus, species, subspecies)))) %>%
|
||||
arrange(desc(taxonomic_tree_length)) %>%
|
||||
distinct(fullname, .keep_all = TRUE) %>%
|
||||
select(-c("col_id_new", "authors2", "authors", "lastyear", "taxonomic_tree_length")) %>%
|
||||
arrange(fullname)
|
||||
MOs.old <- bind_rows(data_col.old,
|
||||
data_gbif.old) %>%
|
||||
distinct(fullname, .keep_all = TRUE) %>%
|
||||
arrange(fullname)
|
||||
|
||||
# clean up DSMZ ---
|
||||
data_dsmz <- data_dsmz %>%
|
||||
as_tibble() %>%
|
||||
transmute(col_id = NA_integer_,
|
||||
col_id_new = NA_integer_,
|
||||
fullname = "",
|
||||
# kingdom = "",
|
||||
# phylum = "",
|
||||
# class = "",
|
||||
# order = "",
|
||||
# family = "",
|
||||
genus = ifelse(is.na(GENUS), "", GENUS),
|
||||
species = ifelse(is.na(SPECIES), "", SPECIES),
|
||||
subspecies = ifelse(is.na(SUBSPECIES), "", SUBSPECIES),
|
||||
rank = ifelse(species == "", "genus", "species"),
|
||||
ref = AUTHORS,
|
||||
species_id = as.character(RECORD_NO),
|
||||
source = "DSMZ")
|
||||
|
||||
# DSMZ only contains genus/(sub)species, try to find taxonomic properties based on genus and data_col
|
||||
ref_taxonomy <- MOs.new %>%
|
||||
distinct(genus, .keep_all = TRUE) %>%
|
||||
filter(family != "") %>%
|
||||
filter(genus %in% data_dsmz$genus) %>%
|
||||
distinct(genus, .keep_all = TRUE) %>%
|
||||
select(kingdom, phylum, class, order, family, genus)
|
||||
|
||||
data_dsmz <- data_dsmz %>%
|
||||
left_join(ref_taxonomy, by = "genus") %>%
|
||||
mutate(kingdom = "Bacteria")
|
||||
|
||||
data_dsmz.new <- data_dsmz %>%
|
||||
clean_new() %>%
|
||||
distinct(fullname, .keep_all = TRUE) %>%
|
||||
select(colnames(MOs.new)) %>%
|
||||
arrange(fullname)
|
||||
|
||||
# combine everything ----
|
||||
MOs <- bind_rows(MOs.new,
|
||||
data_dsmz.new) %>%
|
||||
distinct(fullname, .keep_all = TRUE) %>%
|
||||
# not the ones that are old
|
||||
filter(!fullname %in% MOs.old$fullname) %>%
|
||||
arrange(fullname) %>%
|
||||
mutate(col_id = ifelse(source != "CoL", NA_integer_, col_id)) %>%
|
||||
filter(fullname != "")
|
||||
|
||||
rm(data_col.new)
|
||||
rm(data_col.old)
|
||||
rm(data_gbif.new)
|
||||
rm(data_gbif.old)
|
||||
rm(data_dsmz)
|
||||
rm(data_dsmz.new)
|
||||
rm(ref_taxonomy)
|
||||
rm(MOs.new)
|
||||
|
||||
MOs.bak <- MOs
|
||||
|
||||
# Trichomonas trick ----
|
||||
# for species in Trypanosoma and Trichomonas we observe al lot of taxonomic info missing
|
||||
MOs %>% filter(genus %in% c("Trypanosoma", "Trichomonas")) %>% View()
|
||||
MOs[which(MOs$genus == "Trypanosoma"), "kingdom"] <- MOs[which(MOs$fullname == "Trypanosoma"),]$kingdom
|
||||
MOs[which(MOs$genus == "Trypanosoma"), "phylum"] <- MOs[which(MOs$fullname == "Trypanosoma"),]$phylum
|
||||
MOs[which(MOs$genus == "Trypanosoma"), "class"] <- MOs[which(MOs$fullname == "Trypanosoma"),]$class
|
||||
MOs[which(MOs$genus == "Trypanosoma"), "order"] <- MOs[which(MOs$fullname == "Trypanosoma"),]$order
|
||||
MOs[which(MOs$genus == "Trypanosoma"), "family"] <- MOs[which(MOs$fullname == "Trypanosoma"),]$family
|
||||
MOs[which(MOs$genus == "Trichomonas"), "kingdom"] <- MOs[which(MOs$fullname == "Trichomonas"),]$kingdom
|
||||
MOs[which(MOs$genus == "Trichomonas"), "phylum"] <- MOs[which(MOs$fullname == "Trichomonas"),]$phylum
|
||||
MOs[which(MOs$genus == "Trichomonas"), "class"] <- MOs[which(MOs$fullname == "Trichomonas"),]$class
|
||||
MOs[which(MOs$genus == "Trichomonas"), "order"] <- MOs[which(MOs$fullname == "Trichomonas"),]$order
|
||||
MOs[which(MOs$genus == "Trichomonas"), "family"] <- MOs[which(MOs$fullname == "Trichomonas"),]$family
|
||||
|
||||
# fill taxonomic properties that are missing
|
||||
MOs <- MOs %>%
|
||||
mutate(phylum = ifelse(phylum %in% c(NA, ""), "(unknown phylum)", phylum),
|
||||
class = ifelse(class %in% c(NA, ""), "(unknown class)", class),
|
||||
order = ifelse(order %in% c(NA, ""), "(unknown order)", order),
|
||||
family = ifelse(family %in% c(NA, ""), "(unknown family)", family))
|
||||
|
||||
# Abbreviations ----
|
||||
# Add abbreviations so we can easily know which ones are which ones.
|
||||
# These will become valid and unique microbial IDs for the AMR package.
|
||||
MOs <- MOs %>%
|
||||
arrange(kingdom, fullname) %>%
|
||||
group_by(kingdom) %>%
|
||||
mutate(abbr_other = case_when(
|
||||
rank == "family" ~ paste0("[FAM]_",
|
||||
abbreviate(family,
|
||||
minlength = 8,
|
||||
use.classes = TRUE,
|
||||
method = "both.sides",
|
||||
strict = FALSE)),
|
||||
rank == "order" ~ paste0("[ORD]_",
|
||||
abbreviate(order,
|
||||
minlength = 8,
|
||||
use.classes = TRUE,
|
||||
method = "both.sides",
|
||||
strict = FALSE)),
|
||||
rank == "class" ~ paste0("[CLS]_",
|
||||
abbreviate(class,
|
||||
minlength = 8,
|
||||
use.classes = TRUE,
|
||||
method = "both.sides",
|
||||
strict = FALSE)),
|
||||
rank == "phylum" ~ paste0("[PHL]_",
|
||||
abbreviate(phylum,
|
||||
minlength = 8,
|
||||
use.classes = TRUE,
|
||||
method = "both.sides",
|
||||
strict = FALSE)),
|
||||
rank == "kingdom" ~ paste0("[KNG]_", kingdom),
|
||||
TRUE ~ NA_character_
|
||||
)) %>%
|
||||
# abbreviations determined per kingdom and family
|
||||
# becuase they are part of the abbreviation
|
||||
mutate(abbr_genus = abbreviate(genus,
|
||||
minlength = 7,
|
||||
use.classes = TRUE,
|
||||
method = "both.sides",
|
||||
strict = FALSE)) %>%
|
||||
ungroup() %>%
|
||||
group_by(genus) %>%
|
||||
# species abbreviations may be the same between genera
|
||||
# because the genus abbreviation is part of the abbreviation
|
||||
mutate(abbr_species = abbreviate(stringr::str_to_title(species),
|
||||
minlength = 3,
|
||||
use.classes = FALSE,
|
||||
method = "both.sides")) %>%
|
||||
ungroup() %>%
|
||||
group_by(genus, species) %>%
|
||||
mutate(abbr_subspecies = abbreviate(stringr::str_to_title(subspecies),
|
||||
minlength = 3,
|
||||
use.classes = FALSE,
|
||||
method = "both.sides")) %>%
|
||||
ungroup() %>%
|
||||
# remove trailing underscores
|
||||
mutate(mo = gsub("_+$", "",
|
||||
toupper(paste(
|
||||
# first character: kingdom
|
||||
ifelse(kingdom %in% c("Animalia", "Plantae"),
|
||||
substr(kingdom, 1, 2),
|
||||
substr(kingdom, 1, 1)),
|
||||
# next: genus, species, subspecies
|
||||
ifelse(is.na(abbr_other),
|
||||
paste(abbr_genus,
|
||||
abbr_species,
|
||||
abbr_subspecies,
|
||||
sep = "_"),
|
||||
abbr_other),
|
||||
sep = "_")))) %>%
|
||||
mutate(mo = ifelse(duplicated(.$mo),
|
||||
# these one or two must be unique too
|
||||
paste0(mo, "1"),
|
||||
mo),
|
||||
fullname = ifelse(fullname == "",
|
||||
trimws(paste(genus, species, subspecies)),
|
||||
fullname)) %>%
|
||||
# put `mo` in front, followed by the rest
|
||||
select(mo, everything(), -abbr_other, -abbr_genus, -abbr_species, -abbr_subspecies)
|
||||
|
||||
# add non-taxonomic entries
|
||||
MOs <- MOs %>%
|
||||
bind_rows(
|
||||
# Unknowns
|
||||
data.frame(mo = "UNKNOWN",
|
||||
col_id = NA_integer_,
|
||||
fullname = "(unknown name)",
|
||||
kingdom = "(unknown kingdom)",
|
||||
phylum = "(unknown phylum)",
|
||||
class = "(unknown class)",
|
||||
order = "(unknown order)",
|
||||
family = "(unknown family)",
|
||||
genus = "(unknown genus)",
|
||||
species = "(unknown species)",
|
||||
subspecies = "(unknown subspecies)",
|
||||
rank = "(unknown rank)",
|
||||
ref = NA_character_,
|
||||
species_id = "",
|
||||
source = "manually added",
|
||||
stringsAsFactors = FALSE),
|
||||
data.frame(mo = "B_GRAMN",
|
||||
col_id = NA_integer_,
|
||||
fullname = "(unknown Gram-negatives)",
|
||||
kingdom = "Bacteria",
|
||||
phylum = "(unknown phylum)",
|
||||
class = "(unknown class)",
|
||||
order = "(unknown order)",
|
||||
family = "(unknown family)",
|
||||
genus = "(unknown Gram-negatives)",
|
||||
species = "(unknown species)",
|
||||
subspecies = "(unknown subspecies)",
|
||||
rank = "species",
|
||||
ref = NA_character_,
|
||||
species_id = "",
|
||||
source = "manually added",
|
||||
stringsAsFactors = FALSE),
|
||||
data.frame(mo = "B_GRAMP",
|
||||
col_id = NA_integer_,
|
||||
fullname = "(unknown Gram-positives)",
|
||||
kingdom = "Bacteria",
|
||||
phylum = "(unknown phylum)",
|
||||
class = "(unknown class)",
|
||||
order = "(unknown order)",
|
||||
family = "(unknown family)",
|
||||
genus = "(unknown Gram-positives)",
|
||||
species = "(unknown species)",
|
||||
subspecies = "(unknown subspecies)",
|
||||
rank = "species",
|
||||
ref = NA_character_,
|
||||
species_id = "",
|
||||
source = "manually added",
|
||||
stringsAsFactors = FALSE),
|
||||
# CoNS
|
||||
MOs %>%
|
||||
filter(genus == "Staphylococcus", species == "") %>% .[1,] %>%
|
||||
mutate(mo = paste(mo, "CNS", sep = "_"),
|
||||
rank = "species",
|
||||
col_id = NA_integer_,
|
||||
species = "coagulase-negative",
|
||||
fullname = "Coagulase-negative Staphylococcus (CoNS)",
|
||||
ref = NA_character_,
|
||||
species_id = "",
|
||||
source = "manually added"),
|
||||
# CoPS
|
||||
MOs %>%
|
||||
filter(genus == "Staphylococcus", species == "") %>% .[1,] %>%
|
||||
mutate(mo = paste(mo, "CPS", sep = "_"),
|
||||
rank = "species",
|
||||
col_id = NA_integer_,
|
||||
species = "coagulase-positive",
|
||||
fullname = "Coagulase-positive Staphylococcus (CoPS)",
|
||||
ref = NA_character_,
|
||||
species_id = "",
|
||||
source = "manually added"),
|
||||
# Streptococci groups A, B, C, F, H, K
|
||||
MOs %>%
|
||||
filter(genus == "Streptococcus", species == "pyogenes") %>% .[1,] %>%
|
||||
# we can keep all other details, since S. pyogenes is the only member of group A
|
||||
mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRA", sep = "_"),
|
||||
species = "group A" ,
|
||||
fullname = "Streptococcus group A"),
|
||||
MOs %>%
|
||||
filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
|
||||
# we can keep all other details, since S. agalactiae is the only member of group B
|
||||
mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRB", sep = "_"),
|
||||
species = "group B" ,
|
||||
fullname = "Streptococcus group B"),
|
||||
MOs %>%
|
||||
filter(genus == "Streptococcus", species == "dysgalactiae") %>% .[1,] %>%
|
||||
mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRC", sep = "_"),
|
||||
col_id = NA_integer_,
|
||||
species = "group C" ,
|
||||
fullname = "Streptococcus group C",
|
||||
ref = NA_character_,
|
||||
species_id = "",
|
||||
source = "manually added"),
|
||||
MOs %>%
|
||||
filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
|
||||
mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRD", sep = "_"),
|
||||
col_id = NA_integer_,
|
||||
species = "group D" ,
|
||||
fullname = "Streptococcus group D",
|
||||
ref = NA_character_,
|
||||
species_id = "",
|
||||
source = "manually added"),
|
||||
MOs %>%
|
||||
filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
|
||||
mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRF", sep = "_"),
|
||||
col_id = NA_integer_,
|
||||
species = "group F" ,
|
||||
fullname = "Streptococcus group F",
|
||||
ref = NA_character_,
|
||||
species_id = "",
|
||||
source = "manually added"),
|
||||
MOs %>%
|
||||
filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
|
||||
mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRG", sep = "_"),
|
||||
col_id = NA_integer_,
|
||||
species = "group G" ,
|
||||
fullname = "Streptococcus group G",
|
||||
ref = NA_character_,
|
||||
species_id = "",
|
||||
source = "manually added"),
|
||||
MOs %>%
|
||||
filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
|
||||
mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRH", sep = "_"),
|
||||
col_id = NA_integer_,
|
||||
species = "group H" ,
|
||||
fullname = "Streptococcus group H",
|
||||
ref = NA_character_,
|
||||
species_id = "",
|
||||
source = "manually added"),
|
||||
MOs %>%
|
||||
filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
|
||||
mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRK", sep = "_"),
|
||||
col_id = NA_integer_,
|
||||
species = "group K" ,
|
||||
fullname = "Streptococcus group K",
|
||||
ref = NA_character_,
|
||||
species_id = "",
|
||||
source = "manually added"),
|
||||
# Beta-haemolytic Streptococci
|
||||
MOs %>%
|
||||
filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
|
||||
mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "HAE", sep = "_"),
|
||||
col_id = NA_integer_,
|
||||
species = "beta-haemolytic" ,
|
||||
fullname = "Beta-haemolytic Streptococcus",
|
||||
ref = NA_character_,
|
||||
species_id = "",
|
||||
source = "manually added")
|
||||
)
|
||||
|
||||
|
||||
# everything distinct?
|
||||
sum(duplicated(MOs$mo))
|
||||
colnames(MOs)
|
||||
|
||||
# set prevalence per species
|
||||
MOs <- MOs %>%
|
||||
mutate(prevalence = case_when(
|
||||
class == "Gammaproteobacteria"
|
||||
| genus %in% c("Enterococcus", "Staphylococcus", "Streptococcus")
|
||||
| mo %in% c("UNKNOWN", "B_GRAMN", "B_GRAMP")
|
||||
~ 1,
|
||||
phylum %in% c("Proteobacteria",
|
||||
"Firmicutes",
|
||||
"Actinobacteria",
|
||||
"Sarcomastigophora")
|
||||
| genus %in% c("Aspergillus",
|
||||
"Bacteroides",
|
||||
"Candida",
|
||||
"Capnocytophaga",
|
||||
"Chryseobacterium",
|
||||
"Cryptococcus",
|
||||
"Elisabethkingia",
|
||||
"Flavobacterium",
|
||||
"Fusobacterium",
|
||||
"Giardia",
|
||||
"Leptotrichia",
|
||||
"Mycoplasma",
|
||||
"Prevotella",
|
||||
"Rhodotorula",
|
||||
"Treponema",
|
||||
"Trichophyton",
|
||||
"Trichomonas",
|
||||
"Ureaplasma")
|
||||
| rank %in% c("kingdom", "phylum", "class", "order", "family")
|
||||
~ 2,
|
||||
TRUE ~ 3
|
||||
))
|
||||
|
||||
# arrange
|
||||
MOs <- MOs %>% arrange(fullname)
|
||||
|
||||
# transform
|
||||
MOs <- as.data.frame(MOs, stringsAsFactors = FALSE)
|
||||
MOs.old <- as.data.frame(MOs.old, stringsAsFactors = FALSE)
|
||||
class(MOs$mo) <- "mo"
|
||||
MOs$col_id <- as.integer(MOs$col_id)
|
||||
|
||||
# get differences in MO codes between this data and the package version
|
||||
MO_diff <- AMR::microorganisms %>%
|
||||
mutate(pastedtext = paste(mo, fullname)) %>%
|
||||
filter(!pastedtext %in% (MOs %>% mutate(pastedtext = paste(mo, fullname)) %>% pull(pastedtext))) %>%
|
||||
select(mo_old = mo, fullname, pastedtext) %>%
|
||||
left_join(MOs %>%
|
||||
transmute(mo_new = mo, fullname_new = fullname, pastedtext = paste(mo, fullname)), "pastedtext") %>%
|
||||
select(mo_old, mo_new, fullname_new)
|
||||
|
||||
mo_diff2 <- AMR::microorganisms %>%
|
||||
select(mo, fullname) %>%
|
||||
left_join(MOs %>%
|
||||
select(mo, fullname),
|
||||
by = "fullname",
|
||||
suffix = c("_old", "_new")) %>%
|
||||
filter(mo_old != mo_new,
|
||||
#!mo_new %in% mo_old,
|
||||
!mo_old %like% "\\[")
|
||||
|
||||
mo_diff3 <- tibble(previous_old = names(AMR:::make_trans_tbl()),
|
||||
previous_new = AMR:::make_trans_tbl()) %>%
|
||||
left_join(AMR::microorganisms %>% select(mo, fullname), by = c(previous_new = "mo")) %>%
|
||||
left_join(MOs %>% select(mo_new = mo, fullname), by = "fullname")
|
||||
|
||||
# what did we win most?
|
||||
MOs %>% filter(!fullname %in% AMR::microorganisms$fullname) %>% freq(genus)
|
||||
# what did we lose most?
|
||||
AMR::microorganisms %>%
|
||||
filter(kingdom != "Chromista" & !fullname %in% MOs$fullname & !fullname %in% MOs.old$fullname) %>%
|
||||
freq(genus)
|
||||
|
||||
|
||||
# save
|
||||
saveRDS(MOs, "microorganisms.rds")
|
||||
saveRDS(MOs.old, "microorganisms.old.rds")
|
||||
|
||||
# on the server, do:
|
||||
usethis::use_data(microorganisms, overwrite = TRUE, version = 2)
|
||||
usethis::use_data(microorganisms.old, overwrite = TRUE, version = 2)
|
||||
rm(microorganisms)
|
||||
rm(microorganisms.old)
|
||||
|
||||
# TO DO AFTER THIS
|
||||
# * Update the year and dim()s in R/data.R
|
||||
# * Rerun data-raw/reproduction_of_rsi_translation.R
|
||||
# * Run unit tests
|
Reference in New Issue
Block a user