AMR/data-raw/reproduction_of_microorgani...

980 lines
42 KiB
R
Raw Normal View History

# ==================================================================== #
# TITLE #
# Antimicrobial Resistance (AMR) Data Analysis for R #
# #
# SOURCE #
# https://github.com/msberends/AMR #
# #
# LICENCE #
2021-12-23 18:56:28 +01:00
# (c) 2018-2022 Berends MS, Luz CF et al. #
2020-10-08 11:16:03 +02:00
# Developed at the University of Groningen, the Netherlands, in #
# collaboration with non-profit organisations Certe Medical #
# Diagnostics & Advice, and University Medical Center Groningen. #
# #
# This R package is free software; you can freely use and distribute #
# it for both personal and commercial purposes under the terms of the #
# GNU General Public License version 2.0 (GNU GPL-2), as published by #
# the Free Software Foundation. #
# We created this package for both routine data analysis and academic #
# research and it was publicly released in the hope that it will be #
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. #
2020-10-08 11:16:03 +02:00
# #
# Visit our website for the full manual and a complete tutorial about #
# how to conduct AMR data analysis: https://msberends.github.io/AMR/ #
# ==================================================================== #
2019-03-18 14:29:41 +01:00
# Reproduction of the `microorganisms` data set
# Data retrieved from the Catalogue of Life (CoL):
# https://download.catalogueoflife.org/col/monthly/
# (download latest dwca, such as https://download.catalogueoflife.org/col/monthly/2022-01-14_dwca.zip)
2020-05-27 16:37:49 +02:00
# Data retrieved from the Global Biodiversity Information Facility (GBIF):
2020-03-14 14:05:43 +01:00
# https://doi.org/10.15468/rffz4x
2020-05-27 16:37:49 +02:00
#
2021-02-18 23:23:14 +01:00
# And from the List of Prokaryotic names with Standing in Nomenclature (LPSN)
# (register first) https://lpsn.dsmz.de/downloads
# download the latest CSV file.
2019-02-20 00:04:48 +01:00
2019-02-21 23:32:30 +01:00
library(dplyr)
library(AMR)
2020-05-27 16:37:49 +02:00
# also needed: data.table, httr, jsonlite, cleaner, stringr
# unzip and extract taxa.txt (both around 1.5 GB, 3.7-3.9M rows) from Col and GBIF, then:
data_col_raw <- data.table::fread("data-raw/taxon.tsv", quote = "")
2020-05-27 16:37:49 +02:00
data_gbif <- data.table::fread("data-raw/taxa.txt", quote = "")
# merge the two
data_col <- data_gbif %>%
rename(referenceID = identifier) %>%
bind_rows(data_col_raw) %>%
distinct(scientificName, kingdom, genus, specificEpithet, infraspecificEpithet, .keep_all = TRUE)
rm(data_col_raw)
rm(data_gbif)
# read the data from the DSMZ API (around 19000 rows)
dsmz_username <- ""
dsmz_password <- ""
GET_df <- function(url) {
result <- httr::GET(url, httr::authenticate(dsmz_username, dsmz_password))
httr::stop_for_status(result)
result %>%
httr::content(type = "text", encoding = "UTF-8") %>%
jsonlite::fromJSON(flatten = TRUE)
}
dsmz_first <- GET_df("https://bacdive.dsmz.de/api/pnu/species?page=1&format=json")
data_dsmz <- dsmz_first$results
# this next process will take appr. `dsmz_first$count / 100 * 5 / 60` minutes
for (i in 2:round((dsmz_first$count / 100) + 0.5)) {
data_dsmz <<- rbind(data_dsmz,
GET_df(paste0("https://bacdive.dsmz.de/api/pnu/species/?page=", i, "&format=json"))$results)
cat(i, "-", AMR:::percentage(i / round((dsmz_first$count / 100) + 0.5)), "\n")
}
rm(dsmz_first)
2019-03-18 14:29:41 +01:00
# the CoL data is over 3.7M rows:
2020-05-27 16:37:49 +02:00
data_col %>% cleaner::freq(kingdom)
2019-02-20 00:04:48 +01:00
# Item Count Percent Cum. Count Cum. Percent
# --- ---------- ---------- -------- ----------- -------------
2020-05-27 16:37:49 +02:00
# 1 Animalia 2,494,992 55.43% 2,494,992 55.43%
# 2 Plantae 1,379,674 30.65% 3,874,666 86.08%
# 3 Fungi 547,619 12.17% 4,422,285 98.24%
# 4 Chromista 51,475 1.14% 4,473,760 99.39%
# 5 Bacteria 14,442 0.32% 4,488,202 99.71%
# 6 Protozoa 8,750 0.19% 4,496,952 99.90%
# 7 Viruses 3,805 0.08% 4,500,757 99.99%
# 8 Archaea 609 0.01% 4,501,366 100.00%
2019-02-20 00:04:48 +01:00
2019-03-18 14:29:41 +01:00
# clean data_col
2020-05-27 16:37:49 +02:00
data_col.bak <- data_col
data_col_old <- data_col %>%
# filter: has new accepted name
filter(!is.na(acceptedNameUsageID)) %>%
as_tibble() %>%
transmute(fullname = trimws(stringr::str_replace(scientificName,
pattern = stringr::fixed(scientificNameAuthorship),
replacement = "")),
fullname_new = trimws(paste(ifelse(is.na(genus), "", genus),
ifelse(is.na(specificEpithet), "", specificEpithet),
ifelse(is.na(infraspecificEpithet), "", infraspecificEpithet))),
ref = scientificNameAuthorship,
prevalence = NA_integer_)
2019-03-18 14:29:41 +01:00
data_col <- data_col %>%
2020-05-27 16:37:49 +02:00
# filter: has no new accepted name
filter(is.na(acceptedNameUsageID)) %>%
2019-03-18 14:29:41 +01:00
as_tibble() %>%
2020-05-27 16:37:49 +02:00
transmute(fullname = "",
kingdom,
phylum,
class,
order,
family,
genus,
species = specificEpithet,
subspecies = infraspecificEpithet,
rank = taxonRank,
ref = scientificNameAuthorship,
species_id = referenceID,
source = "CoL")
2019-03-18 14:29:41 +01:00
# clean data_dsmz
2020-05-27 16:37:49 +02:00
data_dsmz.bak <- data_dsmz
data_dsmz_old <- data_dsmz %>%
# filter: correct name is not NULL
filter(!sapply(correct_name, is.null)) %>%
as_tibble() %>%
transmute(fullname = trimws(paste(ifelse(is.na(genus), "", genus),
ifelse(is.na(species_epithet), "", species_epithet),
ifelse(is.na(subspecies_epithet), "", subspecies_epithet))),
fullname_new = sapply(correct_name, function(x) x[2L]),
ref = authors,
prevalence = NA_integer_)
2019-03-18 14:29:41 +01:00
data_dsmz <- data_dsmz %>%
2020-05-27 16:37:49 +02:00
# filter: correct name is NULL
filter(sapply(correct_name, is.null)) %>%
2019-02-20 00:04:48 +01:00
as_tibble() %>%
2020-05-27 16:37:49 +02:00
transmute(fullname = "",
kingdom = regio,
phylum,
class = classis,
# order = "", # does not contain order, will add later based on CoL
family = familia,
genus = ifelse(is.na(genus), "", genus),
species = ifelse(is.na(species_epithet), "", species_epithet),
subspecies = ifelse(is.na(subspecies_epithet), "", subspecies_epithet),
2019-03-18 14:29:41 +01:00
rank = ifelse(species == "", "genus", "species"),
2020-05-27 16:37:49 +02:00
ref = authors,
species_id = as.character(pnu_no),
2019-03-18 14:29:41 +01:00
source = "DSMZ")
# DSMZ only contains genus/(sub)species, try to find taxonomic properties based on genus and data_col
ref_taxonomy <- data_col %>%
2020-05-27 16:37:49 +02:00
filter(family %in% data_dsmz$family & family != "") %>%
arrange(kingdom) %>%
2020-05-27 16:37:49 +02:00
distinct(family, .keep_all = TRUE) %>%
select(family, order)
2019-03-18 14:29:41 +01:00
data_dsmz <- data_dsmz %>%
2020-05-27 16:37:49 +02:00
left_join(ref_taxonomy, by = "family") # NAs will later become "(unknown ...)"
2019-03-18 14:29:41 +01:00
# combine everything
data_total <- data_col %>%
bind_rows(data_dsmz)
rm(data_col)
rm(data_dsmz)
rm(ref_taxonomy)
2020-05-27 16:37:49 +02:00
rm(data_col.bak)
rm(data_dsmz.bak)
2019-03-18 14:29:41 +01:00
MOs <- data_total %>%
2019-02-20 00:04:48 +01:00
filter(
2019-02-28 13:56:28 +01:00
(
2019-04-05 18:47:39 +02:00
# we only want all MICROorganisms and no viruses
!kingdom %in% c("Animalia", "Plantae", "Viruses")
2019-03-18 14:29:41 +01:00
# and not all fungi: Aspergillus, Candida, Trichphyton and Pneumocystis are the most important,
2019-02-28 13:56:28 +01:00
# so only keep these orders from the fungi:
& !(kingdom == "Fungi"
2019-08-09 23:22:10 +02:00
& !order %in% c("Eurotiales", "Microascales", "Mucorales", "Saccharomycetales", "Schizosaccharomycetales", "Tremellales", "Onygenales", "Pneumocystales"))
2019-02-28 13:56:28 +01:00
)
2019-03-18 14:29:41 +01:00
# or the genus has to be one of the genera we found in our hospitals last decades (Northern Netherlands, 2002-2018)
2022-06-10 13:15:23 +02:00
| genus %in% MO_PREVALENT_GENERA
2019-08-09 23:22:10 +02:00
) %>%
# really no Plantae (e.g. Dracunculus exist both as worm and as plant)
2019-09-18 15:46:09 +02:00
filter(kingdom != "Plantae") %>%
filter(!rank %in% c("kingdom", "phylum", "class", "order", "family", "genus"))
# include all ranks other than species for the included species
MOs <- MOs %>% bind_rows(data_total %>%
filter((kingdom %in% MOs$kingdom & rank == "kingdom")
| (phylum %in% MOs$phylum & rank == "phylum")
| (class %in% MOs$class & rank == "class")
| (order %in% MOs$order & rank == "order")
| (family %in% MOs$family & rank == "family")
| (genus %in% MOs$genus & rank == "genus")))
2019-04-05 18:47:39 +02:00
2020-05-27 16:37:49 +02:00
get_author_year <- function(ref) {
# Only keep first author, e.g. transform 'Smith, Jones, 2011' to 'Smith et al., 2011'
authors2 <- iconv(ref, from = "UTF-8", to = "ASCII//TRANSLIT")
# remove leading and trailing brackets
authors2 <- gsub("^[(](.*)[)]$", "\\1", authors2)
# only take part after brackets if there's a name
authors2 <- ifelse(grepl(".*[)] [a-zA-Z]+.*", authors2),
gsub(".*[)] (.*)", "\\1", authors2),
authors2)
# get year from last 4 digits
lastyear = as.integer(gsub(".*([0-9]{4})$", "\\1", authors2))
# can never be later than now
lastyear = ifelse(lastyear > as.integer(format(Sys.Date(), "%Y")),
NA,
lastyear)
# get authors without last year
authors <- gsub("(.*)[0-9]{4}$", "\\1", authors2)
# remove nonsense characters from names
authors <- gsub("[^a-zA-Z,'& -]", "", authors)
# remove trailing and leading spaces
authors <- trimws(authors)
# only keep first author and replace all others by 'et al'
authors <- gsub("(,| and| et| &| ex| emend\\.?) .*", " et al.", authors)
# et al. always with ending dot
authors <- gsub(" et al\\.?", " et al.", authors)
authors <- gsub(" ?,$", "", authors)
# don't start with 'sensu' or 'ehrenb'
authors <- gsub("^(sensu|Ehrenb.?) ", "", authors, ignore.case = TRUE)
# no initials, only surname
authors <- gsub("^([A-Z]+ )+", "", authors, ignore.case = FALSE)
# combine author and year if year is available
ref <- ifelse(!is.na(lastyear),
paste0(authors, ", ", lastyear),
authors)
# fix beginning and ending
ref <- gsub(", $", "", ref)
ref <- gsub("^, ", "", ref)
ref <- gsub("^(emend|et al.,?)", "", ref)
ref <- trimws(ref)
# a lot start with a lowercase character - fix that
ref[!grepl("^d[A-Z]", ref)] <- gsub("^([a-z])", "\\U\\1", ref[!grepl("^d[A-Z]", ref)], perl = TRUE)
# specific one for the French that are named dOrbigny
ref[grepl("^d[A-Z]", ref)] <- gsub("^d", "d'", ref[grepl("^d[A-Z]", ref)])
ref <- gsub(" +", " ", ref)
ref
}
MOs <- MOs %>% mutate(ref = get_author_year(ref))
2019-02-20 00:04:48 +01:00
2019-03-18 14:29:41 +01:00
# Remove non-ASCII characters (these are not allowed by CRAN)
2019-02-20 00:04:48 +01:00
MOs <- MOs %>%
lapply(iconv, from = "UTF-8", to = "ASCII//TRANSLIT") %>%
as_tibble(stringsAsFactors = FALSE) %>%
# remove invalid characters
mutate_all(~gsub("[\"'`]+", "", .))
2019-02-20 00:04:48 +01:00
2020-05-27 16:37:49 +02:00
# set new fullnames
MOs <- MOs %>%
mutate(fullname = trimws(case_when(rank == "family" ~ family,
rank == "order" ~ order,
rank == "class" ~ class,
rank == "phylum" ~ phylum,
rank == "kingdom" ~ kingdom,
TRUE ~ paste(genus, species, subspecies))),
fullname = gsub(" (var|f|subsp)[.]", "", fullname)) %>%
# remove text if it contains 'Not assigned', etc.
mutate_all(function(x) ifelse(x %like% "(not assigned|homonym|mistake)", NA, x)) %>%
# clean taxonomy
mutate(kingdom = ifelse(is.na(kingdom) | trimws(kingdom) == "", "(unknown kingdom)", trimws(kingdom)),
phylum = ifelse(is.na(phylum) | trimws(phylum) == "", "(unknown phylum)", trimws(phylum)),
class = ifelse(is.na(class) | trimws(class) == "", "(unknown class)", trimws(class)),
order = ifelse(is.na(order) | trimws(order) == "", "(unknown order)", trimws(order)),
family = ifelse(is.na(family) | trimws(family) == "", "(unknown family)", trimws(family)))
# Split old taxonomic names
MOs.old <- data_col_old %>%
filter(!gsub(" (var|f|subsp)[.]", "", fullname_new) %in% data_dsmz_old$fullname) %>%
bind_rows(data_dsmz_old) %>%
mutate(fullname_new = gsub(" (var|f|subsp)[.]", "", fullname_new),
fullname = gsub(" (var|f|subsp)[.]", "", fullname)) %>%
# for cases like Chlamydia pneumoniae -> Chlamydophila pneumoniae -> Chlamydia pneumoniae:
filter(!fullname %in% fullname_new &
fullname_new %in% MOs$fullname &
!is.na(fullname) &
fullname != fullname_new) %>%
2019-02-20 00:04:48 +01:00
distinct(fullname, .keep_all = TRUE) %>%
2020-05-27 16:37:49 +02:00
arrange(fullname) %>%
mutate(ref = get_author_year(ref))
2019-09-22 17:19:59 +02:00
2019-02-20 00:04:48 +01:00
MOs <- MOs %>%
2020-05-27 16:37:49 +02:00
# remove entries that are old and in MOs.old
filter(!fullname %in% MOs.old$fullname) %>%
# mark up
transmute(fullname,
2019-02-20 00:04:48 +01:00
kingdom,
phylum,
class,
order,
family,
2020-05-27 16:37:49 +02:00
genus,
2019-03-18 14:29:41 +01:00
species,
subspecies,
rank,
ref,
2020-05-27 16:37:49 +02:00
species_id = gsub("[^a-zA-Z0-9].*", "", species_id),
2019-03-18 14:29:41 +01:00
source) %>%
2020-05-27 16:37:49 +02:00
# prefer known taxonomy over unknown taxonomy, then DSMZ over CoL (= desc)
arrange(desc(kingdom, genus, species, source)) %>%
2019-09-18 15:46:09 +02:00
distinct(kingdom, fullname, .keep_all = TRUE)
# remove all genera that have no species - they are irrelevant for microbiology and almost all from the kingdom of Animalia
to_remove <- MOs %>%
filter(!kingdom %in% c("Bacteria", "Protozoa")) %>%
group_by(kingdom, genus) %>%
count() %>%
filter(n == 1) %>%
ungroup() %>%
mutate(kingdom_genus = paste(kingdom, genus)) %>%
pull(kingdom_genus)
MOs <- MOs %>% filter(!(paste(kingdom, genus) %in% to_remove))
rm(to_remove)
2020-05-27 16:37:49 +02:00
# add all mssing genera, families and orders
2019-09-18 15:46:09 +02:00
MOs <- MOs %>%
2020-05-27 16:37:49 +02:00
bind_rows(MOs %>%
arrange(genus, species) %>%
distinct(genus, .keep_all = TRUE) %>%
filter(rank == "species") %>%
mutate(fullname = genus,
species = "",
rank = "genus",
species_id = "",
ref = NA_character_)) %>%
bind_rows(MOs %>%
arrange(family, genus) %>%
distinct(family, .keep_all = TRUE) %>%
filter(rank == "genus") %>%
mutate(fullname = family,
genus = "",
rank = "family",
species_id = "",
ref = NA_character_)) %>%
bind_rows(MOs %>%
arrange(order, family) %>%
distinct(family, .keep_all = TRUE) %>%
filter(rank == "family") %>%
mutate(fullname = order,
family = "",
rank = "order",
species_id = "",
ref = NA_character_))
# remove the empty ones
MOs <- MOs %>%
mutate(fullname = gsub(",.*", "", fullname)) %>%
distinct(kingdom, fullname, .keep_all = TRUE) %>%
filter(fullname != "")
2019-03-18 14:29:41 +01:00
# what characters are in the fullnames?
2019-08-09 23:22:10 +02:00
table(sort(unlist(strsplit(x = paste(MOs$fullname, collapse = ""), split = ""))))
MOs %>% filter(fullname %unlike% "^[a-z ]+$") %>% arrange(fullname) %>% View()
2019-09-18 15:46:09 +02:00
table(MOs$kingdom, MOs$rank)
table(AMR::microorganisms$kingdom, AMR::microorganisms$rank)
# set prevalence per species
MOs <- MOs %>%
mutate(prevalence = case_when(
class == "Gammaproteobacteria"
| genus %in% c("Enterococcus", "Staphylococcus", "Streptococcus")
~ 1,
2019-09-20 12:33:05 +02:00
kingdom %in% c("Archaea", "Bacteria", "Chromista", "Fungi")
& (phylum %in% c("Proteobacteria",
"Firmicutes",
"Actinobacteria",
"Sarcomastigophora")
2022-06-10 13:15:23 +02:00
| genus %in% MO_PREVALENT_GENERA
2019-09-20 12:33:05 +02:00
| rank %in% c("kingdom", "phylum", "class", "order", "family"))
2019-09-18 15:46:09 +02:00
~ 2,
TRUE ~ 3
))
2019-03-18 14:29:41 +01:00
# Add abbreviations so we can easily know which ones are which ones.
# These will become valid and unique microbial IDs for the AMR package.
2019-02-20 00:04:48 +01:00
MOs <- MOs %>%
2019-09-20 12:33:05 +02:00
arrange(prevalence, genus, species, subspecies) %>%
2019-02-20 00:04:48 +01:00
group_by(kingdom) %>%
2019-05-10 16:44:59 +02:00
mutate(abbr_other = case_when(
rank == "family" ~ paste0("[FAM]_",
abbreviate(family,
minlength = 8,
use.classes = TRUE,
method = "both.sides",
strict = FALSE)),
rank == "order" ~ paste0("[ORD]_",
abbreviate(order,
minlength = 8,
use.classes = TRUE,
method = "both.sides",
strict = FALSE)),
rank == "class" ~ paste0("[CLS]_",
abbreviate(class,
minlength = 8,
use.classes = TRUE,
method = "both.sides",
strict = FALSE)),
rank == "phylum" ~ paste0("[PHL]_",
abbreviate(phylum,
minlength = 8,
use.classes = TRUE,
method = "both.sides",
strict = FALSE)),
rank == "kingdom" ~ paste0("[KNG]_", kingdom),
TRUE ~ NA_character_
)) %>%
2019-02-20 00:04:48 +01:00
# abbreviations may be same for genera between kingdoms,
2019-03-18 14:29:41 +01:00
# because each abbreviation starts with the the first character(s) of the kingdom
2019-09-20 12:33:05 +02:00
mutate(abbr_genus = abbreviate(gsub("^ae", "\u00E6\u00E6", genus, ignore.case = TRUE), # keep a starting Latin ae
2019-02-20 00:04:48 +01:00
minlength = 5,
use.classes = TRUE,
2019-09-20 12:33:05 +02:00
method = "both.sides")) %>%
2019-02-20 00:04:48 +01:00
ungroup() %>%
group_by(genus) %>%
# species abbreviations may be the same between genera
# because the genus abbreviation is part of the abbreviation
2019-09-20 12:33:05 +02:00
mutate(abbr_species = abbreviate(gsub("^ae", "\u00E6\u00E6", species),
2019-09-18 15:46:09 +02:00
minlength = 4,
use.classes = TRUE,
2019-02-20 00:04:48 +01:00
method = "both.sides")) %>%
ungroup() %>%
group_by(genus, species) %>%
2019-09-20 12:33:05 +02:00
mutate(abbr_subspecies = abbreviate(gsub("^ae", "\u00E6\u00E6", subspecies),
2019-09-18 15:46:09 +02:00
minlength = 4,
use.classes = TRUE,
2019-02-20 00:04:48 +01:00
method = "both.sides")) %>%
ungroup() %>%
# remove trailing underscores
mutate(mo = gsub("_+$", "",
2019-03-18 14:29:41 +01:00
toupper(paste(ifelse(kingdom %in% c("Animalia", "Plantae"),
substr(kingdom, 1, 2),
substr(kingdom, 1, 1)),
2019-05-10 16:44:59 +02:00
ifelse(is.na(abbr_other),
paste(abbr_genus,
abbr_species,
abbr_subspecies,
sep = "_"),
abbr_other),
2019-09-20 12:33:05 +02:00
sep = "_"))),
mo = gsub("(\u00C6|\u00E6)+", "AE", mo)) %>%
2019-02-26 12:33:26 +01:00
mutate(mo = ifelse(duplicated(.$mo),
2019-03-18 14:29:41 +01:00
# these one or two must be unique too
2019-02-26 12:33:26 +01:00
paste0(mo, "1"),
mo),
fullname = ifelse(fullname == "",
2019-02-28 13:56:28 +01:00
trimws(paste(genus, species, subspecies)),
fullname)) %>%
2019-03-18 14:29:41 +01:00
# put `mo` in front, followed by the rest
2019-05-10 16:44:59 +02:00
select(mo, everything(), -abbr_other, -abbr_genus, -abbr_species, -abbr_subspecies)
2019-02-20 00:04:48 +01:00
# add non-taxonomic entries
MOs <- MOs %>%
bind_rows(
2019-03-02 22:47:04 +01:00
# Unknowns
2019-03-18 14:29:41 +01:00
data.frame(mo = "UNKNOWN",
fullname = "(unknown name)",
kingdom = "(unknown kingdom)",
phylum = "(unknown phylum)",
class = "(unknown class)",
order = "(unknown order)",
family = "(unknown family)",
genus = "(unknown genus)",
species = "(unknown species)",
subspecies = "(unknown subspecies)",
rank = "(unknown rank)",
ref = NA_character_,
species_id = "",
source = "manually added",
2019-09-18 15:46:09 +02:00
prevalence = 1,
2019-03-18 14:29:41 +01:00
stringsAsFactors = FALSE),
2019-03-02 22:47:04 +01:00
data.frame(mo = "B_GRAMN",
2019-06-11 14:18:25 +02:00
fullname = "(unknown Gram-negatives)",
2019-03-02 22:47:04 +01:00
kingdom = "Bacteria",
phylum = "(unknown phylum)",
class = "(unknown class)",
order = "(unknown order)",
family = "(unknown family)",
2019-06-11 14:18:25 +02:00
genus = "(unknown Gram-negatives)",
2019-03-02 22:47:04 +01:00
species = "(unknown species)",
subspecies = "(unknown subspecies)",
rank = "species",
ref = NA_character_,
2019-03-18 14:29:41 +01:00
species_id = "",
source = "manually added",
2019-09-18 15:46:09 +02:00
prevalence = 1,
2019-03-02 22:47:04 +01:00
stringsAsFactors = FALSE),
data.frame(mo = "B_GRAMP",
2019-06-11 14:18:25 +02:00
fullname = "(unknown Gram-positives)",
2019-03-02 22:47:04 +01:00
kingdom = "Bacteria",
phylum = "(unknown phylum)",
class = "(unknown class)",
order = "(unknown order)",
family = "(unknown family)",
2019-06-11 14:18:25 +02:00
genus = "(unknown Gram-positives)",
2019-03-02 22:47:04 +01:00
species = "(unknown species)",
subspecies = "(unknown subspecies)",
rank = "species",
ref = NA_character_,
2019-03-18 14:29:41 +01:00
species_id = "",
source = "manually added",
2019-09-18 15:46:09 +02:00
prevalence = 1,
2019-03-02 22:47:04 +01:00
stringsAsFactors = FALSE),
data.frame(mo = "F_YEAST",
fullname = "(unknown yeast)",
kingdom = "Fungi",
phylum = "(unknown phylum)",
class = "(unknown class)",
order = "(unknown order)",
family = "(unknown family)",
genus = "(unknown genus)",
species = "(unknown species)",
subspecies = "(unknown subspecies)",
rank = "species",
ref = NA_character_,
species_id = "",
source = "manually added",
2019-09-18 15:46:09 +02:00
prevalence = 2,
stringsAsFactors = FALSE),
data.frame(mo = "F_FUNGUS",
fullname = "(unknown fungus)",
kingdom = "Fungi",
phylum = "(unknown phylum)",
class = "(unknown class)",
order = "(unknown order)",
family = "(unknown family)",
genus = "(unknown genus)",
species = "(unknown species)",
subspecies = "(unknown subspecies)",
rank = "species",
ref = NA_character_,
species_id = "",
source = "manually added",
2019-09-18 15:46:09 +02:00
prevalence = 2,
stringsAsFactors = FALSE),
2019-02-20 00:04:48 +01:00
# CoNS
MOs %>%
filter(genus == "Staphylococcus", species == "epidermidis") %>% .[1,] %>%
2019-09-18 15:46:09 +02:00
mutate(mo = gsub("(.*)_(.*)_.*", "\\1_\\2_CONS", mo),
2019-03-18 14:29:41 +01:00
species = "coagulase-negative",
fullname = "Coagulase-negative Staphylococcus (CoNS)",
ref = NA_character_,
species_id = "",
source = "manually added"),
2019-02-20 00:04:48 +01:00
# CoPS
MOs %>%
filter(genus == "Staphylococcus", species == "epidermidis") %>% .[1,] %>%
2019-09-18 15:46:09 +02:00
mutate(mo = gsub("(.*)_(.*)_.*", "\\1_\\2_COPS", mo),
2019-03-18 14:29:41 +01:00
species = "coagulase-positive",
fullname = "Coagulase-positive Staphylococcus (CoPS)",
ref = NA_character_,
species_id = "",
source = "manually added"),
2019-02-20 00:04:48 +01:00
# Streptococci groups A, B, C, F, H, K
MOs %>%
2019-03-18 14:29:41 +01:00
filter(genus == "Streptococcus", species == "pyogenes") %>% .[1,] %>%
# we can keep all other details, since S. pyogenes is the only member of group A
2019-09-18 15:46:09 +02:00
mutate(mo = gsub("(.*)_(.*)_.*", "\\1_\\2_GRPA", mo),
2019-02-20 00:04:48 +01:00
species = "group A" ,
2019-09-18 15:46:09 +02:00
fullname = "Streptococcus group A",
source = "manually added"),
2019-02-20 00:04:48 +01:00
MOs %>%
2019-03-18 14:29:41 +01:00
filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
# we can keep all other details, since S. agalactiae is the only member of group B
2019-09-18 15:46:09 +02:00
mutate(mo = gsub("(.*)_(.*)_.*", "\\1_\\2_GRPB", mo),
2019-02-20 00:04:48 +01:00
species = "group B" ,
2019-09-18 15:46:09 +02:00
fullname = "Streptococcus group B",
source = "manually added"),
2019-02-20 00:04:48 +01:00
MOs %>%
2019-03-18 14:29:41 +01:00
filter(genus == "Streptococcus", species == "dysgalactiae") %>% .[1,] %>%
2019-09-18 15:46:09 +02:00
mutate(mo = gsub("(.*)_(.*)_.*", "\\1_\\2_GRPC", mo),
2019-02-20 00:04:48 +01:00
species = "group C" ,
fullname = "Streptococcus group C",
2019-03-18 14:29:41 +01:00
ref = NA_character_,
species_id = "",
source = "manually added"),
2019-02-20 00:04:48 +01:00
MOs %>%
filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
2019-09-18 15:46:09 +02:00
mutate(mo = gsub("(.*)_(.*)_.*", "\\1_\\2_GRPD", mo),
2019-02-20 00:04:48 +01:00
species = "group D" ,
fullname = "Streptococcus group D",
2019-03-18 14:29:41 +01:00
ref = NA_character_,
species_id = "",
source = "manually added"),
2019-02-20 00:04:48 +01:00
MOs %>%
filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
2019-09-18 15:46:09 +02:00
mutate(mo = gsub("(.*)_(.*)_.*", "\\1_\\2_GRPF", mo),
2019-02-20 00:04:48 +01:00
species = "group F" ,
fullname = "Streptococcus group F",
2019-03-18 14:29:41 +01:00
ref = NA_character_,
species_id = "",
source = "manually added"),
2019-02-20 00:04:48 +01:00
MOs %>%
filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
2019-09-18 15:46:09 +02:00
mutate(mo = gsub("(.*)_(.*)_.*", "\\1_\\2_GRPG", mo),
2019-03-18 14:29:41 +01:00
species = "group G" ,
2019-02-20 00:04:48 +01:00
fullname = "Streptococcus group G",
2019-03-18 14:29:41 +01:00
ref = NA_character_,
species_id = "",
source = "manually added"),
2019-02-20 00:04:48 +01:00
MOs %>%
filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
2019-09-18 15:46:09 +02:00
mutate(mo = gsub("(.*)_(.*)_.*", "\\1_\\2_GRPH", mo),
2019-02-20 00:04:48 +01:00
species = "group H" ,
fullname = "Streptococcus group H",
2019-03-18 14:29:41 +01:00
ref = NA_character_,
species_id = "",
source = "manually added"),
2019-02-20 00:04:48 +01:00
MOs %>%
filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
2019-09-18 15:46:09 +02:00
mutate(mo = gsub("(.*)_(.*)_.*", "\\1_\\2_GRPK", mo),
2019-02-20 00:04:48 +01:00
species = "group K" ,
fullname = "Streptococcus group K",
2019-03-18 14:29:41 +01:00
ref = NA_character_,
species_id = "",
source = "manually added"),
2019-02-20 00:04:48 +01:00
# Beta haemolytic Streptococci
MOs %>%
filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
2019-09-18 15:46:09 +02:00
mutate(mo = gsub("(.*)_(.*)_.*", "\\1_\\2_HAEM", mo),
2019-02-20 00:04:48 +01:00
species = "beta-haemolytic" ,
fullname = "Beta-haemolytic Streptococcus",
2019-03-18 14:29:41 +01:00
ref = NA_character_,
species_id = "",
2019-06-13 14:28:46 +02:00
source = "manually added"),
2019-08-13 16:15:08 +02:00
# Viridans Streptococci
MOs %>%
filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
2019-09-18 15:46:09 +02:00
mutate(mo = gsub("(.*)_(.*)_.*", "\\1_\\2_VIRI", mo),
2019-08-13 16:15:08 +02:00
species = "viridans" ,
fullname = "Viridans Group Streptococcus (VGS)",
ref = NA_character_,
species_id = "",
source = "manually added"),
# Milleri Streptococci
MOs %>%
filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
2019-09-18 15:46:09 +02:00
mutate(mo = gsub("(.*)_(.*)_.*", "\\1_\\2_MILL", mo),
2019-08-13 16:15:08 +02:00
species = "milleri" ,
fullname = "Milleri Group Streptococcus (MGS)",
ref = NA_character_,
species_id = "",
source = "manually added"),
# Candida krusei
MOs %>%
filter(genus == "Candida", species == "glabrata") %>% .[1,] %>%
mutate(mo = gsub("(.*)_(.*)_.*", "\\1_\\2_KRUS", mo),
species = "krusei" ,
fullname = "Candida krusei",
ref = NA_character_,
species_id = "",
source = "manually added"),
# Blastocystis hominis does not exist (it means 'got a Blastocystis from humans', PMID 15634993)
2019-09-22 17:19:59 +02:00
# but let's be nice to the clinical people in microbiology
MOs %>%
filter(fullname == "Blastocystis") %>%
mutate(mo = paste0(mo, "_HMNS"),
fullname = paste(fullname, "hominis"),
species = "hominis",
source = "manually added",
ref = NA_character_,
species_id = ""),
2019-06-13 14:28:46 +02:00
# Trichomonas vaginalis is missing, same order as Dientamoeba
MOs %>%
filter(fullname == "Dientamoeba") %>%
2019-09-18 15:46:09 +02:00
mutate(mo = gsub("(.*?)_.*", "\\1_THMNS", mo),
2019-06-13 14:28:46 +02:00
fullname = "Trichomonas",
family = "Trichomonadidae",
genus = "Trichomonas",
source = "manually added",
ref = "Donne, 1836",
species_id = ""),
MOs %>%
filter(fullname == "Dientamoeba fragilis") %>%
2019-09-18 15:46:09 +02:00
mutate(mo = gsub("(.*?)_.*", "\\1_THMNS_VAG", mo),
2019-06-13 14:28:46 +02:00
fullname = "Trichomonas vaginalis",
family = "Trichomonadidae",
genus = "Trichomonas",
species = "vaginalis",
source = "manually added",
ref = "Donne, 1836",
species_id = ""),
MOs %>% # add family as such too
filter(fullname == "Monocercomonadidae") %>%
2019-09-18 15:46:09 +02:00
mutate(mo = gsub("(.*)_(.*)_.*", "\\1_\\2_TRCHMNDD", mo),
2019-06-13 14:28:46 +02:00
fullname = "Trichomonadidae",
family = "Trichomonadidae",
rank = "family",
genus = "",
species = "",
source = "manually added",
ref = "",
species_id = ""),
2019-02-20 00:04:48 +01:00
)
2019-10-30 23:02:50 +01:00
# Incorporate new microbial order for Gammaproteobacteria - Adeolu et al. (2016), PMID 27620848
MOs[which(MOs$family == "Enterobacteriaceae"), "family"] <- ""
MOs[which(MOs$genus %in% c("Escherichia",
"Atlantibacter",
"Biostraticola",
"Buttiauxella",
"Cedecea",
"Citrobacter",
"Cronobacter",
"Enterobacillus",
"Enterobacter",
"Franconibacter",
"Gibbsiella",
"Izhakiella",
"Klebsiella",
"Kluyvera",
"Kosakonia",
"Leclercia",
"Lelliottia",
"Mangrovibacter",
"Pluralibacter",
"Pseudocitrobacter",
"Raoultella",
"Rosenbergiella",
"Saccharobacter",
"Salmonella",
"Shigella",
"Shimwellia",
"Siccibacter",
"Trabulsiella",
"Yokenella")), "family"] <- "Enterobacteriaceae"
MOs[which(MOs$genus %in% c("Erwinia",
"Buchnera",
"Pantoea",
"Phaseolibacter",
"Tatumella",
"Wigglesworthia")), "family"] <- "Erwiniaceae"
MOs[which(MOs$genus %in% c("Pectobacterium",
"Brenneria",
"Dickeya",
"Lonsdalea",
"Sodalis")), "family"] <- "Pectobacteriaceae"
MOs[which(MOs$genus %in% c("Yersinia",
"Chania",
"Ewingella",
"Rahnella",
"Rouxiella",
"Samsonia",
"Serratia")), "family"] <- "Yersiniaceae"
MOs[which(MOs$genus %in% c("Hafnia",
"Edwardsiella",
"Obesumbacterium")), "family"] <- "Hafniaceae"
MOs[which(MOs$genus %in% c("Morganella",
"Arsenophonus",
"Cosenzaea",
"Moellerella",
"Photorhabdus",
"Proteus",
"Providencia",
"Xenorhabdus")), "family"] <- "Morganellaceae"
MOs[which(MOs$genus %in% c("Budvicia",
"Leminorella",
"Pragia")), "family"] <- "Budviciaceae"
MOs[which(MOs$family %in% c("Enterobacteriaceae",
"Erwiniaceae",
"Pectobacteriaceae",
"Yersiniaceae",
"Hafniaceae",
"Morganellaceae",
"Budviciaceae")), "order"] <- "Enterobacterales"
new_families <- MOs %>%
filter(order == "Enterobacterales") %>%
pull(family) %>%
unique()
2020-05-27 16:37:49 +02:00
MOs <- MOs %>%
filter(!(rank == "family" & fullname %in% new_families)) %>%
bind_rows(tibble(mo = paste0("B_[FAM]_",
toupper(abbreviate(new_families,
minlength = 8,
use.classes = TRUE,
method = "both.sides",
strict = FALSE))),
fullname = new_families,
kingdom = "Bacteria",
phylum = "Proteobacteria",
class = "Gammaproteobacteria",
order = "Enterobacterales",
family = new_families,
genus = "",
species = "",
subspecies = "",
rank = "family",
ref = "Adeolu et al., 2016",
species_id = NA_character_,
source = "manually added",
prevalence = 1))
2019-10-30 23:02:50 +01:00
MOs[which(MOs$order == "Enterobacteriales"), "order"] <- "Enterobacterales"
MOs[which(MOs$fullname == "Enterobacteriales"), "fullname"] <- "Enterobacterales"
2019-09-22 17:19:59 +02:00
# add prevalence to old taxonomic names
2019-09-20 14:18:29 +02:00
MOs.old <- MOs.old %>%
2020-05-27 16:37:49 +02:00
select(-prevalence) %>%
left_join(MOs %>% select(fullname, prevalence), by = c("fullname_new" = "fullname"))
2019-09-20 14:18:29 +02:00
2019-02-28 13:56:28 +01:00
# everything distinct?
sum(duplicated(MOs$mo))
2019-08-09 23:22:10 +02:00
sum(duplicated(MOs$fullname))
2019-03-02 22:47:04 +01:00
colnames(MOs)
2019-02-28 13:56:28 +01:00
2020-07-08 14:48:06 +02:00
# add the ones we would delete now, that have unexisting codes and names (also in the old names)
MOs <- MOs %>%
mutate(mo = as.character(mo)) %>%
bind_rows(
AMR::microorganisms %>%
mutate(mo = as.character(mo)) %>%
filter(genus %in% gen & !fullname %in% AMR::microorganisms$fullname &
!fullname %in% AMR::microorganisms.old$fullname &
!mo %in% microorganisms$mo) %>%
select(all_of(colnames(AMR::microorganisms)))
)
# here we welcome the new ones:
2019-09-22 17:19:59 +02:00
MOs %>% arrange(fullname) %>% filter(!fullname %in% AMR::microorganisms$fullname) %>% View()
MOs.old %>% arrange(fullname) %>% filter(!fullname %in% AMR::microorganisms.old$fullname) %>% View()
# and the ones we lost:
2020-05-27 16:37:49 +02:00
# AMR::microorganisms %>% filter(!fullname %in% MOs$fullname) %>% View() # based on fullname
AMR::microorganisms %>% filter(!fullname %in% c(MOs$fullname, MOs.old$fullname)) %>% View() # excluding renamed ones
# AMR::microorganisms %>% filter(!mo %in% MOs$mo) %>% View() # based on mo
# AMR::microorganisms %>% filter(!mo %in% MOs$mo & !fullname %in% MOs$fullname) %>% View()
2019-08-09 23:22:10 +02:00
# and these IDs have changed:
2019-09-18 15:46:09 +02:00
old_new <- MOs %>%
mutate(kingdom_fullname = paste(kingdom, fullname)) %>%
2020-05-27 16:37:49 +02:00
filter(kingdom_fullname %in% (AMR::microorganisms %>%
mutate(kingdom_fullname = paste(kingdom, fullname)) %>%
pull(kingdom_fullname))) %>%
left_join(AMR::microorganisms %>%
mutate(kingdom_fullname = paste(kingdom, fullname)) %>%
select(mo, kingdom_fullname), by = "kingdom_fullname", suffix = c("_new", "_old")) %>%
2019-08-09 23:22:10 +02:00
filter(mo_new != mo_old) %>%
2019-09-18 15:46:09 +02:00
select(mo_old, mo_new, everything())
View(old_new)
2020-05-27 16:37:49 +02:00
# set new MO codes as names to existing data sets
rsi_translation$mo <- mo_name(rsi_translation$mo, language = NULL)
microorganisms.codes$mo <- mo_name(microorganisms.codes$mo, language = NULL)
2021-05-30 22:14:38 +02:00
# microorganisms.translation <- AMR:::microorganisms.translation %>%
# bind_rows(tibble(mo_old = AMR:::microorganisms.translation$mo_new, mo_new = mo_old)) %>%
# filter(!mo_old %in% MOs$mo) %>%
# mutate(mo_new = mo_name(mo_new, language = NULL)) %>%
# bind_rows(old_new %>% select(mo_old, mo_new)) %>%
# distinct(mo_old, .keep_all = TRUE)
2020-05-27 16:37:49 +02:00
# arrange the data sets to save
2019-09-22 17:19:59 +02:00
MOs <- MOs %>% arrange(fullname)
2019-06-13 14:28:46 +02:00
MOs.old <- MOs.old %>% arrange(fullname)
2019-06-22 14:49:12 +02:00
# transform
2019-06-13 14:28:46 +02:00
MOs <- as.data.frame(MOs, stringsAsFactors = FALSE)
2019-02-20 00:04:48 +01:00
MOs.old <- as.data.frame(MOs.old, stringsAsFactors = FALSE)
2019-09-18 15:46:09 +02:00
microorganisms.codes <- as.data.frame(microorganisms.codes, stringsAsFactors = FALSE)
2020-05-27 16:37:49 +02:00
class(MOs$mo) <- c("mo", "character")
2019-02-20 00:04:48 +01:00
2019-09-20 12:33:05 +02:00
# SAVE
2020-05-27 16:37:49 +02:00
### for same server
microorganisms <- dataset_UTF8_to_ASCII(MOs)
microorganisms.old <- dataset_UTF8_to_ASCII(MOs.old)
### for other server
2019-02-20 00:04:48 +01:00
saveRDS(MOs, "microorganisms.rds")
saveRDS(MOs.old, "microorganisms.old.rds")
2019-09-20 12:33:05 +02:00
saveRDS(microorganisms.codes, "microorganisms.codes.rds")
2019-02-22 22:12:10 +01:00
2019-06-22 14:49:12 +02:00
# on the server, do:
2019-06-11 14:18:25 +02:00
usethis::use_data(microorganisms, overwrite = TRUE, version = 2)
usethis::use_data(microorganisms.old, overwrite = TRUE, version = 2)
2019-02-28 13:56:28 +01:00
rm(microorganisms)
rm(microorganisms.old)
2020-05-27 16:37:49 +02:00
# load new data sets
devtools::load_all(".")
# reset previously changed mo codes
rsi_translation$mo <- as.mo(rsi_translation$mo)
microorganisms.codes$mo <- as.mo(microorganisms.codes$mo)
class(microorganisms.codes$mo) <- c("mo", "character")
2021-05-30 22:14:38 +02:00
# microorganisms.translation <- microorganisms.translation %>%
# # (to do: add last package version to column pkg_version)
# left_join(microorganisms.old[, c("fullname", "fullname_new")], # microorganisms.old is now new and loaded
# by = c("mo_new" = "fullname")) %>%
# mutate(name = ifelse(!is.na(fullname_new), fullname_new, mo_new)) %>%
# left_join(microorganisms[, c("fullname", "mo")], # as is microorganisms
# by = c("name" = "fullname")) %>%
# select(mo_old, mo_new = mo) %>%
# filter(!is.na(mo_old), !is.na(mo_new))
# class(microorganisms.translation$mo_old) <- "character" # no class <mo> since those aren't valid MO codes
# class(microorganisms.translation$mo_new) <- c("mo", "character")
2020-05-27 16:37:49 +02:00
# save those to the package
usethis::use_data(rsi_translation, overwrite = TRUE, version = 2)
usethis::use_data(microorganisms.codes, overwrite = TRUE, version = 2)
2021-05-30 22:14:38 +02:00
# saveRDS(microorganisms.translation, file = "data-raw/microorganisms.translation.rds", version = 2)
2020-05-27 16:37:49 +02:00
# to save microorganisms.translation internally to the package
2022-08-26 22:25:15 +02:00
# source("data-raw/pre-commit-hook.R")
2020-05-27 16:37:49 +02:00
# load new data sets again
2019-09-20 12:33:05 +02:00
devtools::load_all(".")
2020-05-27 16:37:49 +02:00
# and check: these codes should not be missing (will otherwise throw a unit test error):
AMR::microorganisms.codes %>% filter(!mo %in% MOs$mo)
AMR::rsi_translation %>% filter(!mo %in% MOs$mo)
2021-05-30 22:14:38 +02:00
# AMR:::microorganisms.translation %>% filter(!mo_new %in% MOs$mo)
2020-05-27 16:37:49 +02:00
# update the example_isolates data set
example_isolates$mo <- as.mo(example_isolates$mo)
usethis::use_data(example_isolates, overwrite = TRUE)
# Don't forget to add SNOMED codes! (data-raw/snomed.R)
# run the unit tests
testthat::test_file("tests/testthat/test-data.R")
testthat::test_file("tests/testthat/test-mo.R")
testthat::test_file("tests/testthat/test-mo_property.R")
2020-05-28 10:51:56 +02:00
# edit 2020-05-28
# Not sure why it now says M. tuberculosis was renamed to M. africanum (B_MYCBC_AFRC), but that's not true
microorganisms <- microorganisms %>%
bind_rows(microorganisms %>%
filter(mo == "B_MYCBC_AFRC") %>%
mutate(mo = "B_MYCBC_TBRC", snomed = list(c("113861009", "113858008")),
ref = "Lehmann et al., 2018",species_id = "778540",
source = "DSMZ", species = "tuberculosis",
fullname = "Mycobacterium tuberculosis")) %>%
arrange(fullname)
class(microorganisms$mo) <- c("mo", "character")
microorganisms.old <- microorganisms.old %>% filter(fullname != "Mycobacterium tuberculosis")
usethis::use_data(microorganisms, overwrite = TRUE, version = 2, compress = "xz")
2020-05-28 10:51:56 +02:00
usethis::use_data(microorganisms.old, overwrite = TRUE, version = 2)
2020-05-27 16:37:49 +02:00
# OLD CODE ----------------------------------------------------------------
# to keep all the old IDs:
# MOs <- MOs %>% filter(!mo %in% old_new$mo_new) %>%
# rbind(microorganisms %>%
# filter(mo %in% old_new$mo_old) %>%
# select(mo, fullname) %>%
# left_join(MOs %>%
# select(-mo), by = "fullname"))
# this is how to fix it
# microorganisms.codes <- AMR::microorganisms.codes %>%
# left_join(MOs %>%
# mutate(kingdom_fullname = paste(kingdom, fullname)) %>%
# left_join(AMR::microorganisms %>%
# transmute(mo, kingdom_fullname = paste(kingdom, fullname)),
# by = "kingdom_fullname", suffix = c("_new", "_old")) %>%
# select(mo_old, mo_new),
# by = c("mo" = "mo_old")) %>%
# select(code, mo = mo_new) %>%
# filter(!is.na(mo))
# microorganisms.codes %>% filter(!mo %in% MOs$mo)
# # and for microorganisms.translation:
# microorganisms.translation <- AMR:::microorganisms.translation %>%
# select(mo = mo_new) %>%
# left_join(AMR::microorganisms %>%
# transmute(mo, kingdom_fullname = paste(kingdom, fullname)),
# by = "kingdom_fullname", suffix = c("_new", "_old")) %>%
# select(mo_old, mo_new)
# left_join(MOs %>%
# mutate(kingdom_fullname = paste(kingdom, fullname)) %>%
# left_join(AMR::microorganisms %>%
# transmute(mo, kingdom_fullname = paste(kingdom, fullname)),
# by = "kingdom_fullname", suffix = c("_new", "_old")) %>%
# select(mo_old, mo_new),
# by = c("mo" = "mo_old")) %>%
# select(code, mo = mo_new) %>%
# filter(!is.na(mo))
# microorganisms.codes %>% filter(!mo %in% MOs$mo)