(v1.1.0.9021) 1st isolates update

2026-02-24 03:09:10 +01:00 · 2020-05-28 10:51:56 +02:00
parent 86d44054f0
commit d9a4b0bcaf
51 changed files with 483 additions and 1106 deletions
--- a/data-raw/country_analysis.R
+++ b/data-raw/country_analysis.R
@@ -90,9 +90,29 @@ countries_geometry <- sf::st_as_sf(map('world', plot = FALSE, fill = TRUE)) %>%
         not_antarctica = as.integer(ID != "Antarctica"),
         countries_name = ifelse(included == 1, as.character(ID), NA))

+# add countries not in the list
+countries_missing <- unique(ip_tbl$country[!ip_tbl$country %in% countries_geometry$countries_code])
+for (i in seq_len(length(countries_missing))) {
+  countries_geometry <- countries_geometry %>%
+    rbind(countries_geometry %>% 
+            filter(ID == "Netherlands") %>%
+            mutate(ID = countrycode::countrycode(countries_missing[i],
+                                                 origin = 'iso2c', 
+                                                 destination = 'country.name'),
+                   countries_code = countries_missing[i],
+                   included = 1,
+                   not_antarctica = 1,
+                   countries_name = countrycode::countrycode(countries_missing[i],
+                                                             origin = 'iso2c', 
+                                                             destination = 'country.name')))
+}
+
 # how many?
 countries_geometry %>% filter(included == 1) %>% nrow()

+countries_geometry$countries_name <- gsub("UK", "United Kingdom", countries_geometry$countries_name, fixed = TRUE)
+countries_geometry$countries_name <- gsub("USA", "United States", countries_geometry$countries_name, fixed = TRUE)
+
 countries_plot <- ggplot(countries_geometry) +
    geom_sf(aes(fill = included, colour = not_antarctica),
            size = 0.25, 
@@ -101,9 +121,9 @@ countries_plot <- ggplot(countries_geometry) +
    theme(panel.grid = element_blank(),
          axis.title = element_blank(),
          axis.text = element_blank()) +
-    scale_fill_gradient(low = "white", high = "#CAD6EA", ) +
+    scale_fill_gradient(low = "white", high = "#128f7645") +
    # this makes the border Antarctica turn white (invisible):
-    scale_colour_gradient(low = "white", high = "#81899B")
+    scale_colour_gradient(low = "white", high = "#128f76")

 countries_plot_mini <- countries_plot
 countries_plot_mini$data <- countries_plot_mini$data %>% filter(ID != "Antarctica")
--- a/data-raw/microorganisms.txt
+++ b/data-raw/microorganisms.txt
@@ -37251,6 +37251,7 @@
 "B_MYCBC_TKNS"	"Mycobacterium tokaiense"	"Bacteria"	"Actinobacteria"	"(unknown class)"	"Actinomycetales"	"Mycobacteriaceae"	"Mycobacterium"	"tokaiense"	""	"species"	"Tsukamura, 1981"	"c457ca4ae3a404100c8ce8c82a6100cc"	"CoL"	2	"72477006"
 "B_MYCBC_TRPL"	"Mycobacterium triplex"	"Bacteria"	"Actinobacteria"	"(unknown class)"	"Actinomycetales"	"Mycobacteriaceae"	"Mycobacterium"	"triplex"	""	"species"	"Floyd et al., 1997"	"f23c2b6cad7a0e20374cdf3d3ff55dce"	"CoL"	2	"113860005"
 "B_MYCBC_TRVL"	"Mycobacterium triviale"	"Bacteria"	"Actinobacteria"	"(unknown class)"	"Actinomycetales"	"Mycobacteriaceae"	"Mycobacterium"	"triviale"	""	"species"	"Kubica, 1970"	"9cb8b676cce27952821e173b12bfff3f"	"CoL"	2	"40333002"
+"B_MYCBC_TBRC"	"Mycobacterium tuberculosis"	"Bacteria"	"Actinobacteria"	"(unknown class)"	"Actinomycetales"	"Mycobacteriaceae"	"Mycobacterium"	"tuberculosis"	""	"species"	"Lehmann et al., 2018"	"778540"	"DSMZ"	2	"c(\"113861009\", \"113858008\")"
 "B_MYCBC_TUSC"	"Mycobacterium tusciae"	"Bacteria"	"Actinobacteria"	"(unknown class)"	"Actinomycetales"	"Mycobacteriaceae"	"Mycobacterium"	"tusciae"	""	"species"	"Tortoli et al., 1999"	"7a8ff8f5a2b16131366fe6e8dfb6b570"	"CoL"	2	
 "B_MYCBC_ULCR"	"Mycobacterium ulcerans"	"Bacteria"	"Actinobacteria"	"(unknown class)"	"Actinomycetales"	"Mycobacteriaceae"	"Mycobacterium"	"ulcerans"	""	"species"	"MacCallum et al., 1950"	"96b3a2e207e76f4725132034d7d0bde1"	"CoL"	2	"40713003"
 "B_MYCBC_VACC"	"Mycobacterium vaccae"	"Bacteria"	"Actinobacteria"	"(unknown class)"	"Actinomycetales"	"Mycobacteriaceae"	"Mycobacterium"	"vaccae"	""	"species"	"Bonicke et al., 1964"	"adbc928aba39beadc25b2ba7e8214c91"	"CoL"	2	"54925005"
--- a/data-raw/reproduction_of_microorganisms.R
+++ b/data-raw/reproduction_of_microorganisms.R
@@ -920,6 +920,22 @@ testthat::test_file("tests/testthat/test-data.R")
 testthat::test_file("tests/testthat/test-mo.R")
 testthat::test_file("tests/testthat/test-mo_property.R")

+# edit 2020-05-28
+# Not sure why it now says M. tuberculosis was renamed to M. africanum (B_MYCBC_AFRC), but that's not true
+microorganisms <- microorganisms %>% 
+  bind_rows(microorganisms %>% 
+              filter(mo == "B_MYCBC_AFRC") %>%
+              mutate(mo = "B_MYCBC_TBRC", snomed = list(c("113861009", "113858008")),
+                     ref = "Lehmann et al., 2018",species_id = "778540",
+                     source = "DSMZ", species = "tuberculosis",
+                     fullname = "Mycobacterium tuberculosis")) %>% 
+  arrange(fullname)
+class(microorganisms$mo) <- c("mo", "character")
+microorganisms.old <- microorganisms.old %>% filter(fullname != "Mycobacterium tuberculosis")
+
+usethis::use_data(microorganisms, overwrite = TRUE, version = 2)
+usethis::use_data(microorganisms.old, overwrite = TRUE, version = 2)
+


 # OLD CODE ----------------------------------------------------------------
--- a/data-raw/reproduction_of_microorganisms_new.R
+++ b/data-raw/reproduction_of_microorganisms_new.R
@@ -1,682 +0,0 @@
-# ==================================================================== #
-# TITLE                                                                #
-# Antimicrobial Resistance (AMR) Analysis                              #
-#                                                                      #
-# SOURCE                                                               #
-# https://gitlab.com/msberends/AMR                                     #
-#                                                                      #
-# LICENCE                                                              #
-# (c) 2018-2020 Berends MS, Luz CF et al.                              #
-#                                                                      #
-# This R package is free software; you can freely use and distribute   #
-# it for both personal and commercial purposes under the terms of the  #
-# GNU General Public License version 2.0 (GNU GPL-2), as published by  #
-# the Free Software Foundation.                                        #
-#                                                                      #
-# We created this package for both routine data analysis and academic  #
-# research and it was publicly released in the hope that it will be    #
-# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY.              #
-# Visit our website for more info: https://msberends.gitlab.io/AMR.    #
-# ==================================================================== #
-
-# ---------------------------------------------------------------------------------
-# Reproduction of the `microorganisms` data set
-# ---------------------------------------------------------------------------------
-# Data retrieved from:
-#
-# [1] Catalogue of Life (CoL) through the Encyclopaedia of Life
-#     https://opendata.eol.org/dataset/catalogue-of-life/
-#      * Download the resource file with a name like "Catalogue of Life yyyy-mm-dd"
-#      * Extract "taxon.tab"
-#
-# [2] Global Biodiversity Information Facility (GBIF)
-#     https://doi.org/10.15468/39omei
-#     * Extract "Taxon.tsv"
-#
-# [3] Deutsche Sammlung von Mikroorganismen und Zellkulturen (DSMZ)
-#     https://www.dsmz.de/support/bacterial-nomenclature-up-to-date-downloads.html
-#     * Download the latest "Complete List" as xlsx file (DSMZ_bactnames.xlsx)
-# ---------------------------------------------------------------------------------
-
-library(dplyr)
-library(AMR)
-
-data_col <- data.table::fread("Documents/taxon.tab")
-data_gbif <- data.table::fread("Documents/Taxon.tsv")
-
-# read the xlsx file from DSMZ (only around 2.5 MB):
-data_dsmz <- readxl::read_xlsx("Downloads/DSMZ_bactnames.xlsx")
-
-# the CoL data is over 3.7M rows:
-data_col %>% freq(kingdom)
-#      Item             Count   Percent   Cum. Count   Cum. Percent
-# ---  ----------  ----------  --------  -----------  -------------
-# 1    Animalia     2,225,627     59.1%    2,225,627          59.1%
-# 2    Plantae      1,177,412     31.3%    3,403,039          90.4%
-# 3    Fungi          290,145      7.7%    3,693,184          98.1%
-# 4    Chromista       47,126      1.3%    3,740,310          99.3%
-# 5    Bacteria        14,478      0.4%    3,754,788          99.7%
-# 6    Protozoa         6,060      0.2%    3,760,848          99.9%
-# 7    Viruses          3,827      0.1%    3,764,675         100.0%
-# 8    Archaea            610      0.0%    3,765,285         100.0%
-
-# the GBIF data is over 5.8M rows:
-data_gbif %>% freq(kingdom)
-#      Item                  Count   Percent   Cum. Count   Cum. Percent
-# ---  ---------------  ----------  --------  -----------  -------------
-# 1    Animalia          3,264,138     55.7%    3,264,138          55.7%
-# 2    Plantae           1,814,962     31.0%    5,079,100          86.7%
-# 3    Fungi               538,086      9.2%    5,617,186          95.9%
-# 4    Chromista           181,374      3.1%    5,798,560          99.0%
-# 5    Bacteria             24,048      0.4%    5,822,608          99.4%
-# 6    Protozoa             15,138      0.3%    5,837,746          99.7%
-# 7    incertae sedis        9,995      0.2%    5,847,741          99.8%
-# 8    Viruses               9,630      0.2%    5,857,371         100.0%
-# 9    Archaea                 771      0.0%    5,858,142         100.0%
-
-
-# Clean up helper function ------------------------------------------------
-clean_new <- function(new) {
-  new %>%
-    # only the ones that have no new ID to refer to a newer name
-    filter(is.na(col_id_new)) %>%
-    filter(
-      (
-        # we only want all MICROorganisms and no viruses
-        !kingdom %in% c("Animalia", "Chromista", "Plantae", "Viruses")
-        # and not all fungi: Aspergillus, Candida, Trichphyton and Pneumocystis are the most important,
-        # so only keep these orders from the fungi:
-        & !(kingdom == "Fungi"
-            & !order %in% c("Eurotiales", "Saccharomycetales", "Schizosaccharomycetales", "Tremellales", "Onygenales", "Pneumocystales"))
-      )
-      # or the family has to contain a genus we found in our hospitals last decades (Northern Netherlands, 2002-2018)
-      | genus %in% c("Absidia", "Acremonium", "Actinotignum", "Alternaria", "Anaerosalibacter", "Ancylostoma", "Anisakis", "Apophysomyces",
-                     "Arachnia", "Ascaris", "Aureobacterium", "Aureobasidium", "Balantidum", "Bilophilia", "Branhamella", "Brochontrix",
-                     "Brugia", "Calymmatobacterium", "Catabacter", "Chilomastix", "Chryseomonas", "Cladophialophora", "Cladosporium",
-                     "Clonorchis", "Cordylobia", "Curvularia", "Demodex", "Dermatobia", "Diphyllobothrium", "Dracunculus", "Echinococcus",
-                     "Enterobius", "Euascomycetes", "Exophiala", "Fasciola", "Fusarium", "Hendersonula", "Hymenolepis", "Kloeckera",
-                     "Koserella", "Larva", "Leishmania", "Lelliottia", "Loa", "Lumbricus", "Malassezia", "Metagonimus", "Molonomonas",
-                     "Mucor", "Nattrassia", "Necator", "Novospingobium", "Onchocerca", "Opistorchis", "Paragonimus", "Paramyxovirus",
-                     "Pediculus", "Phoma", "Phthirus", "Pityrosporum", "Pseudallescheria", "Pulex", "Rhizomucor", "Rhizopus", "Rhodotorula",
-                     "Salinococcus", "Sanguibacteroides", "Schistosoma", "Scopulariopsis", "Scytalidium", "Sporobolomyces", "Stomatococcus",
-                     "Strongyloides", "Syncephalastraceae", "Taenia", "Torulopsis", "Trichinella", "Trichobilharzia", "Trichomonas",
-                     "Trichosporon", "Trichuris", "Trypanosoma", "Wuchereria")) %>%
-    mutate(
-      authors2 = iconv(ref, from = "UTF-8", to = "ASCII//TRANSLIT"),
-      # remove leading and trailing brackets
-      authors2 = gsub("^[(](.*)[)]$", "\\1", authors2),
-      # only take part after brackets if there's a name
-      authors2 = ifelse(grepl(".*[)] [a-zA-Z]+.*", authors2),
-                        gsub(".*[)] (.*)", "\\1", authors2),
-                        authors2),
-      # get year from last 4 digits
-      lastyear = as.integer(gsub(".*([0-9]{4})$", "\\1", authors2)),
-      # can never be later than now
-      lastyear = ifelse(lastyear > as.integer(format(Sys.Date(), "%Y")),
-                        NA,
-                        lastyear),
-      # get authors without last year
-      authors = gsub("(.*)[0-9]{4}$", "\\1", authors2),
-      # remove nonsense characters from names
-      authors = gsub("[^a-zA-Z,'& -]", "", authors),
-      # remove trailing and leading spaces
-      authors = trimws(authors),
-      # only keep first author and replace all others by 'et al'
-      authors = gsub("(,| and| et| &| ex| emend\\.?) .*", " et al.", authors),
-      # et al. always with ending dot
-      authors = gsub(" et al\\.?", " et al.", authors),
-      authors = gsub(" ?,$", "", authors),
-      # don't start with 'sensu' or 'ehrenb'
-      authors = gsub("^(sensu|Ehrenb.?) ", "", authors, ignore.case = TRUE),
-      # no initials, only surname
-      authors = gsub("^([A-Z]+ )+", "", authors, ignore.case = FALSE),
-      # combine author and year if year is available
-      ref = ifelse(!is.na(lastyear),
-                   paste0(authors, ", ", lastyear),
-                   authors),
-      # fix beginning and ending
-      ref = gsub(", $", "", ref),
-      ref = gsub("^, ", "", ref)) %>%
-    # remove text if it contains 'Not assigned' like phylum in viruses
-    mutate_all(~gsub("Not assigned", "", .)) %>%
-    # Remove non-ASCII characters (these are not allowed by CRAN)
-    lapply(iconv, from = "UTF-8", to = "ASCII//TRANSLIT") %>%
-    as_tibble(stringsAsFactors = FALSE) %>%
-    mutate(fullname = trimws(case_when(rank == "family" ~ family,
-                                       rank == "order" ~ order,
-                                       rank == "class" ~ class,
-                                       rank == "phylum" ~ phylum,
-                                       rank == "kingdom" ~ kingdom,
-                                       TRUE ~ paste(genus, species, subspecies))))
-}
-clean_old <- function(old, new) {
-  old %>%
-    # only the ones that exist in the new data set
-    filter(col_id_new %in% new$col_id) %>%
-    mutate(
-      authors2 = iconv(ref, from = "UTF-8", to = "ASCII//TRANSLIT"),
-      # remove leading and trailing brackets
-      authors2 = gsub("^[(](.*)[)]$", "\\1", authors2),
-      # only take part after brackets if there's a name
-      authors2 = ifelse(grepl(".*[)] [a-zA-Z]+.*", authors2),
-                        gsub(".*[)] (.*)", "\\1", authors2),
-                        authors2),
-      # get year from last 4 digits
-      lastyear = as.integer(gsub(".*([0-9]{4})$", "\\1", authors2)),
-      # can never be later than now
-      lastyear = ifelse(lastyear > as.integer(format(Sys.Date(), "%Y")),
-                        NA,
-                        lastyear),
-      # get authors without last year
-      authors = gsub("(.*)[0-9]{4}$", "\\1", authors2),
-      # remove nonsense characters from names
-      authors = gsub("[^a-zA-Z,'& -]", "", authors),
-      # remove trailing and leading spaces
-      authors = trimws(authors),
-      # only keep first author and replace all others by 'et al'
-      authors = gsub("(,| and| et| &| ex| emend\\.?) .*", " et al.", authors),
-      # et al. always with ending dot
-      authors = gsub(" et al\\.?", " et al.", authors),
-      authors = gsub(" ?,$", "", authors),
-      # don't start with 'sensu' or 'ehrenb'
-      authors = gsub("^(sensu|Ehrenb.?) ", "", authors, ignore.case = TRUE),
-      # no initials, only surname
-      authors = gsub("^([A-Z]+ )+", "", authors, ignore.case = FALSE),
-      # combine author and year if year is available
-      ref = ifelse(!is.na(lastyear),
-                   paste0(authors, ", ", lastyear),
-                   authors),
-      # fix beginning and ending
-      ref = gsub(", $", "", ref),
-      ref = gsub("^, ", "", ref)) %>%
-    # remove text if it contains 'Not assigned' like phylum in viruses
-    mutate_all(~gsub("Not assigned", "", .)) %>%
-    # Remove non-ASCII characters (these are not allowed by CRAN)
-    lapply(iconv, from = "UTF-8", to = "ASCII//TRANSLIT") %>%
-    as_tibble(stringsAsFactors = FALSE) %>%
-    select(col_id_new, fullname, ref, authors2) %>%
-    left_join(new %>% select(col_id, fullname_new = fullname), by = c(col_id_new = "col_id")) %>%
-    mutate(fullname = trimws(
-      gsub("(.*)[(].*", "\\1",
-           stringr::str_replace(
-             string = fullname,
-             pattern = stringr::fixed(authors2),
-             replacement = "")) %>%
-        gsub(" (var|f|subsp)[.]", "", .))) %>%
-    select(-c("col_id_new", "authors2")) %>%
-    filter(!is.na(fullname), !is.na(fullname_new)) %>%
-    filter(fullname != fullname_new, !fullname %like% "^[?]")
-}
-
-# clean CoL and GBIF ----
-# clean data_col
-data_col <- data_col %>%
-  as_tibble() %>%
-  select(col_id = taxonID,
-         col_id_new = acceptedNameUsageID,
-         fullname = scientificName,
-         kingdom,
-         phylum,
-         class,
-         order,
-         family,
-         genus,
-         species = specificEpithet,
-         subspecies = infraspecificEpithet,
-         rank = taxonRank,
-         ref = scientificNameAuthorship,
-         species_id = furtherInformationURL) %>%
-  mutate(source = "CoL")
-# split into old and new
-data_col.new <- data_col %>% clean_new()
-data_col.old <- data_col %>% clean_old(new = data_col.new)
-rm(data_col)
-
-# clean data_gbif
-data_gbif <- data_gbif %>%
-  as_tibble() %>%
-  filter(
-    # no uncertain taxonomic placements
-    taxonRemarks != "doubtful",
-    kingdom != "incertae sedis",
-    taxonRank != "unranked") %>%
-  transmute(col_id = taxonID,
-            col_id_new = acceptedNameUsageID,
-            fullname = scientificName,
-            kingdom,
-            phylum,
-            class,
-            order,
-            family,
-            genus,
-            species = specificEpithet,
-            subspecies = infraspecificEpithet,
-            rank = taxonRank,
-            ref = scientificNameAuthorship,
-            species_id = as.character(parentNameUsageID)) %>%
-  mutate(source = "GBIF")
-# split into old and new
-data_gbif.new <- data_gbif %>% clean_new()
-data_gbif.old <- data_gbif %>% clean_old(new = data_gbif.new)
-rm(data_gbif)
-
-# put CoL and GBIF together ----
-MOs.new <- bind_rows(data_col.new,
-                     data_gbif.new) %>%
-  mutate(taxonomic_tree_length = nchar(trimws(paste(kingdom, phylum, class, order, family, genus, species, subspecies)))) %>%
-  arrange(desc(taxonomic_tree_length)) %>%
-  distinct(fullname, .keep_all = TRUE) %>%
-  select(-c("col_id_new", "authors2", "authors", "lastyear", "taxonomic_tree_length")) %>%
-  arrange(fullname)
-MOs.old <- bind_rows(data_col.old,
-                     data_gbif.old) %>%
-  distinct(fullname, .keep_all = TRUE) %>%
-  arrange(fullname)
-
-# clean up DSMZ ---
-data_dsmz <- data_dsmz %>%
-  as_tibble() %>%
-  transmute(col_id = NA_integer_,
-            col_id_new = NA_integer_,
-            fullname = "",
-            # kingdom = "",
-            # phylum = "",
-            # class = "",
-            # order = "",
-            # family = "",
-            genus = ifelse(is.na(GENUS), "", GENUS),
-            species = ifelse(is.na(SPECIES), "", SPECIES),
-            subspecies = ifelse(is.na(SUBSPECIES), "", SUBSPECIES),
-            rank = ifelse(species == "", "genus", "species"),
-            ref = AUTHORS,
-            species_id = as.character(RECORD_NO),
-            source = "DSMZ")
-
-# DSMZ only contains genus/(sub)species, try to find taxonomic properties based on genus and data_col
-ref_taxonomy <- MOs.new %>%
-  distinct(genus, .keep_all = TRUE) %>%
-  filter(family != "") %>%
-  filter(genus %in% data_dsmz$genus) %>%
-  distinct(genus, .keep_all = TRUE) %>%
-  select(kingdom, phylum, class, order, family, genus)
-
-data_dsmz <- data_dsmz %>%
-  left_join(ref_taxonomy, by = "genus") %>%
-  mutate(kingdom = "Bacteria")
-
-data_dsmz.new <- data_dsmz %>%
-  clean_new() %>%
-  distinct(fullname, .keep_all = TRUE) %>%
-  select(colnames(MOs.new)) %>%
-  arrange(fullname)
-
-# combine everything ----
-MOs <- bind_rows(MOs.new,
-                 data_dsmz.new) %>%
-  distinct(fullname, .keep_all = TRUE) %>%
-  # not the ones that are old
-  filter(!fullname %in% MOs.old$fullname) %>%
-  arrange(fullname) %>%
-  mutate(col_id = ifelse(source != "CoL", NA_integer_, col_id)) %>%
-  filter(fullname != "")
-
-rm(data_col.new)
-rm(data_col.old)
-rm(data_gbif.new)
-rm(data_gbif.old)
-rm(data_dsmz)
-rm(data_dsmz.new)
-rm(ref_taxonomy)
-rm(MOs.new)
-
-MOs.bak <- MOs
-
-# Trichomonas trick ----
-# for species in Trypanosoma and Trichomonas we observe al lot of taxonomic info missing
-MOs %>% filter(genus %in% c("Trypanosoma", "Trichomonas")) %>% View()
-MOs[which(MOs$genus == "Trypanosoma"), "kingdom"] <- MOs[which(MOs$fullname == "Trypanosoma"),]$kingdom
-MOs[which(MOs$genus == "Trypanosoma"), "phylum"] <- MOs[which(MOs$fullname == "Trypanosoma"),]$phylum
-MOs[which(MOs$genus == "Trypanosoma"), "class"] <- MOs[which(MOs$fullname == "Trypanosoma"),]$class
-MOs[which(MOs$genus == "Trypanosoma"), "order"] <- MOs[which(MOs$fullname == "Trypanosoma"),]$order
-MOs[which(MOs$genus == "Trypanosoma"), "family"] <- MOs[which(MOs$fullname == "Trypanosoma"),]$family
-MOs[which(MOs$genus == "Trichomonas"), "kingdom"] <- MOs[which(MOs$fullname == "Trichomonas"),]$kingdom
-MOs[which(MOs$genus == "Trichomonas"), "phylum"] <- MOs[which(MOs$fullname == "Trichomonas"),]$phylum
-MOs[which(MOs$genus == "Trichomonas"), "class"] <- MOs[which(MOs$fullname == "Trichomonas"),]$class
-MOs[which(MOs$genus == "Trichomonas"), "order"] <- MOs[which(MOs$fullname == "Trichomonas"),]$order
-MOs[which(MOs$genus == "Trichomonas"), "family"] <- MOs[which(MOs$fullname == "Trichomonas"),]$family
-
-# fill taxonomic properties that are missing
-MOs <- MOs %>%
-  mutate(phylum = ifelse(phylum %in% c(NA, ""), "(unknown phylum)", phylum),
-         class = ifelse(class %in% c(NA, ""), "(unknown class)", class),
-         order = ifelse(order %in% c(NA, ""), "(unknown order)", order),
-         family = ifelse(family %in% c(NA, ""), "(unknown family)", family))
-
-# Abbreviations ----
-# Add abbreviations so we can easily know which ones are which ones.
-# These will become valid and unique microbial IDs for the AMR package.
-MOs <- MOs %>%
-  arrange(kingdom, fullname) %>%
-  group_by(kingdom) %>%
-  mutate(abbr_other = case_when(
-    rank == "family" ~ paste0("[FAM]_",
-                              abbreviate(family,
-                                         minlength = 8,
-                                         use.classes = TRUE,
-                                         method = "both.sides",
-                                         strict = FALSE)),
-    rank == "order" ~ paste0("[ORD]_",
-                             abbreviate(order,
-                                        minlength = 8,
-                                        use.classes = TRUE,
-                                        method = "both.sides",
-                                        strict = FALSE)),
-    rank == "class" ~ paste0("[CLS]_",
-                             abbreviate(class,
-                                        minlength = 8,
-                                        use.classes = TRUE,
-                                        method = "both.sides",
-                                        strict = FALSE)),
-    rank == "phylum" ~ paste0("[PHL]_",
-                              abbreviate(phylum,
-                                         minlength = 8,
-                                         use.classes = TRUE,
-                                         method = "both.sides",
-                                         strict = FALSE)),
-    rank == "kingdom" ~ paste0("[KNG]_", kingdom),
-    TRUE ~ NA_character_
-  )) %>%
-  # abbreviations determined per kingdom and family
-  # becuase they are part of the abbreviation
-  mutate(abbr_genus = abbreviate(genus,
-                                 minlength = 7,
-                                 use.classes = TRUE,
-                                 method = "both.sides",
-                                 strict = FALSE)) %>%
-  ungroup() %>%
-  group_by(genus) %>%
-  # species abbreviations may be the same between genera
-  # because the genus abbreviation is part of the abbreviation
-  mutate(abbr_species = abbreviate(stringr::str_to_title(species),
-                                   minlength = 3,
-                                   use.classes = FALSE,
-                                   method = "both.sides")) %>%
-  ungroup() %>%
-  group_by(genus, species) %>%
-  mutate(abbr_subspecies = abbreviate(stringr::str_to_title(subspecies),
-                                      minlength = 3,
-                                      use.classes = FALSE,
-                                      method = "both.sides")) %>%
-  ungroup() %>%
-  # remove trailing underscores
-  mutate(mo = gsub("_+$", "",
-                   toupper(paste(
-                     # first character: kingdom
-                     ifelse(kingdom %in% c("Animalia", "Plantae"),
-                            substr(kingdom, 1, 2),
-                            substr(kingdom, 1, 1)),
-                     # next: genus, species, subspecies
-                     ifelse(is.na(abbr_other),
-                            paste(abbr_genus,
-                                  abbr_species,
-                                  abbr_subspecies,
-                                  sep = "_"),
-                            abbr_other),
-                     sep = "_")))) %>%
-  mutate(mo = ifelse(duplicated(.$mo),
-                     # these one or two must be unique too
-                     paste0(mo, "1"),
-                     mo),
-         fullname = ifelse(fullname == "",
-                           trimws(paste(genus, species, subspecies)),
-                           fullname)) %>%
-  # put `mo` in front, followed by the rest
-  select(mo, everything(), -abbr_other, -abbr_genus, -abbr_species, -abbr_subspecies)
-
-# add non-taxonomic entries
-MOs <- MOs %>%
-  bind_rows(
-    # Unknowns
-    data.frame(mo = "UNKNOWN",
-               col_id = NA_integer_,
-               fullname = "(unknown name)",
-               kingdom = "(unknown kingdom)",
-               phylum = "(unknown phylum)",
-               class = "(unknown class)",
-               order = "(unknown order)",
-               family = "(unknown family)",
-               genus = "(unknown genus)",
-               species = "(unknown species)",
-               subspecies = "(unknown subspecies)",
-               rank = "(unknown rank)",
-               ref = NA_character_,
-               species_id = "",
-               source = "manually added",
-               stringsAsFactors = FALSE),
-    data.frame(mo = "B_GRAMN",
-               col_id = NA_integer_,
-               fullname = "(unknown Gram-negatives)",
-               kingdom = "Bacteria",
-               phylum = "(unknown phylum)",
-               class = "(unknown class)",
-               order = "(unknown order)",
-               family = "(unknown family)",
-               genus = "(unknown Gram-negatives)",
-               species = "(unknown species)",
-               subspecies = "(unknown subspecies)",
-               rank = "species",
-               ref = NA_character_,
-               species_id = "",
-               source = "manually added",
-               stringsAsFactors = FALSE),
-    data.frame(mo = "B_GRAMP",
-               col_id = NA_integer_,
-               fullname = "(unknown Gram-positives)",
-               kingdom = "Bacteria",
-               phylum = "(unknown phylum)",
-               class = "(unknown class)",
-               order = "(unknown order)",
-               family = "(unknown family)",
-               genus = "(unknown Gram-positives)",
-               species = "(unknown species)",
-               subspecies = "(unknown subspecies)",
-               rank = "species",
-               ref = NA_character_,
-               species_id = "",
-               source = "manually added",
-               stringsAsFactors = FALSE),
-    # CoNS
-    MOs %>%
-      filter(genus == "Staphylococcus", species == "") %>% .[1,] %>%
-      mutate(mo = paste(mo, "CNS", sep = "_"),
-             rank = "species",
-             col_id = NA_integer_,
-             species = "coagulase-negative",
-             fullname = "Coagulase-negative Staphylococcus (CoNS)",
-             ref = NA_character_,
-             species_id = "",
-             source = "manually added"),
-    # CoPS
-    MOs %>%
-      filter(genus == "Staphylococcus", species == "") %>% .[1,] %>%
-      mutate(mo = paste(mo, "CPS", sep = "_"),
-             rank = "species",
-             col_id = NA_integer_,
-             species = "coagulase-positive",
-             fullname = "Coagulase-positive Staphylococcus (CoPS)",
-             ref = NA_character_,
-             species_id = "",
-             source = "manually added"),
-    # Streptococci groups A, B, C, F, H, K
-    MOs %>%
-      filter(genus == "Streptococcus", species == "pyogenes") %>% .[1,] %>%
-      # we can keep all other details, since S. pyogenes is the only member of group A
-      mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRA", sep = "_"),
-             species = "group A" ,
-             fullname = "Streptococcus group A"),
-    MOs %>%
-      filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
-      # we can keep all other details, since S. agalactiae is the only member of group B
-      mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRB", sep = "_"),
-             species = "group B" ,
-             fullname = "Streptococcus group B"),
-    MOs %>%
-      filter(genus == "Streptococcus", species == "dysgalactiae") %>% .[1,] %>%
-      mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRC", sep = "_"),
-             col_id = NA_integer_,
-             species = "group C" ,
-             fullname = "Streptococcus group C",
-             ref = NA_character_,
-             species_id = "",
-             source = "manually added"),
-    MOs %>%
-      filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
-      mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRD", sep = "_"),
-             col_id = NA_integer_,
-             species = "group D" ,
-             fullname = "Streptococcus group D",
-             ref = NA_character_,
-             species_id = "",
-             source = "manually added"),
-    MOs %>%
-      filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
-      mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRF", sep = "_"),
-             col_id = NA_integer_,
-             species = "group F" ,
-             fullname = "Streptococcus group F",
-             ref = NA_character_,
-             species_id = "",
-             source = "manually added"),
-    MOs %>%
-      filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
-      mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRG", sep = "_"),
-             col_id = NA_integer_,
-             species = "group G" ,
-             fullname = "Streptococcus group G",
-             ref = NA_character_,
-             species_id = "",
-             source = "manually added"),
-    MOs %>%
-      filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
-      mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRH", sep = "_"),
-             col_id = NA_integer_,
-             species = "group H" ,
-             fullname = "Streptococcus group H",
-             ref = NA_character_,
-             species_id = "",
-             source = "manually added"),
-    MOs %>%
-      filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
-      mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRK", sep = "_"),
-             col_id = NA_integer_,
-             species = "group K" ,
-             fullname = "Streptococcus group K",
-             ref = NA_character_,
-             species_id = "",
-             source = "manually added"),
-    # Beta-haemolytic Streptococci
-    MOs %>%
-      filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
-      mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "HAE", sep = "_"),
-             col_id = NA_integer_,
-             species = "beta-haemolytic" ,
-             fullname = "Beta-haemolytic Streptococcus",
-             ref = NA_character_,
-             species_id = "",
-             source = "manually added")
-  )
-
-
-# everything distinct?
-sum(duplicated(MOs$mo))
-colnames(MOs)
-
-# set prevalence per species
-MOs <- MOs %>%
-  mutate(prevalence = case_when(
-    class == "Gammaproteobacteria"
-    | genus %in% c("Enterococcus", "Staphylococcus", "Streptococcus")
-    | mo %in% c("UNKNOWN", "B_GRAMN", "B_GRAMP")
-    ~ 1,
-    phylum %in% c("Proteobacteria",
-                  "Firmicutes",
-                  "Actinobacteria",
-                  "Sarcomastigophora")
-    | genus %in% c("Aspergillus",
-                   "Bacteroides",
-                   "Candida",
-                   "Capnocytophaga",
-                   "Chryseobacterium",
-                   "Cryptococcus",
-                   "Elisabethkingia",
-                   "Flavobacterium",
-                   "Fusobacterium",
-                   "Giardia",
-                   "Leptotrichia",
-                   "Mycoplasma",
-                   "Prevotella",
-                   "Rhodotorula",
-                   "Treponema",
-                   "Trichophyton",
-                   "Trichomonas",
-                   "Ureaplasma")
-    | rank %in% c("kingdom", "phylum", "class", "order", "family")
-    ~ 2,
-    TRUE ~ 3
-  ))
-
-# arrange
-MOs <- MOs %>% arrange(fullname)
-
-# transform
-MOs <- as.data.frame(MOs, stringsAsFactors = FALSE)
-MOs.old <- as.data.frame(MOs.old, stringsAsFactors = FALSE)
-class(MOs$mo) <- "mo"
-MOs$col_id <- as.integer(MOs$col_id)
-
-# get differences in MO codes between this data and the package version
-MO_diff <- AMR::microorganisms %>%
-  mutate(pastedtext = paste(mo, fullname)) %>%
-  filter(!pastedtext %in% (MOs %>% mutate(pastedtext = paste(mo, fullname)) %>% pull(pastedtext))) %>%
-  select(mo_old = mo, fullname, pastedtext) %>%
-  left_join(MOs %>%
-              transmute(mo_new = mo, fullname_new = fullname, pastedtext = paste(mo, fullname)), "pastedtext") %>%
-  select(mo_old, mo_new, fullname_new)
-
-mo_diff2 <- AMR::microorganisms %>%
-  select(mo, fullname) %>%
-  left_join(MOs %>%
-              select(mo, fullname),
-            by = "fullname",
-            suffix = c("_old", "_new")) %>%
-  filter(mo_old != mo_new,
-         #!mo_new %in% mo_old,
-         !mo_old %like% "\\[")
-
-mo_diff3 <- tibble(previous_old = names(AMR:::make_trans_tbl()),
-                   previous_new = AMR:::make_trans_tbl()) %>%
-  left_join(AMR::microorganisms %>% select(mo, fullname), by = c(previous_new = "mo")) %>%
-  left_join(MOs %>% select(mo_new = mo, fullname), by = "fullname")
-
-# what did we win most?
-MOs %>% filter(!fullname %in% AMR::microorganisms$fullname) %>% freq(genus)
-# what did we lose most?
-AMR::microorganisms %>%
-  filter(kingdom !=  "Chromista" & !fullname %in% MOs$fullname & !fullname %in% MOs.old$fullname) %>%
-  freq(genus)
-
-
-# save
-saveRDS(MOs, "microorganisms.rds")
-saveRDS(MOs.old, "microorganisms.old.rds")
-
-# on the server, do:
-usethis::use_data(microorganisms, overwrite = TRUE, version = 2)
-usethis::use_data(microorganisms.old, overwrite = TRUE, version = 2)
-rm(microorganisms)
-rm(microorganisms.old)
-
-# TO DO AFTER THIS
-# * Update the year and dim()s in R/data.R
-# * Rerun data-raw/reproduction_of_rsi_translation.R
-# * Run unit tests