(v1.1.0.9020) updated taxonomy

2026-02-24 03:09:10 +01:00 · 2020-05-27 16:37:49 +02:00
parent ae1969b941
commit 86d44054f0
55 changed files with 68063 additions and 70233 deletions
--- a/data-raw/antibiotics.txt
+++ b/data-raw/antibiotics.txt
@@ -89,7 +89,7 @@
 "CTF"	"J01DC07"	43708	"Cefotiam"	"Cephalosporins (2nd gen.)"	"Other beta-lactam antibacterials"	"Second-generation cephalosporins"	""	"c(\"cefotiam\", \"cefotiam?\", \"cefotiamum\", \"ceradolan\", \"ceradon\", \"haloapor\")"	1.2	"g"	4	"g"	
 "CHE"		125846	"Cefotiam hexetil"	"Cephalosporins (3rd gen.)"			""	"c(\"cefotiam cilexetil\", \"pansporin t\")"					
 "FOV"		9578573	"Cefovecin"	"Cephalosporins (3rd gen.)"			""	""					
-"FOX"	"J01DC01"	441199	"Cefoxitin"	"Cephalosporins (2nd gen.)"	"Other beta-lactam antibacterials"	"Second-generation cephalosporins"	"c(\"cfox\", \"cfx\", \"cfxt\", \"cx\", \"fox\", \"fx\")"	"c(\"cefoxitin\", \"cefoxitina\", \"cefoxitine\", \"cefoxitinum\", \"cefoxotin\", \"cephoxitin\", \"mefoxin\", \"mefoxitin\", \"rephoxitin\")"			6	"g"	"c(\"25240-3\", \"3448-8\")"
+"FOX"	"J01DC01"	441199	"Cefoxitin"	"Cephalosporins (2nd gen.)"	"Other beta-lactam antibacterials"	"Second-generation cephalosporins"	"c(\"cfox\", \"cfsc\", \"cfx\", \"cfxt\", \"cx\", \"fox\", \"fx\")"	"c(\"cefoxitin\", \"cefoxitina\", \"cefoxitine\", \"cefoxitinum\", \"cefoxotin\", \"cephoxitin\", \"mefoxin\", \"mefoxitin\", \"rephoxitin\")"			6	"g"	"c(\"25240-3\", \"3448-8\")"
 "ZOP"		9571080	"Cefozopran"	"Cephalosporins (4th gen.)"			""	"cefozopran"					
 "CFZ"		68597	"Cefpimizole"	"Cephalosporins (3rd gen.)"			""	"c(\"cefpimizol\", \"cefpimizole\", \"cefpimizole sodium\", \"cefpimizolum\")"					
 "CPM"	"J01DD11"	636405	"Cefpiramide"	"Cephalosporins (3rd gen.)"	"Other beta-lactam antibacterials"	"Third-generation cephalosporins"	""	"c(\"cefpiramide\", \"cefpiramide acid\", \"cefpiramido\", \"cefpiramidum\")"			2	"g"	
@@ -105,7 +105,7 @@
 "CPT"	"J01DI02"	56841980	"Ceftaroline"	"Cephalosporins (5th gen.)"			"c(\"\", \"cfro\")"	"c(\"teflaro\", \"zinforo\")"					
 "CPA"			"Ceftaroline/avibactam"	"Cephalosporins (5th gen.)"			""	""					
 "CAZ"	"J01DD02"	5481173	"Ceftazidime"	"Cephalosporins (3rd gen.)"	"Other beta-lactam antibacterials"	"Third-generation cephalosporins"	"c(\"caz\", \"cefta\", \"cfta\", \"cftz\", \"taz\", \"tz\", \"xtz\")"	"c(\"ceftazidim\", \"ceftazidima\", \"ceftazidime\", \"ceftazidimum\", \"ceptaz\", \"fortaz\", \"fortum\", \"pentacef\", \"tazicef\", \"tazidime\")"			4	"g"	"c(\"21151-6\", \"3449-6\", \"80960-8\")"
-"CZA"			"Ceftazidime/avibactam"	"Cephalosporins (3rd gen.)"			""	""					
+"CZA"			"Ceftazidime/avibactam"	"Cephalosporins (3rd gen.)"			"c(\"\", \"cfav\")"	""					
 "CCV"	"J01DD52"	9575352	"Ceftazidime/clavulanic acid"	"Cephalosporins (3rd gen.)"	"Other beta-lactam antibacterials"	"Third-generation cephalosporins"	"c(\"czcl\", \"xtzl\")"	""					
 "CEM"		6537431	"Cefteram"	"Cephalosporins (3rd gen.)"			""	"c(\"cefteram\", \"cefterame\", \"cefteramum\", \"ceftetrame\")"					
 "CPL"		5362114	"Cefteram pivoxil"	"Cephalosporins (3rd gen.)"			""	"c(\"cefteram pivoxil\", \"tomiron\")"					
--- a/data-raw/data_dsmz.rds
+++ b/data-raw/data_dsmz.rds
--- a/data-raw/eucast_rules.tsv
+++ b/data-raw/eucast_rules.tsv
@@ -9,22 +9,6 @@
 # >>>>> IF YOU WANT TO IMPORT THIS FILE INTO YOUR OWN SOFTWARE, HAVE THE FIRST 10 LINES SKIPPED <<<<<
 # -------------------------------------------------------------------------------------------------------------------------------
 if_mo_property	like.is.one_of	this_value	and_these_antibiotics	have_these_values	then_change_these_antibiotics	to_value	reference.rule	reference.rule_group
-genus	like	.*	AMP	S	AMX	S	Non-EUCAST: inherit ampicillin results for unavailable amoxicillin	Other rules
-genus	like	.*	AMP	I	AMX	I	Non-EUCAST: inherit ampicillin results for unavailable amoxicillin	Other rules
-genus	like	.*	AMP	R	AMX	R	Non-EUCAST: inherit ampicillin results for unavailable amoxicillin	Other rules
-genus	like	.*	AMX	S	AMP	S	Non-EUCAST: inherit amoxicillin results for unavailable ampicillin	Other rules
-genus	like	.*	AMX	I	AMP	I	Non-EUCAST: inherit amoxicillin results for unavailable ampicillin	Other rules
-genus	like	.*	AMX	R	AMP	R	Non-EUCAST: inherit amoxicillin results for unavailable ampicillin	Other rules
-genus	like	.*	AMC	R	AMP, AMX	R	Non-EUCAST: set ampicillin = R where amoxicillin/clav acid = R	Other rules
-genus	like	.*	SAM	R	AMP, AMX	R	Non-EUCAST: set ampicillin = R where ampicillin/sulbactam = R	Other rules
-genus	like	.*	TZP	R	PIP	R	Non-EUCAST: set piperacillin = R where piperacillin/tazobactam = R	Other rules
-genus	like	.*	SXT	R	TMP	R	Non-EUCAST: set trimethoprim = R where trimethoprim/sulfa = R	Other rules
-genus	like	.*	AMP	S	AMC	S	Non-EUCAST: set amoxicillin/clav acid = S where ampicillin = S	Other rules
-genus	like	.*	AMX	S	AMC	S	Non-EUCAST: set amoxicillin/clav acid = S where ampicillin = S	Other rules
-genus	like	.*	AMP	S	SAM	S	Non-EUCAST: set ampicillin/sulbactam = S where ampicillin = S	Other rules
-genus	like	.*	AMX	S	SAM	S	Non-EUCAST: set ampicillin/sulbactam = S where ampicillin = S	Other rules
-genus	like	.*	PIP	S	TZP	S	Non-EUCAST: set piperacillin/tazobactam = S where piperacillin = S	Other rules
-genus	like	.*	TMP	S	SXT	S	Non-EUCAST: set trimethoprim/sulfa = S where trimethoprim = S	Other rules
 order	is	Enterobacterales	AMP	S	AMX	S	Enterobacterales (Order)	Breakpoints
 order	is	Enterobacterales	AMP	I	AMX	I	Enterobacterales (Order)	Breakpoints
 order	is	Enterobacterales	AMP	R	AMX	R	Enterobacterales (Order)	Breakpoints
--- a/data-raw/microorganisms.translation.rds
+++ b/data-raw/microorganisms.translation.rds
--- a/data-raw/microorganisms.txt
+++ b/data-raw/microorganisms.txt
--- a/data-raw/reproduction_of_antibiotics.R
+++ b/data-raw/reproduction_of_antibiotics.R
@@ -322,7 +322,7 @@ antibiotics[which(antibiotics$ab == as.ab("cefuroxim")), "abbreviations"][[1]] <
 antibiotics[which(antibiotics$ab == as.ab("cefotaxim")), "abbreviations"][[1]] <- list(c(antibiotics[which(antibiotics$ab == as.ab("cefotaxim")), "abbreviations"][[1]], "cftx"))
 antibiotics[which(antibiotics$ab == as.ab("ceftazidime")), "abbreviations"][[1]] <- list(c(antibiotics[which(antibiotics$ab == as.ab("ceftazidime")), "abbreviations"][[1]], "cftz"))
 antibiotics[which(antibiotics$ab == as.ab("cefepime")), "abbreviations"][[1]] <- list(c(antibiotics[which(antibiotics$ab == as.ab("cefepime")), "abbreviations"][[1]], "cfpi"))
-antibiotics[which(antibiotics$ab == as.ab("cefoxitin")), "abbreviations"][[1]] <- list(c(antibiotics[which(antibiotics$ab == as.ab("cefoxitin")), "abbreviations"][[1]], "cfxt"))
+antibiotics[which(antibiotics$ab == as.ab("cefoxitin")), "abbreviations"][[1]] <- list(c(antibiotics[which(antibiotics$ab == as.ab("cefoxitin")), "abbreviations"][[1]], "cfxt", "cfsc"))
 # More GLIMS codes
 antibiotics[which(antibiotics$ab == "CAZ"), "abbreviations"][[1]] <- list(c(antibiotics[which(antibiotics$ab == "CAZ"), "abbreviations"][[1]], "cftz"))
 antibiotics[which(antibiotics$ab == "CRO"), "abbreviations"][[1]] <- list(c(antibiotics[which(antibiotics$ab == "CRO"), "abbreviations"][[1]], "cftr"))
@@ -377,6 +377,7 @@ antibiotics[which(antibiotics$ab == "CTX"), "abbreviations"][[1]] <- list(c(anti
 antibiotics[which(antibiotics$ab == "CAZ"), "abbreviations"][[1]] <- list(c(antibiotics[which(antibiotics$ab == "CAZ"), "abbreviations"][[1]], "cftz"))
 antibiotics[which(antibiotics$ab == "CFM"), "abbreviations"][[1]] <- list(c(antibiotics[which(antibiotics$ab == "CFM"), "abbreviations"][[1]], "cfxm"))
 antibiotics[which(antibiotics$ab == "FOX"), "abbreviations"][[1]] <- list(c(antibiotics[which(antibiotics$ab == "FOX"), "abbreviations"][[1]], "cfxt"))
+antibiotics[which(antibiotics$ab == "CZA"), "abbreviations"][[1]] <- list(c(antibiotics[which(antibiotics$ab == "CZA"), "abbreviations"][[1]], "cfav"))
 antibiotics[which(antibiotics$ab == "CZO"), "abbreviations"][[1]] <- list(c(antibiotics[which(antibiotics$ab == "CZO"), "abbreviations"][[1]], "cfzl"))
 antibiotics[which(antibiotics$ab == "CZX"), "abbreviations"][[1]] <- list(c(antibiotics[which(antibiotics$ab == "CZX"), "abbreviations"][[1]], "cfzx"))
 antibiotics[which(antibiotics$ab == "CHL"), "abbreviations"][[1]] <- list(c(antibiotics[which(antibiotics$ab == "CHL"), "abbreviations"][[1]], "chlo"))
@@ -577,10 +578,10 @@ antibiotics <- antibiotics %>%

 # set as data.frame again
 antibiotics <- as.data.frame(antibiotics, stringsAsFactors = FALSE)
-class(antibiotics$ab) <- "ab"
+class(antibiotics$ab) <- c("ab", "character")
 antibiotics <- antibiotics %>% arrange(name)

-# make all abbreviations and synonyms lower case, unique and alphabetically sorted
+# make all abbreviations and synonyms lower case, unique and alphabetically sorted ----
 for (i in 1:nrow(antibiotics)) {
  abb <- sort(unique(tolower(antibiotics[i, "abbreviations"][[1]])))
  syn <- sort(unique(tolower(antibiotics[i, "synonyms"][[1]])))
--- a/data-raw/reproduction_of_microorganisms.R
+++ b/data-raw/reproduction_of_microorganisms.R
@@ -23,94 +23,134 @@

 # Data retrieved from the Catalogue of Life (CoL) through the Encyclopaedia of Life:
 # https://opendata.eol.org/dataset/catalogue-of-life/
+# Data retrieved from the Global Biodiversity Information Facility (GBIF): 
 # https://doi.org/10.15468/rffz4x
-# (download the resource file with a name like "Catalogue of Life yyyy-mm-dd")
-# and from the Leibniz Institute DSMZ-German Collection of Microorganisms and Cell Cultures
-# https://www.dsmz.de/support/bacterial-nomenclature-up-to-date-downloads.html
-# (download the latest "Complete List" as xlsx file)
+#
+# And from the Leibniz Institute: German Collection of Microorganisms and Cell Cultures (DSMZ)
+# (register first at https://bacdive.dsmz.de/api/pnu/registration/register/ and use API as done below)

 library(dplyr)
 library(AMR)
+# also needed: data.table, httr, jsonlite, cleaner, stringr

-# unzip and extract taxon.tab (around 1.5 GB) from the CoL archive, then:
-# data_col <- data.table::fread("data-raw/taxon.tab")
-data_col <- data.table::fread("data-raw/taxa.txt", quote = "")
+# unzip and extract taxa.txt (both around 1.5 GB, 3.7-3.9M rows) from Col and GBIF, then:
+data_col_raw <- data.table::fread("data-raw/taxon.tab", quote = "")
+data_gbif <- data.table::fread("data-raw/taxa.txt", quote = "")

-# read the xlsx file from DSMZ (only around 2.5 MB):
-data_dsmz <- readxl::read_xlsx("data-raw/DSMZ_bactnames.xlsx")
+# merge the two
+data_col <- data_gbif %>% 
+  rename(referenceID = identifier) %>% 
+  bind_rows(data_col_raw) %>% 
+  distinct(scientificName, kingdom, genus, specificEpithet, infraspecificEpithet, .keep_all = TRUE)
+rm(data_col_raw)
+rm(data_gbif)
+
+
+# read the data from the DSMZ API (around 19000 rows)
+dsmz_username <- ""
+dsmz_password <- ""
+GET_df <- function(url) {
+  result <- httr::GET(url, httr::authenticate(dsmz_username, dsmz_password))
+  httr::stop_for_status(result)
+  result %>%
+    httr::content(type = "text", encoding = "UTF-8") %>%
+    jsonlite::fromJSON(flatten = TRUE)
+}
+dsmz_first <- GET_df("https://bacdive.dsmz.de/api/pnu/species?page=1&format=json")
+data_dsmz <- dsmz_first$results
+# this next process will take appr. `dsmz_first$count / 100 * 5 / 60` minutes
+for (i in 2:round((dsmz_first$count / 100) + 0.5)) {
+  data_dsmz <<- rbind(data_dsmz,
+                      GET_df(paste0("https://bacdive.dsmz.de/api/pnu/species/?page=", i, "&format=json"))$results)
+  cat(i, "-", AMR:::percentage(i / round((dsmz_first$count / 100) + 0.5)), "\n")
+}
+rm(dsmz_first)

 # the CoL data is over 3.7M rows:
-data_col %>% freq(kingdom)
+data_col %>% cleaner::freq(kingdom)
 #      Item             Count   Percent   Cum. Count   Cum. Percent
 # ---  ----------  ----------  --------  -----------  -------------
-# 1    Animalia     2,225,627     59.1%    2,225,627          59.1%
-# 2    Plantae      1,177,412     31.3%    3,403,039          90.4%
-# 3    Fungi          290,145      7.7%    3,693,184          98.1%
-# 4    Chromista       47,126      1.3%    3,740,310          99.3%
-# 5    Bacteria        14,478      0.4%    3,754,788          99.7%
-# 6    Protozoa         6,060      0.2%    3,760,848          99.9%
-# 7    Viruses          3,827      0.1%    3,764,675         100.0%
-# 8    Archaea            610      0.0%    3,765,285         100.0%
+# 1    Animalia     2,494,992    55.43%    2,494,992         55.43%
+# 2    Plantae      1,379,674    30.65%    3,874,666         86.08%
+# 3    Fungi          547,619    12.17%    4,422,285         98.24%
+# 4    Chromista       51,475     1.14%    4,473,760         99.39%
+# 5    Bacteria        14,442     0.32%    4,488,202         99.71%
+# 6    Protozoa         8,750     0.19%    4,496,952         99.90%
+# 7    Viruses          3,805     0.08%    4,500,757         99.99%
+# 8    Archaea            609     0.01%    4,501,366        100.00%

 # clean data_col
-data_col <- data_col %>%
+data_col.bak <- data_col
+data_col_old <- data_col %>%
+  # filter: has new accepted name
+  filter(!is.na(acceptedNameUsageID)) %>% 
  as_tibble() %>%
-  select(col_id = taxonID,
-         col_id_new = acceptedNameUsageID,
-         fullname = scientificName,
-         kingdom,
-         phylum,
-         class,
-         order,
-         family,
-         genus,
-         species = specificEpithet,
-         subspecies = infraspecificEpithet,
-         rank = taxonRank,
-         ref = scientificNameAuthorship,
-         species_id = references)
-data_col$source <- "CoL"
+  transmute(fullname = trimws(stringr::str_replace(scientificName, 
+                                                   pattern = stringr::fixed(scientificNameAuthorship), 
+                                                   replacement = "")),
+            fullname_new = trimws(paste(ifelse(is.na(genus), "", genus), 
+                                        ifelse(is.na(specificEpithet), "", specificEpithet), 
+                                        ifelse(is.na(infraspecificEpithet), "", infraspecificEpithet))),
+            ref = scientificNameAuthorship,
+            prevalence = NA_integer_)
+data_col <- data_col %>%
+  # filter: has no new accepted name
+  filter(is.na(acceptedNameUsageID)) %>% 
+  as_tibble() %>%
+  transmute(fullname = "",
+            kingdom,
+            phylum,
+            class,
+            order,
+            family,
+            genus,
+            species = specificEpithet,
+            subspecies = infraspecificEpithet,
+            rank = taxonRank,
+            ref = scientificNameAuthorship,
+            species_id = referenceID,
+            source = "CoL")

 # clean data_dsmz
-data_dsmz <- data_dsmz %>%
+data_dsmz.bak <- data_dsmz
+data_dsmz_old <- data_dsmz %>%
+  # filter: correct name is not NULL
+  filter(!sapply(correct_name, is.null)) %>% 
  as_tibble() %>%
-  transmute(col_id = NA_integer_,
-            col_id_new = NA_integer_,
-            fullname = "",
-            # kingdom = "",
-            # phylum = "",
-            # class = "",
-            # order = "",
-            # family = "",
-            genus = ifelse(is.na(GENUS), "", GENUS),
-            species = ifelse(is.na(SPECIES), "", SPECIES),
-            subspecies = ifelse(is.na(SUBSPECIES), "", SUBSPECIES),
+  transmute(fullname = trimws(paste(ifelse(is.na(genus), "", genus), 
+                                    ifelse(is.na(species_epithet), "", species_epithet), 
+                                    ifelse(is.na(subspecies_epithet), "", subspecies_epithet))),
+            fullname_new = sapply(correct_name, function(x) x[2L]),
+            ref = authors,
+            prevalence = NA_integer_)
+
+data_dsmz <- data_dsmz %>%
+  # filter: correct name is NULL
+  filter(sapply(correct_name, is.null)) %>% 
+  as_tibble() %>%
+  transmute(fullname = "",
+            kingdom = regio,
+            phylum,
+            class = classis,
+            # order = "", # does not contain order, will add later based on CoL
+            family = familia,
+            genus = ifelse(is.na(genus), "", genus),
+            species = ifelse(is.na(species_epithet), "", species_epithet),
+            subspecies = ifelse(is.na(subspecies_epithet), "", subspecies_epithet),
            rank = ifelse(species == "", "genus", "species"),
-            ref = AUTHORS,
-            species_id = as.character(RECORD_NO),
+            ref = authors,
+            species_id = as.character(pnu_no),
            source = "DSMZ")

 # DSMZ only contains genus/(sub)species, try to find taxonomic properties based on genus and data_col
 ref_taxonomy <- data_col %>%
-  filter(genus %in% data_dsmz$genus,
-         kingdom %in% c("Bacteria", "Chromista", "Archaea", "Protozoa", "Fungi"),
-         family != "") %>%
-  mutate(kingdom = factor(kingdom,
-                          # in the left_join following, try Bacteria first, then Chromista, ...
-                          levels = c("Bacteria", "Chromista", "Archaea", "Protozoa", "Fungi"),
-                          ordered = TRUE)) %>% 
+  filter(family %in% data_dsmz$family & family != "") %>% 
  arrange(kingdom) %>% 
-  distinct(genus, .keep_all = TRUE) %>%
-  select(kingdom, phylum, class, order, family, genus)
+  distinct(family, .keep_all = TRUE) %>% 
+  select(family, order)

 data_dsmz <- data_dsmz %>%
-  left_join(ref_taxonomy, by = "genus") %>%
-  mutate(kingdom = "Bacteria",
-         phylum = ifelse(is.na(phylum), "(unknown phylum)", phylum),
-         class = ifelse(is.na(class), "(unknown class)", class),
-         order = ifelse(is.na(order), "(unknown order)", order),
-         family = ifelse(is.na(family), "(unknown family)", family),
-  )
+  left_join(ref_taxonomy, by = "family") # NAs will later become "(unknown ...)"

 # combine everything
 data_total <- data_col %>%
@@ -119,6 +159,8 @@ data_total <- data_col %>%
 rm(data_col)
 rm(data_dsmz)
 rm(ref_taxonomy)
+rm(data_col.bak)
+rm(data_dsmz.bak)

 mo_found_in_NL <- c("Absidia", "Acremonium", "Actinotignum", "Aedes", "Alternaria", "Anaerosalibacter", "Ancylostoma", 
                    "Angiostrongylus", "Anisakis", "Anopheles", "Apophysomyces", "Arachnia", "Ascaris", "Aspergillus", 
@@ -158,8 +200,6 @@ MOs <- data_total %>%
    )
    # or the genus has to be one of the genera we found in our hospitals last decades (Northern Netherlands, 2002-2018)
    | genus %in% mo_found_in_NL
-    # or the taxonomic entry is old - the species was renamed
-    | !is.na(col_id_new)
  ) %>%
  # really no Plantae (e.g. Dracunculus exist both as worm and as plant)
  filter(kingdom != "Plantae") %>% 
@@ -174,59 +214,56 @@ MOs <- MOs %>% bind_rows(data_total %>%
                                  | (family %in% MOs$family & rank == "family")
                                  | (genus %in% MOs$genus & rank == "genus")))

-# filter old taxonomic names so only the ones with an existing reference will be kept
-MOs <- MOs %>%
-  filter(is.na(col_id_new) | (!is.na(col_id_new) & col_id_new %in% MOs$col_id))
+get_author_year <- function(ref) {
+  # Only keep first author, e.g. transform 'Smith, Jones, 2011' to 'Smith et al., 2011'
+  
+  authors2 <- iconv(ref, from = "UTF-8", to = "ASCII//TRANSLIT")
+  # remove leading and trailing brackets
+  authors2 <- gsub("^[(](.*)[)]$", "\\1", authors2)
+  # only take part after brackets if there's a name
+  authors2 <- ifelse(grepl(".*[)] [a-zA-Z]+.*", authors2),
+                     gsub(".*[)] (.*)", "\\1", authors2),
+                     authors2)
+  # get year from last 4 digits
+  lastyear = as.integer(gsub(".*([0-9]{4})$", "\\1", authors2))
+  # can never be later than now
+  lastyear = ifelse(lastyear > as.integer(format(Sys.Date(), "%Y")),
+                    NA,
+                    lastyear)
+  # get authors without last year
+  authors <- gsub("(.*)[0-9]{4}$", "\\1", authors2)
+  # remove nonsense characters from names
+  authors <- gsub("[^a-zA-Z,'& -]", "", authors)
+  # remove trailing and leading spaces
+  authors <- trimws(authors)
+  # only keep first author and replace all others by 'et al'
+  authors <- gsub("(,| and| et| &| ex| emend\\.?) .*", " et al.", authors)
+  # et al. always with ending dot
+  authors <- gsub(" et al\\.?", " et al.", authors)
+  authors <- gsub(" ?,$", "", authors)
+  # don't start with 'sensu' or 'ehrenb'
+  authors <- gsub("^(sensu|Ehrenb.?) ", "", authors, ignore.case = TRUE)
+  # no initials, only surname
+  authors <- gsub("^([A-Z]+ )+", "", authors, ignore.case = FALSE)
+  # combine author and year if year is available
+  ref <- ifelse(!is.na(lastyear),
+                paste0(authors, ", ", lastyear),
+                authors)
+  # fix beginning and ending
+  ref <- gsub(", $", "", ref)
+  ref <- gsub("^, ", "", ref)
+  ref <- gsub("^(emend|et al.,?)", "", ref)
+  ref <- trimws(ref)
+  
+  # a lot start with a lowercase character - fix that
+  ref[!grepl("^d[A-Z]", ref)] <- gsub("^([a-z])", "\\U\\1", ref[!grepl("^d[A-Z]", ref)], perl = TRUE)
+  # specific one for the French that are named dOrbigny 
+  ref[grepl("^d[A-Z]", ref)] <- gsub("^d", "d'", ref[grepl("^d[A-Z]", ref)])
+  ref <- gsub(" +", " ", ref)
+  ref
+}

-MOs <- MOs %>%
-  # remove text if it contains 'Not assigned' like phylum in viruses
-  mutate_all(~gsub("(Not assigned|\\[homonym\\]|\\[mistake\\])", "", ., ignore.case = TRUE))
-
-MOs <- MOs %>%
-  # Only keep first author, e.g. transform 'Smith, Jones, 2011' to 'Smith et al., 2011':
-  mutate(authors2 = iconv(ref, from = "UTF-8", to = "ASCII//TRANSLIT"),
-         # remove leading and trailing brackets
-         authors2 = gsub("^[(](.*)[)]$", "\\1", authors2),
-         # only take part after brackets if there's a name
-         authors2 = ifelse(grepl(".*[)] [a-zA-Z]+.*", authors2),
-                           gsub(".*[)] (.*)", "\\1", authors2),
-                           authors2),
-         # get year from last 4 digits
-         lastyear = as.integer(gsub(".*([0-9]{4})$", "\\1", authors2)),
-         # can never be later than now
-         lastyear = ifelse(lastyear > as.integer(format(Sys.Date(), "%Y")),
-                           NA,
-                           lastyear),
-         # get authors without last year
-         authors = gsub("(.*)[0-9]{4}$", "\\1", authors2),
-         # remove nonsense characters from names
-         authors = gsub("[^a-zA-Z,'& -]", "", authors),
-         # remove trailing and leading spaces
-         authors = trimws(authors),
-         # only keep first author and replace all others by 'et al'
-         authors = gsub("(,| and| et| &| ex| emend\\.?) .*", " et al.", authors),
-         # et al. always with ending dot
-         authors = gsub(" et al\\.?", " et al.", authors),
-         authors = gsub(" ?,$", "", authors),
-         # don't start with 'sensu' or 'ehrenb'
-         authors = gsub("^(sensu|Ehrenb.?) ", "", authors, ignore.case = TRUE),
-         # no initials, only surname
-         authors = gsub("^([A-Z]+ )+", "", authors, ignore.case = FALSE),
-         # combine author and year if year is available
-         ref = ifelse(!is.na(lastyear),
-                      paste0(authors, ", ", lastyear),
-                      authors),
-         # fix beginning and ending
-         ref = gsub(", $", "", ref),
-         ref = gsub("^, ", "", ref),
-         ref = gsub("^(emend|et al.,?)", "", ref),
-         ref = trimws(ref)
-  )
-# a lot start with a lowercase character - fix that
-MOs$ref[!grepl("^d[A-Z]", MOs$ref)] <- gsub("^([a-z])", "\\U\\1", MOs$ref[!grepl("^d[A-Z]", MOs$ref)], perl = TRUE)
-# specific one for the French that are named dOrbigny 
-MOs$ref[grepl("^d[A-Z]", MOs$ref)] <- gsub("^d", "d'", MOs$ref[grepl("^d[A-Z]", MOs$ref)])
-MOs <- MOs %>% mutate(ref = gsub(" +", " ", ref))
+MOs <- MOs %>% mutate(ref = get_author_year(ref))

 # Remove non-ASCII characters (these are not allowed by CRAN)
 MOs <- MOs %>%
@@ -235,53 +272,58 @@ MOs <- MOs %>%
  # remove invalid characters
  mutate_all(~gsub("[\"'`]+", "", .))

-# Split old taxonomic names - they refer in the original data to a new `taxonID` with `acceptedNameUsageID`
-MOs.old <- MOs %>%
-  filter(!is.na(col_id_new),
-         ref != "",
-         source != "DSMZ") %>%
-  transmute(col_id,
-            col_id_new,
-            fullname =
-              trimws(
-                gsub("(.*)[(].*", "\\1",
-                     stringr::str_replace(
-                       string = fullname,
-                       pattern = stringr::fixed(authors2),
-                       replacement = "")) %>%
-                  gsub(" (var|f|subsp)[.]", "", .)),
-            ref) %>%
-  filter(!is.na(fullname)) %>%
-  distinct(fullname, .keep_all = TRUE) %>%
-  arrange(col_id)
+# set new fullnames
+MOs <- MOs %>%
+  mutate(fullname = trimws(case_when(rank == "family" ~ family,
+                                     rank == "order" ~ order,
+                                     rank == "class" ~ class,
+                                     rank == "phylum" ~ phylum,
+                                     rank == "kingdom" ~ kingdom,
+                                     TRUE ~ paste(genus, species, subspecies))),
+         fullname = gsub(" (var|f|subsp)[.]", "", fullname)) %>% 
+  # remove text if it contains 'Not assigned', etc.
+  mutate_all(function(x) ifelse(x %like% "(not assigned|homonym|mistake)", NA, x)) %>% 
+  # clean taxonomy
+  mutate(kingdom = ifelse(is.na(kingdom) | trimws(kingdom) == "", "(unknown kingdom)", trimws(kingdom)),
+         phylum = ifelse(is.na(phylum) | trimws(phylum) == "", "(unknown phylum)", trimws(phylum)),
+         class = ifelse(is.na(class) | trimws(class) == "", "(unknown class)", trimws(class)),
+         order = ifelse(is.na(order) | trimws(order) == "", "(unknown order)", trimws(order)),
+         family = ifelse(is.na(family) | trimws(family) == "", "(unknown family)", trimws(family)))

-MO.bak <- MOs
+# Split old taxonomic names
+MOs.old <- data_col_old %>% 
+  filter(!gsub(" (var|f|subsp)[.]", "", fullname_new) %in% data_dsmz_old$fullname) %>% 
+  bind_rows(data_dsmz_old) %>%
+  mutate(fullname_new = gsub(" (var|f|subsp)[.]", "", fullname_new),
+         fullname = gsub(" (var|f|subsp)[.]", "", fullname)) %>% 
+  # for cases like Chlamydia pneumoniae -> Chlamydophila pneumoniae -> Chlamydia pneumoniae:
+  filter(!fullname %in% fullname_new &
+           fullname_new %in% MOs$fullname &
+           !is.na(fullname) & 
+           fullname != fullname_new) %>%
+  distinct(fullname, .keep_all = TRUE) %>%
+  arrange(fullname) %>% 
+  mutate(ref = get_author_year(ref))

 MOs <- MOs %>%
-  filter(is.na(col_id_new) | source == "DSMZ") %>%
-  transmute(col_id,
-            fullname = trimws(case_when(rank == "family" ~ family,
-                                        rank == "order" ~ order,
-                                        rank == "class" ~ class,
-                                        rank == "phylum" ~ phylum,
-                                        rank == "kingdom" ~ kingdom,
-                                        TRUE ~ paste(genus, species, subspecies))),
+  # remove entries that are old and in MOs.old
+  filter(!fullname %in% MOs.old$fullname) %>% 
+  # mark up
+  transmute(fullname,
            kingdom,
            phylum,
            class,
            order,
            family,
-            genus = gsub(":", "", genus),
+            genus,
            species,
            subspecies,
            rank,
            ref,
-            species_id = gsub(".*/([a-f0-9]+)", "\\1", species_id),
+            species_id = gsub("[^a-zA-Z0-9].*", "", species_id),
            source) %>%
-  #distinct(fullname, .keep_all = TRUE) %>%
-  filter(!grepl("unassigned", fullname, ignore.case = TRUE)) %>% 
-  # prefer DSMZ over CoL, since that's more recent
-  arrange(desc(source)) %>% 
+  # prefer known taxonomy over unknown taxonomy, then DSMZ over CoL (= desc)
+  arrange(desc(kingdom, genus, species, source)) %>%
  distinct(kingdom, fullname, .keep_all = TRUE)

 # remove all genera that have no species - they are irrelevant for microbiology and almost all from the kingdom of Animalia
@@ -296,43 +338,45 @@ to_remove <- MOs %>%
 MOs <- MOs %>% filter(!(paste(kingdom, genus) %in% to_remove))
 rm(to_remove)

-# add CoL's col_id, source and ref from MOs.bak, for the cases where DSMZ took preference
+# add all mssing genera, families and orders
 MOs <- MOs %>% 
-  mutate(kingdom_fullname = paste(kingdom, fullname)) %>% 
-  left_join(MO.bak %>%
-              filter(is.na(col_id_new), !is.na(col_id)) %>%
-              transmute(col_id, species_id, source, ref, kingdom_fullname = trimws(paste(kingdom, genus, species, subspecies))), 
-            by = "kingdom_fullname",
-            suffix = c("_dsmz", "_col")) %>% 
-  mutate(col_id = col_id_col, 
-         species_id = ifelse(!is.na(species_id_col) & ref_col == ref_dsmz, 
-                             gsub(".*/(.*)$", "\\1", species_id_col), 
-                             species_id_dsmz),
-         source = ifelse(!is.na(species_id_col) & ref_col == ref_dsmz,
-                         source_col, 
-                         source_dsmz), 
-         ref = ifelse(!is.na(species_id_col) & ref_col == ref_dsmz,
-                      ref_col, 
-                      ref_dsmz)) %>% 
-  select(-matches("(_col|_dsmz|kingdom_fullname)"))
+  bind_rows(MOs %>% 
+              arrange(genus, species) %>%
+              distinct(genus, .keep_all = TRUE) %>%
+              filter(rank == "species") %>%
+              mutate(fullname = genus, 
+                     species = "", 
+                     rank = "genus", 
+                     species_id = "",
+                     ref = NA_character_)) %>% 
+  bind_rows(MOs %>% 
+              arrange(family, genus) %>%
+              distinct(family, .keep_all = TRUE) %>%
+              filter(rank == "genus") %>%
+              mutate(fullname = family, 
+                     genus = "",
+                     rank = "family", 
+                     species_id = "",
+                     ref = NA_character_)) %>% 
+  bind_rows(MOs %>% 
+              arrange(order, family) %>%
+              distinct(family, .keep_all = TRUE) %>%
+              filter(rank == "family") %>%
+              mutate(fullname = order, 
+                     family = "",
+                     rank = "order", 
+                     species_id = "",
+                     ref = NA_character_))

-
-MOs.old <- MOs.old %>%
-  # remove the ones that are in the MOs data set
-  filter(col_id_new %in% MOs$col_id) %>% 
-  # and remove the ones that have the exact same fullname in the MOs data set, like Moraxella catarrhalis
-  left_join(MOs, by = "fullname") %>%
-  filter(col_id_new != col_id.y | is.na(col_id.y)) %>% 
-  select(col_id = col_id.x, col_id_new, fullname, ref = ref.x)
-
-# remove the records that are in MOs.old
-sum(MOs.old$fullname %in% MOs$fullname)
-MOs <- MOs %>% filter(!fullname %in% MOs.old$fullname)
-sum(MOs.old$fullname %in% MOs$fullname)
+# remove the empty ones
+MOs <- MOs %>%
+  mutate(fullname = gsub(",.*", "", fullname)) %>% 
+  distinct(kingdom, fullname, .keep_all = TRUE) %>% 
+  filter(fullname != "")

 # what characters are in the fullnames?
 table(sort(unlist(strsplit(x = paste(MOs$fullname, collapse = ""), split = ""))))
-MOs %>% filter(!fullname %like% "^[a-z ]+$") %>% View()
+MOs %>% filter(!fullname %like% "^[a-z ]+$") %>% arrange(fullname) %>% View()

 table(MOs$kingdom, MOs$rank)
 table(AMR::microorganisms$kingdom, AMR::microorganisms$rank)
@@ -436,7 +480,6 @@ MOs <- MOs %>%
  bind_rows(
    # Unknowns
    data.frame(mo = "UNKNOWN",
-               col_id = NA_integer_,
               fullname = "(unknown name)",
               kingdom = "(unknown kingdom)",
               phylum = "(unknown phylum)",
@@ -453,7 +496,6 @@ MOs <- MOs %>%
               prevalence = 1,
               stringsAsFactors = FALSE),
    data.frame(mo = "B_GRAMN",
-               col_id = NA_integer_,
               fullname = "(unknown Gram-negatives)",
               kingdom = "Bacteria",
               phylum = "(unknown phylum)",
@@ -470,7 +512,6 @@ MOs <- MOs %>%
               prevalence = 1,
               stringsAsFactors = FALSE),
    data.frame(mo = "B_GRAMP",
-               col_id = NA_integer_,
               fullname = "(unknown Gram-positives)",
               kingdom = "Bacteria",
               phylum = "(unknown phylum)",
@@ -487,7 +528,6 @@ MOs <- MOs %>%
               prevalence = 1,
               stringsAsFactors = FALSE),
    data.frame(mo = "F_YEAST",
-               col_id = NA_integer_,
               fullname = "(unknown yeast)",
               kingdom = "Fungi",
               phylum = "(unknown phylum)",
@@ -504,7 +544,6 @@ MOs <- MOs %>%
               prevalence = 2,
               stringsAsFactors = FALSE),
    data.frame(mo = "F_FUNGUS",
-               col_id = NA_integer_,
               fullname = "(unknown fungus)",
               kingdom = "Fungi",
               phylum = "(unknown phylum)",
@@ -524,7 +563,6 @@ MOs <- MOs %>%
    MOs %>%
      filter(genus == "Staphylococcus", species == "epidermidis") %>% .[1,] %>%
      mutate(mo = gsub("(.*)_(.*)_.*", "\\1_\\2_CONS", mo),
-             col_id = NA_integer_,
             species = "coagulase-negative",
             fullname = "Coagulase-negative Staphylococcus (CoNS)",
             ref = NA_character_,
@@ -534,7 +572,6 @@ MOs <- MOs %>%
    MOs %>%
      filter(genus == "Staphylococcus", species == "epidermidis") %>% .[1,] %>%
      mutate(mo = gsub("(.*)_(.*)_.*", "\\1_\\2_COPS", mo),
-             col_id = NA_integer_,
             species = "coagulase-positive",
             fullname = "Coagulase-positive Staphylococcus (CoPS)",
             ref = NA_character_,
@@ -558,7 +595,6 @@ MOs <- MOs %>%
    MOs %>%
      filter(genus == "Streptococcus", species == "dysgalactiae") %>% .[1,] %>%
      mutate(mo = gsub("(.*)_(.*)_.*", "\\1_\\2_GRPC", mo),
-             col_id = NA_integer_,
             species = "group C" ,
             fullname = "Streptococcus group C",
             ref = NA_character_,
@@ -567,7 +603,6 @@ MOs <- MOs %>%
    MOs %>%
      filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
      mutate(mo = gsub("(.*)_(.*)_.*", "\\1_\\2_GRPD", mo),
-             col_id = NA_integer_,
             species = "group D" ,
             fullname = "Streptococcus group D",
             ref = NA_character_,
@@ -576,7 +611,6 @@ MOs <- MOs %>%
    MOs %>%
      filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
      mutate(mo = gsub("(.*)_(.*)_.*", "\\1_\\2_GRPF", mo),
-             col_id = NA_integer_,
             species = "group F" ,
             fullname = "Streptococcus group F",
             ref = NA_character_,
@@ -585,7 +619,6 @@ MOs <- MOs %>%
    MOs %>%
      filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
      mutate(mo = gsub("(.*)_(.*)_.*", "\\1_\\2_GRPG", mo),
-             col_id = NA_integer_,
             species = "group G" ,
             fullname = "Streptococcus group G",
             ref = NA_character_,
@@ -594,7 +627,6 @@ MOs <- MOs %>%
    MOs %>%
      filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
      mutate(mo = gsub("(.*)_(.*)_.*", "\\1_\\2_GRPH", mo),
-             col_id = NA_integer_,
             species = "group H" ,
             fullname = "Streptococcus group H",
             ref = NA_character_,
@@ -603,7 +635,6 @@ MOs <- MOs %>%
    MOs %>%
      filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
      mutate(mo = gsub("(.*)_(.*)_.*", "\\1_\\2_GRPK", mo),
-             col_id = NA_integer_,
             species = "group K" ,
             fullname = "Streptococcus group K",
             ref = NA_character_,
@@ -613,7 +644,6 @@ MOs <- MOs %>%
    MOs %>%
      filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
      mutate(mo = gsub("(.*)_(.*)_.*", "\\1_\\2_HAEM", mo),
-             col_id = NA_integer_,
             species = "beta-haemolytic" ,
             fullname = "Beta-haemolytic Streptococcus",
             ref = NA_character_,
@@ -623,7 +653,6 @@ MOs <- MOs %>%
    MOs %>%
      filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
      mutate(mo = gsub("(.*)_(.*)_.*", "\\1_\\2_VIRI", mo),
-             col_id = NA_integer_,
             species = "viridans" ,
             fullname = "Viridans Group Streptococcus (VGS)",
             ref = NA_character_,
@@ -633,7 +662,6 @@ MOs <- MOs %>%
    MOs %>%
      filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>%
      mutate(mo = gsub("(.*)_(.*)_.*", "\\1_\\2_MILL", mo),
-             col_id = NA_integer_,
             species = "milleri" ,
             fullname = "Milleri Group Streptococcus (MGS)",
             ref = NA_character_,
@@ -646,7 +674,6 @@ MOs <- MOs %>%
      mutate(mo = paste0(mo, "_HMNS"),
             fullname = paste(fullname, "hominis"),
             species = "hominis",
-             col_id = NA,
             source = "manually added",
             ref = NA_character_,
             species_id = ""),
@@ -654,7 +681,6 @@ MOs <- MOs %>%
    MOs %>%
      filter(fullname == "Dientamoeba") %>%
      mutate(mo = gsub("(.*?)_.*", "\\1_THMNS", mo),
-             col_id = NA,
             fullname = "Trichomonas",
             family = "Trichomonadidae",
             genus = "Trichomonas",
@@ -664,7 +690,6 @@ MOs <- MOs %>%
    MOs %>%
      filter(fullname == "Dientamoeba fragilis") %>%
      mutate(mo = gsub("(.*?)_.*", "\\1_THMNS_VAG", mo),
-             col_id = NA,
             fullname = "Trichomonas vaginalis",
             family = "Trichomonadidae",
             genus = "Trichomonas",
@@ -675,7 +700,6 @@ MOs <- MOs %>%
    MOs %>% # add family as such too
      filter(fullname == "Monocercomonadidae") %>%
      mutate(mo = gsub("(.*)_(.*)_.*", "\\1_\\2_TRCHMNDD", mo),
-             col_id = NA,
             fullname = "Trichomonadidae",
             family = "Trichomonadidae",
             rank = "family",
@@ -760,33 +784,37 @@ new_families <- MOs %>%
  filter(order == "Enterobacterales") %>%
  pull(family) %>%
  unique()
-class(MOs$mo) <- "character"
-MOs <- rbind(MOs %>% filter(!(rank == "family" & fullname %in% new_families)), 
-             AMR::microorganisms %>%
-               select(-snomed) %>% 
-               filter(family == "Enterobacteriaceae" & rank == "family") %>%
-               rbind(., ., ., ., ., ., .) %>% 
-               mutate(fullname = new_families,
-                      source = "manually added",
-                      ref = "Adeolu et al., 2016",
-                      family = fullname, mo = paste0("B_[FAM]_",
-                                                     toupper(abbreviate(new_families,
-                                                                        minlength = 8,
-                                                                        use.classes = TRUE,
-                                                                        method = "both.sides",
-                                                                        strict = FALSE)))))
+
+MOs <- MOs %>% 
+  filter(!(rank == "family" & fullname %in% new_families)) %>% 
+  bind_rows(tibble(mo = paste0("B_[FAM]_",
+                               toupper(abbreviate(new_families,
+                                                  minlength = 8,
+                                                  use.classes = TRUE,
+                                                  method = "both.sides",
+                                                  strict = FALSE))),
+                   fullname = new_families,
+                   kingdom = "Bacteria",
+                   phylum = "Proteobacteria",
+                   class = "Gammaproteobacteria",
+                   order = "Enterobacterales",
+                   family = new_families,
+                   genus = "",
+                   species = "",
+                   subspecies = "",
+                   rank = "family",
+                   ref = "Adeolu et al., 2016",
+                   species_id = NA_character_,
+                   source = "manually added",
+                   prevalence = 1))
+
 MOs[which(MOs$order == "Enterobacteriales"), "order"] <- "Enterobacterales"
 MOs[which(MOs$fullname == "Enterobacteriales"), "fullname"] <- "Enterobacterales"

-MOs <- MOs %>%
-  group_by(kingdom) %>%
-  distinct(fullname, .keep_all = TRUE) %>% 
-  ungroup() %>% 
-  filter(fullname != "")
-
 # add prevalence to old taxonomic names
 MOs.old <- MOs.old %>% 
-  left_join(MOs %>% select(col_id, prevalence), by = c("col_id_new" = "col_id"))
+  select(-prevalence) %>% 
+  left_join(MOs %>% select(fullname, prevalence), by = c("fullname_new" = "fullname"))

 # everything distinct?
 sum(duplicated(MOs$mo))
@@ -797,18 +825,105 @@ colnames(MOs)
 MOs %>% arrange(fullname) %>% filter(!fullname %in% AMR::microorganisms$fullname) %>% View()
 MOs.old %>% arrange(fullname) %>% filter(!fullname %in% AMR::microorganisms.old$fullname) %>% View()
 # and the ones we lost:
-AMR::microorganisms %>% filter(!fullname %in% MOs$fullname) %>% View() # based on fullname
-AMR::microorganisms %>% filter(!mo %in% MOs$mo) %>% View()             # based on mo
-AMR::microorganisms %>% filter(!mo %in% MOs$mo & !fullname %in% MOs$fullname) %>% View() 
+# AMR::microorganisms %>% filter(!fullname %in% MOs$fullname) %>% View() # based on fullname
+AMR::microorganisms %>% filter(!fullname %in% c(MOs$fullname, MOs.old$fullname)) %>% View() # excluding renamed ones
+# AMR::microorganisms %>% filter(!mo %in% MOs$mo) %>% View()             # based on mo
+# AMR::microorganisms %>% filter(!mo %in% MOs$mo & !fullname %in% MOs$fullname) %>% View() 
 # and these IDs have changed:
 old_new <- MOs %>%
  mutate(kingdom_fullname = paste(kingdom, fullname)) %>% 
-  filter(kingdom_fullname %in% (AMR::microorganisms %>% mutate(kingdom_fullname = paste(kingdom, fullname)) %>% pull(kingdom_fullname))) %>%
-  left_join(AMR::microorganisms %>% mutate(kingdom_fullname = paste(kingdom, fullname)) %>% select(mo, kingdom_fullname), by = "kingdom_fullname", suffix = c("_new", "_old")) %>% 
+  filter(kingdom_fullname %in% (AMR::microorganisms %>% 
+                                  mutate(kingdom_fullname = paste(kingdom, fullname)) %>%
+                                  pull(kingdom_fullname))) %>%
+  left_join(AMR::microorganisms %>% 
+              mutate(kingdom_fullname = paste(kingdom, fullname)) %>%
+              select(mo, kingdom_fullname), by = "kingdom_fullname", suffix = c("_new", "_old")) %>% 
  filter(mo_new != mo_old) %>% 
  select(mo_old, mo_new, everything())
-
 View(old_new)
+
+# set new MO codes as names to existing data sets
+rsi_translation$mo <- mo_name(rsi_translation$mo, language = NULL)
+microorganisms.codes$mo <- mo_name(microorganisms.codes$mo, language = NULL)
+microorganisms.translation <- AMR:::microorganisms.translation %>%
+  bind_rows(tibble(mo_old = AMR:::microorganisms.translation$mo_new, mo_new = mo_old)) %>%
+  filter(!mo_old %in% MOs$mo) %>% 
+  mutate(mo_new = mo_name(mo_new, language = NULL)) %>% 
+  bind_rows(old_new %>% select(mo_old, mo_new)) %>% 
+  distinct(mo_old, .keep_all = TRUE)
+
+# arrange the data sets to save
+MOs <- MOs %>% arrange(fullname)
+MOs.old <- MOs.old %>% arrange(fullname)
+
+# transform
+MOs <- as.data.frame(MOs, stringsAsFactors = FALSE)
+MOs.old <- as.data.frame(MOs.old, stringsAsFactors = FALSE)
+microorganisms.codes <- as.data.frame(microorganisms.codes, stringsAsFactors = FALSE)
+class(MOs$mo) <- c("mo", "character")
+
+# SAVE
+### for same server
+microorganisms <- dataset_UTF8_to_ASCII(MOs)
+microorganisms.old <- dataset_UTF8_to_ASCII(MOs.old)
+### for other server
+saveRDS(MOs, "microorganisms.rds")
+saveRDS(MOs.old, "microorganisms.old.rds")
+saveRDS(microorganisms.codes, "microorganisms.codes.rds")
+
+# on the server, do:
+usethis::use_data(microorganisms, overwrite = TRUE, version = 2)
+usethis::use_data(microorganisms.old, overwrite = TRUE, version = 2)
+rm(microorganisms)
+rm(microorganisms.old)
+
+# load new data sets
+devtools::load_all(".")
+
+# reset previously changed mo codes
+rsi_translation$mo <- as.mo(rsi_translation$mo)
+microorganisms.codes$mo <- as.mo(microorganisms.codes$mo)
+class(microorganisms.codes$mo) <- c("mo", "character")
+microorganisms.translation <- microorganisms.translation %>%
+  left_join(microorganisms.old[, c("fullname", "fullname_new")], # microorganisms.old is now new and loaded
+            by = c("mo_new" = "fullname")) %>%
+  mutate(name = ifelse(!is.na(fullname_new), fullname_new, mo_new)) %>% 
+  left_join(microorganisms[, c("fullname", "mo")],               # as is microorganisms
+            by = c("name" = "fullname")) %>% 
+  select(mo_old, mo_new = mo) %>% 
+  filter(!is.na(mo_old), !is.na(mo_new))
+class(microorganisms.translation$mo_old) <- "character" # no class <mo> since those aren't valid MO codes
+class(microorganisms.translation$mo_new) <- c("mo", "character")
+# save those to the package
+usethis::use_data(rsi_translation, overwrite = TRUE, version = 2)
+usethis::use_data(microorganisms.codes, overwrite = TRUE, version = 2)
+saveRDS(microorganisms.translation, file = "data-raw/microorganisms.translation.rds", version = 2)
+# to save microorganisms.translation internally to the package
+source("data-raw/internals.R")
+
+# load new data sets again
+devtools::load_all(".")
+
+# and check: these codes should not be missing (will otherwise throw a unit test error):
+AMR::microorganisms.codes %>% filter(!mo %in% MOs$mo)
+AMR::rsi_translation %>% filter(!mo %in% MOs$mo)
+AMR:::microorganisms.translation %>% filter(!mo_new %in% MOs$mo)
+
+# update the example_isolates data set
+example_isolates$mo <- as.mo(example_isolates$mo)
+usethis::use_data(example_isolates, overwrite = TRUE)
+
+# Don't forget to add SNOMED codes! (data-raw/snomed.R)
+
+# run the unit tests
+testthat::test_file("tests/testthat/test-data.R")
+testthat::test_file("tests/testthat/test-mo.R")
+testthat::test_file("tests/testthat/test-mo_property.R")
+
+
+
+# OLD CODE ----------------------------------------------------------------
+
 # to keep all the old IDs:
 # MOs <- MOs %>% filter(!mo %in% old_new$mo_new) %>%
 #   rbind(microorganisms %>%
@@ -816,79 +931,32 @@ View(old_new)
 #           select(mo, fullname) %>%
 #           left_join(MOs %>%
 #                       select(-mo), by = "fullname"))
-
-# and these codes are now missing (which will throw a unit test error):
-AMR::microorganisms.codes %>% filter(!mo %in% MOs$mo)
-AMR::rsi_translation %>% filter(!mo %in% MOs$mo)
-AMR:::microorganisms.translation %>% filter(!mo_new %in% MOs$mo) %>% View()
 # this is how to fix it
-microorganisms.codes <- AMR::microorganisms.codes %>% 
-  left_join(MOs %>%
-              mutate(kingdom_fullname = paste(kingdom, fullname)) %>% 
-              left_join(AMR::microorganisms %>%
-                          transmute(mo, kingdom_fullname = paste(kingdom, fullname)),
-                        by = "kingdom_fullname", suffix = c("_new", "_old")) %>%
-              select(mo_old, mo_new),
-            by = c("mo" = "mo_old")) %>% 
-  select(code, mo = mo_new) %>% 
-  filter(!is.na(mo))
-microorganisms.codes %>% filter(!mo %in% MOs$mo)
-# and for microorganisms.translation:
-microorganisms.translation <- AMR:::microorganisms.translation %>% 
-  select(mo = mo_new) %>% 
-  left_join(AMR::microorganisms %>%
-              transmute(mo, kingdom_fullname = paste(kingdom, fullname)),
-            by = "kingdom_fullname", suffix = c("_new", "_old")) %>%
-  select(mo_old, mo_new)
-  left_join(MOs %>%
-              mutate(kingdom_fullname = paste(kingdom, fullname)) %>% 
-              left_join(AMR::microorganisms %>%
-                          transmute(mo, kingdom_fullname = paste(kingdom, fullname)),
-                        by = "kingdom_fullname", suffix = c("_new", "_old")) %>%
-              select(mo_old, mo_new),
-            by = c("mo" = "mo_old")) %>% 
-  select(code, mo = mo_new) %>% 
-  filter(!is.na(mo))
-microorganisms.codes %>% filter(!mo %in% MOs$mo)
-
-# arrange
-MOs <- MOs %>% arrange(fullname)
-MOs.old <- MOs.old %>% arrange(fullname)
-microorganisms.codes <- microorganisms.codes %>% arrange(code)
-
-# transform
-MOs <- as.data.frame(MOs, stringsAsFactors = FALSE)
-MOs.old <- as.data.frame(MOs.old, stringsAsFactors = FALSE)
-microorganisms.codes <- as.data.frame(microorganisms.codes, stringsAsFactors = FALSE)
-class(MOs$mo) <- "mo"
-class(microorganisms.codes$mo) <- "mo"
-MOs$col_id <- as.integer(MOs$col_id)
-MOs.old$col_id <- as.integer(MOs.old$col_id)
-MOs.old$col_id_new <- as.integer(MOs.old$col_id_new)
-
-# SAVE
-### for other server
-saveRDS(MOs, "microorganisms.rds")
-saveRDS(MOs.old, "microorganisms.old.rds")
-saveRDS(microorganisms.codes, "microorganisms.codes.rds")
-### for same server
-microorganisms <- MOs
-microorganisms.old <- MOs.old
-microorganisms.translation <- old_new %>% select(mo_old, mo_new) %>% as.data.frame()
-class(microorganisms.translation$mo_old) <- "mo"
-class(microorganisms.translation$mo_new) <- "mo"
-
-# on the server, do:
-usethis::use_data(microorganisms, overwrite = TRUE, version = 2)
-usethis::use_data(microorganisms.old, overwrite = TRUE, version = 2)
-usethis::use_data(microorganisms.codes, overwrite = TRUE, version = 2)
-saveRDS(microorganisms.translation, file = "data-raw/microorganisms.translation.rds", version = 2) # this one will be covered in data-raw/internals.R
-rm(microorganisms)
-rm(microorganisms.old)
-rm(microorganisms.codes)
-rm(microorganisms.translation)
-devtools::load_all(".")
-
-# TO DO AFTER THIS
-# * Rerun data-raw/reproduction_of_rsi_translation.R
-# * Run unit tests
+# microorganisms.codes <- AMR::microorganisms.codes %>% 
+#   left_join(MOs %>%
+#               mutate(kingdom_fullname = paste(kingdom, fullname)) %>% 
+#               left_join(AMR::microorganisms %>%
+#                           transmute(mo, kingdom_fullname = paste(kingdom, fullname)),
+#                         by = "kingdom_fullname", suffix = c("_new", "_old")) %>%
+#               select(mo_old, mo_new),
+#             by = c("mo" = "mo_old")) %>% 
+#   select(code, mo = mo_new) %>% 
+#   filter(!is.na(mo))
+# microorganisms.codes %>% filter(!mo %in% MOs$mo)
+# # and for microorganisms.translation:
+# microorganisms.translation <- AMR:::microorganisms.translation %>% 
+#   select(mo = mo_new) %>% 
+#   left_join(AMR::microorganisms %>%
+#               transmute(mo, kingdom_fullname = paste(kingdom, fullname)),
+#             by = "kingdom_fullname", suffix = c("_new", "_old")) %>%
+#   select(mo_old, mo_new)
+#   left_join(MOs %>%
+#               mutate(kingdom_fullname = paste(kingdom, fullname)) %>% 
+#               left_join(AMR::microorganisms %>%
+#                           transmute(mo, kingdom_fullname = paste(kingdom, fullname)),
+#                         by = "kingdom_fullname", suffix = c("_new", "_old")) %>%
+#               select(mo_old, mo_new),
+#             by = c("mo" = "mo_old")) %>% 
+#   select(code, mo = mo_new) %>% 
+#   filter(!is.na(mo))
+# microorganisms.codes %>% filter(!mo %in% MOs$mo)
--- a/data-raw/rsi_translation.txt
+++ b/data-raw/rsi_translation.txt
--- a/data-raw/snomed.R
+++ b/data-raw/snomed.R
@@ -22,16 +22,17 @@
 library(AMR)
 library(tidyverse)

-# go to https://www.nictiz.nl/standaardisatie/terminologiecentrum/referentielijsten/micro-organismen/
+# go to https://www.nictiz.nl/standaardisatie/terminologiecentrum/referentielijsten/micro-organismen/ (Ctrl/Cmd + A in table)
 # read the table from clipboard
-snomed <- clipr::read_clip_tbl()
-# snomed <- snomed %>%
-#   transmute(fullname = trimws(gsub("^genus", "", Omschrijving, ignore.case = TRUE)),
-#             snomed = as.integer(Id))
+snomed <- clipr::read_clip_tbl(skip = 2)
 snomed <- snomed %>%
-  transmute(fullname = mo_name(Omschrijving),
+  dplyr::filter(gsub("(^genus |^familie |^stam |ss.? |subsp.? |subspecies )", "", 
+              Omschrijving.,
+              ignore.case = TRUE) %in% c(microorganisms$fullname, 
+                                         microorganisms.old$fullname)) %>% 
+  dplyr::transmute(fullname = mo_name(Omschrijving.),
            snomed = as.integer(Id)) %>% 
-  filter(!fullname %like% "unknown")
+  dplyr::filter(!fullname %like% "unknown")
 snomed_trans <- snomed %>%
  group_by(fullname) %>%
  mutate(snomed_list = list(snomed)) %>%
@@ -51,59 +52,59 @@ rm(microorganisms)

 # OLD ---------------------------------------------------------------------

-baseUrl <- 'https://browser.ihtsdotools.org/snowstorm/snomed-ct'
-edition <- 'MAIN'
-version <- '2019-07-31'
-
-microorganisms.snomed <- data.frame(conceptid = character(0),
-                                    mo = character(0),
-                                    stringsAsFactors = FALSE)
-microorganisms$snomed <- ""
-
-# for (i in 1:50) {
-for (i in 1:1000) {
-  
-  if (i %% 10 == 0) {
-    cat(paste0(i, " - ", cleaner::percentage(i / nrow(microorganisms)), "\n"))
-  }
-  
-  mo_data <- microorganisms %>% 
-    filter(mo == microorganisms$mo[i]) %>% 
-    as.list()
-  
-  if (!mo_data$rank %in% c("genus", "species")) {
-    next
-  }
-  
-  searchTerm <- paste0(
-    ifelse(mo_data$rank == "genus", "Genus ", ""),
-    mo_data$fullname, 
-    " (organism)")
-  
-  url <- paste0(baseUrl, '/browser/',
-                edition, '/', 
-                version, 
-                '/descriptions?term=', curl::curl_escape(searchTerm),
-                '&mode=fullText&activeFilter=true&limit=', 250)
-  results <- url %>% 
-    httr::GET() %>%
-    httr::content(type = "text", encoding = "UTF-8") %>% 
-    jsonlite::fromJSON(flatten = TRUE) %>% 
-    .$items
-  if (NROW(results) == 0) {
-    next
-  } else {
-    message("Adding ", crayon::italic(mo_data$fullname))
-  }
-  
-  tryCatch(
-    microorganisms$snomed[i] <- results %>% filter(term == searchTerm) %>% pull(concept.conceptId),
-    error = function(e) invisible()
-  )
-  
-  if (nrow(results) > 1) {
-      microorganisms.snomed <- microorganisms.snomed %>% 
-        bind_rows(tibble(conceptid = results %>% filter(term != searchTerm) %>% pull(concept.conceptId) %>% unique(),
-                         mo = as.character(mo_data$mo)))
-  }
-}
+# baseUrl <- 'https://browser.ihtsdotools.org/snowstorm/snomed-ct'
+# edition <- 'MAIN'
+# version <- '2019-07-31'
+# 
+# microorganisms.snomed <- data.frame(conceptid = character(0),
+#                                     mo = character(0),
+#                                     stringsAsFactors = FALSE)
+# microorganisms$snomed <- ""
+# 
+# # for (i in 1:50) {
+# for (i in 1:1000) {
+#   
+#   if (i %% 10 == 0) {
+#     cat(paste0(i, " - ", cleaner::percentage(i / nrow(microorganisms)), "\n"))
+#   }
+#   
+#   mo_data <- microorganisms %>% 
+#     filter(mo == microorganisms$mo[i]) %>% 
+#     as.list()
+#   
+#   if (!mo_data$rank %in% c("genus", "species")) {
+#     next
+#   }
+#   
+#   searchTerm <- paste0(
+#     ifelse(mo_data$rank == "genus", "Genus ", ""),
+#     mo_data$fullname, 
+#     " (organism)")
+#   
+#   url <- paste0(baseUrl, '/browser/',
+#                 edition, '/', 
+#                 version, 
+#                 '/descriptions?term=', curl::curl_escape(searchTerm),
+#                 '&mode=fullText&activeFilter=true&limit=', 250)
+#   results <- url %>% 
+#     httr::GET() %>%
+#     httr::content(type = "text", encoding = "UTF-8") %>% 
+#     jsonlite::fromJSON(flatten = TRUE) %>% 
+#     .$items
+#   if (NROW(results) == 0) {
+#     next
+#   } else {
+#     message("Adding ", crayon::italic(mo_data$fullname))
+#   }
+#   
+#   tryCatch(
+#     microorganisms$snomed[i] <- results %>% filter(term == searchTerm) %>% pull(concept.conceptId),
+#     error = function(e) invisible()
+#   )
+#   
+#   if (nrow(results) > 1) {
+#       microorganisms.snomed <- microorganisms.snomed %>% 
+#         bind_rows(tibble(conceptid = results %>% filter(term != searchTerm) %>% pull(concept.conceptId) %>% unique(),
+#                          mo = as.character(mo_data$mo)))
+#   }
+# }