AMR/data-raw/reproduction_of_microorgani...

# ==================================================================== #
# TITLE                                                                #
# Antimicrobial Resistance (AMR) Data Analysis for R                   #
#                                                                      #
# SOURCE                                                               #
# https://github.com/msberends/AMR                                     #
#                                                                      #
# LICENCE                                                              #
# (c) 2018-2021 Berends MS, Luz CF et al.                              #
# Developed at the University of Groningen, the Netherlands, in        #
# collaboration with non-profit organisations Certe Medical            #
# Diagnostics & Advice, and University Medical Center Groningen.       # 
#                                                                      #
# This R package is free software; you can freely use and distribute   #
# it for both personal and commercial purposes under the terms of the  #
# GNU General Public License version 2.0 (GNU GPL-2), as published by  #
# the Free Software Foundation.                                        #
# We created this package for both routine data analysis and academic  #
# research and it was publicly released in the hope that it will be    #
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY.              #
#                                                                      #
# Visit our website for the full manual and a complete tutorial about  #
# how to conduct AMR data analysis: https://msberends.github.io/AMR/   #
# ==================================================================== #

# Register at List of Prokaryotic names with Standing in Nomenclature (LPSN)
# then got to https://lpsn.dsmz.de/downloads and download the latest CSV file.

library(tidyverse)
library(AMR)

# these should still work after this update
test_fullname <- microorganisms$fullname
test_mo <- microorganisms$mo


# Helper functions --------------------------------------------------------

get_author_year <- function(ref) {
  # Only keep first author, e.g. transform 'Smith, Jones, 2011' to 'Smith et al., 2011'
  
  authors2 <- iconv(ref, from = "UTF-8", to = "ASCII//TRANSLIT")
  authors2 <- gsub(" ?\\(Approved Lists [0-9]+\\) ?", " () ", authors2)
  authors2 <- gsub(" [)(]+ $", "", authors2)
  # remove leading and trailing brackets
  authors2 <- trimws(gsub("^[(](.*)[)]$", "\\1", authors2))
  # only take part after brackets if there's a name
  authors2 <- ifelse(grepl(".*[)] [a-zA-Z]+.*", authors2),
                     gsub(".*[)] (.*)", "\\1", authors2),
                     authors2)
  # get year from last 4 digits
  lastyear = as.integer(gsub(".*([0-9]{4})$", "\\1", authors2))
  # can never be later than now
  lastyear = ifelse(lastyear > as.integer(format(Sys.Date(), "%Y")),
                    NA,
                    lastyear)
  # get authors without last year
  authors <- gsub("(.*)[0-9]{4}$", "\\1", authors2)
  # remove nonsense characters from names
  authors <- gsub("[^a-zA-Z,'& -]", "", authors)
  # remove trailing and leading spaces
  authors <- trimws(authors)
  # only keep first author and replace all others by 'et al'
  authors <- gsub("(,| and| et| &| ex| emend\\.?) .*", " et al.", authors)
  # et al. always with ending dot
  authors <- gsub(" et al\\.?", " et al.", authors)
  authors <- gsub(" ?,$", "", authors)
  # don't start with 'sensu' or 'ehrenb'
  authors <- gsub("^(sensu|Ehrenb.?) ", "", authors, ignore.case = TRUE)
  # no initials, only surname
  authors <- gsub("^([A-Z]+ )+", "", authors, ignore.case = FALSE)
  # combine author and year if year is available
  ref <- ifelse(!is.na(lastyear),
                paste0(authors, ", ", lastyear),
                authors)
  # fix beginning and ending
  ref <- gsub(", $", "", ref)
  ref <- gsub("^, ", "", ref)
  ref <- gsub("^(emend|et al.,?)", "", ref)
  ref <- trimws(ref)
  ref <- gsub("'", "", ref)
  
  # a lot start with a lowercase character - fix that
  ref[!grepl("^d[A-Z]", ref)] <- gsub("^([a-z])", "\\U\\1", ref[!grepl("^d[A-Z]", ref)], perl = TRUE)
  # specific one for the French that are named dOrbigny 
  ref[grepl("^d[A-Z]", ref)] <- gsub("^d", "d'", ref[grepl("^d[A-Z]", ref)])
  ref <- gsub(" +", " ", ref)
  ref
}

df_remove_nonASCII <- function(df) {
  # Remove non-ASCII characters (these are not allowed by CRAN)
  df %>%
    mutate_if(is.character, iconv, from = "UTF-8", to = "ASCII//TRANSLIT") %>% 
    # also remove invalid characters
    mutate_if(is.character, ~gsub("[\"'`]+", "", .)) %>%
    AMR:::dataset_UTF8_to_ASCII()
}

abbreviate_mo <- function(x, minlength = 5, prefix = "", ...) {
  # keep a starting Latin ae
  suppressWarnings(
    gsub("^ae", "\u00E6\u00E6", x, ignore.case = TRUE) %>% 
      abbreviate(minlength = minlength,
                 use.classes = TRUE,
                 method = "both.sides", ...) %>% 
      paste0(prefix, .) %>% 
      toupper() %>% 
      gsub("(\u00C6|\u00E6)+", "AE", .)
  )
}

# Read data ---------------------------------------------------------------

taxonomy <- read_csv("~/Downloads/taxonomy.csv")

# Create synonyms ---------------------------------------------------------

new_synonyms <- taxonomy %>%
  left_join(taxonomy,
            by = c("record_lnk" = "record_no"), 
            suffix = c("", ".new")) %>% 
  filter(!is.na(record_lnk)) %>% 
  mutate_all(~ifelse(is.na(.), "", .)) %>% 
  transmute(fullname = trimws(paste(genus_name, sp_epithet, subsp_epithet)),
            fullname_new = trimws(paste(genus_name.new, sp_epithet.new, subsp_epithet.new)),
            ref = get_author_year(authors),
            prevalence = 0) %>% 
  distinct(fullname, .keep_all = TRUE) %>% 
  filter(fullname != fullname_new) %>% 
  # this part joins this table to itself to correct for entries that had >1 renames, 
  # such as:
  # Bacteroides tectum -> Bacteroides tectus
  # Bacteroides tectus -> Bacteroides pyogenes
  left_join(., ., 
            by = c("fullname_new" = "fullname"), 
            suffix = c("", ".2")) %>%
  mutate(fullname_new = ifelse(!is.na(fullname_new.2), fullname_new.2, fullname_new),
         ref = ifelse(!is.na(ref.2), ref.2, ref)) %>%
  select(-ends_with(".2")) 

mo_became_synonym <- microorganisms %>% 
  filter(fullname %in% new_synonyms$fullname)

updated_microorganisms <- taxonomy %>%
  filter(is.na(record_lnk)) %>% 
  mutate_all(~ifelse(is.na(.), "", .)) %>% 
  transmute(mo = "",
            fullname = trimws(paste(genus_name, sp_epithet, subsp_epithet)),
            kingdom = "Bacteria",
            phylum = "",
            class = "",
            order = "",
            family = "",
            genus = trimws(genus_name),
            species = trimws(replace_na(sp_epithet, "")),
            subspecies = trimws(replace_na(subsp_epithet, "")),
            rank = case_when(subspecies == "" & species == "" ~ "genus",
                             subspecies == "" ~ "species",
                             TRUE ~ "subsp."),
            ref = get_author_year(authors),
            species_id = as.character(record_no),
            source = "LPSN",
            prevalence = 0,
            snomed = NA)

new_microorganisms <- updated_microorganisms %>% 
  filter(!fullname %in% microorganisms$fullname)

genera_with_mo_code <- updated_microorganisms %>%
  filter(genus %in% (microorganisms %>% filter(kingdom == "Bacteria", rank == "genus") %>% pull(genus))) %>%
  distinct(genus) %>% 
  left_join(microorganisms %>% filter(kingdom == "Bacteria", rank == "genus") %>% select(mo, genus),
            by = "genus")
  
genera_without_mo_code <- updated_microorganisms %>% 
  filter(!genus %in% genera_with_mo_code$genus) %>% 
  pull(genus) %>% 
  unique()

genera_without_mo_code_abbr <- genera_without_mo_code %>% 
  abbreviate_mo(5, prefix = "B_")
genera_without_mo_code_abbr[genera_without_mo_code_abbr %in% microorganisms$mo] <- abbreviate_mo(genera_without_mo_code[genera_without_mo_code_abbr %in% microorganisms$mo], 6, prefix = "B_")
genera_without_mo_code_abbr[genera_without_mo_code_abbr %in% microorganisms$mo] <- abbreviate_mo(genera_without_mo_code[genera_without_mo_code_abbr %in% microorganisms$mo], 7, prefix = "B_")
# all unique??
sum(genera_without_mo_code_abbr %in% microorganisms$mo) == 0

genus_abb <- tibble(genus = genera_without_mo_code,
                    abbr = genera_without_mo_code_abbr) %>% 
  bind_rows(microorganisms %>%
              filter(kingdom == "Bacteria", rank == "genus", !genus %in% genera_without_mo_code) %>%
              transmute(genus, abbr = as.character(mo))) %>% 
  arrange(genus)


# Update taxonomy ---------------------------------------------------------

# fill in the taxonomy of new genera
updated_taxonomy <- tibble(phylum = character(0),
                           class = character(0),
                           order = character(0),
                           family = character(0),
                           genus = character(0))
for (page in LETTERS) {
  message("Downloading page ", page, "... ", appendLF = FALSE)
  url <- paste0("https://lpsn.dsmz.de/genus?page=", page)
  
  x <- xml2::read_html(url) %>% 
    rvest::html_node(".main-list") %>% 
    # evety list element with a set <id> attribute
    rvest::html_nodes("li[id]")
  for (i in seq_len(length(x))) {
    txt <- x %>% 
      magrittr::extract2(i) %>% 
      rvest::html_text() %>%
      gsub("\\[[A-Za-z]+, no [a-z]+\\]", "NA", .) %>% 
      gsub("Candidatus ", "", ., fixed = TRUE) %>% 
      gsub("[ \t\r\n\"]+", "|", .) %>%
      gsub("\\|ShowHide.*", "", .) %>% 
      gsub("[\\[\\]]", "", ., fixed = TRUE) %>% 
      gsub("^\\|", "", .) %>% 
      strsplit("|", fixed = TRUE) %>% 
      unlist()
    txt[txt == "NA"] <- ""
    txt <- gsub("[^A-Za-z]+", "", txt)
    updated_taxonomy <- updated_taxonomy %>% 
      bind_rows(tibble(phylum = txt[2],
                       class = txt[3],
                       order = txt[4],
                       family = txt[5],
                       genus = txt[6]))
  }
  message(length(x), " entries (total ", nrow(updated_taxonomy), ")")
}

# Create new microorganisms -----------------------------------------------

new_microorganisms <- new_microorganisms %>% 
  left_join(genus_abb, by = "genus") %>% 
  group_by(genus) %>% 
  mutate(species_abb = abbreviate_mo(species, 4)) %>% 
  group_by(genus, species) %>% 
  mutate(subspecies_abb = abbreviate_mo(subspecies, 4)) %>% 
  ungroup() %>% 
  mutate(mo = paste(abbr, species_abb, subspecies_abb, sep = "_"),
         mo = gsub("_+$", "", mo)) %>% 
  select(-matches("abb"))

# add taxonomy new microorganisms
MOs <- microorganisms %>%
  mutate(mo = as.character(mo)) %>%
  bind_rows(new_microorganisms) %>%
  arrange(fullname)

# unique MO codes
MOs$mo[which(duplicated(MOs$mo))] <- paste0(MOs$mo[which(duplicated(MOs$mo))], 1)
# all unique?
!any(duplicated(MOs$mo))

MOs <- MOs %>% 
  # remove entries that are now a synonym
  filter(!fullname %in% new_synonyms$fullname) %>% 
  # update the taxonomy
  left_join(updated_taxonomy, by = "genus", suffix = c("", ".new")) %>%
  mutate(phylum = ifelse(!is.na(phylum.new), phylum.new, phylum),
         class = ifelse(!is.na(class.new), class.new, class),
         order = ifelse(!is.na(order.new), order.new, order),
         family = ifelse(!is.na(family.new), family.new, family)) %>% 
  select(-ends_with(".new")) %>%
  # update prevalence based on taxonomy (Berends et al., 2021)
  mutate(prevalence = case_when(
    class == "Gammaproteobacteria"
    | genus %in% c("Enterococcus", "Staphylococcus", "Streptococcus")
    ~ 1,
    kingdom %in% c("Archaea", "Bacteria", "Chromista", "Fungi")
    & (phylum %in% c("Proteobacteria",
                     "Firmicutes",
                     "Actinobacteria",
                     "Sarcomastigophora")
       | genus %in% c("Absidia", "Acremonium", "Actinotignum", "Alternaria", "Anaerosalibacter", "Apophysomyces",
                      "Arachnia", "Aspergillus", "Aureobacterium", "Aureobasidium", "Bacteroides", "Basidiobolus",
                      "Beauveria", "Blastocystis", "Branhamella", "Calymmatobacterium", "Candida", "Capnocytophaga",
                      "Catabacter", "Chaetomium", "Chryseobacterium", "Chryseomonas", "Chrysonilia", "Cladophialophora",
                      "Cladosporium", "Conidiobolus", "Cryptococcus", "Curvularia", "Exophiala", "Exserohilum",
                      "Flavobacterium", "Fonsecaea", "Fusarium", "Fusobacterium", "Hendersonula", "Hypomyces",
                      "Koserella", "Lelliottia", "Leptosphaeria", "Leptotrichia", "Malassezia", "Malbranchea",
                      "Mortierella", "Mucor", "Mycocentrospora", "Mycoplasma", "Nectria", "Ochroconis",
                      "Oidiodendron", "Phoma", "Piedraia", "Pithomyces", "Pityrosporum", "Prevotella", "Pseudallescheria", 
                      "Rhizomucor", "Rhizopus", "Rhodotorula", "Scolecobasidium", "Scopulariopsis", "Scytalidium", 
                      "Sporobolomyces", "Stachybotrys", "Stomatococcus", "Treponema", "Trichoderma", "Trichophyton", 
                      "Trichosporon", "Tritirachium", "Ureaplasma")
       | rank %in% c("kingdom", "phylum", "class", "order", "family"))
    ~ 2,
    TRUE ~ 3
  ))

# add all mssing genera, families and orders
MOs <- MOs %>% 
  bind_rows(
    MOs %>% 
      arrange(genus, species) %>%
      distinct(genus, .keep_all = TRUE) %>%
      filter(rank == "species", source != "manually added") %>%
      mutate(mo = gsub("^([A-Z]_[A-Z]+)_.*", "\\1", mo),
             fullname = genus, 
             species = "", 
             subspecies = "", 
             rank = "genus", 
             species_id = "",
             snomed = NA,
             ref = NA_character_),
    MOs %>% 
      group_by(family) %>%
      filter(!any(rank == "family") & n() > 1) %>%
      ungroup() %>% 
      arrange(family) %>%
      distinct(family, .keep_all = TRUE) %>% 
      filter(!family %in% c("", NA), source != "manually added") %>%
      mutate(mo = paste0(substr(kingdom, 1, 1), "_[FAM]_",
                         abbreviate(family,
                                    minlength = 8,
                                    use.classes = TRUE,
                                    method = "both.sides",
                                    strict = FALSE)),
             mo = toupper(mo),
             fullname = family,
             genus = "",
             species = "", 
             subspecies = "", 
             rank = "family", 
             species_id = "",
             snomed = NA,
             ref = NA_character_),
    MOs %>% 
      group_by(order) %>%
      filter(!any(rank == "order") & n() > 1) %>%
      ungroup() %>% 
      arrange(order) %>%
      distinct(order, .keep_all = TRUE) %>% 
      filter(!order %in% c("", NA), source != "manually added") %>%
      mutate(mo = paste0(substr(kingdom, 1, 1), "_[ORD]_",
                         abbreviate(order,
                                    minlength = 8,
                                    use.classes = TRUE,
                                    method = "both.sides",
                                    strict = FALSE)),
             mo = toupper(mo),
             fullname = order,
             family = "",
             genus = "",
             species = "", 
             subspecies = "", 
             rank = "order", 
             species_id = "",
             snomed = NA,
             ref = NA_character_)
  ) %>%
  arrange(fullname)

# clean up
MOs <- MOs %>% 
  df_remove_nonASCII()


# Merge synonyms ----------------------------------------------------------

# remove synonyms that are now valid names
MOs.old <- microorganisms.old %>% 
  # add new synonyms
  bind_rows(new_synonyms) %>% 
  filter(!fullname %in% MOs$fullname) %>% 
  arrange(fullname) %>% 
  distinct(fullname, fullname_new, .keep_all = TRUE) %>% 
  # add prevalence to old taxonomic names
  select(-prevalence) %>% 
  left_join(MOs %>% select(fullname, prevalence), by = c("fullname_new" = "fullname")) %>% 
  # clean up
  df_remove_nonASCII()

# Keep old codes for translation ------------------------------------------

# add removed microbial IDs to the internal translation table so old package versions keep working
MOs.translation <- microorganisms %>% 
  filter(!mo %in% MOs$mo) %>%
  select(mo, fullname) %>%
  left_join(new_synonyms) %>% 
  left_join(MOs %>% transmute(fullname_new = fullname, mo2 = as.character(mo))) %>% 
  select(mo_old = mo, mo_new = mo2) %>% 
  distinct()
MOs.translation <- AMR:::microorganisms.translation %>% 
  left_join(MOs.translation %>% select(mo_new_update = mo_new, mo_new = mo_old)) %>% 
  mutate(mo_new = as.character(ifelse(!is.na(mo_new_update), mo_new_update, mo_new))) %>% 
  select(-mo_new_update) %>% 
  bind_rows(
    # old IDs used in microorganisms.codes must put in here as well
    microorganisms.codes %>%
      filter(!mo %in% MOs$mo) %>%
      transmute(mo_old = mo, fullname = mo_name(mo)) %>%
      left_join(MOs.old %>%
                  select(fullname, fullname_new)) %>%
      left_join(MOs %>%
                  select(mo_new = mo, fullname_new = fullname)) %>%
      transmute(mo_old = as.character(mo_old), mo_new)) %>% 
  arrange(mo_old) %>% 
  filter(mo_old != mo_new,
         !mo_old %in% MOs$mo) %>%
  left_join(., ., 
            by = c("mo_new" = "mo_old"), 
            suffix = c("", ".2")) %>%
  mutate(mo_new = ifelse(!is.na(mo_new.2), mo_new.2, mo_new)) %>% 
  distinct(mo_old, mo_new) %>% 
  # clean up
  df_remove_nonASCII()

message("microorganisms new:     ", sum(!MOs$fullname %in% c(microorganisms$fullname, MOs.old$fullname)))
message("microorganisms renamed: ", sum(!MOs.old$fullname %in% microorganisms.old$fullname))


# Save --------------------------------------------------------------------

# class <mo>
class(MOs$mo) <- c("mo", "character")
class(MOs.translation$mo_new) <- c("mo", "character")

microorganisms <- MOs
microorganisms.old <- MOs.old
microorganisms.translation <- MOs.translation

# on the server, do:
usethis::use_data(microorganisms, overwrite = TRUE, version = 2, compress = "xz")
usethis::use_data(microorganisms.old, overwrite = TRUE, version = 2)
saveRDS(microorganisms.translation, file = "data-raw/microorganisms.translation.rds", version = 2)
rm(microorganisms)
rm(microorganisms.old)
rm(microorganisms.translation)
# to save microorganisms.translation internally to the package
devtools::load_all(".")
source("data-raw/_internals.R")

# load new data sets
devtools::load_all(".")

# reset previously changed mo codes
rsi_translation$mo <- as.mo(rsi_translation$mo, language = NULL)
usethis::use_data(rsi_translation, overwrite = TRUE, version = 2)
rm(rsi_translation)

microorganisms.codes$mo <- as.mo(microorganisms.codes$mo, language = NULL)
usethis::use_data(microorganisms.codes, overwrite = TRUE, version = 2)
rm(microorganisms.codes)

example_isolates$mo <- as.mo(example_isolates$mo, language = NULL)
usethis::use_data(example_isolates, overwrite = TRUE, version = 2)
rm(example_isolates)

intrinsic_resistant$microorganism <- suppressMessages(mo_name(intrinsic_resistant$microorganism))
usethis::use_data(intrinsic_resistant, overwrite = TRUE, version = 2)
rm(intrinsic_resistant)

# load new data sets again
devtools::load_all(".")
source("data-raw/_internals.R")
devtools::load_all(".")


# Test updates ------------------------------------------------------------

# and check: these codes should not be missing (will otherwise throw a unit test error):
AMR::microorganisms.codes %>% filter(!mo %in% MOs$mo)
AMR::rsi_translation %>% filter(!mo %in% MOs$mo)
AMR:::microorganisms.translation %>% filter(!mo_new %in% MOs$mo)
AMR::example_isolates %>% filter(!mo %in% MOs$mo)

# Don't forget to add SNOMED codes! (data-raw/snomed.R)

# run the unit tests
Sys.setenv(NOT_CRAN = "true")
testthat::test_file("tests/testthat/test-data.R")
testthat::test_file("tests/testthat/test-mo.R")
testthat::test_file("tests/testthat/test-mo_property.R")
(v1.5.0.9028) Updated taxonomy until March 2021 2021-03-04 23:28:32 +01:00			`# ==================================================================== #`
			`# TITLE #`
			`# Antimicrobial Resistance (AMR) Data Analysis for R #`
			`# #`
			`# SOURCE #`
			`# https://github.com/msberends/AMR #`
			`# #`
			`# LICENCE #`
			`# (c) 2018-2021 Berends MS, Luz CF et al. #`
			`# Developed at the University of Groningen, the Netherlands, in #`
			`# collaboration with non-profit organisations Certe Medical #`
			`# Diagnostics & Advice, and University Medical Center Groningen. #`
			`# #`
			`# This R package is free software; you can freely use and distribute #`
			`# it for both personal and commercial purposes under the terms of the #`
			`# GNU General Public License version 2.0 (GNU GPL-2), as published by #`
			`# the Free Software Foundation. #`
			`# We created this package for both routine data analysis and academic #`
			`# research and it was publicly released in the hope that it will be #`
			`# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. #`
			`# #`
			`# Visit our website for the full manual and a complete tutorial about #`
			`# how to conduct AMR data analysis: https://msberends.github.io/AMR/ #`
			`# ==================================================================== #`

			`# Register at List of Prokaryotic names with Standing in Nomenclature (LPSN)`
			`# then got to https://lpsn.dsmz.de/downloads and download the latest CSV file.`

			`library(tidyverse)`
			`library(AMR)`

			`# these should still work after this update`
			`test_fullname <- microorganisms$fullname`
			`test_mo <- microorganisms$mo`


			`# Helper functions --------------------------------------------------------`

			`get_author_year <- function(ref) {`
			`# Only keep first author, e.g. transform 'Smith, Jones, 2011' to 'Smith et al., 2011'`

			`authors2 <- iconv(ref, from = "UTF-8", to = "ASCII//TRANSLIT")`
			`authors2 <- gsub(" ?\\(Approved Lists [0-9]+\\) ?", " () ", authors2)`
			`authors2 <- gsub(" [)(]+ $", "", authors2)`
			`# remove leading and trailing brackets`
			`authors2 <- trimws(gsub("^[(](.*)[)]$", "\\1", authors2))`
			`# only take part after brackets if there's a name`
			`authors2 <- ifelse(grepl(".[)] [a-zA-Z]+.", authors2),`
			`gsub(".[)] (.)", "\\1", authors2),`
			`authors2)`
			`# get year from last 4 digits`
			`lastyear = as.integer(gsub(".*([0-9]{4})$", "\\1", authors2))`
			`# can never be later than now`
			`lastyear = ifelse(lastyear > as.integer(format(Sys.Date(), "%Y")),`
			`NA,`
			`lastyear)`
			`# get authors without last year`
			`authors <- gsub("(.*)[0-9]{4}$", "\\1", authors2)`
			`# remove nonsense characters from names`
			`authors <- gsub("[^a-zA-Z,'& -]", "", authors)`
			`# remove trailing and leading spaces`
			`authors <- trimws(authors)`
			`# only keep first author and replace all others by 'et al'`
			`authors <- gsub("(,\| and\| et\| &\| ex\| emend\\.?) .*", " et al.", authors)`
			`# et al. always with ending dot`
			`authors <- gsub(" et al\\.?", " et al.", authors)`
			`authors <- gsub(" ?,$", "", authors)`
			`# don't start with 'sensu' or 'ehrenb'`
			`authors <- gsub("^(sensu\|Ehrenb.?) ", "", authors, ignore.case = TRUE)`
			`# no initials, only surname`
			`authors <- gsub("^([A-Z]+ )+", "", authors, ignore.case = FALSE)`
			`# combine author and year if year is available`
			`ref <- ifelse(!is.na(lastyear),`
			`paste0(authors, ", ", lastyear),`
			`authors)`
			`# fix beginning and ending`
			`ref <- gsub(", $", "", ref)`
			`ref <- gsub("^, ", "", ref)`
			`ref <- gsub("^(emend\|et al.,?)", "", ref)`
			`ref <- trimws(ref)`
			`ref <- gsub("'", "", ref)`

			`# a lot start with a lowercase character - fix that`
			`ref[!grepl("^d[A-Z]", ref)] <- gsub("^([a-z])", "\\U\\1", ref[!grepl("^d[A-Z]", ref)], perl = TRUE)`
			`# specific one for the French that are named dOrbigny`
			`ref[grepl("^d[A-Z]", ref)] <- gsub("^d", "d'", ref[grepl("^d[A-Z]", ref)])`
			`ref <- gsub(" +", " ", ref)`
			`ref`
			`}`

			`df_remove_nonASCII <- function(df) {`
			`# Remove non-ASCII characters (these are not allowed by CRAN)`
			`df %>%`
			`mutate_if(is.character, iconv, from = "UTF-8", to = "ASCII//TRANSLIT") %>%`
			`# also remove invalid characters`
			mutate_if(is.character, ~gsub("[\"'`]+", "", .)) %>%
			`AMR:::dataset_UTF8_to_ASCII()`
			`}`

			`abbreviate_mo <- function(x, minlength = 5, prefix = "", ...) {`
			`# keep a starting Latin ae`
			`suppressWarnings(`
			`gsub("^ae", "\u00E6\u00E6", x, ignore.case = TRUE) %>%`
			`abbreviate(minlength = minlength,`
			`use.classes = TRUE,`
			`method = "both.sides", ...) %>%`
			`paste0(prefix, .) %>%`
			`toupper() %>%`
			`gsub("(\u00C6\|\u00E6)+", "AE", .)`
			`)`
			`}`

			`# Read data ---------------------------------------------------------------`

			`taxonomy <- read_csv("~/Downloads/taxonomy.csv")`

			`# Create synonyms ---------------------------------------------------------`

			`new_synonyms <- taxonomy %>%`
			`left_join(taxonomy,`
			`by = c("record_lnk" = "record_no"),`
			`suffix = c("", ".new")) %>%`
			`filter(!is.na(record_lnk)) %>%`
			`mutate_all(~ifelse(is.na(.), "", .)) %>%`
			`transmute(fullname = trimws(paste(genus_name, sp_epithet, subsp_epithet)),`
			`fullname_new = trimws(paste(genus_name.new, sp_epithet.new, subsp_epithet.new)),`
			`ref = get_author_year(authors),`
			`prevalence = 0) %>%`
			`distinct(fullname, .keep_all = TRUE) %>%`
			`filter(fullname != fullname_new) %>%`
			`# this part joins this table to itself to correct for entries that had >1 renames,`
			`# such as:`
			`# Bacteroides tectum -> Bacteroides tectus`
			`# Bacteroides tectus -> Bacteroides pyogenes`
			`left_join(., .,`
			`by = c("fullname_new" = "fullname"),`
			`suffix = c("", ".2")) %>%`
			`mutate(fullname_new = ifelse(!is.na(fullname_new.2), fullname_new.2, fullname_new),`
			`ref = ifelse(!is.na(ref.2), ref.2, ref)) %>%`
			`select(-ends_with(".2"))`

			`mo_became_synonym <- microorganisms %>%`
			`filter(fullname %in% new_synonyms$fullname)`

			`updated_microorganisms <- taxonomy %>%`
			`filter(is.na(record_lnk)) %>%`
			`mutate_all(~ifelse(is.na(.), "", .)) %>%`
			`transmute(mo = "",`
			`fullname = trimws(paste(genus_name, sp_epithet, subsp_epithet)),`
			`kingdom = "Bacteria",`
			`phylum = "",`
			`class = "",`
			`order = "",`
			`family = "",`
			`genus = trimws(genus_name),`
			`species = trimws(replace_na(sp_epithet, "")),`
			`subspecies = trimws(replace_na(subsp_epithet, "")),`
			`rank = case_when(subspecies == "" & species == "" ~ "genus",`
			`subspecies == "" ~ "species",`
			`TRUE ~ "subsp."),`
			`ref = get_author_year(authors),`
			`species_id = as.character(record_no),`
(v1.6.0.9000) custom EUCAST rules 2021-04-07 08:37:42 +02:00			`source = "LPSN",`
(v1.5.0.9028) Updated taxonomy until March 2021 2021-03-04 23:28:32 +01:00			`prevalence = 0,`
			`snomed = NA)`

			`new_microorganisms <- updated_microorganisms %>%`
			`filter(!fullname %in% microorganisms$fullname)`

			`genera_with_mo_code <- updated_microorganisms %>%`
			`filter(genus %in% (microorganisms %>% filter(kingdom == "Bacteria", rank == "genus") %>% pull(genus))) %>%`
			`distinct(genus) %>%`
			`left_join(microorganisms %>% filter(kingdom == "Bacteria", rank == "genus") %>% select(mo, genus),`
			`by = "genus")`

			`genera_without_mo_code <- updated_microorganisms %>%`
			`filter(!genus %in% genera_with_mo_code$genus) %>%`
			`pull(genus) %>%`
			`unique()`

			`genera_without_mo_code_abbr <- genera_without_mo_code %>%`
			`abbreviate_mo(5, prefix = "B_")`
			`genera_without_mo_code_abbr[genera_without_mo_code_abbr %in% microorganisms$mo] <- abbreviate_mo(genera_without_mo_code[genera_without_mo_code_abbr %in% microorganisms$mo], 6, prefix = "B_")`
			`genera_without_mo_code_abbr[genera_without_mo_code_abbr %in% microorganisms$mo] <- abbreviate_mo(genera_without_mo_code[genera_without_mo_code_abbr %in% microorganisms$mo], 7, prefix = "B_")`
			`# all unique??`
			`sum(genera_without_mo_code_abbr %in% microorganisms$mo) == 0`

			`genus_abb <- tibble(genus = genera_without_mo_code,`
			`abbr = genera_without_mo_code_abbr) %>%`
			`bind_rows(microorganisms %>%`
			`filter(kingdom == "Bacteria", rank == "genus", !genus %in% genera_without_mo_code) %>%`
			`transmute(genus, abbr = as.character(mo))) %>%`
			`arrange(genus)`


			`# Update taxonomy ---------------------------------------------------------`

			`# fill in the taxonomy of new genera`
			`updated_taxonomy <- tibble(phylum = character(0),`
			`class = character(0),`
			`order = character(0),`
			`family = character(0),`
			`genus = character(0))`
			`for (page in LETTERS) {`
			`message("Downloading page ", page, "... ", appendLF = FALSE)`
			`url <- paste0("https://lpsn.dsmz.de/genus?page=", page)`

			`x <- xml2::read_html(url) %>%`
			`rvest::html_node(".main-list") %>%`
			`# evety list element with a set <id> attribute`
			`rvest::html_nodes("li[id]")`
			`for (i in seq_len(length(x))) {`
			`txt <- x %>%`
			`magrittr::extract2(i) %>%`
			`rvest::html_text() %>%`
			`gsub("\\[[A-Za-z]+, no [a-z]+\\]", "NA", .) %>%`
			`gsub("Candidatus ", "", ., fixed = TRUE) %>%`
			`gsub("[ \t\r\n\"]+", "\|", .) %>%`
			`gsub("\\\|ShowHide.*", "", .) %>%`
			`gsub("[\\[\\]]", "", ., fixed = TRUE) %>%`
			`gsub("^\\\|", "", .) %>%`
			`strsplit("\|", fixed = TRUE) %>%`
			`unlist()`
			`txt[txt == "NA"] <- ""`
			`txt <- gsub("[^A-Za-z]+", "", txt)`
			`updated_taxonomy <- updated_taxonomy %>%`
			`bind_rows(tibble(phylum = txt[2],`
			`class = txt[3],`
			`order = txt[4],`
			`family = txt[5],`
			`genus = txt[6]))`
			`}`
			`message(length(x), " entries (total ", nrow(updated_taxonomy), ")")`
			`}`

			`# Create new microorganisms -----------------------------------------------`

			`new_microorganisms <- new_microorganisms %>%`
			`left_join(genus_abb, by = "genus") %>%`
			`group_by(genus) %>%`
			`mutate(species_abb = abbreviate_mo(species, 4)) %>%`
			`group_by(genus, species) %>%`
			`mutate(subspecies_abb = abbreviate_mo(subspecies, 4)) %>%`
			`ungroup() %>%`
			`mutate(mo = paste(abbr, species_abb, subspecies_abb, sep = "_"),`
			`mo = gsub("_+$", "", mo)) %>%`
			`select(-matches("abb"))`

			`# add taxonomy new microorganisms`
			`MOs <- microorganisms %>%`
			`mutate(mo = as.character(mo)) %>%`
			`bind_rows(new_microorganisms) %>%`
			`arrange(fullname)`

			`# unique MO codes`
			`MOs$mo[which(duplicated(MOs$mo))] <- paste0(MOs$mo[which(duplicated(MOs$mo))], 1)`
			`# all unique?`
			`!any(duplicated(MOs$mo))`

			`MOs <- MOs %>%`
			`# remove entries that are now a synonym`
			`filter(!fullname %in% new_synonyms$fullname) %>%`
			`# update the taxonomy`
			`left_join(updated_taxonomy, by = "genus", suffix = c("", ".new")) %>%`
			`mutate(phylum = ifelse(!is.na(phylum.new), phylum.new, phylum),`
			`class = ifelse(!is.na(class.new), class.new, class),`
			`order = ifelse(!is.na(order.new), order.new, order),`
			`family = ifelse(!is.na(family.new), family.new, family)) %>%`
			`select(-ends_with(".new")) %>%`
			`# update prevalence based on taxonomy (Berends et al., 2021)`
			`mutate(prevalence = case_when(`
			`class == "Gammaproteobacteria"`
			`\| genus %in% c("Enterococcus", "Staphylococcus", "Streptococcus")`
			`~ 1,`
			`kingdom %in% c("Archaea", "Bacteria", "Chromista", "Fungi")`
			`& (phylum %in% c("Proteobacteria",`
			`"Firmicutes",`
			`"Actinobacteria",`
			`"Sarcomastigophora")`
			`\| genus %in% c("Absidia", "Acremonium", "Actinotignum", "Alternaria", "Anaerosalibacter", "Apophysomyces",`
			`"Arachnia", "Aspergillus", "Aureobacterium", "Aureobasidium", "Bacteroides", "Basidiobolus",`
			`"Beauveria", "Blastocystis", "Branhamella", "Calymmatobacterium", "Candida", "Capnocytophaga",`
			`"Catabacter", "Chaetomium", "Chryseobacterium", "Chryseomonas", "Chrysonilia", "Cladophialophora",`
			`"Cladosporium", "Conidiobolus", "Cryptococcus", "Curvularia", "Exophiala", "Exserohilum",`
			`"Flavobacterium", "Fonsecaea", "Fusarium", "Fusobacterium", "Hendersonula", "Hypomyces",`
			`"Koserella", "Lelliottia", "Leptosphaeria", "Leptotrichia", "Malassezia", "Malbranchea",`
			`"Mortierella", "Mucor", "Mycocentrospora", "Mycoplasma", "Nectria", "Ochroconis",`
			`"Oidiodendron", "Phoma", "Piedraia", "Pithomyces", "Pityrosporum", "Prevotella", "Pseudallescheria",`
			`"Rhizomucor", "Rhizopus", "Rhodotorula", "Scolecobasidium", "Scopulariopsis", "Scytalidium",`
			`"Sporobolomyces", "Stachybotrys", "Stomatococcus", "Treponema", "Trichoderma", "Trichophyton",`
			`"Trichosporon", "Tritirachium", "Ureaplasma")`
			`\| rank %in% c("kingdom", "phylum", "class", "order", "family"))`
			`~ 2,`
			`TRUE ~ 3`
(v1.5.0.9030) as.ab() speed improvement 2021-03-05 10:32:09 +01:00			`))`
(v1.5.0.9028) Updated taxonomy until March 2021 2021-03-04 23:28:32 +01:00
(v1.5.0.9029) added missing families and orders 2021-03-05 01:31:46 +01:00			`# add all mssing genera, families and orders`
			`MOs <- MOs %>%`
			`bind_rows(`
			`MOs %>%`
			`arrange(genus, species) %>%`
			`distinct(genus, .keep_all = TRUE) %>%`
			`filter(rank == "species", source != "manually added") %>%`
			`mutate(mo = gsub("^([A-Z]_[A-Z]+)_.*", "\\1", mo),`
			`fullname = genus,`
			`species = "",`
			`subspecies = "",`
			`rank = "genus",`
			`species_id = "",`
			`snomed = NA,`
			`ref = NA_character_),`
			`MOs %>%`
			`group_by(family) %>%`
			`filter(!any(rank == "family") & n() > 1) %>%`
			`ungroup() %>%`
			`arrange(family) %>%`
			`distinct(family, .keep_all = TRUE) %>%`
			`filter(!family %in% c("", NA), source != "manually added") %>%`
			`mutate(mo = paste0(substr(kingdom, 1, 1), "_[FAM]_",`
			`abbreviate(family,`
			`minlength = 8,`
			`use.classes = TRUE,`
			`method = "both.sides",`
			`strict = FALSE)),`
			`mo = toupper(mo),`
			`fullname = family,`
			`genus = "",`
			`species = "",`
			`subspecies = "",`
			`rank = "family",`
			`species_id = "",`
			`snomed = NA,`
			`ref = NA_character_),`
			`MOs %>%`
			`group_by(order) %>%`
			`filter(!any(rank == "order") & n() > 1) %>%`
			`ungroup() %>%`
			`arrange(order) %>%`
			`distinct(order, .keep_all = TRUE) %>%`
			`filter(!order %in% c("", NA), source != "manually added") %>%`
			`mutate(mo = paste0(substr(kingdom, 1, 1), "_[ORD]_",`
			`abbreviate(order,`
			`minlength = 8,`
			`use.classes = TRUE,`
			`method = "both.sides",`
			`strict = FALSE)),`
			`mo = toupper(mo),`
			`fullname = order,`
			`family = "",`
			`genus = "",`
			`species = "",`
			`subspecies = "",`
			`rank = "order",`
			`species_id = "",`
			`snomed = NA,`
			`ref = NA_character_)`
			`) %>%`
			`arrange(fullname)`

(v1.5.0.9030) as.ab() speed improvement 2021-03-05 10:32:09 +01:00			`# clean up`
			`MOs <- MOs %>%`
			`df_remove_nonASCII()`
(v1.5.0.9029) added missing families and orders 2021-03-05 01:31:46 +01:00

(v1.5.0.9028) Updated taxonomy until March 2021 2021-03-04 23:28:32 +01:00			`# Merge synonyms ----------------------------------------------------------`

			`# remove synonyms that are now valid names`
			`MOs.old <- microorganisms.old %>%`
			`# add new synonyms`
			`bind_rows(new_synonyms) %>%`
			`filter(!fullname %in% MOs$fullname) %>%`
			`arrange(fullname) %>%`
			`distinct(fullname, fullname_new, .keep_all = TRUE) %>%`
			`# add prevalence to old taxonomic names`
			`select(-prevalence) %>%`
			`left_join(MOs %>% select(fullname, prevalence), by = c("fullname_new" = "fullname")) %>%`
			`# clean up`
			`df_remove_nonASCII()`

			`# Keep old codes for translation ------------------------------------------`

			`# add removed microbial IDs to the internal translation table so old package versions keep working`
			`MOs.translation <- microorganisms %>%`
			`filter(!mo %in% MOs$mo) %>%`
			`select(mo, fullname) %>%`
			`left_join(new_synonyms) %>%`
			`left_join(MOs %>% transmute(fullname_new = fullname, mo2 = as.character(mo))) %>%`
			`select(mo_old = mo, mo_new = mo2) %>%`
			`distinct()`
			`MOs.translation <- AMR:::microorganisms.translation %>%`
			`left_join(MOs.translation %>% select(mo_new_update = mo_new, mo_new = mo_old)) %>%`
			`mutate(mo_new = as.character(ifelse(!is.na(mo_new_update), mo_new_update, mo_new))) %>%`
			`select(-mo_new_update) %>%`
			`bind_rows(`
			`# old IDs used in microorganisms.codes must put in here as well`
			`microorganisms.codes %>%`
			`filter(!mo %in% MOs$mo) %>%`
			`transmute(mo_old = mo, fullname = mo_name(mo)) %>%`
			`left_join(MOs.old %>%`
			`select(fullname, fullname_new)) %>%`
			`left_join(MOs %>%`
			`select(mo_new = mo, fullname_new = fullname)) %>%`
			`transmute(mo_old = as.character(mo_old), mo_new)) %>%`
			`arrange(mo_old) %>%`
			`filter(mo_old != mo_new,`
			`!mo_old %in% MOs$mo) %>%`
			`left_join(., .,`
			`by = c("mo_new" = "mo_old"),`
			`suffix = c("", ".2")) %>%`
			`mutate(mo_new = ifelse(!is.na(mo_new.2), mo_new.2, mo_new)) %>%`
			`distinct(mo_old, mo_new) %>%`
			`# clean up`
			`df_remove_nonASCII()`

			`message("microorganisms new: ", sum(!MOs$fullname %in% c(microorganisms$fullname, MOs.old$fullname)))`
			`message("microorganisms renamed: ", sum(!MOs.old$fullname %in% microorganisms.old$fullname))`


			`# Save --------------------------------------------------------------------`

			`# class <mo>`
			`class(MOs$mo) <- c("mo", "character")`
			`class(MOs.translation$mo_new) <- c("mo", "character")`

			`microorganisms <- MOs`
			`microorganisms.old <- MOs.old`
			`microorganisms.translation <- MOs.translation`

			`# on the server, do:`
			`usethis::use_data(microorganisms, overwrite = TRUE, version = 2, compress = "xz")`
			`usethis::use_data(microorganisms.old, overwrite = TRUE, version = 2)`
			`saveRDS(microorganisms.translation, file = "data-raw/microorganisms.translation.rds", version = 2)`
			`rm(microorganisms)`
			`rm(microorganisms.old)`
			`rm(microorganisms.translation)`
			`# to save microorganisms.translation internally to the package`
			`devtools::load_all(".")`
			`source("data-raw/_internals.R")`

			`# load new data sets`
			`devtools::load_all(".")`

			`# reset previously changed mo codes`
			`rsi_translation$mo <- as.mo(rsi_translation$mo, language = NULL)`
			`usethis::use_data(rsi_translation, overwrite = TRUE, version = 2)`
			`rm(rsi_translation)`

			`microorganisms.codes$mo <- as.mo(microorganisms.codes$mo, language = NULL)`
			`usethis::use_data(microorganisms.codes, overwrite = TRUE, version = 2)`
			`rm(microorganisms.codes)`

			`example_isolates$mo <- as.mo(example_isolates$mo, language = NULL)`
			`usethis::use_data(example_isolates, overwrite = TRUE, version = 2)`
			`rm(example_isolates)`

			`intrinsic_resistant$microorganism <- suppressMessages(mo_name(intrinsic_resistant$microorganism))`
			`usethis::use_data(intrinsic_resistant, overwrite = TRUE, version = 2)`
			`rm(intrinsic_resistant)`

			`# load new data sets again`
			`devtools::load_all(".")`
			`source("data-raw/_internals.R")`
			`devtools::load_all(".")`


			`# Test updates ------------------------------------------------------------`

			`# and check: these codes should not be missing (will otherwise throw a unit test error):`
			`AMR::microorganisms.codes %>% filter(!mo %in% MOs$mo)`
			`AMR::rsi_translation %>% filter(!mo %in% MOs$mo)`
			`AMR:::microorganisms.translation %>% filter(!mo_new %in% MOs$mo)`
			`AMR::example_isolates %>% filter(!mo %in% MOs$mo)`

			`# Don't forget to add SNOMED codes! (data-raw/snomed.R)`

			`# run the unit tests`
			`Sys.setenv(NOT_CRAN = "true")`
			`testthat::test_file("tests/testthat/test-data.R")`
			`testthat::test_file("tests/testthat/test-mo.R")`
			`testthat::test_file("tests/testthat/test-mo_property.R")`