diff --git a/DESCRIPTION b/DESCRIPTION index f0eedddf..080dbdd5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: AMR -Version: 1.1.0.9020 -Date: 2020-05-27 +Version: 1.1.0.9021 +Date: 2020-05-28 Title: Antimicrobial Resistance Analysis Authors@R: c( person(role = c("aut", "cre"), diff --git a/NAMESPACE b/NAMESPACE index 0339b4f9..2249a670 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -254,7 +254,6 @@ importFrom(graphics,arrows) importFrom(graphics,axis) importFrom(graphics,barplot) importFrom(graphics,par) -importFrom(graphics,plot) importFrom(graphics,points) importFrom(graphics,text) importFrom(stats,complete.cases) diff --git a/NEWS.md b/NEWS.md index cdeaa3de..adcd3e51 100755 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,5 @@ -# AMR 1.1.0.9020 -## Last updated: 27-May-2020 +# AMR 1.1.0.9021 +## Last updated: 28-May-2020 ### Breaking * Removed code dependency on all other R packages, making this package fully independent of the development process of others. This is a major code change, but will probably not be noticeable by most users. @@ -14,7 +14,7 @@ ### Changed * Taxonomy: - * Updated the taxonomy of microorganisms tot May 2020, using the Catalogue of Life (CoL), the Global Biodiversity Information Facility (GBIF) and the List of Prokaryotic names with Standing in Nomenclature (LPSN, hosted by DSMZ since February 2020) + * Updated the taxonomy of microorganisms tot May 2020, using the Catalogue of Life (CoL), the Global Biodiversity Information Facility (GBIF) and the List of Prokaryotic names with Standing in Nomenclature (LPSN, hosted by DSMZ since February 2020). **Note:** a taxonomic update may always impact determination of first isolates (using `first_isolate()`), since some bacterial names might be renamed to other genera or other (sub)species. This is expected behaviour. * Removed the Catalogue of Life IDs (like 776351), since they now work with a species ID (hexadecimal string) * EUCAST rules: * The `eucast_rules()` function no longer applies "other" rules at default that are made available by this package (like setting ampicillin = R when ampicillin + enzyme inhibitor = R). The default input value for `rules` is now `c("breakpoints", "expert")` instead of `"all"`, but this can be changed by the user. To return to the old behaviour, set `options(AMR.eucast_rules = "all")`. diff --git a/R/resistance_predict.R b/R/resistance_predict.R index b9f223d6..5d77df5a 100755 --- a/R/resistance_predict.R +++ b/R/resistance_predict.R @@ -304,7 +304,7 @@ rsi_predict <- resistance_predict #' @exportMethod plot.mic #' @export -#' @importFrom graphics plot axis arrows points +#' @importFrom graphics axis arrows points #' @rdname resistance_predict plot.resistance_predict <- function(x, main = paste("Resistance Prediction of", x_name), ...) { x_name <- paste0(ab_name(attributes(x)$ab), " (", attributes(x)$ab, ")") @@ -314,6 +314,12 @@ plot.resistance_predict <- function(x, main = paste("Resistance Prediction of", } else { ylab <- "%IR" } + # get plot() generic; this was moved from the 'graphics' pkg to the 'base' pkg in R 4.0.0 + if (as.integer(R.Version()$major) >= 4) { + plot <- get("plot", envir = asNamespace("base")) + } else { + plot <- get("plot", envir = asNamespace("graphics")) + } plot(x = x$year, y = x$value, ylim = c(0, 1), diff --git a/R/rsi.R b/R/rsi.R index 8a99d1f4..9d16b432 100755 --- a/R/rsi.R +++ b/R/rsi.R @@ -563,25 +563,20 @@ summary.rsi <- function(object, ...) { #' @exportMethod plot.rsi #' @export -#' @importFrom graphics plot text +#' @importFrom graphics text axis #' @noRd plot.rsi <- function(x, lwd = 2, ylim = NULL, ylab = "Percentage", xlab = "Antimicrobial Interpretation", - main = paste("Susceptibility Analysis of", deparse(substitute(x))), + main = paste("Resistance Overview of", deparse(substitute(x))), axes = FALSE, ...) { - suppressWarnings( - data <- data.frame(x = x, - y = 1, - stringsAsFactors = TRUE) %>% - group_by(x) %>% - summarise(n = sum(y)) %>% - filter(!is.na(x)) %>% - mutate(s = round((n / sum(n)) * 100, 1)) - ) + data <- as.data.frame(table(x), stringsAsFactors = FALSE) + colnames(data) <- c("x", "n") + data$s <- round((data$n / sum(data$n)) * 100, 1) + if (!"S" %in% data$x) { data <- rbind(data, data.frame(x = "S", n = 0, s = 0)) } @@ -592,10 +587,17 @@ plot.rsi <- function(x, data <- rbind(data, data.frame(x = "R", n = 0, s = 0)) } + # don't use as.rsi() here, it will confuse plot() data$x <- factor(data$x, levels = c("S", "I", "R"), ordered = TRUE) ymax <- if_else(max(data$s) > 95, 105, 100) + # get plot() generic; this was moved from the 'graphics' pkg to the 'base' pkg in R 4.0.0 + if (as.integer(R.Version()$major) >= 4) { + plot <- get("plot", envir = asNamespace("base")) + } else { + plot <- get("plot", envir = asNamespace("graphics")) + } plot(x = data$x, y = data$s, lwd = lwd, @@ -623,7 +625,7 @@ plot.rsi <- function(x, barplot.rsi <- function(height, col = c("chartreuse4", "chartreuse3", "brown3"), xlab = ifelse(beside, "Antimicrobial Interpretation", ""), - main = paste("Antimicrobial resistance of", deparse(substitute(height))), + main = paste("Resistance Overview of", deparse(substitute(height))), ylab = "Frequency", beside = TRUE, axes = beside, diff --git a/data-raw/country_analysis.R b/data-raw/country_analysis.R index 34cff779..44a22bf1 100644 --- a/data-raw/country_analysis.R +++ b/data-raw/country_analysis.R @@ -90,9 +90,29 @@ countries_geometry <- sf::st_as_sf(map('world', plot = FALSE, fill = TRUE)) %>% not_antarctica = as.integer(ID != "Antarctica"), countries_name = ifelse(included == 1, as.character(ID), NA)) +# add countries not in the list +countries_missing <- unique(ip_tbl$country[!ip_tbl$country %in% countries_geometry$countries_code]) +for (i in seq_len(length(countries_missing))) { + countries_geometry <- countries_geometry %>% + rbind(countries_geometry %>% + filter(ID == "Netherlands") %>% + mutate(ID = countrycode::countrycode(countries_missing[i], + origin = 'iso2c', + destination = 'country.name'), + countries_code = countries_missing[i], + included = 1, + not_antarctica = 1, + countries_name = countrycode::countrycode(countries_missing[i], + origin = 'iso2c', + destination = 'country.name'))) +} + # how many? countries_geometry %>% filter(included == 1) %>% nrow() +countries_geometry$countries_name <- gsub("UK", "United Kingdom", countries_geometry$countries_name, fixed = TRUE) +countries_geometry$countries_name <- gsub("USA", "United States", countries_geometry$countries_name, fixed = TRUE) + countries_plot <- ggplot(countries_geometry) + geom_sf(aes(fill = included, colour = not_antarctica), size = 0.25, @@ -101,9 +121,9 @@ countries_plot <- ggplot(countries_geometry) + theme(panel.grid = element_blank(), axis.title = element_blank(), axis.text = element_blank()) + - scale_fill_gradient(low = "white", high = "#CAD6EA", ) + + scale_fill_gradient(low = "white", high = "#128f7645") + # this makes the border Antarctica turn white (invisible): - scale_colour_gradient(low = "white", high = "#81899B") + scale_colour_gradient(low = "white", high = "#128f76") countries_plot_mini <- countries_plot countries_plot_mini$data <- countries_plot_mini$data %>% filter(ID != "Antarctica") diff --git a/data-raw/microorganisms.txt b/data-raw/microorganisms.txt index 6bc5e2c3..32009e55 100644 --- a/data-raw/microorganisms.txt +++ b/data-raw/microorganisms.txt @@ -37251,6 +37251,7 @@ "B_MYCBC_TKNS" "Mycobacterium tokaiense" "Bacteria" "Actinobacteria" "(unknown class)" "Actinomycetales" "Mycobacteriaceae" "Mycobacterium" "tokaiense" "" "species" "Tsukamura, 1981" "c457ca4ae3a404100c8ce8c82a6100cc" "CoL" 2 "72477006" "B_MYCBC_TRPL" "Mycobacterium triplex" "Bacteria" "Actinobacteria" "(unknown class)" "Actinomycetales" "Mycobacteriaceae" "Mycobacterium" "triplex" "" "species" "Floyd et al., 1997" "f23c2b6cad7a0e20374cdf3d3ff55dce" "CoL" 2 "113860005" "B_MYCBC_TRVL" "Mycobacterium triviale" "Bacteria" "Actinobacteria" "(unknown class)" "Actinomycetales" "Mycobacteriaceae" "Mycobacterium" "triviale" "" "species" "Kubica, 1970" "9cb8b676cce27952821e173b12bfff3f" "CoL" 2 "40333002" +"B_MYCBC_TBRC" "Mycobacterium tuberculosis" "Bacteria" "Actinobacteria" "(unknown class)" "Actinomycetales" "Mycobacteriaceae" "Mycobacterium" "tuberculosis" "" "species" "Lehmann et al., 2018" "778540" "DSMZ" 2 "c(\"113861009\", \"113858008\")" "B_MYCBC_TUSC" "Mycobacterium tusciae" "Bacteria" "Actinobacteria" "(unknown class)" "Actinomycetales" "Mycobacteriaceae" "Mycobacterium" "tusciae" "" "species" "Tortoli et al., 1999" "7a8ff8f5a2b16131366fe6e8dfb6b570" "CoL" 2 "B_MYCBC_ULCR" "Mycobacterium ulcerans" "Bacteria" "Actinobacteria" "(unknown class)" "Actinomycetales" "Mycobacteriaceae" "Mycobacterium" "ulcerans" "" "species" "MacCallum et al., 1950" "96b3a2e207e76f4725132034d7d0bde1" "CoL" 2 "40713003" "B_MYCBC_VACC" "Mycobacterium vaccae" "Bacteria" "Actinobacteria" "(unknown class)" "Actinomycetales" "Mycobacteriaceae" "Mycobacterium" "vaccae" "" "species" "Bonicke et al., 1964" "adbc928aba39beadc25b2ba7e8214c91" "CoL" 2 "54925005" diff --git a/data-raw/reproduction_of_microorganisms.R b/data-raw/reproduction_of_microorganisms.R index a4529ebe..4b2cfa95 100644 --- a/data-raw/reproduction_of_microorganisms.R +++ b/data-raw/reproduction_of_microorganisms.R @@ -920,6 +920,22 @@ testthat::test_file("tests/testthat/test-data.R") testthat::test_file("tests/testthat/test-mo.R") testthat::test_file("tests/testthat/test-mo_property.R") +# edit 2020-05-28 +# Not sure why it now says M. tuberculosis was renamed to M. africanum (B_MYCBC_AFRC), but that's not true +microorganisms <- microorganisms %>% + bind_rows(microorganisms %>% + filter(mo == "B_MYCBC_AFRC") %>% + mutate(mo = "B_MYCBC_TBRC", snomed = list(c("113861009", "113858008")), + ref = "Lehmann et al., 2018",species_id = "778540", + source = "DSMZ", species = "tuberculosis", + fullname = "Mycobacterium tuberculosis")) %>% + arrange(fullname) +class(microorganisms$mo) <- c("mo", "character") +microorganisms.old <- microorganisms.old %>% filter(fullname != "Mycobacterium tuberculosis") + +usethis::use_data(microorganisms, overwrite = TRUE, version = 2) +usethis::use_data(microorganisms.old, overwrite = TRUE, version = 2) + # OLD CODE ---------------------------------------------------------------- diff --git a/data-raw/reproduction_of_microorganisms_new.R b/data-raw/reproduction_of_microorganisms_new.R deleted file mode 100644 index 3d9b9140..00000000 --- a/data-raw/reproduction_of_microorganisms_new.R +++ /dev/null @@ -1,682 +0,0 @@ -# ==================================================================== # -# TITLE # -# Antimicrobial Resistance (AMR) Analysis # -# # -# SOURCE # -# https://gitlab.com/msberends/AMR # -# # -# LICENCE # -# (c) 2018-2020 Berends MS, Luz CF et al. # -# # -# This R package is free software; you can freely use and distribute # -# it for both personal and commercial purposes under the terms of the # -# GNU General Public License version 2.0 (GNU GPL-2), as published by # -# the Free Software Foundation. # -# # -# We created this package for both routine data analysis and academic # -# research and it was publicly released in the hope that it will be # -# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. # -# Visit our website for more info: https://msberends.gitlab.io/AMR. # -# ==================================================================== # - -# --------------------------------------------------------------------------------- -# Reproduction of the `microorganisms` data set -# --------------------------------------------------------------------------------- -# Data retrieved from: -# -# [1] Catalogue of Life (CoL) through the Encyclopaedia of Life -# https://opendata.eol.org/dataset/catalogue-of-life/ -# * Download the resource file with a name like "Catalogue of Life yyyy-mm-dd" -# * Extract "taxon.tab" -# -# [2] Global Biodiversity Information Facility (GBIF) -# https://doi.org/10.15468/39omei -# * Extract "Taxon.tsv" -# -# [3] Deutsche Sammlung von Mikroorganismen und Zellkulturen (DSMZ) -# https://www.dsmz.de/support/bacterial-nomenclature-up-to-date-downloads.html -# * Download the latest "Complete List" as xlsx file (DSMZ_bactnames.xlsx) -# --------------------------------------------------------------------------------- - -library(dplyr) -library(AMR) - -data_col <- data.table::fread("Documents/taxon.tab") -data_gbif <- data.table::fread("Documents/Taxon.tsv") - -# read the xlsx file from DSMZ (only around 2.5 MB): -data_dsmz <- readxl::read_xlsx("Downloads/DSMZ_bactnames.xlsx") - -# the CoL data is over 3.7M rows: -data_col %>% freq(kingdom) -# Item Count Percent Cum. Count Cum. Percent -# --- ---------- ---------- -------- ----------- ------------- -# 1 Animalia 2,225,627 59.1% 2,225,627 59.1% -# 2 Plantae 1,177,412 31.3% 3,403,039 90.4% -# 3 Fungi 290,145 7.7% 3,693,184 98.1% -# 4 Chromista 47,126 1.3% 3,740,310 99.3% -# 5 Bacteria 14,478 0.4% 3,754,788 99.7% -# 6 Protozoa 6,060 0.2% 3,760,848 99.9% -# 7 Viruses 3,827 0.1% 3,764,675 100.0% -# 8 Archaea 610 0.0% 3,765,285 100.0% - -# the GBIF data is over 5.8M rows: -data_gbif %>% freq(kingdom) -# Item Count Percent Cum. Count Cum. Percent -# --- --------------- ---------- -------- ----------- ------------- -# 1 Animalia 3,264,138 55.7% 3,264,138 55.7% -# 2 Plantae 1,814,962 31.0% 5,079,100 86.7% -# 3 Fungi 538,086 9.2% 5,617,186 95.9% -# 4 Chromista 181,374 3.1% 5,798,560 99.0% -# 5 Bacteria 24,048 0.4% 5,822,608 99.4% -# 6 Protozoa 15,138 0.3% 5,837,746 99.7% -# 7 incertae sedis 9,995 0.2% 5,847,741 99.8% -# 8 Viruses 9,630 0.2% 5,857,371 100.0% -# 9 Archaea 771 0.0% 5,858,142 100.0% - - -# Clean up helper function ------------------------------------------------ -clean_new <- function(new) { - new %>% - # only the ones that have no new ID to refer to a newer name - filter(is.na(col_id_new)) %>% - filter( - ( - # we only want all MICROorganisms and no viruses - !kingdom %in% c("Animalia", "Chromista", "Plantae", "Viruses") - # and not all fungi: Aspergillus, Candida, Trichphyton and Pneumocystis are the most important, - # so only keep these orders from the fungi: - & !(kingdom == "Fungi" - & !order %in% c("Eurotiales", "Saccharomycetales", "Schizosaccharomycetales", "Tremellales", "Onygenales", "Pneumocystales")) - ) - # or the family has to contain a genus we found in our hospitals last decades (Northern Netherlands, 2002-2018) - | genus %in% c("Absidia", "Acremonium", "Actinotignum", "Alternaria", "Anaerosalibacter", "Ancylostoma", "Anisakis", "Apophysomyces", - "Arachnia", "Ascaris", "Aureobacterium", "Aureobasidium", "Balantidum", "Bilophilia", "Branhamella", "Brochontrix", - "Brugia", "Calymmatobacterium", "Catabacter", "Chilomastix", "Chryseomonas", "Cladophialophora", "Cladosporium", - "Clonorchis", "Cordylobia", "Curvularia", "Demodex", "Dermatobia", "Diphyllobothrium", "Dracunculus", "Echinococcus", - "Enterobius", "Euascomycetes", "Exophiala", "Fasciola", "Fusarium", "Hendersonula", "Hymenolepis", "Kloeckera", - "Koserella", "Larva", "Leishmania", "Lelliottia", "Loa", "Lumbricus", "Malassezia", "Metagonimus", "Molonomonas", - "Mucor", "Nattrassia", "Necator", "Novospingobium", "Onchocerca", "Opistorchis", "Paragonimus", "Paramyxovirus", - "Pediculus", "Phoma", "Phthirus", "Pityrosporum", "Pseudallescheria", "Pulex", "Rhizomucor", "Rhizopus", "Rhodotorula", - "Salinococcus", "Sanguibacteroides", "Schistosoma", "Scopulariopsis", "Scytalidium", "Sporobolomyces", "Stomatococcus", - "Strongyloides", "Syncephalastraceae", "Taenia", "Torulopsis", "Trichinella", "Trichobilharzia", "Trichomonas", - "Trichosporon", "Trichuris", "Trypanosoma", "Wuchereria")) %>% - mutate( - authors2 = iconv(ref, from = "UTF-8", to = "ASCII//TRANSLIT"), - # remove leading and trailing brackets - authors2 = gsub("^[(](.*)[)]$", "\\1", authors2), - # only take part after brackets if there's a name - authors2 = ifelse(grepl(".*[)] [a-zA-Z]+.*", authors2), - gsub(".*[)] (.*)", "\\1", authors2), - authors2), - # get year from last 4 digits - lastyear = as.integer(gsub(".*([0-9]{4})$", "\\1", authors2)), - # can never be later than now - lastyear = ifelse(lastyear > as.integer(format(Sys.Date(), "%Y")), - NA, - lastyear), - # get authors without last year - authors = gsub("(.*)[0-9]{4}$", "\\1", authors2), - # remove nonsense characters from names - authors = gsub("[^a-zA-Z,'& -]", "", authors), - # remove trailing and leading spaces - authors = trimws(authors), - # only keep first author and replace all others by 'et al' - authors = gsub("(,| and| et| &| ex| emend\\.?) .*", " et al.", authors), - # et al. always with ending dot - authors = gsub(" et al\\.?", " et al.", authors), - authors = gsub(" ?,$", "", authors), - # don't start with 'sensu' or 'ehrenb' - authors = gsub("^(sensu|Ehrenb.?) ", "", authors, ignore.case = TRUE), - # no initials, only surname - authors = gsub("^([A-Z]+ )+", "", authors, ignore.case = FALSE), - # combine author and year if year is available - ref = ifelse(!is.na(lastyear), - paste0(authors, ", ", lastyear), - authors), - # fix beginning and ending - ref = gsub(", $", "", ref), - ref = gsub("^, ", "", ref)) %>% - # remove text if it contains 'Not assigned' like phylum in viruses - mutate_all(~gsub("Not assigned", "", .)) %>% - # Remove non-ASCII characters (these are not allowed by CRAN) - lapply(iconv, from = "UTF-8", to = "ASCII//TRANSLIT") %>% - as_tibble(stringsAsFactors = FALSE) %>% - mutate(fullname = trimws(case_when(rank == "family" ~ family, - rank == "order" ~ order, - rank == "class" ~ class, - rank == "phylum" ~ phylum, - rank == "kingdom" ~ kingdom, - TRUE ~ paste(genus, species, subspecies)))) -} -clean_old <- function(old, new) { - old %>% - # only the ones that exist in the new data set - filter(col_id_new %in% new$col_id) %>% - mutate( - authors2 = iconv(ref, from = "UTF-8", to = "ASCII//TRANSLIT"), - # remove leading and trailing brackets - authors2 = gsub("^[(](.*)[)]$", "\\1", authors2), - # only take part after brackets if there's a name - authors2 = ifelse(grepl(".*[)] [a-zA-Z]+.*", authors2), - gsub(".*[)] (.*)", "\\1", authors2), - authors2), - # get year from last 4 digits - lastyear = as.integer(gsub(".*([0-9]{4})$", "\\1", authors2)), - # can never be later than now - lastyear = ifelse(lastyear > as.integer(format(Sys.Date(), "%Y")), - NA, - lastyear), - # get authors without last year - authors = gsub("(.*)[0-9]{4}$", "\\1", authors2), - # remove nonsense characters from names - authors = gsub("[^a-zA-Z,'& -]", "", authors), - # remove trailing and leading spaces - authors = trimws(authors), - # only keep first author and replace all others by 'et al' - authors = gsub("(,| and| et| &| ex| emend\\.?) .*", " et al.", authors), - # et al. always with ending dot - authors = gsub(" et al\\.?", " et al.", authors), - authors = gsub(" ?,$", "", authors), - # don't start with 'sensu' or 'ehrenb' - authors = gsub("^(sensu|Ehrenb.?) ", "", authors, ignore.case = TRUE), - # no initials, only surname - authors = gsub("^([A-Z]+ )+", "", authors, ignore.case = FALSE), - # combine author and year if year is available - ref = ifelse(!is.na(lastyear), - paste0(authors, ", ", lastyear), - authors), - # fix beginning and ending - ref = gsub(", $", "", ref), - ref = gsub("^, ", "", ref)) %>% - # remove text if it contains 'Not assigned' like phylum in viruses - mutate_all(~gsub("Not assigned", "", .)) %>% - # Remove non-ASCII characters (these are not allowed by CRAN) - lapply(iconv, from = "UTF-8", to = "ASCII//TRANSLIT") %>% - as_tibble(stringsAsFactors = FALSE) %>% - select(col_id_new, fullname, ref, authors2) %>% - left_join(new %>% select(col_id, fullname_new = fullname), by = c(col_id_new = "col_id")) %>% - mutate(fullname = trimws( - gsub("(.*)[(].*", "\\1", - stringr::str_replace( - string = fullname, - pattern = stringr::fixed(authors2), - replacement = "")) %>% - gsub(" (var|f|subsp)[.]", "", .))) %>% - select(-c("col_id_new", "authors2")) %>% - filter(!is.na(fullname), !is.na(fullname_new)) %>% - filter(fullname != fullname_new, !fullname %like% "^[?]") -} - -# clean CoL and GBIF ---- -# clean data_col -data_col <- data_col %>% - as_tibble() %>% - select(col_id = taxonID, - col_id_new = acceptedNameUsageID, - fullname = scientificName, - kingdom, - phylum, - class, - order, - family, - genus, - species = specificEpithet, - subspecies = infraspecificEpithet, - rank = taxonRank, - ref = scientificNameAuthorship, - species_id = furtherInformationURL) %>% - mutate(source = "CoL") -# split into old and new -data_col.new <- data_col %>% clean_new() -data_col.old <- data_col %>% clean_old(new = data_col.new) -rm(data_col) - -# clean data_gbif -data_gbif <- data_gbif %>% - as_tibble() %>% - filter( - # no uncertain taxonomic placements - taxonRemarks != "doubtful", - kingdom != "incertae sedis", - taxonRank != "unranked") %>% - transmute(col_id = taxonID, - col_id_new = acceptedNameUsageID, - fullname = scientificName, - kingdom, - phylum, - class, - order, - family, - genus, - species = specificEpithet, - subspecies = infraspecificEpithet, - rank = taxonRank, - ref = scientificNameAuthorship, - species_id = as.character(parentNameUsageID)) %>% - mutate(source = "GBIF") -# split into old and new -data_gbif.new <- data_gbif %>% clean_new() -data_gbif.old <- data_gbif %>% clean_old(new = data_gbif.new) -rm(data_gbif) - -# put CoL and GBIF together ---- -MOs.new <- bind_rows(data_col.new, - data_gbif.new) %>% - mutate(taxonomic_tree_length = nchar(trimws(paste(kingdom, phylum, class, order, family, genus, species, subspecies)))) %>% - arrange(desc(taxonomic_tree_length)) %>% - distinct(fullname, .keep_all = TRUE) %>% - select(-c("col_id_new", "authors2", "authors", "lastyear", "taxonomic_tree_length")) %>% - arrange(fullname) -MOs.old <- bind_rows(data_col.old, - data_gbif.old) %>% - distinct(fullname, .keep_all = TRUE) %>% - arrange(fullname) - -# clean up DSMZ --- -data_dsmz <- data_dsmz %>% - as_tibble() %>% - transmute(col_id = NA_integer_, - col_id_new = NA_integer_, - fullname = "", - # kingdom = "", - # phylum = "", - # class = "", - # order = "", - # family = "", - genus = ifelse(is.na(GENUS), "", GENUS), - species = ifelse(is.na(SPECIES), "", SPECIES), - subspecies = ifelse(is.na(SUBSPECIES), "", SUBSPECIES), - rank = ifelse(species == "", "genus", "species"), - ref = AUTHORS, - species_id = as.character(RECORD_NO), - source = "DSMZ") - -# DSMZ only contains genus/(sub)species, try to find taxonomic properties based on genus and data_col -ref_taxonomy <- MOs.new %>% - distinct(genus, .keep_all = TRUE) %>% - filter(family != "") %>% - filter(genus %in% data_dsmz$genus) %>% - distinct(genus, .keep_all = TRUE) %>% - select(kingdom, phylum, class, order, family, genus) - -data_dsmz <- data_dsmz %>% - left_join(ref_taxonomy, by = "genus") %>% - mutate(kingdom = "Bacteria") - -data_dsmz.new <- data_dsmz %>% - clean_new() %>% - distinct(fullname, .keep_all = TRUE) %>% - select(colnames(MOs.new)) %>% - arrange(fullname) - -# combine everything ---- -MOs <- bind_rows(MOs.new, - data_dsmz.new) %>% - distinct(fullname, .keep_all = TRUE) %>% - # not the ones that are old - filter(!fullname %in% MOs.old$fullname) %>% - arrange(fullname) %>% - mutate(col_id = ifelse(source != "CoL", NA_integer_, col_id)) %>% - filter(fullname != "") - -rm(data_col.new) -rm(data_col.old) -rm(data_gbif.new) -rm(data_gbif.old) -rm(data_dsmz) -rm(data_dsmz.new) -rm(ref_taxonomy) -rm(MOs.new) - -MOs.bak <- MOs - -# Trichomonas trick ---- -# for species in Trypanosoma and Trichomonas we observe al lot of taxonomic info missing -MOs %>% filter(genus %in% c("Trypanosoma", "Trichomonas")) %>% View() -MOs[which(MOs$genus == "Trypanosoma"), "kingdom"] <- MOs[which(MOs$fullname == "Trypanosoma"),]$kingdom -MOs[which(MOs$genus == "Trypanosoma"), "phylum"] <- MOs[which(MOs$fullname == "Trypanosoma"),]$phylum -MOs[which(MOs$genus == "Trypanosoma"), "class"] <- MOs[which(MOs$fullname == "Trypanosoma"),]$class -MOs[which(MOs$genus == "Trypanosoma"), "order"] <- MOs[which(MOs$fullname == "Trypanosoma"),]$order -MOs[which(MOs$genus == "Trypanosoma"), "family"] <- MOs[which(MOs$fullname == "Trypanosoma"),]$family -MOs[which(MOs$genus == "Trichomonas"), "kingdom"] <- MOs[which(MOs$fullname == "Trichomonas"),]$kingdom -MOs[which(MOs$genus == "Trichomonas"), "phylum"] <- MOs[which(MOs$fullname == "Trichomonas"),]$phylum -MOs[which(MOs$genus == "Trichomonas"), "class"] <- MOs[which(MOs$fullname == "Trichomonas"),]$class -MOs[which(MOs$genus == "Trichomonas"), "order"] <- MOs[which(MOs$fullname == "Trichomonas"),]$order -MOs[which(MOs$genus == "Trichomonas"), "family"] <- MOs[which(MOs$fullname == "Trichomonas"),]$family - -# fill taxonomic properties that are missing -MOs <- MOs %>% - mutate(phylum = ifelse(phylum %in% c(NA, ""), "(unknown phylum)", phylum), - class = ifelse(class %in% c(NA, ""), "(unknown class)", class), - order = ifelse(order %in% c(NA, ""), "(unknown order)", order), - family = ifelse(family %in% c(NA, ""), "(unknown family)", family)) - -# Abbreviations ---- -# Add abbreviations so we can easily know which ones are which ones. -# These will become valid and unique microbial IDs for the AMR package. -MOs <- MOs %>% - arrange(kingdom, fullname) %>% - group_by(kingdom) %>% - mutate(abbr_other = case_when( - rank == "family" ~ paste0("[FAM]_", - abbreviate(family, - minlength = 8, - use.classes = TRUE, - method = "both.sides", - strict = FALSE)), - rank == "order" ~ paste0("[ORD]_", - abbreviate(order, - minlength = 8, - use.classes = TRUE, - method = "both.sides", - strict = FALSE)), - rank == "class" ~ paste0("[CLS]_", - abbreviate(class, - minlength = 8, - use.classes = TRUE, - method = "both.sides", - strict = FALSE)), - rank == "phylum" ~ paste0("[PHL]_", - abbreviate(phylum, - minlength = 8, - use.classes = TRUE, - method = "both.sides", - strict = FALSE)), - rank == "kingdom" ~ paste0("[KNG]_", kingdom), - TRUE ~ NA_character_ - )) %>% - # abbreviations determined per kingdom and family - # becuase they are part of the abbreviation - mutate(abbr_genus = abbreviate(genus, - minlength = 7, - use.classes = TRUE, - method = "both.sides", - strict = FALSE)) %>% - ungroup() %>% - group_by(genus) %>% - # species abbreviations may be the same between genera - # because the genus abbreviation is part of the abbreviation - mutate(abbr_species = abbreviate(stringr::str_to_title(species), - minlength = 3, - use.classes = FALSE, - method = "both.sides")) %>% - ungroup() %>% - group_by(genus, species) %>% - mutate(abbr_subspecies = abbreviate(stringr::str_to_title(subspecies), - minlength = 3, - use.classes = FALSE, - method = "both.sides")) %>% - ungroup() %>% - # remove trailing underscores - mutate(mo = gsub("_+$", "", - toupper(paste( - # first character: kingdom - ifelse(kingdom %in% c("Animalia", "Plantae"), - substr(kingdom, 1, 2), - substr(kingdom, 1, 1)), - # next: genus, species, subspecies - ifelse(is.na(abbr_other), - paste(abbr_genus, - abbr_species, - abbr_subspecies, - sep = "_"), - abbr_other), - sep = "_")))) %>% - mutate(mo = ifelse(duplicated(.$mo), - # these one or two must be unique too - paste0(mo, "1"), - mo), - fullname = ifelse(fullname == "", - trimws(paste(genus, species, subspecies)), - fullname)) %>% - # put `mo` in front, followed by the rest - select(mo, everything(), -abbr_other, -abbr_genus, -abbr_species, -abbr_subspecies) - -# add non-taxonomic entries -MOs <- MOs %>% - bind_rows( - # Unknowns - data.frame(mo = "UNKNOWN", - col_id = NA_integer_, - fullname = "(unknown name)", - kingdom = "(unknown kingdom)", - phylum = "(unknown phylum)", - class = "(unknown class)", - order = "(unknown order)", - family = "(unknown family)", - genus = "(unknown genus)", - species = "(unknown species)", - subspecies = "(unknown subspecies)", - rank = "(unknown rank)", - ref = NA_character_, - species_id = "", - source = "manually added", - stringsAsFactors = FALSE), - data.frame(mo = "B_GRAMN", - col_id = NA_integer_, - fullname = "(unknown Gram-negatives)", - kingdom = "Bacteria", - phylum = "(unknown phylum)", - class = "(unknown class)", - order = "(unknown order)", - family = "(unknown family)", - genus = "(unknown Gram-negatives)", - species = "(unknown species)", - subspecies = "(unknown subspecies)", - rank = "species", - ref = NA_character_, - species_id = "", - source = "manually added", - stringsAsFactors = FALSE), - data.frame(mo = "B_GRAMP", - col_id = NA_integer_, - fullname = "(unknown Gram-positives)", - kingdom = "Bacteria", - phylum = "(unknown phylum)", - class = "(unknown class)", - order = "(unknown order)", - family = "(unknown family)", - genus = "(unknown Gram-positives)", - species = "(unknown species)", - subspecies = "(unknown subspecies)", - rank = "species", - ref = NA_character_, - species_id = "", - source = "manually added", - stringsAsFactors = FALSE), - # CoNS - MOs %>% - filter(genus == "Staphylococcus", species == "") %>% .[1,] %>% - mutate(mo = paste(mo, "CNS", sep = "_"), - rank = "species", - col_id = NA_integer_, - species = "coagulase-negative", - fullname = "Coagulase-negative Staphylococcus (CoNS)", - ref = NA_character_, - species_id = "", - source = "manually added"), - # CoPS - MOs %>% - filter(genus == "Staphylococcus", species == "") %>% .[1,] %>% - mutate(mo = paste(mo, "CPS", sep = "_"), - rank = "species", - col_id = NA_integer_, - species = "coagulase-positive", - fullname = "Coagulase-positive Staphylococcus (CoPS)", - ref = NA_character_, - species_id = "", - source = "manually added"), - # Streptococci groups A, B, C, F, H, K - MOs %>% - filter(genus == "Streptococcus", species == "pyogenes") %>% .[1,] %>% - # we can keep all other details, since S. pyogenes is the only member of group A - mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRA", sep = "_"), - species = "group A" , - fullname = "Streptococcus group A"), - MOs %>% - filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>% - # we can keep all other details, since S. agalactiae is the only member of group B - mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRB", sep = "_"), - species = "group B" , - fullname = "Streptococcus group B"), - MOs %>% - filter(genus == "Streptococcus", species == "dysgalactiae") %>% .[1,] %>% - mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRC", sep = "_"), - col_id = NA_integer_, - species = "group C" , - fullname = "Streptococcus group C", - ref = NA_character_, - species_id = "", - source = "manually added"), - MOs %>% - filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>% - mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRD", sep = "_"), - col_id = NA_integer_, - species = "group D" , - fullname = "Streptococcus group D", - ref = NA_character_, - species_id = "", - source = "manually added"), - MOs %>% - filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>% - mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRF", sep = "_"), - col_id = NA_integer_, - species = "group F" , - fullname = "Streptococcus group F", - ref = NA_character_, - species_id = "", - source = "manually added"), - MOs %>% - filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>% - mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRG", sep = "_"), - col_id = NA_integer_, - species = "group G" , - fullname = "Streptococcus group G", - ref = NA_character_, - species_id = "", - source = "manually added"), - MOs %>% - filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>% - mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRH", sep = "_"), - col_id = NA_integer_, - species = "group H" , - fullname = "Streptococcus group H", - ref = NA_character_, - species_id = "", - source = "manually added"), - MOs %>% - filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>% - mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRK", sep = "_"), - col_id = NA_integer_, - species = "group K" , - fullname = "Streptococcus group K", - ref = NA_character_, - species_id = "", - source = "manually added"), - # Beta-haemolytic Streptococci - MOs %>% - filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>% - mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "HAE", sep = "_"), - col_id = NA_integer_, - species = "beta-haemolytic" , - fullname = "Beta-haemolytic Streptococcus", - ref = NA_character_, - species_id = "", - source = "manually added") - ) - - -# everything distinct? -sum(duplicated(MOs$mo)) -colnames(MOs) - -# set prevalence per species -MOs <- MOs %>% - mutate(prevalence = case_when( - class == "Gammaproteobacteria" - | genus %in% c("Enterococcus", "Staphylococcus", "Streptococcus") - | mo %in% c("UNKNOWN", "B_GRAMN", "B_GRAMP") - ~ 1, - phylum %in% c("Proteobacteria", - "Firmicutes", - "Actinobacteria", - "Sarcomastigophora") - | genus %in% c("Aspergillus", - "Bacteroides", - "Candida", - "Capnocytophaga", - "Chryseobacterium", - "Cryptococcus", - "Elisabethkingia", - "Flavobacterium", - "Fusobacterium", - "Giardia", - "Leptotrichia", - "Mycoplasma", - "Prevotella", - "Rhodotorula", - "Treponema", - "Trichophyton", - "Trichomonas", - "Ureaplasma") - | rank %in% c("kingdom", "phylum", "class", "order", "family") - ~ 2, - TRUE ~ 3 - )) - -# arrange -MOs <- MOs %>% arrange(fullname) - -# transform -MOs <- as.data.frame(MOs, stringsAsFactors = FALSE) -MOs.old <- as.data.frame(MOs.old, stringsAsFactors = FALSE) -class(MOs$mo) <- "mo" -MOs$col_id <- as.integer(MOs$col_id) - -# get differences in MO codes between this data and the package version -MO_diff <- AMR::microorganisms %>% - mutate(pastedtext = paste(mo, fullname)) %>% - filter(!pastedtext %in% (MOs %>% mutate(pastedtext = paste(mo, fullname)) %>% pull(pastedtext))) %>% - select(mo_old = mo, fullname, pastedtext) %>% - left_join(MOs %>% - transmute(mo_new = mo, fullname_new = fullname, pastedtext = paste(mo, fullname)), "pastedtext") %>% - select(mo_old, mo_new, fullname_new) - -mo_diff2 <- AMR::microorganisms %>% - select(mo, fullname) %>% - left_join(MOs %>% - select(mo, fullname), - by = "fullname", - suffix = c("_old", "_new")) %>% - filter(mo_old != mo_new, - #!mo_new %in% mo_old, - !mo_old %like% "\\[") - -mo_diff3 <- tibble(previous_old = names(AMR:::make_trans_tbl()), - previous_new = AMR:::make_trans_tbl()) %>% - left_join(AMR::microorganisms %>% select(mo, fullname), by = c(previous_new = "mo")) %>% - left_join(MOs %>% select(mo_new = mo, fullname), by = "fullname") - -# what did we win most? -MOs %>% filter(!fullname %in% AMR::microorganisms$fullname) %>% freq(genus) -# what did we lose most? -AMR::microorganisms %>% - filter(kingdom != "Chromista" & !fullname %in% MOs$fullname & !fullname %in% MOs.old$fullname) %>% - freq(genus) - - -# save -saveRDS(MOs, "microorganisms.rds") -saveRDS(MOs.old, "microorganisms.old.rds") - -# on the server, do: -usethis::use_data(microorganisms, overwrite = TRUE, version = 2) -usethis::use_data(microorganisms.old, overwrite = TRUE, version = 2) -rm(microorganisms) -rm(microorganisms.old) - -# TO DO AFTER THIS -# * Update the year and dim()s in R/data.R -# * Rerun data-raw/reproduction_of_rsi_translation.R -# * Run unit tests diff --git a/data/microorganisms.old.rda b/data/microorganisms.old.rda index 02aabf5a..f0910ad1 100644 Binary files a/data/microorganisms.old.rda and b/data/microorganisms.old.rda differ diff --git a/data/microorganisms.rda b/data/microorganisms.rda index cedc2d0d..db8f7a99 100644 Binary files a/data/microorganisms.rda and b/data/microorganisms.rda differ diff --git a/docs/404.html b/docs/404.html index 44b54f27..40d22457 100644 --- a/docs/404.html +++ b/docs/404.html @@ -81,7 +81,7 @@
diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index 5aa5c5bf..070e9fa7 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -81,7 +81,7 @@ diff --git a/docs/articles/AMR.html b/docs/articles/AMR.html index 1d459a7a..fd671271 100644 --- a/docs/articles/AMR.html +++ b/docs/articles/AMR.html @@ -39,7 +39,7 @@ @@ -186,7 +186,7 @@vignettes/AMR.Rmd
AMR.Rmd
Note: values on this page will change with every website update since they are based on randomly created values and the page was written in R Markdown. However, the methodology remains unchanged. This page was generated on 25 May 2020.
+Note: values on this page will change with every website update since they are based on randomly created values and the page was written in R Markdown. However, the methodology remains unchanged. This page was generated on 28 May 2020.
Now, let’s start the cleaning and the analysis!
@@ -432,16 +432,16 @@ Longest: 1