diff --git a/DESCRIPTION b/DESCRIPTION index 1bda40fc7..5b13cd75d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: AMR Version: 0.5.0.9018 -Date: 2019-02-21 +Date: 2019-02-22 Title: Antimicrobial Resistance Analysis Authors@R: c( person( diff --git a/NAMESPACE b/NAMESPACE index 23c2c8f5c..a4708cc3f 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -120,6 +120,7 @@ export(mo_kingdom) export(mo_order) export(mo_phylum) export(mo_property) +export(mo_rank) export(mo_ref) export(mo_renamed) export(mo_shortname) @@ -268,5 +269,6 @@ importFrom(stats,mad) importFrom(stats,pchisq) importFrom(stats,predict) importFrom(stats,sd) +importFrom(utils,browseURL) importFrom(utils,browseVignettes) importFrom(utils,installed.packages) diff --git a/NEWS.md b/NEWS.md index fca8315bd..79a22b84a 100755 --- a/NEWS.md +++ b/NEWS.md @@ -18,8 +18,10 @@ We've got a new website: [https://msberends.gitlab.io/AMR](https://msberends.git * All ~15,000 previously accepted names of included (sub)species that have been taxonomically renamed * The responsible author(s) and year of scientific publication - This data is updated annually - check the included version with `catalogue_of_life_version()`. + This data is updated annually - check the included version with `catalogue_of_life_version()`. * Due to this change, some `mo` codes changed (e.g. *Streptococcus* changed from `B_STRPTC` to `B_STRPT`). A translation table is used internally to support older microorganism IDs, so users will not notice this difference. +* New function `mo_rank()` for the taxonomic rank (genus, species, infraspecies, etc.) +* New function `mo_url()` to get the URL to the Catalogue of Life * Support for data from [WHONET](https://whonet.org/) and [EARS-Net](https://ecdc.europa.eu/en/about-us/partnerships-and-networks/disease-and-laboratory-networks/ears-net) (European Antimicrobial Resistance Surveillance Network): * Exported files from WHONET can be read and used in this package. For functions like `first_isolate()` and `eucast_rules()`, all parameters will be filled in automatically. * This package now knows all antibiotic abbrevations by EARS-Net (which are also being used by WHONET) - the `antibiotics` data set now contains a column `ears_net`. diff --git a/R/catalogue_of_life.R b/R/catalogue_of_life.R index 480ab4969..89fa65076 100755 --- a/R/catalogue_of_life.R +++ b/R/catalogue_of_life.R @@ -29,7 +29,7 @@ #' Included are: #' \itemize{ #' \item{All ~55,000 (sub)species from the kingdoms of Archaea, Bacteria, Protozoa and Viruses} -#' \item{All ~3,000 (sub)species from these orders of the kingdom of Fungi: Eurotiales, Onygenales, Pneumocystales, Saccharomycetales and Schizosaccharomycetales. The kingdom of Fungi is a very large taxon with almost 300,000 different (sub)species, of which most are not microbial (but rather macroscopic, like mushrooms). Because of this, not all fungi fit the scope of this package and including everything would tremendously slow down our algorithms too. By only including the aforementioned taxonomic orders, the most relevant (sub)species are covered (like all species of \emph{Aspergillus}, \emph{Candida}, \emph{Pneumocystis}, \emph{Saccharomyces} and \emph{Trichophyton}).} +#' \item{All ~3,500 (sub)species from these orders of the kingdom of Fungi: Eurotiales, Onygenales, Pneumocystales, Saccharomycetales, Schizosaccharomycetales and Tremellales. The kingdom of Fungi is a very large taxon with almost 300,000 different (sub)species, of which most are not microbial (but rather macroscopic, like mushrooms). Because of this, not all fungi fit the scope of this package and including everything would tremendously slow down our algorithms too. By only including the aforementioned taxonomic orders, the most relevant fungi are covered (like all species of \emph{Aspergillus}, \emph{Candida}, \emph{Cryptococcus}, \emph{Histplasma}, \emph{Pneumocystis}, \emph{Saccharomyces} and \emph{Trichophyton}).} #' \item{All ~15,000 previously accepted names of included (sub)species that have been taxonomically renamed} #' \item{The complete taxonomic tree of all included (sub)species: from kingdom to subspecies} #' \item{The responsible author(s) and year of scientific publication} diff --git a/R/data.R b/R/data.R index 6e8d1840f..45d43fcac 100755 --- a/R/data.R +++ b/R/data.R @@ -134,7 +134,7 @@ #' #' A data set containing the microbial taxonomy of six kingdoms from the Catalogue of Life. MO codes can be looked up using \code{\link{as.mo}}. #' @inheritSection catalogue_of_life Catalogue of Life -#' @format A \code{\link{data.frame}} with 56,672 observations and 14 variables: +#' @format A \code{\link{data.frame}} with 57,158 observations and 14 variables: #' \describe{ #' \item{\code{mo}}{ID of microorganism as used by this package} #' \item{\code{col_id}}{Catalogue of Life ID} @@ -163,6 +163,7 @@ "microorganisms" catalogue_of_life <- list( + year = 2018, version = "Catalogue of Life: 2018 Annual Checklist", url = "http://www.catalogueoflife.org/annual-checklist/2018" ) @@ -175,6 +176,8 @@ catalogue_of_life <- list( catalogue_of_life_version <- function() { list(version = catalogue_of_life$version, url = catalogue_of_life$url, + # annual release always somewhere in March + is_latest_annual_release = Sys.Date() < as.Date(paste0(catalogue_of_life$year + 1, "-04-01")), no_of_species = nrow(AMR::microorganisms), no_of_synonyms = nrow(AMR::microorganisms.old)) } @@ -183,7 +186,7 @@ catalogue_of_life_version <- function() { #' #' A data set containing old (previously valid or accepted) taxonomic names according to the Catalogue of Life. This data set is used internally by \code{\link{as.mo}}. #' @inheritSection catalogue_of_life Catalogue of Life -#' @format A \code{\link{data.frame}} with 14,506 observations and 4 variables: +#' @format A \code{\link{data.frame}} with 14,487 observations and 4 variables: #' \describe{ #' \item{\code{col_id}}{Catalogue of Life ID} #' \item{\code{tsn_new}}{New Catalogue of Life ID} diff --git a/R/mo_property.R b/R/mo_property.R index cc938e1f6..49c4624f0 100755 --- a/R/mo_property.R +++ b/R/mo_property.R @@ -26,6 +26,7 @@ #' @param property one of the column names of one of the \code{\link{microorganisms}} data set or \code{"shortname"} #' @param language language of the returned text, defaults to system language (see \code{\link{get_locale}}) and can also be set with \code{\link{getOption}("AMR_locale")}. Use \code{language = NULL} or \code{language = ""} to prevent translation. #' @param ... other parameters passed on to \code{\link{as.mo}} +#' @param open browse the URL using \code{\link[utils]{browseURL}} #' @details All functions will return the most recently known taxonomic property according to the Catalogue of Life, except for \code{mo_ref}, \code{mo_authors} and \code{mo_year}. This leads to the following results: #' \itemize{ #' \item{\code{mo_fullname("Chlamydia psittaci")} will return \code{"Chlamydophila psittaci"} (with a warning about the renaming)} @@ -44,14 +45,14 @@ #' @return \itemize{ #' \item{An \code{integer} in case of \code{mo_year}} #' \item{A \code{list} in case of \code{mo_taxonomy}} +#' \item{A named \code{character} in case of \code{mo_url}} #' \item{A \code{character} in all other cases} #' } #' @export #' @seealso \code{\link{microorganisms}} #' @inheritSection AMR Read more on our website! #' @examples -#' # All properties of Escherichia coli -#' ## taxonomic properties +#' ## taxonomic tree #' mo_kingdom("E. coli") # "Bacteria" #' mo_phylum("E. coli") # "Proteobacteria" #' mo_class("E. coli") # "Gammaproteobacteria" @@ -68,10 +69,12 @@ #' ## other properties #' mo_gramstain("E. coli") # "Gram negative" #' mo_type("E. coli") # "Bacteria" (equal to kingdom) +#' mo_rank("E. coli") # "species" +#' mo_url("E. coli") # get the direct url to the Catalogue of Life #' #' ## scientific reference -#' mo_ref("E. coli") # "Castellani and Chalmers, 1919" -#' mo_authors("E. coli") # "Castellani and Chalmers" +#' mo_ref("E. coli") # "Castellani et al., 1919" +#' mo_authors("E. coli") # "Castellani et al." #' mo_year("E. coli") # 1919 #' #' @@ -107,7 +110,7 @@ #' mo_shortname("S. pyo", Lancefield = TRUE) # "GAS" ('Group A streptococci') #' #' -#' # Language support for German, Dutch, Spanish, Portuguese, Italian and French +#' # language support for German, Dutch, Spanish, Portuguese, Italian and French #' mo_gramstain("E. coli", language = "de") # "Gramnegativ" #' mo_gramstain("E. coli", language = "nl") # "Gram-negatief" #' mo_gramstain("E. coli", language = "es") # "Gram negativo" @@ -125,7 +128,7 @@ #' language = "nl") # "Streptococcus groep A" #' #' -#' # Get a list with the complete taxonomy (kingdom to subspecies) +#' # get a list with the complete taxonomy (kingdom to subspecies) #' mo_taxonomy("E. coli") mo_fullname <- function(x, language = get_locale(), ...) { x <- mo_validate(x = x, property = "fullname", ...) @@ -259,9 +262,9 @@ mo_ref <- function(x, ...) { #' @export mo_authors <- function(x, ...) { x <- mo_validate(x = x, property = "ref", ...) - # remove last 4 digits and presumably the comma and space that preceed them + # remove last 4 digits and presumably the comma and space that preceeds them x[!is.na(x)] <- gsub(",? ?[0-9]{4}", "", x[!is.na(x)]) - x + suppressWarnings(x) } #' @rdname mo_property @@ -270,7 +273,13 @@ mo_year <- function(x, ...) { x <- mo_validate(x = x, property = "ref", ...) # get last 4 digits x[!is.na(x)] <- gsub(".*([0-9]{4})$", "\\1", x[!is.na(x)]) - as.integer(x) + suppressWarnings(as.integer(x)) +} + +#' @rdname mo_property +#' @export +mo_rank <- function(x, ...) { + mo_validate(x = x, property = "rank", ...) } #' @rdname mo_property @@ -288,10 +297,18 @@ mo_taxonomy <- function(x, ...) { } #' @rdname mo_property +#' @importFrom utils browseURL #' @export -mo_url <- function(x, ...) { +mo_url <- function(x, open = FALSE, ...) { u <- mo_validate(x = x, property = "species_id", ...) u[u != ""] <- paste0(catalogue_of_life$url, "/details/species/id/", u) + names(u) <- mo_fullname(x = x, ... = ...) + if (open == TRUE) { + if (length(u) > 1) { + warning("only the first URL will be opened, as `browseURL` only suports one string.") + } + browseURL(u[1L]) + } u } diff --git a/data/microorganisms.old.rda b/data/microorganisms.old.rda index 027d5334e..b35d6827e 100644 Binary files a/data/microorganisms.old.rda and b/data/microorganisms.old.rda differ diff --git a/data/microorganisms.rda b/data/microorganisms.rda index 50db4fbea..b1cd62666 100755 Binary files a/data/microorganisms.rda and b/data/microorganisms.rda differ diff --git a/docs/articles/AMR.html b/docs/articles/AMR.html index 891998751..f9e3bbcb7 100644 --- a/docs/articles/AMR.html +++ b/docs/articles/AMR.html @@ -192,7 +192,7 @@

How to conduct AMR analysis

Matthijs S. Berends

-

21 February 2019

+

22 February 2019

@@ -201,7 +201,7 @@ -

Note: values on this page will change with every website update since they are based on randomly created values and the page was written in RMarkdown. However, the methodology remains unchanged. This page was generated on 21 February 2019.

+

Note: values on this page will change with every website update since they are based on randomly created values and the page was written in RMarkdown. However, the methodology remains unchanged. This page was generated on 22 February 2019.

Introduction

@@ -217,21 +217,21 @@ -2019-02-21 +2019-02-22 abcd Escherichia coli S S -2019-02-21 +2019-02-22 abcd Escherichia coli S R -2019-02-21 +2019-02-22 efgh Escherichia coli R @@ -327,65 +327,65 @@ -2010-03-19 -Z1 -Hospital A +2014-11-25 +O2 +Hospital D Escherichia coli R S -S +R S F -2012-12-24 -Z8 -Hospital A -Klebsiella pneumoniae +2016-11-18 +I10 +Hospital B +Escherichia coli R S +R S -S -F +M -2013-12-12 -Z1 -Hospital A +2014-08-15 +G9 +Hospital D Staphylococcus aureus R S S S +M + + +2017-07-26 +S2 +Hospital B +Staphylococcus aureus +S +R +S +S F - -2014-08-13 -J4 -Hospital A -Escherichia coli -S -S -S -S -M - -2012-04-09 -F5 -Hospital A +2017-01-25 +H5 +Hospital C Escherichia coli -S +R S S S M -2010-08-11 -N1 -Hospital A -Klebsiella pneumoniae +2017-03-12 +B9 +Hospital C +Escherichia coli S S S @@ -411,8 +411,8 @@ #> #> Item Count Percent Cum. Count Cum. Percent #> --- ----- ------- -------- ----------- ------------- -#> 1 M 10,436 52.2% 10,436 52.2% -#> 2 F 9,564 47.8% 20,000 100.0% +#> 1 M 10,377 51.9% 10,377 51.9% +#> 2 F 9,623 48.1% 20,000 100.0%

So, we can draw at least two conclusions immediately. From a data scientist perspective, the data looks clean: only values M and F. From a researcher perspective: there are slightly more men. Nothing we didn’t already know.

The data is already quite clean, but we still need to transform some variables. The bacteria column now consists of text, and we want to add more variables based on microbial IDs later on. So, we will transform this column to valid IDs. The mutate() function of the dplyr package makes this really easy:

data <- data %>%
@@ -443,10 +443,10 @@
 #> Kingella kingae (no changes)
 #> 
 #> EUCAST Expert Rules, Intrinsic Resistance and Exceptional Phenotypes (v3.1, 2016)
-#> Table 1:  Intrinsic resistance in Enterobacteriaceae (1333 changes)
+#> Table 1:  Intrinsic resistance in Enterobacteriaceae (1284 changes)
 #> Table 2:  Intrinsic resistance in non-fermentative Gram-negative bacteria (no changes)
 #> Table 3:  Intrinsic resistance in other Gram-negative bacteria (no changes)
-#> Table 4:  Intrinsic resistance in Gram-positive bacteria (2733 changes)
+#> Table 4:  Intrinsic resistance in Gram-positive bacteria (2790 changes)
 #> Table 8:  Interpretive rules for B-lactam agents and Gram-positive cocci (no changes)
 #> Table 9:  Interpretive rules for B-lactam agents and Gram-negative rods (no changes)
 #> Table 10: Interpretive rules for B-lactam agents and other Gram-negative bacteria (no changes)
@@ -462,9 +462,9 @@
 #> Non-EUCAST: piperacillin/tazobactam = S where piperacillin = S (no changes)
 #> Non-EUCAST: trimethoprim/sulfa = S where trimethoprim = S (no changes)
 #> 
-#> => EUCAST rules affected 7,452 out of 20,000 rows
+#> => EUCAST rules affected 7,321 out of 20,000 rows
 #>    -> added 0 test results
-#>    -> changed 4,066 test results (0 to S; 0 to I; 4,066 to R)
+#> -> changed 4,074 test results (0 to S; 0 to I; 4,074 to R)

@@ -489,8 +489,8 @@ #> NOTE: Using column `bacteria` as input for `col_mo`. #> NOTE: Using column `date` as input for `col_date`. #> NOTE: Using column `patient_id` as input for `col_patient_id`. -#> => Found 5,692 first isolates (28.5% of total)

-

So only 28.5% is suitable for resistance analysis! We can now filter on it with the filter() function, also from the dplyr package:

+#> => Found 5,680 first isolates (28.4% of total) +

So only 28.4% is suitable for resistance analysis! We can now filter on it with the filter() function, also from the dplyr package:

data_1st <- data %>% 
   filter(first == TRUE)

For future use, the above two syntaxes can be shortened with the filter_first_isolate() function:

@@ -516,32 +516,32 @@ 1 -2010-01-14 -O6 +2010-01-10 +X9 B_ESCHR_COL R -I -R +S +S S TRUE 2 -2010-02-22 -O6 +2010-04-18 +X9 B_ESCHR_COL -S -S +R +I S S FALSE 3 -2010-04-01 -O6 +2010-07-02 +X9 B_ESCHR_COL -S +R S S S @@ -549,21 +549,21 @@ 4 -2010-04-25 -O6 +2010-09-21 +X9 B_ESCHR_COL +R S -S -S +R S FALSE 5 -2010-05-09 -O6 +2010-09-22 +X9 B_ESCHR_COL -S +R S S S @@ -571,21 +571,21 @@ 6 -2010-05-29 -O6 +2010-10-06 +X9 B_ESCHR_COL S -I -R +S +S S FALSE 7 -2010-06-27 -O6 +2010-10-14 +X9 B_ESCHR_COL -S +R S S S @@ -593,19 +593,19 @@ 8 -2010-06-27 -O6 +2011-01-09 +X9 B_ESCHR_COL S +I S -S -S +R FALSE 9 -2011-02-01 -O6 +2011-03-31 +X9 B_ESCHR_COL R S @@ -615,12 +615,12 @@ 10 -2011-03-20 -O6 +2011-03-31 +X9 B_ESCHR_COL S S -S +R S FALSE @@ -637,7 +637,7 @@ #> NOTE: Using column `patient_id` as input for `col_patient_id`. #> NOTE: Using column `keyab` as input for `col_keyantibiotics`. Use col_keyantibiotics = FALSE to prevent this. #> [Criterion] Inclusion based on key antibiotics, ignoring I. -#> => Found 15,851 first weighted isolates (79.3% of total) +#> => Found 15,854 first weighted isolates (79.3% of total) @@ -654,34 +654,34 @@ - - + + - - + + - - + + - - + + - + - - + + - + @@ -690,44 +690,32 @@ - - + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + @@ -736,22 +724,34 @@ + + + + + + + + + + + + - - + + + - - - + + - - + + @@ -762,23 +762,23 @@ - - + + - +
isolate
12010-01-14O62010-01-10X9 B_ESCHR_COL RIRSS S TRUE TRUE
22010-02-22O62010-04-18X9 B_ESCHR_COLSSRI S S FALSETRUEFALSE
32010-04-01O62010-07-02X9 B_ESCHR_COLSR S S S
42010-04-25O62010-09-21X9 B_ESCHR_COLR SSSSFALSEFALSE
52010-05-09O6B_ESCHR_COLSSSSFALSEFALSE
62010-05-29O6B_ESCHR_COLSI R S FALSE TRUE
72010-06-27O652010-09-22X9B_ESCHR_COLRSSSFALSETRUE
62010-10-06X9 B_ESCHR_COL S SFALSE TRUE
72010-10-14X9B_ESCHR_COLRSSSFALSETRUE
82010-06-27O62011-01-09X9 B_ESCHR_COL SI SSSFALSER FALSETRUE
92011-02-01O62011-03-31X9 B_ESCHR_COL R S
102011-03-20O62011-03-31X9 B_ESCHR_COL S SSR S FALSE TRUE
-

Instead of 2, now 6 isolates are flagged. In total, 79.3% of all isolates are marked ‘first weighted’ - 50.8% more than when using the CLSI guideline. In real life, this novel algorithm will yield 5-10% more isolates than the classic CLSI guideline.

+

Instead of 2, now 8 isolates are flagged. In total, 79.3% of all isolates are marked ‘first weighted’ - 50.9% more than when using the CLSI guideline. In real life, this novel algorithm will yield 5-10% more isolates than the classic CLSI guideline.

As with filter_first_isolate(), there’s a shortcut for this new algorithm too:

data_1st <- data %>% 
   filter_first_weighted_isolate()
-

So we end up with 15,851 isolates for analysis.

+

So we end up with 15,854 isolates for analysis.

We can remove unneeded columns:

data_1st <- data_1st %>% 
   select(-c(first, keyab))
@@ -803,10 +803,58 @@ -3 -2013-12-12 -Z1 -Hospital A +2 +2016-11-18 +I10 +Hospital B +B_ESCHR_COL +R +S +R +S +M +Gram negative +Escherichia +coli +TRUE + + +5 +2017-01-25 +H5 +Hospital C +B_ESCHR_COL +R +S +S +S +M +Gram negative +Escherichia +coli +TRUE + + +6 +2017-03-12 +B9 +Hospital C +B_ESCHR_COL +S +S +S +S +M +Gram negative +Escherichia +coli +TRUE + + +7 +2015-08-12 +Y4 +Hospital B B_STPHY_AUR R S @@ -818,26 +866,10 @@ aureus TRUE - -4 -2014-08-13 -J4 -Hospital A -B_ESCHR_COL -S -S -S -S -M -Gram negative -Escherichia -coli -TRUE - -5 -2012-04-09 -F5 +9 +2016-01-24 +L10 Hospital A B_ESCHR_COL S @@ -851,44 +883,12 @@ TRUE -6 -2010-08-11 -N1 -Hospital A -B_KLBSL_PNE -R -S -S -S -M -Gram negative -Klebsiella -pneumoniae -TRUE - - -7 -2010-02-25 -E9 -Hospital A -B_KLBSL_PNE -R -R -S -S -M -Gram negative -Klebsiella -pneumoniae -TRUE - - -8 -2013-04-09 -N1 -Hospital A +12 +2013-09-11 +H6 +Hospital B B_STPHY_AUR -I +S S R S @@ -915,9 +915,9 @@
freq(paste(data_1st$genus, data_1st$species))

Or can be used like the dplyr way, which is easier readable:

data_1st %>% freq(genus, species)
-

Frequency table of genus and species from a data.frame (15,851 x 13)

+

Frequency table of genus and species from a data.frame (15,854 x 13)

Columns: 2
-Length: 15,851 (of which NA: 0 = 0.00%)
+Length: 15,854 (of which NA: 0 = 0.00%)
Unique: 4

Shortest: 16
Longest: 24

@@ -934,33 +934,33 @@ Longest: 24

1 Escherichia coli -7,853 -49.5% -7,853 -49.5% +7,918 +49.9% +7,918 +49.9% 2 Staphylococcus aureus -3,943 -24.9% -11,796 -74.4% +3,930 +24.8% +11,848 +74.7% 3 Streptococcus pneumoniae -2,432 -15.3% -14,228 -89.8% +2,498 +15.8% +14,346 +90.5% 4 Klebsiella pneumoniae -1,623 -10.2% -15,851 +1,508 +9.5% +15,854 100.0% @@ -971,7 +971,7 @@ Longest: 24

Resistance percentages

The functions portion_R, portion_RI, portion_I, portion_IS and portion_S can be used to determine the portion of a specific antimicrobial outcome. They can be used on their own:

data_1st %>% portion_IR(amox)
-#> [1] 0.4764368
+#> [1] 0.4726883

Or can be used in conjuction with group_by() and summarise(), both from the dplyr package:

data_1st %>% 
   group_by(hospital) %>% 
@@ -984,19 +984,19 @@ Longest: 24

Hospital A -0.4722165 +0.4737395 Hospital B -0.4788707 +0.4763709 Hospital C -0.4670535 +0.4739257 Hospital D -0.4859994 +0.4636854 @@ -1014,23 +1014,23 @@ Longest: 24

Hospital A -0.4722165 -4841 +0.4737395 +4760 Hospital B -0.4788707 -5490 +0.4763709 +5544 Hospital C -0.4670535 -2413 +0.4739257 +2397 Hospital D -0.4859994 -3107 +0.4636854 +3153 @@ -1050,27 +1050,27 @@ Longest: 24

Escherichia -0.7324589 -0.9016936 -0.9759328 +0.7350341 +0.9051528 +0.9761303 Klebsiella -0.7221195 -0.9081947 -0.9821319 +0.7274536 +0.9177719 +0.9781167 Staphylococcus -0.7420746 -0.9163074 -0.9792037 +0.7432570 +0.9216285 +0.9788804 Streptococcus -0.7203947 +0.7273819 0.0000000 -0.7203947 +0.7273819 diff --git a/docs/articles/AMR_files/figure-html/plot 1-1.png b/docs/articles/AMR_files/figure-html/plot 1-1.png index c531478d2..8765e8733 100644 Binary files a/docs/articles/AMR_files/figure-html/plot 1-1.png and b/docs/articles/AMR_files/figure-html/plot 1-1.png differ diff --git a/docs/articles/AMR_files/figure-html/plot 3-1.png b/docs/articles/AMR_files/figure-html/plot 3-1.png index fe3810704..139f69f67 100644 Binary files a/docs/articles/AMR_files/figure-html/plot 3-1.png and b/docs/articles/AMR_files/figure-html/plot 3-1.png differ diff --git a/docs/articles/AMR_files/figure-html/plot 4-1.png b/docs/articles/AMR_files/figure-html/plot 4-1.png index 9919bc8c7..25e894984 100644 Binary files a/docs/articles/AMR_files/figure-html/plot 4-1.png and b/docs/articles/AMR_files/figure-html/plot 4-1.png differ diff --git a/docs/articles/AMR_files/figure-html/plot 5-1.png b/docs/articles/AMR_files/figure-html/plot 5-1.png index 5154ee144..cb7bf92f7 100644 Binary files a/docs/articles/AMR_files/figure-html/plot 5-1.png and b/docs/articles/AMR_files/figure-html/plot 5-1.png differ diff --git a/docs/articles/EUCAST.html b/docs/articles/EUCAST.html index 1ac9719f4..082df7834 100644 --- a/docs/articles/EUCAST.html +++ b/docs/articles/EUCAST.html @@ -192,7 +192,7 @@

How to apply EUCAST rules

Matthijs S. Berends

-

21 February 2019

+

22 February 2019

diff --git a/docs/articles/G_test.html b/docs/articles/G_test.html index 7298828d3..0742e808c 100644 --- a/docs/articles/G_test.html +++ b/docs/articles/G_test.html @@ -192,7 +192,7 @@

How to use the G-test

Matthijs S. Berends

-

21 February 2019

+

22 February 2019

diff --git a/docs/articles/WHONET.html b/docs/articles/WHONET.html index eb202bad0..e6aea648f 100644 --- a/docs/articles/WHONET.html +++ b/docs/articles/WHONET.html @@ -192,7 +192,7 @@

How to work with WHONET data

Matthijs S. Berends

-

21 February 2019

+

22 February 2019

diff --git a/docs/articles/atc_property.html b/docs/articles/atc_property.html index 61e7491bf..54772c9d7 100644 --- a/docs/articles/atc_property.html +++ b/docs/articles/atc_property.html @@ -192,7 +192,7 @@

How to get properties of an antibiotic

Matthijs S. Berends

-

21 February 2019

+

22 February 2019

diff --git a/docs/articles/benchmarks.html b/docs/articles/benchmarks.html index 8095f07d8..7557f953c 100644 --- a/docs/articles/benchmarks.html +++ b/docs/articles/benchmarks.html @@ -192,7 +192,7 @@

Benchmarks

Matthijs S. Berends

-

21 February 2019

+

22 February 2019

@@ -214,29 +214,18 @@ as.mo("S. aureus"), as.mo("STAAUR"), as.mo("Staphylococcus aureus"), - as.mo("B_STPHY_AUR"), - times = 10) -print(S.aureus, unit = "ms", signif = 2) -#> Unit: milliseconds -#> expr min lq mean median uq max -#> as.mo("sau") 100.00 100.00 110.00 100.00 100.00 160.00 -#> as.mo("stau") 140.00 140.00 170.00 160.00 190.00 200.00 -#> as.mo("staaur") 99.00 100.00 100.00 100.00 100.00 110.00 -#> as.mo("S. aureus") 64.00 64.00 65.00 65.00 66.00 67.00 -#> as.mo("S. aureus") 65.00 65.00 70.00 66.00 66.00 110.00 -#> as.mo("STAAUR") 97.00 98.00 100.00 100.00 100.00 100.00 -#> as.mo("Staphylococcus aureus") 35.00 35.00 36.00 36.00 37.00 38.00 -#> as.mo("B_STPHY_AUR") 0.34 0.47 0.52 0.48 0.56 0.89 -#> neval -#> 10 -#> 10 -#> 10 -#> 10 -#> 10 -#> 10 -#> 10 -#> 10
-

In the table above, all measurements are shown in milliseconds (thousands of seconds). A value of 10 milliseconds means it can determine 100 input values per second. It case of 50 milliseconds, this is only 20 input values per second. The more an input value resembles a full name, the faster the result will be found. In case of as.mo("B_STPHY_AUR"), the input is already a valid MO code, so it only almost takes no time at all (476 millionths of a second).

+ times = 10) +print(S.aureus, unit = "ms", signif = 3) +#> Unit: milliseconds +#> expr min lq mean median uq max neval +#> as.mo("sau") 42.9 43.2 43.9 44.0 44.2 45.1 10 +#> as.mo("stau") 86.8 87.0 88.9 87.3 88.2 101.0 10 +#> as.mo("staaur") 42.6 43.6 51.5 43.8 44.5 82.8 10 +#> as.mo("S. aureus") 23.2 23.3 31.0 23.5 23.6 61.8 10 +#> as.mo("S. aureus") 23.1 23.3 26.4 23.7 24.4 51.2 10 +#> as.mo("STAAUR") 42.8 43.4 44.5 44.3 44.5 47.8 10 +#> as.mo("Staphylococcus aureus") 14.3 14.5 20.4 14.8 16.0 64.6 10 +

In the table above, all measurements are shown in milliseconds (thousands of seconds). A value of 10 milliseconds means it can determine 100 input values per second. It case of 50 milliseconds, this is only 20 input values per second. The more an input value resembles a full name, the faster the result will be found.

To achieve this speed, the as.mo function also takes into account the prevalence of human pathogenic microorganisms. The downside is of course that less prevalent microorganisms will be determined less fast. See this example for the ID of Mycoplasma leonicaptivi (B_MYCPL_LEO), a bug probably never found before in humans:

M.leonicaptivi <- microbenchmark(as.mo("myle"),
                                  as.mo("mycleo"),
@@ -244,134 +233,127 @@
                                  as.mo("M.  leonicaptivi"),
                                  as.mo("MYCLEO"),
                                  as.mo("Mycoplasma leonicaptivi"),
-                                 as.mo("B_MYCPL_LEO"),
-                                 times = 10)
-print(M.leonicaptivi, unit = "ms", signif = 2)
-#> Unit: milliseconds
-#>                              expr    min     lq  mean median     uq max
-#>                     as.mo("myle") 210.00 220.00 240.0 230.00 260.00 310
-#>                   as.mo("mycleo") 610.00 630.00 680.0 680.00 720.00 770
-#>          as.mo("M. leonicaptivi") 370.00 370.00 390.0 390.00 410.00 410
-#>         as.mo("M.  leonicaptivi") 350.00 350.00 390.0 390.00 410.00 480
-#>                   as.mo("MYCLEO") 630.00 650.00 680.0 670.00 680.00 880
-#>  as.mo("Mycoplasma leonicaptivi") 250.00 250.00 260.0 250.00 260.00 290
-#>              as.mo("B_MYCPL_LEO")   0.35   0.43   5.6   0.69   0.75  50
-#>  neval
-#>     10
-#>     10
-#>     10
-#>     10
-#>     10
-#>     10
-#>     10
-

That takes 4.7 times as much time on average! A value of 100 milliseconds means it can only determine ~10 different input values per second. We can conclude that looking up arbitrary codes of less prevalent microorganisms is the worst way to go, in terms of calculation performance:

+ times = 10) +print(M.leonicaptivi, unit = "ms", signif = 3) +#> Unit: milliseconds +#> expr min lq mean median uq max neval +#> as.mo("myle") 141 142 162 142 142 299 10 +#> as.mo("mycleo") 479 481 520 525 530 634 10 +#> as.mo("M. leonicaptivi") 241 242 273 263 281 382 10 +#> as.mo("M. leonicaptivi") 239 241 268 282 283 299 10 +#> as.mo("MYCLEO") 487 520 525 524 528 601 10 +#> as.mo("Mycoplasma leonicaptivi") 152 156 183 174 200 261 10 +

That takes 7.3 times as much time on average! A value of 100 milliseconds means it can only determine ~10 different input values per second. We can conclude that looking up arbitrary codes of less prevalent microorganisms is the worst way to go, in terms of calculation performance.

+

In the figure below, we compare Escherichia coli (which is very common) with Prevotella brevis (which is moderately common) and with Mycoplasma leonicaptivi (which is very uncommon):

par(mar = c(5, 16, 4, 2)) # set more space for left margin text (16)
 
-# highest value on y axis
-max_y_axis <- max(S.aureus$time, M.leonicaptivi$time, na.rm = TRUE) / 1e6
-
-boxplot(S.aureus, horizontal = TRUE, las = 1, unit = "ms", log = FALSE, xlab = "", ylim = c(0, max_y_axis),
-        main = expression(paste("Benchmark of ", italic("Staphylococcus aureus"))))
+boxplot(microbenchmark(as.mo("M. leonicaptivi"), + as.mo("Mycoplasma leonicaptivi"), + as.mo("P. brevis"), + as.mo("Prevotella brevis"), + as.mo("E. coli"), + as.mo("Escherichia coli"), + times = 50), + horizontal = TRUE, las = 1, unit = "s", log = FALSE, + xlab = "", ylab = "Time in seconds", + main = "Benchmarks per prevalence")

-
boxplot(M.leonicaptivi, horizontal = TRUE, las = 1, unit = "ms", log = FALSE, xlab = "", ylim = c(0, max_y_axis),
-        main = expression(paste("Benchmark of ", italic("Mycoplasma leonicaptivi"))))
-

-

To relieve this pitfall and further improve performance, two important calculations take almost no time at all: repetitive results and already precalculated results.

+

Uncommon microorganisms take a lot more time than common microorganisms. To relieve this pitfall and further improve performance, two important calculations take almost no time at all: repetitive results and already precalculated results.

Repetitive results

Repetitive results mean that unique values are present more than once. Unique values will only be calculated once by as.mo(). We will use mo_fullname() for this test - a helper function that returns the full microbial name (genus, species and possibly subspecies) which uses as.mo() internally.

-
library(dplyr)
-# take 500,000 random MO codes from the septic_patients data set
-x = septic_patients %>%
-  sample_n(500000, replace = TRUE) %>%
-  pull(mo)
-  
-# got the right length?
-length(x)
-#> [1] 500000
-
-# and how many unique values do we have?
-n_distinct(x)
-#> [1] 95
-
-# now let's see:
-run_it <- microbenchmark(mo_fullname(x),
-                         times = 10)
-print(run_it, unit = "ms", signif = 3)
-#> Unit: milliseconds
-#>            expr min  lq mean median  uq max neval
-#>  mo_fullname(x) 487 499  527    535 538 573    10
-

So transforming 500,000 values (!) of 95 unique values only takes 0.54 seconds (535 ms). You only lose time on your unique input values.

+
library(dplyr)
+# take 500,000 random MO codes from the septic_patients data set
+x = septic_patients %>%
+  sample_n(500000, replace = TRUE) %>%
+  pull(mo)
+  
+# got the right length?
+length(x)
+#> [1] 500000
+
+# and how many unique values do we have?
+n_distinct(x)
+#> [1] 95
+
+# now let's see:
+run_it <- microbenchmark(mo_fullname(x),
+                         times = 10)
+print(run_it, unit = "ms", signif = 3)
+#> Unit: milliseconds
+#>            expr min  lq mean median  uq max neval
+#>  mo_fullname(x) 400 405  463    441 533 558    10
+

So transforming 500,000 values (!) of 95 unique values only takes 0.44 seconds (441 ms). You only lose time on your unique input values.

Precalculated results

What about precalculated results? If the input is an already precalculated result of a helper function like mo_fullname(), it almost doesn’t take any time at all (see ‘C’ below):

-
run_it <- microbenchmark(A = mo_fullname("B_STPHY_AUR"),
-                         B = mo_fullname("S. aureus"),
+
+

So going from mo_fullname("Staphylococcus aureus") to "Staphylococcus aureus" takes 0.0004 seconds - it doesn’t even start calculating if the result would be the same as the expected resulting value. That goes for all helper functions:

+ -

So going from mo_fullname("Staphylococcus aureus") to "Staphylococcus aureus" takes 0.0005 seconds - it doesn’t even start calculating if the result would be the same as the expected resulting value. That goes for all helper functions:

- + D = mo_family("Staphylococcaceae"), + E = mo_order("Bacillales"), + F = mo_class("Bacilli"), + G = mo_phylum("Firmicutes"), + H = mo_kingdom("Bacteria"), + times = 10) +print(run_it, unit = "ms", signif = 3) +#> Unit: milliseconds +#> expr min lq mean median uq max neval +#> A 0.298 0.327 0.398 0.400 0.452 0.535 10 +#> B 0.251 0.287 0.339 0.344 0.377 0.436 10 +#> C 0.293 0.403 0.451 0.487 0.500 0.537 10 +#> D 0.250 0.262 0.300 0.277 0.336 0.395 10 +#> E 0.249 0.261 0.306 0.313 0.344 0.384 10 +#> F 0.273 0.283 0.325 0.326 0.338 0.420 10 +#> G 0.238 0.293 0.312 0.325 0.342 0.356 10 +#> H 0.250 0.262 0.304 0.316 0.337 0.358 10

Of course, when running mo_phylum("Firmicutes") the function has zero knowledge about the actual microorganism, namely S. aureus. But since the result would be "Firmicutes" too, there is no point in calculating the result. And because this package ‘knows’ all phyla of all known bacteria (according to the Catalogue of Life), it can just return the initial value immediately.

Results in other languages

When the system language is non-English and supported by this AMR package, some functions will have a translated result. This almost does’t take extra time:

-
mo_fullname("CoNS", language = "en") # or just mo_fullname("CoNS") on an English system
-#> [1] "Coagulase Negative Staphylococcus (CoNS)"
-
-mo_fullname("CoNS", language = "es") # or just mo_fullname("CoNS") on a Spanish system
-#> [1] "Staphylococcus coagulasa negativo (CoNS)"
-
-mo_fullname("CoNS", language = "nl") # or just mo_fullname("CoNS") on a Dutch system
-#> [1] "Coagulase-negatieve Staphylococcus (CNS)"
-
-run_it <- microbenchmark(en = mo_fullname("CoNS", language = "en"),
-                         de = mo_fullname("CoNS", language = "de"),
-                         nl = mo_fullname("CoNS", language = "nl"),
-                         es = mo_fullname("CoNS", language = "es"),
-                         it = mo_fullname("CoNS", language = "it"),
-                         fr = mo_fullname("CoNS", language = "fr"),
-                         pt = mo_fullname("CoNS", language = "pt"),
-                         times = 10)
-print(run_it, unit = "ms", signif = 4)
-#> Unit: milliseconds
-#>  expr   min    lq  mean median    uq   max neval
-#>    en 24.41 25.27 26.34  25.41 26.92 30.60    10
-#>    de 35.53 35.76 36.76  35.98 37.20 41.19    10
-#>    nl 34.51 35.55 39.93  35.60 40.15 69.76    10
-#>    es 34.36 35.98 44.29  37.46 39.98 73.16    10
-#>    it 35.78 36.22 37.44  36.75 38.70 40.78    10
-#>    fr 35.45 35.71 36.09  35.79 36.15 37.93    10
-#>    pt 35.10 35.44 44.61  35.76 39.68 77.27    10
+
mo_fullname("CoNS", language = "en") # or just mo_fullname("CoNS") on an English system
+#> [1] "Coagulase Negative Staphylococcus (CoNS)"
+
+mo_fullname("CoNS", language = "es") # or just mo_fullname("CoNS") on a Spanish system
+#> [1] "Staphylococcus coagulasa negativo (CoNS)"
+
+mo_fullname("CoNS", language = "nl") # or just mo_fullname("CoNS") on a Dutch system
+#> [1] "Coagulase-negatieve Staphylococcus (CNS)"
+
+run_it <- microbenchmark(en = mo_fullname("CoNS", language = "en"),
+                         de = mo_fullname("CoNS", language = "de"),
+                         nl = mo_fullname("CoNS", language = "nl"),
+                         es = mo_fullname("CoNS", language = "es"),
+                         it = mo_fullname("CoNS", language = "it"),
+                         fr = mo_fullname("CoNS", language = "fr"),
+                         pt = mo_fullname("CoNS", language = "pt"),
+                         times = 10)
+print(run_it, unit = "ms", signif = 4)
+#> Unit: milliseconds
+#>  expr   min    lq  mean median    uq   max neval
+#>    en 10.78 11.11 11.15  11.14 11.30 11.41    10
+#>    de 19.60 19.65 23.24  19.76 20.61 52.47    10
+#>    nl 19.14 19.71 19.75  19.72 19.87 20.22    10
+#>    es 19.64 19.73 28.36  20.60 25.91 64.67    10
+#>    it 19.33 19.49 23.13  19.68 19.97 52.72    10
+#>    fr 19.43 19.54 20.08  19.72 20.60 21.46    10
+#>    pt 19.34 19.66 23.15  19.80 20.48 52.40    10

Currently supported are German, Dutch, Spanish, Italian, French and Portuguese.

diff --git a/docs/articles/benchmarks_files/figure-html/unnamed-chunk-5-1.png b/docs/articles/benchmarks_files/figure-html/unnamed-chunk-5-1.png index b49419111..49660c8ee 100644 Binary files a/docs/articles/benchmarks_files/figure-html/unnamed-chunk-5-1.png and b/docs/articles/benchmarks_files/figure-html/unnamed-chunk-5-1.png differ diff --git a/docs/articles/freq.html b/docs/articles/freq.html index 6ed329913..0e5b2b519 100644 --- a/docs/articles/freq.html +++ b/docs/articles/freq.html @@ -192,7 +192,7 @@

How to create frequency tables

Matthijs S. Berends

-

21 February 2019

+

22 February 2019

diff --git a/docs/articles/mo_property.html b/docs/articles/mo_property.html index 060e5ade1..3e17c75c9 100644 --- a/docs/articles/mo_property.html +++ b/docs/articles/mo_property.html @@ -192,7 +192,7 @@

How to get properties of a microorganism

Matthijs S. Berends

-

21 February 2019

+

22 February 2019

diff --git a/docs/articles/resistance_predict.html b/docs/articles/resistance_predict.html index c551ac81d..a67a62aeb 100644 --- a/docs/articles/resistance_predict.html +++ b/docs/articles/resistance_predict.html @@ -192,7 +192,7 @@

How to predict antimicrobial resistance

Matthijs S. Berends

-

21 February 2019

+

22 February 2019

diff --git a/docs/index.html b/docs/index.html index a59d6d19d..e4c42d8a5 100644 --- a/docs/index.html +++ b/docs/index.html @@ -250,7 +250,7 @@

This package is available on the official R network (CRAN), which has a peer-reviewed submission process. Install this package in R with:

install.packages("AMR")

It will be downloaded and installed automatically. For RStudio, click on the menu Tools > Install Packages… and then type in “AMR” and press Install.

-

Note: Not all functions on this website may be available in this latest release. To use all functions and data sets mentioned on this website, install the latest development version.

+

Note: Not all functions on this website may be available in this latest release. To use all functions and data sets mentioned on this website, install the latest development version.

@@ -277,8 +277,8 @@ diff --git a/docs/news/index.html b/docs/news/index.html index cc6240ae1..9c4989bdc 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -256,13 +256,14 @@

All ~3,000 (sub)species from these orders of the kingdom of Fungi: Eurotiales, Onygenales, Pneumocystales, Saccharomycetales and Schizosaccharomycetales.

The kingdom of Fungi is a very large taxon with almost 300,000 different (sub)species, of which most are not microbial (but rather macroscopic, like mushrooms). Because of this, not all fungi fit the scope of this package and including everything would tremendously slow down our algorithms too. By only including the aforementioned taxonomic orders, the most relevant (sub)species are covered (like all species of Aspergillus, Candida, Pneumocystis, Saccharomyces and Trichophyton).
  • All ~15,000 previously accepted names of included (sub)species that have been taxonomically renamed
  • -
  • The responsible author(s) and year of scientific publication

  • - -This data is updated annually - check the included version with catalogue_of_life_version(). - +
  • New function mo_rank() for the taxonomic rank (genus, species, infraspecies, etc.)
  • +
  • New function mo_url() to get the URL to the Catalogue of Life
  • Support for data from WHONET and EARS-Net (European Antimicrobial Resistance Surveillance Network):
    • Exported files from WHONET can be read and used in this package. For functions like first_isolate() and eucast_rules(), all parameters will be filled in automatically.
    • diff --git a/docs/reference/as.mo.html b/docs/reference/as.mo.html index 25d08178a..eb6a5aab4 100644 --- a/docs/reference/as.mo.html +++ b/docs/reference/as.mo.html @@ -344,7 +344,7 @@ When using allow_uncertain = TRUE (which is the default setting), i This package contains the complete taxonomic tree of almost all microorganisms from the authoritative and comprehensive Catalogue of Life (http://www.catalogueoflife.org). This data is updated annually - check the included version with catalogue_of_life_version.

      Included are:

      • All ~55,000 (sub)species from the kingdoms of Archaea, Bacteria, Protozoa and Viruses

      • -
      • All ~3,000 (sub)species from these orders of the kingdom of Fungi: Eurotiales, Onygenales, Pneumocystales, Saccharomycetales and Schizosaccharomycetales. The kingdom of Fungi is a very large taxon with almost 300,000 different (sub)species, of which most are not microbial (but rather macroscopic, like mushrooms). Because of this, not all fungi fit the scope of this package and including everything would tremendously slow down our algorithms too. By only including the aforementioned taxonomic orders, the most relevant (sub)species are covered (like all species of Aspergillus, Candida, Pneumocystis, Saccharomyces and Trichophyton).

      • +
      • All ~3,500 (sub)species from these orders of the kingdom of Fungi: Eurotiales, Onygenales, Pneumocystales, Saccharomycetales, Schizosaccharomycetales and Tremellales. The kingdom of Fungi is a very large taxon with almost 300,000 different (sub)species, of which most are not microbial (but rather macroscopic, like mushrooms). Because of this, not all fungi fit the scope of this package and including everything would tremendously slow down our algorithms too. By only including the aforementioned taxonomic orders, the most relevant fungi are covered (like all species of Aspergillus, Candida, Cryptococcus, Histplasma, Pneumocystis, Saccharomyces and Trichophyton).

      • All ~15,000 previously accepted names of included (sub)species that have been taxonomically renamed

      • The complete taxonomic tree of all included (sub)species: from kingdom to subspecies

      • The responsible author(s) and year of scientific publication

      • diff --git a/docs/reference/catalogue_of_life.html b/docs/reference/catalogue_of_life.html index f29870a8a..493bc4de0 100644 --- a/docs/reference/catalogue_of_life.html +++ b/docs/reference/catalogue_of_life.html @@ -249,7 +249,7 @@ This package contains the complete taxonomic tree of almost all microorganisms from the authoritative and comprehensive Catalogue of Life (http://www.catalogueoflife.org). This data is updated annually - check the included version with catalogue_of_life_version.

        Included are:

        • All ~55,000 (sub)species from the kingdoms of Archaea, Bacteria, Protozoa and Viruses

        • -
        • All ~3,000 (sub)species from these orders of the kingdom of Fungi: Eurotiales, Onygenales, Pneumocystales, Saccharomycetales and Schizosaccharomycetales. The kingdom of Fungi is a very large taxon with almost 300,000 different (sub)species, of which most are not microbial (but rather macroscopic, like mushrooms). Because of this, not all fungi fit the scope of this package and including everything would tremendously slow down our algorithms too. By only including the aforementioned taxonomic orders, the most relevant (sub)species are covered (like all species of Aspergillus, Candida, Pneumocystis, Saccharomyces and Trichophyton).

        • +
        • All ~3,500 (sub)species from these orders of the kingdom of Fungi: Eurotiales, Onygenales, Pneumocystales, Saccharomycetales, Schizosaccharomycetales and Tremellales. The kingdom of Fungi is a very large taxon with almost 300,000 different (sub)species, of which most are not microbial (but rather macroscopic, like mushrooms). Because of this, not all fungi fit the scope of this package and including everything would tremendously slow down our algorithms too. By only including the aforementioned taxonomic orders, the most relevant fungi are covered (like all species of Aspergillus, Candida, Cryptococcus, Histplasma, Pneumocystis, Saccharomyces and Trichophyton).

        • All ~15,000 previously accepted names of included (sub)species that have been taxonomically renamed

        • The complete taxonomic tree of all included (sub)species: from kingdom to subspecies

        • The responsible author(s) and year of scientific publication

        • diff --git a/docs/reference/catalogue_of_life_version.html b/docs/reference/catalogue_of_life_version.html index b95895767..810eaf0bf 100644 --- a/docs/reference/catalogue_of_life_version.html +++ b/docs/reference/catalogue_of_life_version.html @@ -250,7 +250,7 @@ This package contains the complete taxonomic tree of almost all microorganisms from the authoritative and comprehensive Catalogue of Life (http://www.catalogueoflife.org). This data is updated annually - check the included version with catalogue_of_life_version.

          Included are:

          • All ~55,000 (sub)species from the kingdoms of Archaea, Bacteria, Protozoa and Viruses

          • -
          • All ~3,000 (sub)species from these orders of the kingdom of Fungi: Eurotiales, Onygenales, Pneumocystales, Saccharomycetales and Schizosaccharomycetales. The kingdom of Fungi is a very large taxon with almost 300,000 different (sub)species, of which most are not microbial (but rather macroscopic, like mushrooms). Because of this, not all fungi fit the scope of this package and including everything would tremendously slow down our algorithms too. By only including the aforementioned taxonomic orders, the most relevant (sub)species are covered (like all species of Aspergillus, Candida, Pneumocystis, Saccharomyces and Trichophyton).

          • +
          • All ~3,500 (sub)species from these orders of the kingdom of Fungi: Eurotiales, Onygenales, Pneumocystales, Saccharomycetales, Schizosaccharomycetales and Tremellales. The kingdom of Fungi is a very large taxon with almost 300,000 different (sub)species, of which most are not microbial (but rather macroscopic, like mushrooms). Because of this, not all fungi fit the scope of this package and including everything would tremendously slow down our algorithms too. By only including the aforementioned taxonomic orders, the most relevant fungi are covered (like all species of Aspergillus, Candida, Cryptococcus, Histplasma, Pneumocystis, Saccharomyces and Trichophyton).

          • All ~15,000 previously accepted names of included (sub)species that have been taxonomically renamed

          • The complete taxonomic tree of all included (sub)species: from kingdom to subspecies

          • The responsible author(s) and year of scientific publication

          • diff --git a/docs/reference/index.html b/docs/reference/index.html index 21dffb674..a4673d17a 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -355,7 +355,7 @@ -

            mo_fullname() mo_shortname() mo_subspecies() mo_species() mo_genus() mo_family() mo_order() mo_class() mo_phylum() mo_kingdom() mo_type() mo_gramstain() mo_ref() mo_authors() mo_year() mo_taxonomy() mo_url() mo_property()

            +

            mo_fullname() mo_shortname() mo_subspecies() mo_species() mo_genus() mo_family() mo_order() mo_class() mo_phylum() mo_kingdom() mo_type() mo_gramstain() mo_ref() mo_authors() mo_year() mo_rank() mo_taxonomy() mo_url() mo_property()

            Property of a microorganism

            diff --git a/docs/reference/microorganisms.codes.html b/docs/reference/microorganisms.codes.html index 8f1d5e9e6..70b51b939 100644 --- a/docs/reference/microorganisms.codes.html +++ b/docs/reference/microorganisms.codes.html @@ -257,7 +257,7 @@ This package contains the complete taxonomic tree of almost all microorganisms from the authoritative and comprehensive Catalogue of Life (http://www.catalogueoflife.org). This data is updated annually - check the included version with catalogue_of_life_version.

            Included are:

            • All ~55,000 (sub)species from the kingdoms of Archaea, Bacteria, Protozoa and Viruses

            • -
            • All ~3,000 (sub)species from these orders of the kingdom of Fungi: Eurotiales, Onygenales, Pneumocystales, Saccharomycetales and Schizosaccharomycetales. The kingdom of Fungi is a very large taxon with almost 300,000 different (sub)species, of which most are not microbial (but rather macroscopic, like mushrooms). Because of this, not all fungi fit the scope of this package and including everything would tremendously slow down our algorithms too. By only including the aforementioned taxonomic orders, the most relevant (sub)species are covered (like all species of Aspergillus, Candida, Pneumocystis, Saccharomyces and Trichophyton).

            • +
            • All ~3,500 (sub)species from these orders of the kingdom of Fungi: Eurotiales, Onygenales, Pneumocystales, Saccharomycetales, Schizosaccharomycetales and Tremellales. The kingdom of Fungi is a very large taxon with almost 300,000 different (sub)species, of which most are not microbial (but rather macroscopic, like mushrooms). Because of this, not all fungi fit the scope of this package and including everything would tremendously slow down our algorithms too. By only including the aforementioned taxonomic orders, the most relevant fungi are covered (like all species of Aspergillus, Candida, Cryptococcus, Histplasma, Pneumocystis, Saccharomyces and Trichophyton).

            • All ~15,000 previously accepted names of included (sub)species that have been taxonomically renamed

            • The complete taxonomic tree of all included (sub)species: from kingdom to subspecies

            • The responsible author(s) and year of scientific publication

            • diff --git a/docs/reference/microorganisms.html b/docs/reference/microorganisms.html index fdfd2c827..546afd18a 100644 --- a/docs/reference/microorganisms.html +++ b/docs/reference/microorganisms.html @@ -245,7 +245,7 @@

              Format

              -

              A data.frame with 56,672 observations and 14 variables:

              +

              A data.frame with 57,158 observations and 14 variables:

              mo

              ID of microorganism as used by this package

              col_id

              Catalogue of Life ID

              fullname

              Full name, like "Echerichia coli"

              @@ -281,7 +281,7 @@ This package contains the complete taxonomic tree of almost all microorganisms from the authoritative and comprehensive Catalogue of Life (http://www.catalogueoflife.org). This data is updated annually - check the included version with catalogue_of_life_version.

              Included are:

              • All ~55,000 (sub)species from the kingdoms of Archaea, Bacteria, Protozoa and Viruses

              • -
              • All ~3,000 (sub)species from these orders of the kingdom of Fungi: Eurotiales, Onygenales, Pneumocystales, Saccharomycetales and Schizosaccharomycetales. The kingdom of Fungi is a very large taxon with almost 300,000 different (sub)species, of which most are not microbial (but rather macroscopic, like mushrooms). Because of this, not all fungi fit the scope of this package and including everything would tremendously slow down our algorithms too. By only including the aforementioned taxonomic orders, the most relevant (sub)species are covered (like all species of Aspergillus, Candida, Pneumocystis, Saccharomyces and Trichophyton).

              • +
              • All ~3,500 (sub)species from these orders of the kingdom of Fungi: Eurotiales, Onygenales, Pneumocystales, Saccharomycetales, Schizosaccharomycetales and Tremellales. The kingdom of Fungi is a very large taxon with almost 300,000 different (sub)species, of which most are not microbial (but rather macroscopic, like mushrooms). Because of this, not all fungi fit the scope of this package and including everything would tremendously slow down our algorithms too. By only including the aforementioned taxonomic orders, the most relevant fungi are covered (like all species of Aspergillus, Candida, Cryptococcus, Histplasma, Pneumocystis, Saccharomyces and Trichophyton).

              • All ~15,000 previously accepted names of included (sub)species that have been taxonomically renamed

              • The complete taxonomic tree of all included (sub)species: from kingdom to subspecies

              • The responsible author(s) and year of scientific publication

              • diff --git a/docs/reference/microorganisms.old.html b/docs/reference/microorganisms.old.html index 65fd7cf61..f28ed60a2 100644 --- a/docs/reference/microorganisms.old.html +++ b/docs/reference/microorganisms.old.html @@ -245,7 +245,7 @@

                Format

                -

                A data.frame with 14,506 observations and 4 variables:

                +

                A data.frame with 14,487 observations and 4 variables:

                col_id

                Catalogue of Life ID

                tsn_new

                New Catalogue of Life ID

                fullname

                Old taxonomic name of the microorganism

                @@ -263,7 +263,7 @@ This package contains the complete taxonomic tree of almost all microorganisms from the authoritative and comprehensive Catalogue of Life (http://www.catalogueoflife.org). This data is updated annually - check the included version with catalogue_of_life_version.

                Included are:

                • All ~55,000 (sub)species from the kingdoms of Archaea, Bacteria, Protozoa and Viruses

                • -
                • All ~3,000 (sub)species from these orders of the kingdom of Fungi: Eurotiales, Onygenales, Pneumocystales, Saccharomycetales and Schizosaccharomycetales. The kingdom of Fungi is a very large taxon with almost 300,000 different (sub)species, of which most are not microbial (but rather macroscopic, like mushrooms). Because of this, not all fungi fit the scope of this package and including everything would tremendously slow down our algorithms too. By only including the aforementioned taxonomic orders, the most relevant (sub)species are covered (like all species of Aspergillus, Candida, Pneumocystis, Saccharomyces and Trichophyton).

                • +
                • All ~3,500 (sub)species from these orders of the kingdom of Fungi: Eurotiales, Onygenales, Pneumocystales, Saccharomycetales, Schizosaccharomycetales and Tremellales. The kingdom of Fungi is a very large taxon with almost 300,000 different (sub)species, of which most are not microbial (but rather macroscopic, like mushrooms). Because of this, not all fungi fit the scope of this package and including everything would tremendously slow down our algorithms too. By only including the aforementioned taxonomic orders, the most relevant fungi are covered (like all species of Aspergillus, Candida, Cryptococcus, Histplasma, Pneumocystis, Saccharomyces and Trichophyton).

                • All ~15,000 previously accepted names of included (sub)species that have been taxonomically renamed

                • The complete taxonomic tree of all included (sub)species: from kingdom to subspecies

                • The responsible author(s) and year of scientific publication

                • diff --git a/docs/reference/mo_property.html b/docs/reference/mo_property.html index 6bf3ae404..b6f525ca2 100644 --- a/docs/reference/mo_property.html +++ b/docs/reference/mo_property.html @@ -271,9 +271,11 @@ mo_year(x, ...) +mo_rank(x, ...) + mo_taxonomy(x, ...) -mo_url(x, ...) +mo_url(x, open = FALSE, ...) mo_property(x, property = "fullname", language = get_locale(), ...) @@ -292,6 +294,10 @@ ...

                  other parameters passed on to as.mo

                  + + open +

                  browse the URL using browseURL

                  + property

                  one of the column names of one of the microorganisms data set or "shortname"

                  @@ -304,6 +310,7 @@
                  • An integer in case of mo_year

                  • A list in case of mo_taxonomy

                  • +
                  • A named character in case of mo_url

                  • A character in all other cases

                  @@ -330,7 +337,7 @@ This package contains the complete taxonomic tree of almost all microorganisms from the authoritative and comprehensive Catalogue of Life (http://www.catalogueoflife.org). This data is updated annually - check the included version with catalogue_of_life_version.

                  Included are:

                  • All ~55,000 (sub)species from the kingdoms of Archaea, Bacteria, Protozoa and Viruses

                  • -
                  • All ~3,000 (sub)species from these orders of the kingdom of Fungi: Eurotiales, Onygenales, Pneumocystales, Saccharomycetales and Schizosaccharomycetales. The kingdom of Fungi is a very large taxon with almost 300,000 different (sub)species, of which most are not microbial (but rather macroscopic, like mushrooms). Because of this, not all fungi fit the scope of this package and including everything would tremendously slow down our algorithms too. By only including the aforementioned taxonomic orders, the most relevant (sub)species are covered (like all species of Aspergillus, Candida, Pneumocystis, Saccharomyces and Trichophyton).

                  • +
                  • All ~3,500 (sub)species from these orders of the kingdom of Fungi: Eurotiales, Onygenales, Pneumocystales, Saccharomycetales, Schizosaccharomycetales and Tremellales. The kingdom of Fungi is a very large taxon with almost 300,000 different (sub)species, of which most are not microbial (but rather macroscopic, like mushrooms). Because of this, not all fungi fit the scope of this package and including everything would tremendously slow down our algorithms too. By only including the aforementioned taxonomic orders, the most relevant fungi are covered (like all species of Aspergillus, Candida, Cryptococcus, Histplasma, Pneumocystis, Saccharomyces and Trichophyton).

                  • All ~15,000 previously accepted names of included (sub)species that have been taxonomically renamed

                  • The complete taxonomic tree of all included (sub)species: from kingdom to subspecies

                  • The responsible author(s) and year of scientific publication

                  • @@ -357,8 +364,7 @@ This package contains the complete taxonomic tree of almost all microorganisms f

                    Examples

                    # NOT RUN {
                    -# All properties of Escherichia coli
                    -## taxonomic properties
                    +## taxonomic tree
                     mo_kingdom("E. coli")         # "Bacteria"
                     mo_phylum("E. coli")          # "Proteobacteria"
                     mo_class("E. coli")           # "Gammaproteobacteria"
                    @@ -375,10 +381,12 @@ This package contains the complete taxonomic tree of almost all microorganisms f
                     ## other properties
                     mo_gramstain("E. coli")       # "Gram negative"
                     mo_type("E. coli")            # "Bacteria" (equal to kingdom)
                    +mo_rank("E. coli")            # "species"
                    +mo_url("E. coli")             # get the direct url to the Catalogue of Life
                     
                     ## scientific reference
                    -mo_ref("E. coli")             # "Castellani and Chalmers, 1919"
                    -mo_authors("E. coli")         # "Castellani and Chalmers"
                    +mo_ref("E. coli")             # "Castellani et al., 1919"
                    +mo_authors("E. coli")         # "Castellani et al."
                     mo_year("E. coli")            # 1919
                     
                     
                    @@ -414,7 +422,7 @@ This package contains the complete taxonomic tree of almost all microorganisms f
                     mo_shortname("S. pyo", Lancefield = TRUE) # "GAS" ('Group A streptococci')
                     
                     
                    -# Language support for German, Dutch, Spanish, Portuguese, Italian and French
                    +# language support for German, Dutch, Spanish, Portuguese, Italian and French
                     mo_gramstain("E. coli", language = "de")  # "Gramnegativ"
                     mo_gramstain("E. coli", language = "nl")  # "Gram-negatief"
                     mo_gramstain("E. coli", language = "es")  # "Gram negativo"
                    @@ -432,7 +440,7 @@ This package contains the complete taxonomic tree of almost all microorganisms f
                                 language = "nl")              # "Streptococcus groep A"
                     
                     
                    -# Get a list with the complete taxonomy (kingdom to subspecies)
                    +# get a list with the complete taxonomy (kingdom to subspecies)
                     mo_taxonomy("E. coli")
                     # }
  • diff --git a/index.md b/index.md index 88ba4901a..7ea44bbc9 100644 --- a/index.md +++ b/index.md @@ -66,7 +66,7 @@ install.packages("AMR") It will be downloaded and installed automatically. For RStudio, click on the menu *Tools* > *Install Packages...* and then type in "AMR" and press Install. -*Note:* Not all functions on this website may be available in this latest release. To use all functions and data sets mentioned on this website, install the latest development version. +**Note:** Not all functions on this website may be available in this latest release. To use all functions and data sets mentioned on this website, install the latest development version. #### Latest development version @@ -92,9 +92,9 @@ Included are: * All ~55,000 (sub)species from the kingdoms of Archaea, Bacteria, Protozoa and Viruses -* All ~3,000 (sub)species from these orders of the kingdom of Fungi: Eurotiales, Onygenales, Pneumocystales, Saccharomycetales and Schizosaccharomycetales. +* All ~3,500 (sub)species from these orders of the kingdom of Fungi: Eurotiales, Onygenales, Pneumocystales, Saccharomycetales, Schizosaccharomycetales and Tremellales. - The kingdom of Fungi is a very large taxon with almost 300,000 different (sub)species, of which most are not microbial (but rather macroscopic, like mushrooms). Because of this, not all fungi fit the scope of this package and including everything would tremendously slow down our algorithms too. By only including the aforementioned taxonomic orders, the most relevant (sub)species are covered (like all species of *Aspergillus*, *Candida*, *Pneumocystis*, *Saccharomyces* and *Trichophyton*). + The kingdom of Fungi is a very large taxon with almost 300,000 different (sub)species, of which most are not microbial (but rather macroscopic, like mushrooms). Because of this, not all fungi fit the scope of this package and including everything would tremendously slow down our algorithms too. By only including the aforementioned taxonomic orders, the most relevant fungi are covered (like all species of *Aspergillus*, *Candida*, *Cryptococcus*, *Histoplasma*, *Pneumocystis*, *Saccharomyces* and *Trichophyton*). * All ~15,000 previously accepted names of included (sub)species that have been taxonomically renamed diff --git a/man/as.mo.Rd b/man/as.mo.Rd index 99e3005e2..0f7bce558 100644 --- a/man/as.mo.Rd +++ b/man/as.mo.Rd @@ -118,7 +118,7 @@ This package contains the complete taxonomic tree of almost all microorganisms f Included are: \itemize{ \item{All ~55,000 (sub)species from the kingdoms of Archaea, Bacteria, Protozoa and Viruses} - \item{All ~3,000 (sub)species from these orders of the kingdom of Fungi: Eurotiales, Onygenales, Pneumocystales, Saccharomycetales and Schizosaccharomycetales. The kingdom of Fungi is a very large taxon with almost 300,000 different (sub)species, of which most are not microbial (but rather macroscopic, like mushrooms). Because of this, not all fungi fit the scope of this package and including everything would tremendously slow down our algorithms too. By only including the aforementioned taxonomic orders, the most relevant (sub)species are covered (like all species of \emph{Aspergillus}, \emph{Candida}, \emph{Pneumocystis}, \emph{Saccharomyces} and \emph{Trichophyton}).} + \item{All ~3,500 (sub)species from these orders of the kingdom of Fungi: Eurotiales, Onygenales, Pneumocystales, Saccharomycetales, Schizosaccharomycetales and Tremellales. The kingdom of Fungi is a very large taxon with almost 300,000 different (sub)species, of which most are not microbial (but rather macroscopic, like mushrooms). Because of this, not all fungi fit the scope of this package and including everything would tremendously slow down our algorithms too. By only including the aforementioned taxonomic orders, the most relevant fungi are covered (like all species of \emph{Aspergillus}, \emph{Candida}, \emph{Cryptococcus}, \emph{Histplasma}, \emph{Pneumocystis}, \emph{Saccharomyces} and \emph{Trichophyton}).} \item{All ~15,000 previously accepted names of included (sub)species that have been taxonomically renamed} \item{The complete taxonomic tree of all included (sub)species: from kingdom to subspecies} \item{The responsible author(s) and year of scientific publication} diff --git a/man/catalogue_of_life.Rd b/man/catalogue_of_life.Rd index 91b8fb964..a581c1572 100644 --- a/man/catalogue_of_life.Rd +++ b/man/catalogue_of_life.Rd @@ -14,7 +14,7 @@ This package contains the complete taxonomic tree of almost all microorganisms f Included are: \itemize{ \item{All ~55,000 (sub)species from the kingdoms of Archaea, Bacteria, Protozoa and Viruses} - \item{All ~3,000 (sub)species from these orders of the kingdom of Fungi: Eurotiales, Onygenales, Pneumocystales, Saccharomycetales and Schizosaccharomycetales. The kingdom of Fungi is a very large taxon with almost 300,000 different (sub)species, of which most are not microbial (but rather macroscopic, like mushrooms). Because of this, not all fungi fit the scope of this package and including everything would tremendously slow down our algorithms too. By only including the aforementioned taxonomic orders, the most relevant (sub)species are covered (like all species of \emph{Aspergillus}, \emph{Candida}, \emph{Pneumocystis}, \emph{Saccharomyces} and \emph{Trichophyton}).} + \item{All ~3,500 (sub)species from these orders of the kingdom of Fungi: Eurotiales, Onygenales, Pneumocystales, Saccharomycetales, Schizosaccharomycetales and Tremellales. The kingdom of Fungi is a very large taxon with almost 300,000 different (sub)species, of which most are not microbial (but rather macroscopic, like mushrooms). Because of this, not all fungi fit the scope of this package and including everything would tremendously slow down our algorithms too. By only including the aforementioned taxonomic orders, the most relevant fungi are covered (like all species of \emph{Aspergillus}, \emph{Candida}, \emph{Cryptococcus}, \emph{Histplasma}, \emph{Pneumocystis}, \emph{Saccharomyces} and \emph{Trichophyton}).} \item{All ~15,000 previously accepted names of included (sub)species that have been taxonomically renamed} \item{The complete taxonomic tree of all included (sub)species: from kingdom to subspecies} \item{The responsible author(s) and year of scientific publication} diff --git a/man/catalogue_of_life_version.Rd b/man/catalogue_of_life_version.Rd index 992798ed1..3e9627839 100644 --- a/man/catalogue_of_life_version.Rd +++ b/man/catalogue_of_life_version.Rd @@ -17,7 +17,7 @@ This package contains the complete taxonomic tree of almost all microorganisms f Included are: \itemize{ \item{All ~55,000 (sub)species from the kingdoms of Archaea, Bacteria, Protozoa and Viruses} - \item{All ~3,000 (sub)species from these orders of the kingdom of Fungi: Eurotiales, Onygenales, Pneumocystales, Saccharomycetales and Schizosaccharomycetales. The kingdom of Fungi is a very large taxon with almost 300,000 different (sub)species, of which most are not microbial (but rather macroscopic, like mushrooms). Because of this, not all fungi fit the scope of this package and including everything would tremendously slow down our algorithms too. By only including the aforementioned taxonomic orders, the most relevant (sub)species are covered (like all species of \emph{Aspergillus}, \emph{Candida}, \emph{Pneumocystis}, \emph{Saccharomyces} and \emph{Trichophyton}).} + \item{All ~3,500 (sub)species from these orders of the kingdom of Fungi: Eurotiales, Onygenales, Pneumocystales, Saccharomycetales, Schizosaccharomycetales and Tremellales. The kingdom of Fungi is a very large taxon with almost 300,000 different (sub)species, of which most are not microbial (but rather macroscopic, like mushrooms). Because of this, not all fungi fit the scope of this package and including everything would tremendously slow down our algorithms too. By only including the aforementioned taxonomic orders, the most relevant fungi are covered (like all species of \emph{Aspergillus}, \emph{Candida}, \emph{Cryptococcus}, \emph{Histplasma}, \emph{Pneumocystis}, \emph{Saccharomyces} and \emph{Trichophyton}).} \item{All ~15,000 previously accepted names of included (sub)species that have been taxonomically renamed} \item{The complete taxonomic tree of all included (sub)species: from kingdom to subspecies} \item{The responsible author(s) and year of scientific publication} diff --git a/man/microorganisms.Rd b/man/microorganisms.Rd index 223f27951..86b59567c 100755 --- a/man/microorganisms.Rd +++ b/man/microorganisms.Rd @@ -4,7 +4,7 @@ \name{microorganisms} \alias{microorganisms} \title{Data set with ~60,000 microorganisms} -\format{A \code{\link{data.frame}} with 56,672 observations and 14 variables: +\format{A \code{\link{data.frame}} with 57,158 observations and 14 variables: \describe{ \item{\code{mo}}{ID of microorganism as used by this package} \item{\code{col_id}}{Catalogue of Life ID} @@ -46,7 +46,7 @@ This package contains the complete taxonomic tree of almost all microorganisms f Included are: \itemize{ \item{All ~55,000 (sub)species from the kingdoms of Archaea, Bacteria, Protozoa and Viruses} - \item{All ~3,000 (sub)species from these orders of the kingdom of Fungi: Eurotiales, Onygenales, Pneumocystales, Saccharomycetales and Schizosaccharomycetales. The kingdom of Fungi is a very large taxon with almost 300,000 different (sub)species, of which most are not microbial (but rather macroscopic, like mushrooms). Because of this, not all fungi fit the scope of this package and including everything would tremendously slow down our algorithms too. By only including the aforementioned taxonomic orders, the most relevant (sub)species are covered (like all species of \emph{Aspergillus}, \emph{Candida}, \emph{Pneumocystis}, \emph{Saccharomyces} and \emph{Trichophyton}).} + \item{All ~3,500 (sub)species from these orders of the kingdom of Fungi: Eurotiales, Onygenales, Pneumocystales, Saccharomycetales, Schizosaccharomycetales and Tremellales. The kingdom of Fungi is a very large taxon with almost 300,000 different (sub)species, of which most are not microbial (but rather macroscopic, like mushrooms). Because of this, not all fungi fit the scope of this package and including everything would tremendously slow down our algorithms too. By only including the aforementioned taxonomic orders, the most relevant fungi are covered (like all species of \emph{Aspergillus}, \emph{Candida}, \emph{Cryptococcus}, \emph{Histplasma}, \emph{Pneumocystis}, \emph{Saccharomyces} and \emph{Trichophyton}).} \item{All ~15,000 previously accepted names of included (sub)species that have been taxonomically renamed} \item{The complete taxonomic tree of all included (sub)species: from kingdom to subspecies} \item{The responsible author(s) and year of scientific publication} diff --git a/man/microorganisms.codes.Rd b/man/microorganisms.codes.Rd index ff79772b4..0d8f97cfb 100644 --- a/man/microorganisms.codes.Rd +++ b/man/microorganisms.codes.Rd @@ -23,7 +23,7 @@ This package contains the complete taxonomic tree of almost all microorganisms f Included are: \itemize{ \item{All ~55,000 (sub)species from the kingdoms of Archaea, Bacteria, Protozoa and Viruses} - \item{All ~3,000 (sub)species from these orders of the kingdom of Fungi: Eurotiales, Onygenales, Pneumocystales, Saccharomycetales and Schizosaccharomycetales. The kingdom of Fungi is a very large taxon with almost 300,000 different (sub)species, of which most are not microbial (but rather macroscopic, like mushrooms). Because of this, not all fungi fit the scope of this package and including everything would tremendously slow down our algorithms too. By only including the aforementioned taxonomic orders, the most relevant (sub)species are covered (like all species of \emph{Aspergillus}, \emph{Candida}, \emph{Pneumocystis}, \emph{Saccharomyces} and \emph{Trichophyton}).} + \item{All ~3,500 (sub)species from these orders of the kingdom of Fungi: Eurotiales, Onygenales, Pneumocystales, Saccharomycetales, Schizosaccharomycetales and Tremellales. The kingdom of Fungi is a very large taxon with almost 300,000 different (sub)species, of which most are not microbial (but rather macroscopic, like mushrooms). Because of this, not all fungi fit the scope of this package and including everything would tremendously slow down our algorithms too. By only including the aforementioned taxonomic orders, the most relevant fungi are covered (like all species of \emph{Aspergillus}, \emph{Candida}, \emph{Cryptococcus}, \emph{Histplasma}, \emph{Pneumocystis}, \emph{Saccharomyces} and \emph{Trichophyton}).} \item{All ~15,000 previously accepted names of included (sub)species that have been taxonomically renamed} \item{The complete taxonomic tree of all included (sub)species: from kingdom to subspecies} \item{The responsible author(s) and year of scientific publication} diff --git a/man/microorganisms.old.Rd b/man/microorganisms.old.Rd index 9901406dd..5336ea641 100644 --- a/man/microorganisms.old.Rd +++ b/man/microorganisms.old.Rd @@ -4,7 +4,7 @@ \name{microorganisms.old} \alias{microorganisms.old} \title{Data set with previously accepted taxonomic names} -\format{A \code{\link{data.frame}} with 14,506 observations and 4 variables: +\format{A \code{\link{data.frame}} with 14,487 observations and 4 variables: \describe{ \item{\code{col_id}}{Catalogue of Life ID} \item{\code{tsn_new}}{New Catalogue of Life ID} @@ -28,7 +28,7 @@ This package contains the complete taxonomic tree of almost all microorganisms f Included are: \itemize{ \item{All ~55,000 (sub)species from the kingdoms of Archaea, Bacteria, Protozoa and Viruses} - \item{All ~3,000 (sub)species from these orders of the kingdom of Fungi: Eurotiales, Onygenales, Pneumocystales, Saccharomycetales and Schizosaccharomycetales. The kingdom of Fungi is a very large taxon with almost 300,000 different (sub)species, of which most are not microbial (but rather macroscopic, like mushrooms). Because of this, not all fungi fit the scope of this package and including everything would tremendously slow down our algorithms too. By only including the aforementioned taxonomic orders, the most relevant (sub)species are covered (like all species of \emph{Aspergillus}, \emph{Candida}, \emph{Pneumocystis}, \emph{Saccharomyces} and \emph{Trichophyton}).} + \item{All ~3,500 (sub)species from these orders of the kingdom of Fungi: Eurotiales, Onygenales, Pneumocystales, Saccharomycetales, Schizosaccharomycetales and Tremellales. The kingdom of Fungi is a very large taxon with almost 300,000 different (sub)species, of which most are not microbial (but rather macroscopic, like mushrooms). Because of this, not all fungi fit the scope of this package and including everything would tremendously slow down our algorithms too. By only including the aforementioned taxonomic orders, the most relevant fungi are covered (like all species of \emph{Aspergillus}, \emph{Candida}, \emph{Cryptococcus}, \emph{Histplasma}, \emph{Pneumocystis}, \emph{Saccharomyces} and \emph{Trichophyton}).} \item{All ~15,000 previously accepted names of included (sub)species that have been taxonomically renamed} \item{The complete taxonomic tree of all included (sub)species: from kingdom to subspecies} \item{The responsible author(s) and year of scientific publication} diff --git a/man/mo_property.Rd b/man/mo_property.Rd index db0e42de3..216424d81 100644 --- a/man/mo_property.Rd +++ b/man/mo_property.Rd @@ -17,6 +17,7 @@ \alias{mo_ref} \alias{mo_authors} \alias{mo_year} +\alias{mo_rank} \alias{mo_taxonomy} \alias{mo_url} \title{Property of a microorganism} @@ -51,9 +52,11 @@ mo_authors(x, ...) mo_year(x, ...) +mo_rank(x, ...) + mo_taxonomy(x, ...) -mo_url(x, ...) +mo_url(x, open = FALSE, ...) mo_property(x, property = "fullname", language = get_locale(), ...) } @@ -64,12 +67,15 @@ mo_property(x, property = "fullname", language = get_locale(), ...) \item{...}{other parameters passed on to \code{\link{as.mo}}} +\item{open}{browse the URL using \code{\link[utils]{browseURL}}} + \item{property}{one of the column names of one of the \code{\link{microorganisms}} data set or \code{"shortname"}} } \value{ \itemize{ \item{An \code{integer} in case of \code{mo_year}} \item{A \code{list} in case of \code{mo_taxonomy}} + \item{A named \code{character} in case of \code{mo_url}} \item{A \code{character} in all other cases} } } @@ -101,7 +107,7 @@ This package contains the complete taxonomic tree of almost all microorganisms f Included are: \itemize{ \item{All ~55,000 (sub)species from the kingdoms of Archaea, Bacteria, Protozoa and Viruses} - \item{All ~3,000 (sub)species from these orders of the kingdom of Fungi: Eurotiales, Onygenales, Pneumocystales, Saccharomycetales and Schizosaccharomycetales. The kingdom of Fungi is a very large taxon with almost 300,000 different (sub)species, of which most are not microbial (but rather macroscopic, like mushrooms). Because of this, not all fungi fit the scope of this package and including everything would tremendously slow down our algorithms too. By only including the aforementioned taxonomic orders, the most relevant (sub)species are covered (like all species of \emph{Aspergillus}, \emph{Candida}, \emph{Pneumocystis}, \emph{Saccharomyces} and \emph{Trichophyton}).} + \item{All ~3,500 (sub)species from these orders of the kingdom of Fungi: Eurotiales, Onygenales, Pneumocystales, Saccharomycetales, Schizosaccharomycetales and Tremellales. The kingdom of Fungi is a very large taxon with almost 300,000 different (sub)species, of which most are not microbial (but rather macroscopic, like mushrooms). Because of this, not all fungi fit the scope of this package and including everything would tremendously slow down our algorithms too. By only including the aforementioned taxonomic orders, the most relevant fungi are covered (like all species of \emph{Aspergillus}, \emph{Candida}, \emph{Cryptococcus}, \emph{Histplasma}, \emph{Pneumocystis}, \emph{Saccharomyces} and \emph{Trichophyton}).} \item{All ~15,000 previously accepted names of included (sub)species that have been taxonomically renamed} \item{The complete taxonomic tree of all included (sub)species: from kingdom to subspecies} \item{The responsible author(s) and year of scientific publication} @@ -127,8 +133,7 @@ On our website \url{https://msberends.gitlab.io/AMR} you can find \href{https:// } \examples{ -# All properties of Escherichia coli -## taxonomic properties +## taxonomic tree mo_kingdom("E. coli") # "Bacteria" mo_phylum("E. coli") # "Proteobacteria" mo_class("E. coli") # "Gammaproteobacteria" @@ -145,10 +150,12 @@ mo_shortname("E. coli") # "E. coli" ## other properties mo_gramstain("E. coli") # "Gram negative" mo_type("E. coli") # "Bacteria" (equal to kingdom) +mo_rank("E. coli") # "species" +mo_url("E. coli") # get the direct url to the Catalogue of Life ## scientific reference -mo_ref("E. coli") # "Castellani and Chalmers, 1919" -mo_authors("E. coli") # "Castellani and Chalmers" +mo_ref("E. coli") # "Castellani et al., 1919" +mo_authors("E. coli") # "Castellani et al." mo_year("E. coli") # 1919 @@ -184,7 +191,7 @@ mo_shortname("S. pyo") # "S. pyogenes" mo_shortname("S. pyo", Lancefield = TRUE) # "GAS" ('Group A streptococci') -# Language support for German, Dutch, Spanish, Portuguese, Italian and French +# language support for German, Dutch, Spanish, Portuguese, Italian and French mo_gramstain("E. coli", language = "de") # "Gramnegativ" mo_gramstain("E. coli", language = "nl") # "Gram-negatief" mo_gramstain("E. coli", language = "es") # "Gram negativo" @@ -202,7 +209,7 @@ mo_fullname("S. pyogenes", language = "nl") # "Streptococcus groep A" -# Get a list with the complete taxonomy (kingdom to subspecies) +# get a list with the complete taxonomy (kingdom to subspecies) mo_taxonomy("E. coli") } \seealso{ diff --git a/reproduction_of_microorganisms.R b/reproduction_of_microorganisms.R index e95a1a3d0..3722ca099 100644 --- a/reproduction_of_microorganisms.R +++ b/reproduction_of_microorganisms.R @@ -2,8 +2,8 @@ # Data retrieved from Encyclopaedia of Life: # https://opendata.eol.org/dataset/catalogue-of-life/ -# unzip and extract taxon.tab, then: -taxon <- data.table::fread("taxon.tab") +# unzip and extract taxon.tab (around 1.5 GB), then: +taxon <- data.table::fread("Downloads/taxon.tab") # result is over 3.7M rows: library(dplyr) library(AMR) @@ -29,25 +29,45 @@ MOs <- taxon %>% !taxonRank %in% c("kingdom", "phylum", "superfamily", "class", "order", "family"), # not all fungi: Aspergillus, Candida, Trichphyton and Pneumocystis are the most important, # so only keep these orders from the fungi: - !(kingdom == "Fungi" & !order %in% c("Eurotiales", "Saccharomycetales", "Schizosaccharomycetales", "Onygenales", "Pneumocystales"))) %>% + !(kingdom == "Fungi" & !order %in% c("Eurotiales", "Saccharomycetales", "Schizosaccharomycetales", "Tremellales", "Onygenales", "Pneumocystales"))) %>% # remove text if it contains 'Not assigned' like phylum in viruses mutate_all(funs(gsub("Not assigned", "", .))) %>% - # only latest ref, not original authors - mutate(scientificNameAuthorship = trimws(gsub(".*[)] ", "", scientificNameAuthorship)), - scientificNameAuthorship = ifelse(grepl(" emend[. ]", scientificNameAuthorship, ignore.case = TRUE), - gsub("(.*)emend[. ]+(.*)", "\\2", scientificNameAuthorship, ignore.case = TRUE), - scientificNameAuthorship), - scientificNameAuthorship = gsub(".", "", scientificNameAuthorship, fixed = TRUE), - scientificNameAuthorship = gsub(",? et al", " et al.", scientificNameAuthorship, fixed = FALSE, ignore.case = TRUE), - scientificNameAuthorship = gsub("[()]", "", scientificNameAuthorship), - # year always preceded by comma - scientificNameAuthorship = gsub(" ([0-9]{4})$", ", \\1", scientificNameAuthorship), - scientificNameAuthorship = gsub(",,", ",", scientificNameAuthorship, fixed = TRUE), - # only first author with *et al.* - scientificNameAuthorship = gsub(",.*,", " et al.,", scientificNameAuthorship), - scientificNameAuthorship = gsub(" (and|&) .*,", " et al.,", scientificNameAuthorship), - scientificNameAuthorship = gsub(", [^0-9]+", ", ", scientificNameAuthorship), - scientificNameAuthorship = gsub(", $", "", scientificNameAuthorship) + # Transform 'Smith, Jones, 2011' to 'Smith et al., 2011': + mutate(authors2 = iconv(scientificNameAuthorship, from = "UTF-8", to = "ASCII//TRANSLIT"), + # remove leading and trailing brackets + authors2 = gsub("^[(](.*)[)]$", "\\1", authors2), + # only take part after brackets if there's a name + authors2 = ifelse(grepl(".*[)] [a-zA-Z]+.*", authors2), + gsub(".*[)] (.*)", "\\1", authors2), + authors2), + # get year from last 4 digits + lastyear = as.integer(gsub(".*([0-9]{4})$", "\\1", authors2)), + # can never be later than now + lastyear = ifelse(lastyear > as.integer(format(Sys.Date(), "%Y")), + NA, + lastyear), + # get authors without last year + authors = gsub("(.*)[0-9]{4}$", "\\1", authors2), + # remove nonsense characters from names + authors = gsub("[^a-zA-Z,'& -]", "", authors), + # remove trailing and leading spaces + authors = trimws(authors), + # only keep first author and replace all others by 'et al' + authors = gsub("(,| and| &| ex| emend\\.?) .*", " et al.", authors), + # et al. always with ending dot + authors = gsub(" et al\\.?", " et al.", authors), + authors = gsub(" ?,$", "", authors), + # don't start with 'sensu' or 'ehrenb' + authors = gsub("^(sensu|Ehrenb.?) ", "", authors, ignore.case = TRUE), + # no initials, only surname + authors = gsub("^([A-Z]+ )+", "", authors, ignore.case = FALSE), + # combine author and year if year is available + ref = ifelse(!is.na(lastyear), + paste0(authors, ", ", lastyear), + authors), + # fix beginning and ending + ref = gsub(", $", "", ref), + ref = gsub("^, ", "", ref) ) # remove non-ASCII characters (not allowed by CRAN) @@ -58,7 +78,7 @@ MOs <- MOs %>% # split old taxonomic names - they refer to a new `taxonID` with `acceptedNameUsageID` MOs.old <- MOs %>% filter(!is.na(acceptedNameUsageID), - scientificNameAuthorship != "") %>% + ref != "") %>% transmute(col_id = taxonID, col_id_new = acceptedNameUsageID, fullname = @@ -66,9 +86,9 @@ MOs.old <- MOs %>% gsub("(.*)[(].*", "\\1", stringr::str_replace( string = scientificName, - pattern = stringr::fixed(scientificNameAuthorship), + pattern = stringr::fixed(ref), replacement = ""))), - ref = scientificNameAuthorship) %>% + ref = ref) %>% filter(!is.na(fullname)) %>% distinct(fullname, .keep_all = TRUE) %>% arrange(col_id) @@ -88,7 +108,7 @@ MOs <- MOs %>% species = specificEpithet, subspecies = infraspecificEpithet, rank = taxonRank, - ref = scientificNameAuthorship, + ref = ref, species_id = gsub(".*/([a-f0-9]+)", "\\1", furtherInformationURL)) %>% distinct(fullname, .keep_all = TRUE) %>% filter(!grepl("unassigned", fullname, ignore.case = TRUE)) @@ -254,6 +274,7 @@ class(MOs$mo) <- "mo" saveRDS(MOs, "microorganisms.rds") saveRDS(MOs.old, "microorganisms.old.rds") + # on the server: # usethis::use_data(microorganisms, overwrite = TRUE) # usethis::use_data(microorganisms.old, overwrite = TRUE) diff --git a/vignettes/benchmarks.Rmd b/vignettes/benchmarks.Rmd index 8dc398f86..7fea7540b 100755 --- a/vignettes/benchmarks.Rmd +++ b/vignettes/benchmarks.Rmd @@ -48,12 +48,11 @@ S.aureus <- microbenchmark(as.mo("sau"), as.mo("S. aureus"), as.mo("STAAUR"), as.mo("Staphylococcus aureus"), - as.mo("B_STPHY_AUR"), times = 10) -print(S.aureus, unit = "ms", signif = 2) +print(S.aureus, unit = "ms", signif = 3) ``` -In the table above, all measurements are shown in milliseconds (thousands of seconds). A value of 10 milliseconds means it can determine 100 input values per second. It case of 50 milliseconds, this is only 20 input values per second. The more an input value resembles a full name, the faster the result will be found. In case of `as.mo("B_STPHY_AUR")`, the input is already a valid MO code, so it only almost takes no time at all (`r as.integer(S.aureus %>% filter(expr == 'as.mo("B_STPHY_AUR")') %>% pull(time) %>% median(na.rm = TRUE) / 1000)` millionths of a second). +In the table above, all measurements are shown in milliseconds (thousands of seconds). A value of 10 milliseconds means it can determine 100 input values per second. It case of 50 milliseconds, this is only 20 input values per second. The more an input value resembles a full name, the faster the result will be found. To achieve this speed, the `as.mo` function also takes into account the prevalence of human pathogenic microorganisms. The downside is of course that less prevalent microorganisms will be determined less fast. See this example for the ID of *Mycoplasma leonicaptivi* (`B_MYCPL_LEO`), a bug probably never found before in humans: @@ -64,26 +63,30 @@ M.leonicaptivi <- microbenchmark(as.mo("myle"), as.mo("M. leonicaptivi"), as.mo("MYCLEO"), as.mo("Mycoplasma leonicaptivi"), - as.mo("B_MYCPL_LEO"), times = 10) -print(M.leonicaptivi, unit = "ms", signif = 2) +print(M.leonicaptivi, unit = "ms", signif = 3) ``` -That takes `r round(mean(M.leonicaptivi$time, na.rm = TRUE) / mean(S.aureus$time, na.rm = TRUE), 1)` times as much time on average! A value of 100 milliseconds means it can only determine ~10 different input values per second. We can conclude that looking up arbitrary codes of less prevalent microorganisms is the worst way to go, in terms of calculation performance: +That takes `r round(mean(M.leonicaptivi$time, na.rm = TRUE) / mean(S.aureus$time, na.rm = TRUE), 1)` times as much time on average! A value of 100 milliseconds means it can only determine ~10 different input values per second. We can conclude that looking up arbitrary codes of less prevalent microorganisms is the worst way to go, in terms of calculation performance. + +In the figure below, we compare *Escherichia coli* (which is very common) with *Prevotella brevis* (which is moderately common) and with *Mycoplasma leonicaptivi* (which is very uncommon): ```{r} par(mar = c(5, 16, 4, 2)) # set more space for left margin text (16) -# highest value on y axis -max_y_axis <- max(S.aureus$time, M.leonicaptivi$time, na.rm = TRUE) / 1e6 - -boxplot(S.aureus, horizontal = TRUE, las = 1, unit = "ms", log = FALSE, xlab = "", ylim = c(0, max_y_axis), - main = expression(paste("Benchmark of ", italic("Staphylococcus aureus")))) -boxplot(M.leonicaptivi, horizontal = TRUE, las = 1, unit = "ms", log = FALSE, xlab = "", ylim = c(0, max_y_axis), - main = expression(paste("Benchmark of ", italic("Mycoplasma leonicaptivi")))) +boxplot(microbenchmark(as.mo("M. leonicaptivi"), + as.mo("Mycoplasma leonicaptivi"), + as.mo("P. brevis"), + as.mo("Prevotella brevis"), + as.mo("E. coli"), + as.mo("Escherichia coli"), + times = 50), + horizontal = TRUE, las = 1, unit = "s", log = FALSE, + xlab = "", ylab = "Time in seconds", + main = "Benchmarks per prevalence") ``` -To relieve this pitfall and further improve performance, two important calculations take almost no time at all: **repetitive results** and **already precalculated results**. +Uncommon microorganisms take a lot more time than common microorganisms. To relieve this pitfall and further improve performance, two important calculations take almost no time at all: **repetitive results** and **already precalculated results**. ### Repetitive results