diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 7695a26a..7ffd1306 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -51,7 +51,7 @@ before_script: R-release: stage: build - allow_failure: true + allow_failure: false script: - Rscript -e 'sessionInfo()' # install missing and outdated packages diff --git a/DESCRIPTION b/DESCRIPTION index a5c20726..fe9d7878 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: AMR Version: 0.6.1.9002 -Date: 2019-04-06 +Date: 2019-04-09 Title: Antimicrobial Resistance Analysis Authors@R: c( person( diff --git a/NAMESPACE b/NAMESPACE index 4ec62f58..8efd6b60 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -71,7 +71,6 @@ export(atc_umcg) export(availability) export(brmo) export(catalogue_of_life_version) -export(clean_mo_history) export(count_I) export(count_IR) export(count_R) @@ -293,4 +292,3 @@ importFrom(stats,sd) importFrom(utils,browseURL) importFrom(utils,browseVignettes) importFrom(utils,installed.packages) -importFrom(utils,menu) diff --git a/NEWS.md b/NEWS.md index 8eea15c4..5126e70e 100755 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,8 @@ * Added ~5,000 more old taxonomic names to the `microorganisms.old` data set, which leads to better results finding when using the `as.mo()` function * Frequency tables of microbial IDs speed improvement * Removed all hardcoded EUCAST rules and replaced them with a new reference file: `./inst/eucast/eucast.tsv`. +* Added ceftazidim intrinsic resistance to *Streptococci* +* Changed default settings for `age_groups()`, to let groups of fives and tens end with 100+ instead of 120+. #### Other * Prevented [staged install](https://developer.r-project.org/Blog/public/2019/02/14/staged-install/index.html) in R 3.6.0 and later by adding `StagedInstall: false` to the DESCRIPTION file diff --git a/R/age.R b/R/age.R index 0eca8995..51f65865 100755 --- a/R/age.R +++ b/R/age.R @@ -72,9 +72,9 @@ age <- function(x, reference = Sys.Date()) { #' \item{A character:} #' \itemize{ #' \item{\code{"children"}, equivalent of: \code{c(0, 1, 2, 4, 6, 13, 18)}. This will split on 0, 1, 2-3, 4-5, 6-12, 13-17 and 18+.} -#' \item{\code{"elderly"} or \code{"seniors"}, equivalent of: \code{c(65, 75, 85, 95)}. This will split on 0-64, 65-74, 75-84, 85-94 and 95+.} -#' \item{\code{"fives"}, equivalent of: \code{1:24 * 5}. This will split on 0-4, 5-9, 10-14, 15-19 and so forth, until 120.} -#' \item{\code{"tens"}, equivalent of: \code{1:12 * 10}. This will split on 0-9, 10-19, 20-29 and so forth, until 120.} +#' \item{\code{"elderly"} or \code{"seniors"}, equivalent of: \code{c(65, 75, 85)}. This will split on 0-64, 65-74, 75-84, 85+.} +#' \item{\code{"fives"}, equivalent of: \code{1:20 * 5}. This will split on 0-4, 5-9, 10-14, ..., 90-94, 95-99, 100+.} +#' \item{\code{"tens"}, equivalent of: \code{1:10 * 10}. This will split on 0-9, 10-19, 20-29, ... 80-89, 90-99, 100+.} #' } #' } #' @keywords age_group age @@ -92,11 +92,11 @@ age <- function(x, reference = Sys.Date()) { #' age_groups(ages, c(20, 50)) #' #' # split into groups of ten years -#' age_groups(ages, 1:12 * 10) +#' age_groups(ages, 1:10 * 10) #' age_groups(ages, split_at = "tens") #' #' # split into groups of five years -#' age_groups(ages, 1:24 * 5) +#' age_groups(ages, 1:20 * 5) #' age_groups(ages, split_at = "fives") #' #' # split specifically for children @@ -117,14 +117,14 @@ age <- function(x, reference = Sys.Date()) { age_groups <- function(x, split_at = c(12, 25, 55, 75)) { if (is.character(split_at)) { split_at <- split_at[1L] - if (split_at %like% "^child") { + if (split_at %like% "^(child|kid)") { split_at <- c(0, 1, 2, 4, 6, 13, 18) } else if (split_at %like% "^(elder|senior)") { - split_at <- c(65, 75, 85, 95) + split_at <- c(65, 75, 85) } else if (split_at %like% "^five") { - split_at <- 1:24 * 5 + split_at <- 1:20 * 5 } else if (split_at %like% "^ten") { - split_at <- 1:12 * 10 + split_at <- 1:10 * 10 } } split_at <- as.integer(split_at) diff --git a/R/mo_history.R b/R/mo_history.R index 0274e5c5..c0c50de7 100644 --- a/R/mo_history.R +++ b/R/mo_history.R @@ -25,50 +25,50 @@ set_mo_history <- function(x, mo, uncertainty_level, force = FALSE) { # disable function return(base::invisible()) - if (base::interactive() | force == TRUE) { - mo_hist <- read_mo_history(uncertainty_level = uncertainty_level, force = force) - df <- data.frame(x, mo, stringsAsFactors = FALSE) %>% - distinct(x, .keep_all = TRUE) %>% - filter(!is.na(x) & !is.na(mo)) - if (nrow(df) == 0) { - return(base::invisible()) - } - x <- toupper(df$x) - mo <- df$mo - for (i in 1:length(x)) { - # save package version too, as both the as.mo() algorithm and the reference data set may change - if (NROW(mo_hist[base::which(mo_hist$x == x[i] & - mo_hist$uncertainty_level >= uncertainty_level & - mo_hist$package_v == utils::packageVersion("AMR")),]) == 0) { - tryCatch( - assign(x = "mo_history", - value = rbind(mo_hist, - data.frame( - x = x[i], - mo = mo[i], - uncertainty_level = uncertainty_level, - package_v = base::as.character(utils::packageVersion("AMR")), - stringsAsFactors = FALSE)), - envir = asNamespace("AMR")), - error = function(e) invisible()) - } - } - } - return(base::invisible()) + # if (base::interactive() | force == TRUE) { + # mo_hist <- read_mo_history(uncertainty_level = uncertainty_level, force = force) + # df <- data.frame(x, mo, stringsAsFactors = FALSE) %>% + # distinct(x, .keep_all = TRUE) %>% + # filter(!is.na(x) & !is.na(mo)) + # if (nrow(df) == 0) { + # return(base::invisible()) + # } + # x <- toupper(df$x) + # mo <- df$mo + # for (i in 1:length(x)) { + # # save package version too, as both the as.mo() algorithm and the reference data set may change + # if (NROW(mo_hist[base::which(mo_hist$x == x[i] & + # mo_hist$uncertainty_level >= uncertainty_level & + # mo_hist$package_v == utils::packageVersion("AMR")),]) == 0) { + # tryCatch( + # assign(x = "mo_history", + # value = rbind(mo_hist, + # data.frame( + # x = x[i], + # mo = mo[i], + # uncertainty_level = uncertainty_level, + # package_v = base::as.character(utils::packageVersion("AMR")), + # stringsAsFactors = FALSE)), + # envir = asNamespace("AMR")), + # error = function(e) invisible()) + # } + # } + # } + # return(base::invisible()) } get_mo_history <- function(x, uncertainty_level, force = FALSE) { # disable function return(NA) - history <- read_mo_history(uncertainty_level = uncertainty_level, force = force) - if (base::is.null(history)) { - NA - } else { - data.frame(x = toupper(x), stringsAsFactors = FALSE) %>% - left_join(history, by = "x") %>% - pull(mo) - } + # history <- read_mo_history(uncertainty_level = uncertainty_level, force = force) + # if (base::is.null(history)) { + # NA + # } else { + # data.frame(x = toupper(x), stringsAsFactors = FALSE) %>% + # left_join(history, by = "x") %>% + # pull(mo) + # } } #' @importFrom dplyr %>% filter distinct @@ -76,59 +76,59 @@ read_mo_history <- function(uncertainty_level = 2, force = FALSE, unfiltered = F # disable function return(NULL) - if ((!base::interactive() & force == FALSE)) { - return(NULL) - } - uncertainty_level_param <- uncertainty_level - - history <- tryCatch(get("mo_history", envir = asNamespace("AMR")), - error = function(e) NULL) - if (is.null(history)) { - return(NULL) - } - # Below: filter on current package version. - # Even current fullnames may be replaced by new taxonomic names, so new versions of - # the Catalogue of Life must not lead to data corruption. - - if (unfiltered == FALSE) { - history <- history %>% - filter(package_v == as.character(utils::packageVersion("AMR")), - # only take unknowns if uncertainty_level_param is higher - ((mo == "UNKNOWN" & uncertainty_level_param == uncertainty_level) | - (mo != "UNKNOWN" & uncertainty_level_param >= uncertainty_level))) %>% - arrange(desc(uncertainty_level)) %>% - distinct(x, mo, .keep_all = TRUE) - } - - if (nrow(history) == 0) { - NULL - } else { - history - } + # if ((!base::interactive() & force == FALSE)) { + # return(NULL) + # } + # uncertainty_level_param <- uncertainty_level + # + # history <- tryCatch(get("mo_history", envir = asNamespace("AMR")), + # error = function(e) NULL) + # if (is.null(history)) { + # return(NULL) + # } + # # Below: filter on current package version. + # # Even current fullnames may be replaced by new taxonomic names, so new versions of + # # the Catalogue of Life must not lead to data corruption. + # + # if (unfiltered == FALSE) { + # history <- history %>% + # filter(package_v == as.character(utils::packageVersion("AMR")), + # # only take unknowns if uncertainty_level_param is higher + # ((mo == "UNKNOWN" & uncertainty_level_param == uncertainty_level) | + # (mo != "UNKNOWN" & uncertainty_level_param >= uncertainty_level))) %>% + # arrange(desc(uncertainty_level)) %>% + # distinct(x, mo, .keep_all = TRUE) + # } + # + # if (nrow(history) == 0) { + # NULL + # } else { + # history + # } } -#' @rdname as.mo -#' @importFrom crayon red -#' @importFrom utils menu -#' @export +# @rdname as.mo +# @importFrom crayon red +# @importFrom utils menu +# @export clean_mo_history <- function(...) { - if (!is.null(read_mo_history())) { - if (interactive() & !isTRUE(list(...)$force)) { - q <- menu(title = paste("This will remove all", - format(nrow(read_mo_history(999, unfiltered = TRUE)), big.mark = ","), - "microbial IDs determined previously in this session. Are you sure?"), - choices = c("Yes", "No"), - graphics = FALSE) - if (q != 1) { - return(invisible()) - } - } - tryCatch( - assign(x = "mo_history", - value = NULL, - envir = asNamespace("AMR")), - error = function(e) invisible()) - cat(red("History removed.")) - } + # if (!is.null(read_mo_history())) { + # if (interactive() & !isTRUE(list(...)$force)) { + # q <- menu(title = paste("This will remove all", + # format(nrow(read_mo_history(999, unfiltered = TRUE)), big.mark = ","), + # "microbial IDs determined previously in this session. Are you sure?"), + # choices = c("Yes", "No"), + # graphics = FALSE) + # if (q != 1) { + # return(invisible()) + # } + # } + # tryCatch( + # assign(x = "mo_history", + # value = NULL, + # envir = asNamespace("AMR")), + # error = function(e) invisible()) + # cat(red("History removed.")) + # } } diff --git a/R/mo_property.R b/R/mo_property.R index 1cf4e6e3..3ee619b2 100755 --- a/R/mo_property.R +++ b/R/mo_property.R @@ -247,10 +247,13 @@ mo_phylum <- function(x, language = get_locale(), ...) { #' @rdname mo_property #' @export mo_kingdom <- function(x, language = get_locale(), ...) { + if (all(x %in% AMR::microorganisms$kingdom)) { + return(x) + } + x <- as.mo(x, language = "en", ...) kngdm <- mo_validate(x = x, property = "kingdom", ...) if (language != "en") { - unknowns <- as.mo(x, ...) == "UNKOWN" - kngdm[unknowns] <- mo_translate(kngdm[unknowns], language = language) + kngdm[x == "UNKNOWN"] <- mo_translate(kngdm[x == "UNKNOWN"], language = language) } kngdm } @@ -264,7 +267,6 @@ mo_type <- function(x, language = get_locale(), ...) { #' @rdname mo_property #' @export mo_gramstain <- function(x, language = get_locale(), ...) { - x.bak <- x x.mo <- as.mo(x, language = "en", ...) x.phylum <- mo_phylum(x.mo, language = "en") x[x.phylum %in% c("Actinobacteria", diff --git a/docs/articles/benchmarks.html b/docs/articles/benchmarks.html index c8ea0422..e982224f 100644 --- a/docs/articles/benchmarks.html +++ b/docs/articles/benchmarks.html @@ -192,7 +192,7 @@
benchmarks.Rmd
In the table above, all measurements are shown in milliseconds (thousands of seconds). A value of 5 milliseconds means it can determine 200 input values per second. It case of 100 milliseconds, this is only 10 input values per second. The second input is the only one that has to be looked up thoroughly. All the others are known codes (the first one is a WHONET code) or common laboratory codes, or common full organism names like the last one. Full organism names are always preferred.
To achieve this speed, the as.mo
function also takes into account the prevalence of human pathogenic microorganisms. The downside is of course that less prevalent microorganisms will be determined less fast. See this example for the ID of Thermus islandicus (B_THERMS_ISL
), a bug probably never found before in humans:
T.islandicus <- microbenchmark(as.mo("theisl"),
@@ -236,11 +236,11 @@
print(T.islandicus, unit = "ms", signif = 2)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
-#> as.mo("theisl") 460 460 480 470 510 510 10
-#> as.mo("THEISL") 460 470 490 490 510 540 10
-#> as.mo("T. islandicus") 73 73 84 73 77 130 10
-#> as.mo("T. islandicus") 73 73 88 75 120 120 10
-#> as.mo("Thermus islandicus") 73 73 80 73 74 130 10
That takes 8 times as much time on average. A value of 100 milliseconds means it can only determine ~10 different input values per second. We can conclude that looking up arbitrary codes of less prevalent microorganisms is the worst way to go, in terms of calculation performance. Full names (like Thermus islandicus) are almost fast - these are the most probable input from most data sets.
In the figure below, we compare Escherichia coli (which is very common) with Prevotella brevis (which is moderately common) and with Thermus islandicus (which is very uncommon):
par(mar = c(5, 16, 4, 2)) # set more space for left margin text (16)
@@ -256,10 +256,7 @@
xlab = "", ylab = "Time in seconds", ylim = c(0, 0.5),
main = "Benchmarks per prevalence")
In reality, the as.mo()
functions learns from its own output to speed up determinations for next times. In above figure, this effect was disabled to show the difference with the boxplot below - when you would use as.mo()
yourself:
The highest outliers are the first times. All next determinations were done in only thousands of seconds. For now, learning only works per session. If R is closed or terminated, the algorithms reset. This will probably be resolved in a next version.
-Still, uncommon microorganisms take a lot more time than common microorganisms, especially the first time. To relieve this pitfall and further improve performance, two important calculations take almost no time at all: repetitive results and already precalculated results.
+Uncommon microorganisms take a lot more time than common microorganisms. To relieve this pitfall and further improve performance, two important calculations take almost no time at all: repetitive results and already precalculated results.
So transforming 500,000 values (!!) of 50 unique values only takes 0.75 seconds (751 ms). You only lose time on your unique input values.
+#> mo_fullname(x) 774 777 811 783 829 947 10 +So transforming 500,000 values (!!) of 50 unique values only takes 0.78 seconds (783 ms). You only lose time on your unique input values.
So going from mo_fullname("Staphylococcus aureus")
to "Staphylococcus aureus"
takes 0.0008 seconds - it doesn’t even start calculating if the result would be the same as the expected resulting value. That goes for all helper functions:
run_it <- microbenchmark(A = mo_species("aureus"),
B = mo_genus("Staphylococcus"),
@@ -320,14 +317,14 @@
print(run_it, unit = "ms", signif = 3)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
-#> A 0.376 0.386 0.436 0.436 0.479 0.516 10
-#> B 0.467 0.501 0.561 0.567 0.599 0.700 10
-#> C 0.503 0.782 0.850 0.918 0.971 1.040 10
-#> D 0.403 0.471 0.488 0.491 0.525 0.588 10
-#> E 0.343 0.429 0.456 0.445 0.485 0.638 10
-#> F 0.380 0.403 0.447 0.453 0.491 0.520 10
-#> G 0.385 0.421 0.458 0.447 0.487 0.575 10
-#> H 0.396 0.455 0.484 0.491 0.515 0.549 10
Of course, when running mo_phylum("Firmicutes")
the function has zero knowledge about the actual microorganism, namely S. aureus. But since the result would be "Firmicutes"
too, there is no point in calculating the result. And because this package ‘knows’ all phyla of all known bacteria (according to the Catalogue of Life), it can just return the initial value immediately.
Currently supported are German, Dutch, Spanish, Italian, French and Portuguese.
diff --git a/docs/articles/benchmarks_files/figure-html/unnamed-chunk-5-1.png b/docs/articles/benchmarks_files/figure-html/unnamed-chunk-5-1.png index d84e92d1..683d4169 100644 Binary files a/docs/articles/benchmarks_files/figure-html/unnamed-chunk-5-1.png and b/docs/articles/benchmarks_files/figure-html/unnamed-chunk-5-1.png differ diff --git a/docs/news/index.html b/docs/news/index.html index bd2e78a3..746e5533 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -243,9 +243,19 @@guess_mo()
, guess_atc()
, EUCAST_rules()
, interpretive_reading()
microorganisms.old
data set, which leads to better results finding when using the as.mo()
functionmicroorganisms.old
data set, which leads to better results finding when using the as.mo()
function./inst/eucast/eucast.tsv
.age_groups()
, to let groups of fives and tens end with 100+ instead of 120+.StagedInstall: false
to the DESCRIPTION fileas.atc()
Using as.mo(..., allow_uncertain = 3)
could lead to very unreliable results.
~/.Rhistory_mo
. Use the new function clean_mo_history()
to delete this file, which resets the algorithms.~/.Rhistory_mo
. Use the new function clean_mo_history()
to delete this file, which resets the algorithms.Incoercible results will now be considered ‘unknown’, MO code UNKNOWN
. On foreign systems, properties of these will be translated to all languages already previously supported: German, Dutch, French, Italian, Spanish and Portuguese:
mo_genus("qwerty", language = "es")
@@ -488,9 +498,9 @@ Using as.mo(..., allow_uncertain = 3)if using different lengths of pattern and x in %like%
, it will now return the call
as.mo(..., allow_uncertain = 3)Percentages will now will rounded more logically (e.g. in freq
function)
crayon
, to support formatted text in the consoletidyr
is now mandatory (went to Import
field) since portion_df
and count_df
rely on itas.mo(..., allow_uncertain = 3)
as.mo(..., allow_uncertain = 3)Other small fixes
as.mo(..., allow_uncertain = 3)Functions as.rsi
and as.mic
now add the package name and version as attributes
README.md
with more examplesA character:
"children"
, equivalent of: c(0, 1, 2, 4, 6, 13, 18)
. This will split on 0, 1, 2-3, 4-5, 6-12, 13-17 and 18+.
"elderly"
or "seniors"
, equivalent of: c(65, 75, 85, 95)
. This will split on 0-64, 65-74, 75-84, 85-94 and 95+.
"fives"
, equivalent of: 1:24 * 5
. This will split on 0-4, 5-9, 10-14, 15-19 and so forth, until 120.
"tens"
, equivalent of: 1:12 * 10
. This will split on 0-9, 10-19, 20-29 and so forth, until 120.
"elderly"
or "seniors"
, equivalent of: c(65, 75, 85)
. This will split on 0-64, 65-74, 75-84, 85+.
"fives"
, equivalent of: 1:20 * 5
. This will split on 0-4, 5-9, 10-14, ..., 90-94, 95-99, 100+.
"tens"
, equivalent of: 1:10 * 10
. This will split on 0-9, 10-19, 20-29, ... 80-89, 90-99, 100+.
-
|
Transform to microorganism ID |