diff --git a/DESCRIPTION b/DESCRIPTION index f080051c..c44161f3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: AMR -Version: 0.7.1.9069 -Date: 2019-09-01 +Version: 0.7.1.9070 +Date: 2019-09-02 Title: Antimicrobial Resistance Analysis Authors@R: c( person(role = c("aut", "cre"), diff --git a/NEWS.md b/NEWS.md index 94d56390..375f3054 100755 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,5 @@ -# AMR 0.7.1.9069 -Last updated: 01-Sep-2019 +# AMR 0.7.1.9070 +Last updated: 02-Sep-2019 ### Breaking * Determination of first isolates now **excludes** all 'unknown' microorganisms at default, i.e. microbial code `"UNKNOWN"`. They can be included with the new parameter `include_unknown`: @@ -97,6 +97,7 @@ * Speed improvement for `guess_ab_col()` which is now 30 times faster for antibiotic abbreviations * Improved `filter_ab_class()` to be more reliable and to support 5th generation cephalosporins * Function `availability()` now uses `portion_R()` instead of `portion_IR()`, to comply with EUCAST insights +* Functions `age()` and `age_groups()` now have a `na.rm` parameter to remove empty values #### Other * Added Prof Dr Casper Albers as doctoral advisor and Dr Bart Meijer, Dr Dennis Souverein and Annick Lenglet as contributors diff --git a/R/age.R b/R/age.R index b11ea31d..4a7f35d4 100755 --- a/R/age.R +++ b/R/age.R @@ -25,8 +25,9 @@ #' @param x date(s), will be coerced with \code{\link{as.POSIXlt}} #' @param reference reference date(s) (defaults to today), will be coerced with \code{\link{as.POSIXlt}} and cannot be lower than \code{x} #' @param exact a logical to indicate whether age calculation should be exact, i.e. with decimals. It divides the number of days of \href{https://en.wikipedia.org/wiki/Year-to-date}{year-to-date} (YTD) of \code{x} by the number of days in a year of \code{reference} (either 365 or 366). +#' @param na.rm a logical to indicate whether missing values should be removed #' @return An integer (no decimals) if \code{exact = FALSE}, a double (with decimals) otherwise -#' @seealso \code{\link{age_groups}} to split age into age groups +#' @seealso To split ages into groups, use the \code{\link{age_groups}} function. #' @importFrom dplyr if_else #' @inheritSection AMR Read more on our website! #' @export @@ -39,7 +40,7 @@ #' df$age_exact <- age(df$birth_date, exact = TRUE) #' #' df -age <- function(x, reference = Sys.Date(), exact = FALSE) { +age <- function(x, reference = Sys.Date(), exact = FALSE, na.rm = FALSE) { if (length(x) != length(reference)) { if (length(reference) == 1) { reference <- rep(reference, length(x)) @@ -79,6 +80,10 @@ age <- function(x, reference = Sys.Date(), exact = FALSE) { if (any(ages > 120, na.rm = TRUE)) { warning("Some ages are above 120.") } + + if (isTRUE(na.rm)) { + ages <- ages[!is.na(ages)] + } ages } @@ -88,6 +93,7 @@ age <- function(x, reference = Sys.Date(), exact = FALSE) { #' Split ages into age groups defined by the \code{split} parameter. This allows for easier demographic (antimicrobial resistance) analysis. #' @param x age, e.g. calculated with \code{\link{age}} #' @param split_at values to split \code{x} at, defaults to age groups 0-11, 12-24, 25-54, 55-74 and 75+. See Details. +#' @param na.rm a logical to indicate whether missing values should be removed #' @details To split ages, the input can be: #' \itemize{ #' \item{A numeric vector. A vector of e.g. \code{c(10, 20)} will split on 0-9, 10-19 and 20+. A value of only \code{50} will split on 0-49 and 50+. @@ -102,7 +108,7 @@ age <- function(x, reference = Sys.Date(), exact = FALSE) { #' } #' @keywords age_group age #' @return Ordered \code{\link{factor}} -#' @seealso \code{\link{age}} to determine ages based on one or more reference dates +#' @seealso To determine ages, based on one or more reference dates, use the \code{\link{age}} function. #' @export #' @inheritSection AMR Read more on our website! #' @examples @@ -135,7 +141,7 @@ age <- function(x, reference = Sys.Date(), exact = FALSE) { #' group_by(age_group = age_groups(age)) %>% #' select(age_group, CIP) %>% #' ggplot_rsi(x = "age_group") -age_groups <- function(x, split_at = c(12, 25, 55, 75)) { +age_groups <- function(x, split_at = c(12, 25, 55, 75), na.rm = FALSE) { if (!is.numeric(x)) { stop("`x` and must be numeric, not a ", paste0(class(x), collapse = "/"), ".") } @@ -174,5 +180,11 @@ age_groups <- function(x, split_at = c(12, 25, 55, 75)) { # last category labs[length(labs)] <- paste0(split_at[length(split_at)], "+") - factor(labs[y], levels = labs, ordered = TRUE) + agegroups <- factor(labs[y], levels = labs, ordered = TRUE) + + if (isTRUE(na.rm)) { + agegroups <- agegroups[!is.na(agegroups)] + } + + agegroups } diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index eef454d1..e9ad1f32 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -78,7 +78,7 @@ AMR (for R) - 0.7.1.9069 + 0.7.1.9070 diff --git a/docs/articles/benchmarks.html b/docs/articles/benchmarks.html index f8b66330..d845543e 100644 --- a/docs/articles/benchmarks.html +++ b/docs/articles/benchmarks.html @@ -40,7 +40,7 @@ AMR (for R) - 0.7.1.9069 + 0.7.1.9070 @@ -185,7 +185,7 @@

Benchmarks

Matthijs S. Berends

-

01 September 2019

+

02 September 2019

@@ -220,35 +220,35 @@ print(S.aureus, unit = "ms", signif = 2) # Unit: milliseconds # expr min lq mean median uq -# as.mo("sau") 8.6 8.9 12.0 9.6 10.0 -# as.mo("stau") 33.0 33.0 39.0 34.0 50.0 -# as.mo("STAU") 33.0 33.0 40.0 35.0 49.0 -# as.mo("staaur") 9.0 9.2 11.0 9.6 9.8 -# as.mo("STAAUR") 8.8 9.2 11.0 9.6 9.9 -# as.mo("S. aureus") 23.0 25.0 29.0 26.0 27.0 -# as.mo("S aureus") 24.0 24.0 26.0 25.0 25.0 -# as.mo("Staphylococcus aureus") 4.0 4.2 4.4 4.2 4.6 -# as.mo("Staphylococcus aureus (MRSA)") 1500.0 1500.0 1600.0 1600.0 1700.0 -# as.mo("Sthafilokkockus aaureuz") 530.0 540.0 550.0 550.0 560.0 -# as.mo("MRSA") 8.2 8.7 12.0 9.1 9.5 -# as.mo("VISA") 19.0 20.0 35.0 21.0 37.0 -# as.mo("VRSA") 19.0 19.0 23.0 20.0 21.0 -# as.mo(22242419) 18.0 19.0 20.0 19.0 20.0 +# as.mo("sau") 8.4 8.8 11.0 9.1 9.9 +# as.mo("stau") 32.0 33.0 37.0 34.0 35.0 +# as.mo("STAU") 32.0 33.0 40.0 34.0 52.0 +# as.mo("staaur") 9.1 9.2 9.4 9.3 9.4 +# as.mo("STAAUR") 8.6 9.1 11.0 9.3 9.7 +# as.mo("S. aureus") 24.0 25.0 33.0 26.0 45.0 +# as.mo("S aureus") 24.0 25.0 27.0 25.0 26.0 +# as.mo("Staphylococcus aureus") 4.0 4.1 4.5 4.3 4.6 +# as.mo("Staphylococcus aureus (MRSA)") 1500.0 1600.0 1700.0 1700.0 1700.0 +# as.mo("Sthafilokkockus aaureuz") 550.0 550.0 560.0 560.0 580.0 +# as.mo("MRSA") 8.7 9.0 9.4 9.2 10.0 +# as.mo("VISA") 19.0 20.0 26.0 21.0 37.0 +# as.mo("VRSA") 19.0 19.0 20.0 20.0 20.0 +# as.mo(22242419) 18.0 18.0 24.0 20.0 21.0 # max neval -# 25.0 10 -# 52.0 10 -# 53.0 10 -# 29.0 10 -# 28.0 10 -# 50.0 10 -# 40.0 10 -# 5.2 10 +# 27.0 10 +# 61.0 10 +# 58.0 10 +# 10.0 10 +# 26.0 10 +# 48.0 10 +# 42.0 10 +# 5.9 10 # 1700.0 10 -# 580.0 10 -# 36.0 10 -# 110.0 10 -# 35.0 10 -# 34.0 10 +# 590.0 10 +# 10.0 10 +# 41.0 10 +# 21.0 10 +# 45.0 10

In the table above, all measurements are shown in milliseconds (thousands of seconds). A value of 5 milliseconds means it can determine 200 input values per second. It case of 100 milliseconds, this is only 10 input values per second. The second input is the only one that has to be looked up thoroughly. All the others are known codes (the first one is a WHONET code) or common laboratory codes, or common full organism names like the last one. Full organism names are always preferred.

To achieve this speed, the as.mo function also takes into account the prevalence of human pathogenic microorganisms. The downside is of course that less prevalent microorganisms will be determined less fast. See this example for the ID of Thermus islandicus (B_THERMS_ISL), a bug probably never found before in humans:

@@ -261,12 +261,12 @@ print(T.islandicus, unit = "ms", signif = 2) # Unit: milliseconds # expr min lq mean median uq max neval -# as.mo("theisl") 300 300 310 320 320 330 10 -# as.mo("THEISL") 290 300 300 300 320 320 10 -# as.mo("T. islandicus") 130 140 150 140 150 160 10 -# as.mo("T. islandicus") 140 160 160 160 170 170 10 -# as.mo("Thermus islandicus") 49 52 57 55 56 72 10 -

That takes 1.1 times as much time on average. A value of 100 milliseconds means it can only determine ~10 different input values per second. We can conclude that looking up arbitrary codes of less prevalent microorganisms is the worst way to go, in terms of calculation performance. Full names (like Thermus islandicus) are almost fast - these are the most probable input from most data sets.

+# as.mo("theisl") 300 300 320 320 330 340 10 +# as.mo("THEISL") 290 320 330 320 330 400 10 +# as.mo("T. islandicus") 150 150 160 170 170 170 10 +# as.mo("T. islandicus") 140 150 150 150 160 170 10 +# as.mo("Thermus islandicus") 53 55 65 59 76 80 10 +

That takes 1.2 times as much time on average. A value of 100 milliseconds means it can only determine ~10 different input values per second. We can conclude that looking up arbitrary codes of less prevalent microorganisms is the worst way to go, in terms of calculation performance. Full names (like Thermus islandicus) are almost fast - these are the most probable input from most data sets.

In the figure below, we compare Escherichia coli (which is very common) with Prevotella brevis (which is moderately common) and with Thermus islandicus (which is uncommon):

Uncommon microorganisms take a lot more time than common microorganisms. To relieve this pitfall and further improve performance, two important calculations take almost no time at all: repetitive results and already precalculated results.

@@ -300,8 +300,8 @@ print(run_it, unit = "ms", signif = 3) # Unit: milliseconds # expr min lq mean median uq max neval -# mo_name(x) 582 614 622 620 624 677 10 -

So transforming 500,000 values (!!) of 50 unique values only takes 0.62 seconds (619 ms). You only lose time on your unique input values.

+# mo_name(x) 625 659 686 689 698 746 10 +

So transforming 500,000 values (!!) of 50 unique values only takes 0.69 seconds (688 ms). You only lose time on your unique input values.

@@ -313,11 +313,11 @@ times = 10) print(run_it, unit = "ms", signif = 3) # Unit: milliseconds -# expr min lq mean median uq max neval -# A 6.520 6.600 6.790 6.800 6.930 7.080 10 -# B 22.100 22.300 25.600 23.200 23.500 42.800 10 -# C 0.705 0.772 0.816 0.806 0.854 0.977 10

-

So going from mo_name("Staphylococcus aureus") to "Staphylococcus aureus" takes 0.0008 seconds - it doesn’t even start calculating if the result would be the same as the expected resulting value. That goes for all helper functions:

+# expr min lq mean median uq max neval +# A 6.930 7.380 7.87 7.580 8.20 9.54 10 +# B 23.700 25.700 30.50 26.000 29.60 59.00 10 +# C 0.859 0.912 0.99 0.958 1.05 1.24 10 +

So going from mo_name("Staphylococcus aureus") to "Staphylococcus aureus" takes 0.001 seconds - it doesn’t even start calculating if the result would be the same as the expected resulting value. That goes for all helper functions:

run_it <- microbenchmark(A = mo_species("aureus"),
                          B = mo_genus("Staphylococcus"),
                          C = mo_name("Staphylococcus aureus"),
@@ -330,14 +330,14 @@
 print(run_it, unit = "ms", signif = 3)
 # Unit: milliseconds
 #  expr   min    lq  mean median    uq   max neval
-#     A 0.470 0.477 0.506  0.492 0.498 0.674    10
-#     B 0.623 0.634 0.674  0.669 0.708 0.744    10
-#     C 0.778 0.819 0.845  0.844 0.866 0.915    10
-#     D 0.462 0.471 0.483  0.482 0.492 0.515    10
-#     E 0.462 0.464 0.476  0.472 0.479 0.518    10
-#     F 0.460 0.466 0.475  0.470 0.482 0.510    10
-#     G 0.456 0.465 0.478  0.477 0.486 0.513    10
-#     H 0.459 0.464 0.472  0.470 0.475 0.509    10
+# A 0.478 0.490 0.512 0.509 0.530 0.579 10 +# B 0.644 0.670 0.711 0.673 0.740 0.930 10 +# C 0.697 0.815 0.887 0.840 0.992 1.150 10 +# D 0.456 0.486 0.511 0.503 0.521 0.643 10 +# E 0.465 0.477 0.534 0.497 0.556 0.804 10 +# F 0.459 0.472 0.490 0.477 0.509 0.556 10 +# G 0.466 0.488 0.526 0.522 0.544 0.637 10 +# H 0.455 0.455 0.476 0.465 0.485 0.535 10

Of course, when running mo_phylum("Firmicutes") the function has zero knowledge about the actual microorganism, namely S. aureus. But since the result would be "Firmicutes" too, there is no point in calculating the result. And because this package ‘knows’ all phyla of all known bacteria (according to the Catalogue of Life), it can just return the initial value immediately.

@@ -364,13 +364,13 @@ print(run_it, unit = "ms", signif = 4) # Unit: milliseconds # expr min lq mean median uq max neval -# en 17.64 17.88 20.31 18.36 19.05 37.62 10 -# de 19.15 19.25 21.73 19.75 20.40 37.72 10 -# nl 24.34 24.48 36.33 24.88 26.21 121.60 10 -# es 18.95 19.09 19.29 19.28 19.53 19.61 10 -# it 19.02 19.10 22.80 19.24 20.42 36.65 10 -# fr 19.01 19.04 19.21 19.15 19.20 19.94 10 -# pt 18.94 19.08 22.28 19.55 20.36 38.81 10
+# en 19.02 19.15 20.50 19.34 21.18 27.84 10 +# de 19.56 20.25 28.34 22.84 39.77 50.78 10 +# nl 26.12 26.25 27.73 27.17 28.60 31.65 10 +# es 20.02 20.58 22.36 22.26 23.95 25.71 10 +# it 20.30 20.47 23.21 21.23 24.56 34.71 10 +# fr 20.14 20.56 31.97 21.47 23.47 125.00 10 +# pt 19.83 20.24 22.74 20.46 22.65 38.93 10

Currently supported are German, Dutch, Spanish, Italian, French and Portuguese.

diff --git a/docs/articles/benchmarks_files/figure-html/unnamed-chunk-4-1.png b/docs/articles/benchmarks_files/figure-html/unnamed-chunk-4-1.png index 4af1a60a..8c266414 100644 Binary files a/docs/articles/benchmarks_files/figure-html/unnamed-chunk-4-1.png and b/docs/articles/benchmarks_files/figure-html/unnamed-chunk-4-1.png differ diff --git a/docs/articles/benchmarks_files/figure-html/unnamed-chunk-6-1.png b/docs/articles/benchmarks_files/figure-html/unnamed-chunk-6-1.png index 1f98e6b0..b30d5299 100644 Binary files a/docs/articles/benchmarks_files/figure-html/unnamed-chunk-6-1.png and b/docs/articles/benchmarks_files/figure-html/unnamed-chunk-6-1.png differ diff --git a/docs/articles/benchmarks_files/figure-html/unnamed-chunk-6-2.png b/docs/articles/benchmarks_files/figure-html/unnamed-chunk-6-2.png index 2af94b17..c1668a12 100644 Binary files a/docs/articles/benchmarks_files/figure-html/unnamed-chunk-6-2.png and b/docs/articles/benchmarks_files/figure-html/unnamed-chunk-6-2.png differ diff --git a/docs/articles/benchmarks_files/figure-html/unnamed-chunk-6-3.png b/docs/articles/benchmarks_files/figure-html/unnamed-chunk-6-3.png index de8d0f45..54a3f705 100644 Binary files a/docs/articles/benchmarks_files/figure-html/unnamed-chunk-6-3.png and b/docs/articles/benchmarks_files/figure-html/unnamed-chunk-6-3.png differ diff --git a/docs/articles/index.html b/docs/articles/index.html index 85faa3b2..1152cf27 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -78,7 +78,7 @@ AMR (for R) - 0.7.1.9069 + 0.7.1.9070 diff --git a/docs/authors.html b/docs/authors.html index 8d099023..68e409ca 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -78,7 +78,7 @@ AMR (for R) - 0.7.1.9069 + 0.7.1.9070 diff --git a/docs/index.html b/docs/index.html index 2e1a49c2..cd8a60b0 100644 --- a/docs/index.html +++ b/docs/index.html @@ -42,7 +42,7 @@ AMR (for R) - 0.7.1.9069 + 0.7.1.9070 diff --git a/docs/news/index.html b/docs/news/index.html index 75058886..48ad3e9b 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -78,7 +78,7 @@ AMR (for R) - 0.7.1.9069 + 0.7.1.9070 @@ -225,11 +225,11 @@ -
+

-AMR 0.7.1.9069 Unreleased +AMR 0.7.1.9070 Unreleased

-

Last updated: 01-Sep-2019

+

Last updated: 02-Sep-2019

Breaking

@@ -340,6 +340,7 @@ Since this is a major change, usage of the old also_single_tested w
  • Speed improvement for guess_ab_col() which is now 30 times faster for antibiotic abbreviations
  • Improved filter_ab_class() to be more reliable and to support 5th generation cephalosporins
  • Function availability() now uses portion_R() instead of portion_IR(), to comply with EUCAST insights
  • +
  • Functions age() and age_groups() now have a na.rm parameter to remove empty values
  • @@ -1259,7 +1260,7 @@ Using as.mo(..., allow_uncertain = 3)

    Contents

    @@ -234,7 +234,7 @@
    -
    age(x, reference = Sys.Date(), exact = FALSE)
    +
    age(x, reference = Sys.Date(), exact = FALSE, na.rm = FALSE)

    Arguments

    @@ -251,6 +251,10 @@ + + + +
    exact

    a logical to indicate whether age calculation should be exact, i.e. with decimals. It divides the number of days of year-to-date (YTD) of x by the number of days in a year of reference (either 365 or 366).

    na.rm

    a logical to indicate whether missing values should be removed

    Value

    @@ -264,7 +268,7 @@

    See also

    -

    age_groups to split age into age groups

    +

    To split ages into groups, use the age_groups function.

    Examples

    @@ -298,7 +302,7 @@
    -
    age_groups(x, split_at = c(12, 25, 55, 75))
    +
    age_groups(x, split_at = c(12, 25, 55, 75), na.rm = FALSE)

    Arguments

    @@ -247,6 +247,10 @@ + + + +
    split_at

    values to split x at, defaults to age groups 0-11, 12-24, 25-54, 55-74 and 75+. See Details.

    na.rm

    a logical to indicate whether missing values should be removed

    Value

    @@ -273,7 +277,7 @@

    See also

    -

    age to determine ages based on one or more reference dates

    +

    To determine ages, based on one or more reference dates, use the age function.

    Examples

    diff --git a/docs/reference/index.html b/docs/reference/index.html index 68485f24..add9c1ba 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -78,7 +78,7 @@ AMR (for R) - 0.7.1.9069 + 0.7.1.9070
    diff --git a/man/age.Rd b/man/age.Rd index 4c969c82..e68e56f3 100644 --- a/man/age.Rd +++ b/man/age.Rd @@ -4,7 +4,7 @@ \alias{age} \title{Age in years of individuals} \usage{ -age(x, reference = Sys.Date(), exact = FALSE) +age(x, reference = Sys.Date(), exact = FALSE, na.rm = FALSE) } \arguments{ \item{x}{date(s), will be coerced with \code{\link{as.POSIXlt}}} @@ -12,6 +12,8 @@ age(x, reference = Sys.Date(), exact = FALSE) \item{reference}{reference date(s) (defaults to today), will be coerced with \code{\link{as.POSIXlt}} and cannot be lower than \code{x}} \item{exact}{a logical to indicate whether age calculation should be exact, i.e. with decimals. It divides the number of days of \href{https://en.wikipedia.org/wiki/Year-to-date}{year-to-date} (YTD) of \code{x} by the number of days in a year of \code{reference} (either 365 or 366).} + +\item{na.rm}{a logical to indicate whether missing values should be removed} } \value{ An integer (no decimals) if \code{exact = FALSE}, a double (with decimals) otherwise @@ -35,5 +37,5 @@ df$age_exact <- age(df$birth_date, exact = TRUE) df } \seealso{ -\code{\link{age_groups}} to split age into age groups +To split ages into groups, use the \code{\link{age_groups}} function. } diff --git a/man/age_groups.Rd b/man/age_groups.Rd index d02d5893..e3b6bad2 100644 --- a/man/age_groups.Rd +++ b/man/age_groups.Rd @@ -4,12 +4,14 @@ \alias{age_groups} \title{Split ages into age groups} \usage{ -age_groups(x, split_at = c(12, 25, 55, 75)) +age_groups(x, split_at = c(12, 25, 55, 75), na.rm = FALSE) } \arguments{ \item{x}{age, e.g. calculated with \code{\link{age}}} \item{split_at}{values to split \code{x} at, defaults to age groups 0-11, 12-24, 25-54, 55-74 and 75+. See Details.} + +\item{na.rm}{a logical to indicate whether missing values should be removed} } \value{ Ordered \code{\link{factor}} @@ -68,7 +70,7 @@ example_isolates \%>\% ggplot_rsi(x = "age_group") } \seealso{ -\code{\link{age}} to determine ages based on one or more reference dates +To determine ages, based on one or more reference dates, use the \code{\link{age}} function. } \keyword{age} \keyword{age_group} diff --git a/tests/testthat/test-age.R b/tests/testthat/test-age.R index 7165759c..106b8d3c 100644 --- a/tests/testthat/test-age.R +++ b/tests/testthat/test-age.R @@ -40,6 +40,10 @@ test_that("age works", { expect_warning(age(x = c("1800-01-01", "1805-01-01", "1810-01-01"), reference = "2019-01-01")) + + expect_equal(length(age(x = c("2019-01-01", NA), na.rm = TRUE)), + 1) + }) test_that("age_groups works", { @@ -60,5 +64,8 @@ test_that("age_groups works", { expect_identical(class(age_groups(ages, "fives")), c("ordered", "factor")) + + expect_equal(length(age_groups(c(10, 20, 30, NA), na.rm = TRUE)), + 3) })