From 363218da7e87e548ef13e7fde2acb4b701e1385b Mon Sep 17 00:00:00 2001 From: "Matthijs S. Berends" Date: Tue, 17 Nov 2020 16:57:41 +0100 Subject: [PATCH] (v1.4.0.9024) is_new_episode() --- .github/workflows/check.yaml | 8 +- DESCRIPTION | 2 +- NAMESPACE | 1 + NEWS.md | 10 +- R/aa_helper_functions.R | 4 + R/first_isolate.R | 157 ++++++++++++++++++++++++---- docs/404.html | 2 +- docs/LICENSE-text.html | 2 +- docs/articles/index.html | 2 +- docs/authors.html | 2 +- docs/index.html | 2 +- docs/news/index.html | 9 +- docs/pkgdown.yml | 2 +- docs/reference/first_isolate.html | 76 +++++++++++--- docs/reference/index.html | 4 +- docs/survey.html | 2 +- man/first_isolate.Rd | 71 +++++++++++-- tests/testthat/test-_misc.R | 37 ------- tests/testthat/test-first_isolate.R | 11 ++ tests/testthat/test-zzz.R | 69 ++++++++++++ 20 files changed, 379 insertions(+), 94 deletions(-) create mode 100644 tests/testthat/test-zzz.R diff --git a/.github/workflows/check.yaml b/.github/workflows/check.yaml index 1088df41..50f6771e 100644 --- a/.github/workflows/check.yaml +++ b/.github/workflows/check.yaml @@ -64,6 +64,7 @@ jobs: - {os: ubuntu-16.04, r: '3.5', allowfail: false, rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest"} - {os: ubuntu-16.04, r: '3.4', allowfail: true, rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest"} - {os: ubuntu-16.04, r: '3.3', allowfail: true, rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest"} + - {os: ubuntu-16.04, r: '3.2', allowfail: true, rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest"} # older R versions cannot be tested, since tidyverse only supports last 4 R x.x versions env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true @@ -120,14 +121,15 @@ jobs: shell: Rscript {0} - name: Check on older R versions - if: matrix.config.r == '3.3' + # no vignettes here, since they rely on R 3.3 and higher + if: matrix.config.r == '3.2' env: _R_CHECK_CRAN_INCOMING_: false - run: rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran", "--no-build-vignettes" , "--ignore-vignettes"), error_on = "warning", check_dir = "check") + run: rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran", "--ignore-vignettes"), build_args = "--no-build-vignettes" , error_on = "warning", check_dir = "check") shell: Rscript {0} - name: Check on newer R versions - if: matrix.config.r != '3.3' + if: matrix.config.r != '3.2' env: _R_CHECK_CRAN_INCOMING_: false run: rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check") diff --git a/DESCRIPTION b/DESCRIPTION index f30c3e96..45d1100a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: AMR -Version: 1.4.0.9023 +Version: 1.4.0.9024 Date: 2020-11-17 Title: Antimicrobial Resistance Analysis Authors@R: c( diff --git a/NAMESPACE b/NAMESPACE index 7605a88e..1467837b 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -152,6 +152,7 @@ export(is.mic) export(is.mo) export(is.rsi) export(is.rsi.eligible) +export(is_new_episode) export(key_antibiotics) export(key_antibiotics_equal) export(kurtosis) diff --git a/NEWS.md b/NEWS.md index a80c8cb2..5b99ed0a 100755 --- a/NEWS.md +++ b/NEWS.md @@ -1,7 +1,15 @@ -# AMR 1.4.0.9023 +# AMR 1.4.0.9024 ## Last updated: 17 November 2020 ### New +* Function `is_new_episode()` to determine patient episodes which are not necessarily based on microorganisms. It also supports grouped variables with e.g. `mutate()` and `summarise()` of the `dplyr` package: + ```r + example_isolates %>% + group_by(hospital_id) %>% + summarise(patients = n_distinct(patient_id), + n_episodes_365 = sum(is_new_episode(episode_days = 365)), + n_episodes_60 = sum(is_new_episode(episode_days = 60))) + ``` * Functions `mo_is_gram_negative()` and `mo_is_gram_positive()` as wrappers around `mo_gramstain()`. They always return `TRUE` or `FALSE` (except when the input is `NA` or the MO code is `UNKNOWN`), thus always return `FALSE` for species outside the taxonomic kingdom of Bacteria. If you have the `dplyr` package installed, they can even determine the column with microorganisms themselves when used inside `dplyr` verbs: ```r example_isolates %>% diff --git a/R/aa_helper_functions.R b/R/aa_helper_functions.R index e53e1650..2c70d9eb 100755 --- a/R/aa_helper_functions.R +++ b/R/aa_helper_functions.R @@ -139,9 +139,13 @@ check_dataset_integrity <- function() { } search_type_in_df <- function(x, type, info = TRUE) { + meet_criteria(x, allow_class = "data.frame") + meet_criteria(type, allow_class = "character", has_length = 1) + # try to find columns based on type found <- NULL + # remove attributes from other packages x <- as.data.frame(x, stringsAsFactors = FALSE) colnames(x) <- trimws(colnames(x)) diff --git a/R/first_isolate.R b/R/first_isolate.R index 7f537074..99557b25 100755 --- a/R/first_isolate.R +++ b/R/first_isolate.R @@ -25,10 +25,10 @@ #' Determine first (weighted) isolates #' -#' Determine first (weighted) isolates of all microorganisms of every patient per episode and (if needed) per specimen type. +#' Determine first (weighted) isolates of all microorganisms of every patient per episode and (if needed) per specimen type. To determine patient episodes not necessarily based on microorganisms, use [is_new_episode()] that also supports grouping with the `dplyr` package, see *Examples*. #' @inheritSection lifecycle Stable lifecycle -#' @param x a [data.frame] containing isolates. -#' @param col_date column name of the result date (or date that is was received on the lab), defaults to the first column of with a date class +#' @param x,.data a [data.frame] containing isolates. +#' @param col_date column name of the result date (or date that is was received on the lab), defaults to the first column with a date class #' @param col_patient_id column name of the unique IDs of the patients, defaults to the first column that starts with 'patient' or 'patid' (case insensitive) #' @param col_mo column name of the IDs of the microorganisms (see [as.mo()]), defaults to the first column of class [`mo`]. Values will be coerced using [as.mo()]. #' @param col_testcode column name of the test codes. Use `col_testcode = NULL` to **not** exclude certain test codes (like test codes for screening). In that case `testcodes_exclude` will be ignored. @@ -45,17 +45,26 @@ #' @param info print progress #' @param include_unknown logical to determine whether 'unknown' microorganisms should be included too, i.e. microbial code `"UNKNOWN"`, which defaults to `FALSE`. For WHONET users, this means that all records with organism code `"con"` (*contamination*) will be excluded at default. Isolates with a microbial ID of `NA` will always be excluded as first isolate. #' @param ... parameters passed on to the [first_isolate()] function -#' @details **WHY THIS IS SO IMPORTANT** \cr +#' @details The [is_new_episode()] function is a wrapper around the [first_isolate()] function and can be used for data sets without isolates to just determine patient episodes based on any combination of grouping variables (using `dplyr`), please see *Examples*. Since it runs [first_isolate()] for every group, it is quite slow. +#' +#' All isolates with a microbial ID of `NA` will be excluded as first isolate. +#' +#' ### Why this is so important #' To conduct an analysis of antimicrobial resistance, you should only include the first isolate of every patient per episode [(ref)](https:/pubmed.ncbi.nlm.nih.gov/17304462/). If you would not do this, you could easily get an overestimate or underestimate of the resistance of an antibiotic. Imagine that a patient was admitted with an MRSA and that it was found in 5 different blood cultures the following week. The resistance percentage of oxacillin of all *S. aureus* isolates would be overestimated, because you included this MRSA more than once. It would be [selection bias](https://en.wikipedia.org/wiki/Selection_bias). #' -#' All isolates with a microbial ID of `NA` will be excluded as first isolate. +#' ### `filter_*()` shortcuts #' -#' The functions [filter_first_isolate()] and [filter_first_weighted_isolate()] are helper functions to quickly filter on first isolates. The function [filter_first_isolate()] is essentially equal to either: +#' The functions [filter_first_isolate()] and [filter_first_weighted_isolate()] are helper functions to quickly filter on first isolates. +#' +#' The function [filter_first_isolate()] is essentially equal to either: +#' #' ``` #' x[first_isolate(x, ...), ] #' x %>% filter(first_isolate(x, ...)) #' ``` +#' #' The function [filter_first_weighted_isolate()] is essentially equal to: +#' #' ``` #' x %>% #' mutate(keyab = key_antibiotics(.)) %>% @@ -89,21 +98,22 @@ #' # basic filtering on first isolates #' example_isolates[first_isolate(example_isolates), ] #' +#' # filtering based on isolates ---------------------------------------------- #' \donttest{ #' if (require("dplyr")) { -#' # Filter on first isolates: +#' # filter on first isolates: #' example_isolates %>% #' mutate(first_isolate = first_isolate(.)) %>% #' filter(first_isolate == TRUE) #' -#' # Short-hand versions: +#' # short-hand versions: #' example_isolates %>% #' filter_first_isolate() #' #' example_isolates %>% #' filter_first_weighted_isolate() #' -#' # Now let's see if first isolates matter: +#' # now let's see if first isolates matter: #' A <- example_isolates %>% #' group_by(hospital_id) %>% #' summarise(count = n_rsi(GEN), # gentamicin availability @@ -120,6 +130,42 @@ #' # Gentamicin resistance in hospital D appears to be 3.7% higher than #' # when you (erroneously) would have used all isolates for analysis. #' } +#' +#' # filtering based on any other condition ----------------------------------- +#' +#' if (require("dplyr")) { +#' # is_new_episode() can be used in dplyr verbs to determine patient +#' # episodes based on any (combination of) grouping variables: +#' example_isolates %>% +#' mutate(condition = sample(x = c("A", "B", "C"), +#' size = 2000, +#' replace = TRUE)) %>% +#' group_by(condition) %>% +#' mutate(new_episode = is_new_episode()) +#' +#' example_isolates %>% +#' group_by(hospital_id) %>% +#' summarise(patients = n_distinct(patient_id), +#' n_episodes_365 = sum(is_new_episode(episode_days = 365)), +#' n_episodes_60 = sum(is_new_episode(episode_days = 60)), +#' n_episodes_30 = sum(is_new_episode(episode_days = 30))) +#' +#' +#' # grouping on microorganisms leads to the same results as first_isolate(): +#' x <- example_isolates %>% +#' filter_first_isolate(include_unknown = TRUE) +#' +#' y <- example_isolates %>% +#' group_by(mo) %>% +#' filter(is_new_episode()) +#' +#' identical(x$patient_id, y$patient_id) +#' +#' # but now you can group on isolates and many more: +#' example_isolates %>% +#' group_by(mo, hospital_id, ward_icu) %>% +#' mutate(flag_episode = is_new_episode()) +#' } #' } first_isolate <- function(x, col_date = NULL, @@ -139,7 +185,7 @@ first_isolate <- function(x, info = interactive(), include_unknown = FALSE, ...) { - meet_criteria(x, allow_class = "data.frame") + meet_criteria(x, allow_class = "data.frame") # also checks dimensions to be >0 meet_criteria(col_date, allow_class = "character", has_length = 1, allow_NULL = TRUE, is_in = colnames(x)) meet_criteria(col_patient_id, allow_class = "character", has_length = 1, allow_NULL = TRUE, is_in = colnames(x)) meet_criteria(col_mo, allow_class = "character", has_length = 1, allow_NULL = TRUE, is_in = colnames(x)) @@ -175,13 +221,10 @@ first_isolate <- function(x, } } - stop_ifnot(is.data.frame(x), "`x` must be a data.frame") - stop_if(any(dim(x) == 0), "`x` must contain rows and columns") - # remove data.table, grouping from tibbles, etc. x <- as.data.frame(x, stringsAsFactors = FALSE) - # try to find columns based on type + # try to find columns based on type # -- mo if (is.null(col_mo)) { col_mo <- search_type_in_df(x = x, type = "mo") @@ -299,13 +342,32 @@ first_isolate <- function(x, ) } - # no isolates found + # speed up - return immediately if obvious if (abs(row.start) == Inf | abs(row.end) == Inf) { if (info == TRUE) { - message_("=> Found ", font_bold("no isolates"), as_note = FALSE) + message_("=> Found ", font_bold("no isolates"), + add_fn = font_black, + as_note = FALSE) } return(rep(FALSE, nrow(x))) } + if (row.start == row.end) { + if (info == TRUE) { + message_("=> Found ", font_bold("1 isolate"), ", as the data only contained 1 row", + add_fn = font_black, + as_note = FALSE) + } + return(TRUE) + } + if (length(c(row.start:row.end)) == pm_n_distinct(x[c(row.start:row.end), col_mo, drop = TRUE])) { + if (info == TRUE) { + message_("=> Found ", font_bold(paste(length(c(row.start:row.end)), "isolates")), + ", as all isolates were different microorganisms", + add_fn = font_black, + as_note = FALSE) + } + return(rep(TRUE, length(c(row.start:row.end)))) + } # did find some isolates - add new index numbers of rows x$newvar_row_index_sorted <- seq_len(nrow(x)) @@ -511,7 +573,66 @@ filter_first_weighted_isolate <- function(x, subset(x, first_isolate(x = y, col_date = col_date, col_patient_id = col_patient_id, - col_mo = col_mo, - col_keyantibiotics = col_keyantibiotics, ...)) } + +#' @rdname first_isolate +#' @export +is_new_episode <- function(.data, + episode_days = 365, + col_date = NULL, + col_patient_id = NULL) { + if (missing(.data)) { + # look it up - this also supports grouping variables + cur_data <- import_fn("cur_data", "dplyr", error_on_fail = FALSE) + if (is.null(cur_data)) { + stop_("parameter '.data' not set.") + } + .data <- cur_data() + } + meet_criteria(.data, allow_class = "data.frame") # also checks dimensions to be >0 + meet_criteria(col_date, allow_class = "character", has_length = 1, allow_NULL = TRUE, is_in = colnames(x)) + meet_criteria(col_patient_id, allow_class = "character", has_length = 1, allow_NULL = TRUE, is_in = colnames(x)) + meet_criteria(episode_days, allow_class = c("numeric", "integer"), has_length = 1) + + # get i'th ID of group, so notices will only be thrown once + cur_group_id <- import_fn("cur_group_id", "dplyr", error_on_fail = FALSE) + first_group <- tryCatch(is.null(cur_group_id) || cur_group_id() == 1, + error = function(e) TRUE) + + # try to find columns based on type + # -- date + if (is.null(col_date)) { + col_date <- search_type_in_df(x = .data, + type = "date", + info = first_group) + stop_if(is.null(col_date), "`col_date` must be set") + } + + # -- patient id + if (is.null(col_patient_id)) { + if (all(c("First name", "Last name", "Sex") %in% colnames(.data))) { + # WHONET support + .data$patient_id <- paste(.data$`First name`, .data$`Last name`, .data$Sex) + col_patient_id <- "patient_id" + if (is.null(cur_group_id) || cur_group_id() == 1) { + message_("Using combined columns `", font_bold("First name"), "`, `", font_bold("Last name"), "` and `", font_bold("Sex"), "` as input for `col_patient_id`") + } + } else { + col_patient_id <- search_type_in_df(x = .data, + type = "patient_id", + info = first_group) + } + stop_if(is.null(col_patient_id), "`col_patient_id` must be set") + } + + # create any random mo, so first isolates can be calculated + .data$a94a8fe5 <- as.mo("Escherichia coli") + + first_isolate(.data, + col_date = col_date, + col_patient_id = col_patient_id, + episode_days = episode_days, + col_mo = "a94a8fe5", + info = FALSE) +} diff --git a/docs/404.html b/docs/404.html index 366719f6..edc34440 100644 --- a/docs/404.html +++ b/docs/404.html @@ -81,7 +81,7 @@ AMR (for R) - 1.4.0.9023 + 1.4.0.9024 diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index 05ebc80c..8c61edb9 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -81,7 +81,7 @@ AMR (for R) - 1.4.0.9023 + 1.4.0.9024 diff --git a/docs/articles/index.html b/docs/articles/index.html index b815de3d..0e812de6 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -81,7 +81,7 @@ AMR (for R) - 1.4.0.9023 + 1.4.0.9024 diff --git a/docs/authors.html b/docs/authors.html index c8229f1c..cf09397b 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -81,7 +81,7 @@ AMR (for R) - 1.4.0.9023 + 1.4.0.9024 diff --git a/docs/index.html b/docs/index.html index 07cdd4a8..60f4a7ff 100644 --- a/docs/index.html +++ b/docs/index.html @@ -43,7 +43,7 @@ AMR (for R) - 1.4.0.9023 + 1.4.0.9024 diff --git a/docs/news/index.html b/docs/news/index.html index fc57574e..13e22ebe 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -81,7 +81,7 @@ AMR (for R) - 1.4.0.9023 + 1.4.0.9024 @@ -236,9 +236,9 @@ Source: NEWS.md -
-

-AMR 1.4.0.9023 Unreleased +
+

+AMR 1.4.0.9024 Unreleased

@@ -248,6 +248,7 @@

New

    +
  • Function is_new_episode() to determine patient episodes which are not necessarily based on microorganisms. It also supports grouped variables with e.g. mutate() and summarise() of the dplyr package: r example_isolates %>% group_by(hospital_id) %>% summarise(patients = n_distinct(patient_id), n_episodes_365 = sum(is_new_episode(episode_days = 365)), n_episodes_60 = sum(is_new_episode(episode_days = 60)))

  • Functions mo_is_gram_negative() and mo_is_gram_positive() as wrappers around mo_gramstain(). They always return TRUE or FALSE (except when the input is NA or the MO code is UNKNOWN), thus always return FALSE for species outside the taxonomic kingdom of Bacteria. If you have the dplyr package installed, they can even determine the column with microorganisms themselves when used inside dplyr verbs:

    diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml
    index 3618acfe..4884f74a 100644
    --- a/docs/pkgdown.yml
    +++ b/docs/pkgdown.yml
    @@ -12,7 +12,7 @@ articles:
       datasets: datasets.html
       resistance_predict: resistance_predict.html
       welcome_to_AMR: welcome_to_AMR.html
    -last_built: 2020-11-17T10:53Z
    +last_built: 2020-11-17T15:56Z
     urls:
       reference: https://msberends.github.io/AMR//reference
       article: https://msberends.github.io/AMR//articles
    diff --git a/docs/reference/first_isolate.html b/docs/reference/first_isolate.html
    index a8c060b9..1dcba621 100644
    --- a/docs/reference/first_isolate.html
    +++ b/docs/reference/first_isolate.html
    @@ -49,7 +49,7 @@
       
     
     
    -
    +
     
     
     
    @@ -82,7 +82,7 @@
           
           
             AMR (for R)
    -        1.4.0.9000
    +        1.4.0.9024
           
         
    @@ -239,7 +239,7 @@
-

Determine first (weighted) isolates of all microorganisms of every patient per episode and (if needed) per specimen type.

+

Determine first (weighted) isolates of all microorganisms of every patient per episode and (if needed) per specimen type. To determine patient episodes not necessarily based on microorganisms, use is_new_episode() that also supports grouping with the dplyr package, see Examples.

first_isolate(
@@ -278,18 +278,25 @@
   col_mo = NULL,
   col_keyantibiotics = NULL,
   ...
+)
+
+is_new_episode(
+  .data,
+  episode_days = 365,
+  col_date = NULL,
+  col_patient_id = NULL
 )

Arguments

- + - + @@ -366,10 +373,17 @@

A logical vector

Details

-

WHY THIS IS SO IMPORTANT
-To conduct an analysis of antimicrobial resistance, you should only include the first isolate of every patient per episode (ref). If you would not do this, you could easily get an overestimate or underestimate of the resistance of an antibiotic. Imagine that a patient was admitted with an MRSA and that it was found in 5 different blood cultures the following week. The resistance percentage of oxacillin of all S. aureus isolates would be overestimated, because you included this MRSA more than once. It would be selection bias.

-

All isolates with a microbial ID of NA will be excluded as first isolate.

-

The functions filter_first_isolate() and filter_first_weighted_isolate() are helper functions to quickly filter on first isolates. The function filter_first_isolate() is essentially equal to either:

  x[first_isolate(x, ...), ]
+    

The is_new_episode() function is a wrapper around the first_isolate() function and can be used for data sets without isolates to just determine patient episodes based on any combination of grouping variables (using dplyr), please see Examples. Since it runs first_isolate() for every group, it is quite slow.

+

All isolates with a microbial ID of NA will be excluded as first isolate.

Why this is so important

+ + +

To conduct an analysis of antimicrobial resistance, you should only include the first isolate of every patient per episode (ref). If you would not do this, you could easily get an overestimate or underestimate of the resistance of an antibiotic. Imagine that a patient was admitted with an MRSA and that it was found in 5 different blood cultures the following week. The resistance percentage of oxacillin of all S. aureus isolates would be overestimated, because you included this MRSA more than once. It would be selection bias.

+ +

filter_*() shortcuts

+ + +

The functions filter_first_isolate() and filter_first_weighted_isolate() are helper functions to quickly filter on first isolates.

+

The function filter_first_isolate() is essentially equal to either:

  x[first_isolate(x, ...), ]
   x %>% filter(first_isolate(x, ...))
 
@@ -381,6 +395,7 @@ To conduct an analysis of antimicrobial resistance, you should only include the select(-only_weighted_firsts, -keyab)
+

Key antibiotics

@@ -415,21 +430,22 @@ The lifecycle of this function is stable# basic filtering on first isolates example_isolates[first_isolate(example_isolates), ] +# filtering based on isolates ---------------------------------------------- # \donttest{ if (require("dplyr")) { - # Filter on first isolates: + # filter on first isolates: example_isolates %>% mutate(first_isolate = first_isolate(.)) %>% filter(first_isolate == TRUE) - # Short-hand versions: + # short-hand versions: example_isolates %>% filter_first_isolate() example_isolates %>% filter_first_weighted_isolate() - # Now let's see if first isolates matter: + # now let's see if first isolates matter: A <- example_isolates %>% group_by(hospital_id) %>% summarise(count = n_rsi(GEN), # gentamicin availability @@ -446,6 +462,42 @@ The lifecycle of this function is stable# Gentamicin resistance in hospital D appears to be 3.7% higher than # when you (erroneously) would have used all isolates for analysis. } + +# filtering based on any other condition ----------------------------------- + +if (require("dplyr")) { + # is_new_episode() can be used in dplyr verbs to determine patient + # episodes based on any (combination of) grouping variables: + example_isolates %>% + mutate(condition = sample(x = c("A", "B", "C"), + size = 2000, + replace = TRUE)) %>% + group_by(condition) %>% + mutate(new_episode = is_new_episode()) + + example_isolates %>% + group_by(hospital_id) %>% + summarise(patients = n_distinct(patient_id), + n_episodes_365 = sum(is_new_episode(episode_days = 365)), + n_episodes_60 = sum(is_new_episode(episode_days = 60)), + n_episodes_30 = sum(is_new_episode(episode_days = 30))) + + + # grouping on microorganisms leads to the same results as first_isolate(): + x <- example_isolates %>% + filter_first_isolate(include_unknown = TRUE) + + y <- example_isolates %>% + group_by(mo) %>% + filter(is_new_episode()) + + identical(x$patient_id, y$patient_id) + + # but now you can group on isolates and many more: + example_isolates %>% + group_by(mo, hospital_id, ward_icu) %>% + mutate(flag_episode = is_new_episode()) +} # } diff --git a/docs/reference/index.html b/docs/reference/index.html index 05da0206..d6a81d37 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -81,7 +81,7 @@ AMR (for R) - 1.4.0.9023 + 1.4.0.9024 @@ -478,7 +478,7 @@ diff --git a/docs/survey.html b/docs/survey.html index 2ec33762..2d50307e 100644 --- a/docs/survey.html +++ b/docs/survey.html @@ -81,7 +81,7 @@ AMR (for R) - 1.4.0.9023 + 1.4.0.9024 diff --git a/man/first_isolate.Rd b/man/first_isolate.Rd index 18c0c9a3..4e0f18f3 100755 --- a/man/first_isolate.Rd +++ b/man/first_isolate.Rd @@ -4,6 +4,7 @@ \alias{first_isolate} \alias{filter_first_isolate} \alias{filter_first_weighted_isolate} +\alias{is_new_episode} \title{Determine first (weighted) isolates} \source{ Methodology of this function is strictly based on: @@ -48,11 +49,18 @@ filter_first_weighted_isolate( col_keyantibiotics = NULL, ... ) + +is_new_episode( + .data, + episode_days = 365, + col_date = NULL, + col_patient_id = NULL +) } \arguments{ -\item{x}{a \link{data.frame} containing isolates.} +\item{x, .data}{a \link{data.frame} containing isolates.} -\item{col_date}{column name of the result date (or date that is was received on the lab), defaults to the first column of with a date class} +\item{col_date}{column name of the result date (or date that is was received on the lab), defaults to the first column with a date class} \item{col_patient_id}{column name of the unique IDs of the patients, defaults to the first column that starts with 'patient' or 'patid' (case insensitive)} @@ -90,15 +98,22 @@ filter_first_weighted_isolate( A \code{\link{logical}} vector } \description{ -Determine first (weighted) isolates of all microorganisms of every patient per episode and (if needed) per specimen type. +Determine first (weighted) isolates of all microorganisms of every patient per episode and (if needed) per specimen type. To determine patient episodes not necessarily based on microorganisms, use \code{\link[=is_new_episode]{is_new_episode()}} that also supports grouping with the \code{dplyr} package, see \emph{Examples}. } \details{ -\strong{WHY THIS IS SO IMPORTANT} \cr -To conduct an analysis of antimicrobial resistance, you should only include the first isolate of every patient per episode \href{https:/pubmed.ncbi.nlm.nih.gov/17304462/}{(ref)}. If you would not do this, you could easily get an overestimate or underestimate of the resistance of an antibiotic. Imagine that a patient was admitted with an MRSA and that it was found in 5 different blood cultures the following week. The resistance percentage of oxacillin of all \emph{S. aureus} isolates would be overestimated, because you included this MRSA more than once. It would be \href{https://en.wikipedia.org/wiki/Selection_bias}{selection bias}. +The \code{\link[=is_new_episode]{is_new_episode()}} function is a wrapper around the \code{\link[=first_isolate]{first_isolate()}} function and can be used for data sets without isolates to just determine patient episodes based on any combination of grouping variables (using \code{dplyr}), please see \emph{Examples}. Since it runs \code{\link[=first_isolate]{first_isolate()}} for every group, it is quite slow. All isolates with a microbial ID of \code{NA} will be excluded as first isolate. +\subsection{Why this is so important}{ -The functions \code{\link[=filter_first_isolate]{filter_first_isolate()}} and \code{\link[=filter_first_weighted_isolate]{filter_first_weighted_isolate()}} are helper functions to quickly filter on first isolates. The function \code{\link[=filter_first_isolate]{filter_first_isolate()}} is essentially equal to either:\preformatted{ x[first_isolate(x, ...), ] +To conduct an analysis of antimicrobial resistance, you should only include the first isolate of every patient per episode \href{https:/pubmed.ncbi.nlm.nih.gov/17304462/}{(ref)}. If you would not do this, you could easily get an overestimate or underestimate of the resistance of an antibiotic. Imagine that a patient was admitted with an MRSA and that it was found in 5 different blood cultures the following week. The resistance percentage of oxacillin of all \emph{S. aureus} isolates would be overestimated, because you included this MRSA more than once. It would be \href{https://en.wikipedia.org/wiki/Selection_bias}{selection bias}. +} + +\subsection{\verb{filter_*()} shortcuts}{ + +The functions \code{\link[=filter_first_isolate]{filter_first_isolate()}} and \code{\link[=filter_first_weighted_isolate]{filter_first_weighted_isolate()}} are helper functions to quickly filter on first isolates. + +The function \code{\link[=filter_first_isolate]{filter_first_isolate()}} is essentially equal to either:\preformatted{ x[first_isolate(x, ...), ] x \%>\% filter(first_isolate(x, ...)) } @@ -110,6 +125,7 @@ The function \code{\link[=filter_first_weighted_isolate]{filter_first_weighted_i select(-only_weighted_firsts, -keyab) } } +} \section{Key antibiotics}{ There are two ways to determine whether isolates can be included as first \emph{weighted} isolates which will give generally the same results: @@ -143,21 +159,22 @@ On our website \url{https://msberends.github.io/AMR/} you can find \href{https:/ # basic filtering on first isolates example_isolates[first_isolate(example_isolates), ] +# filtering based on isolates ---------------------------------------------- \donttest{ if (require("dplyr")) { - # Filter on first isolates: + # filter on first isolates: example_isolates \%>\% mutate(first_isolate = first_isolate(.)) \%>\% filter(first_isolate == TRUE) - # Short-hand versions: + # short-hand versions: example_isolates \%>\% filter_first_isolate() example_isolates \%>\% filter_first_weighted_isolate() - # Now let's see if first isolates matter: + # now let's see if first isolates matter: A <- example_isolates \%>\% group_by(hospital_id) \%>\% summarise(count = n_rsi(GEN), # gentamicin availability @@ -174,6 +191,42 @@ if (require("dplyr")) { # Gentamicin resistance in hospital D appears to be 3.7\% higher than # when you (erroneously) would have used all isolates for analysis. } + +# filtering based on any other condition ----------------------------------- + +if (require("dplyr")) { + # is_new_episode() can be used in dplyr verbs to determine patient + # episodes based on any (combination of) grouping variables: + example_isolates \%>\% + mutate(condition = sample(x = c("A", "B", "C"), + size = 2000, + replace = TRUE)) \%>\% + group_by(condition) \%>\% + mutate(new_episode = is_new_episode()) + + example_isolates \%>\% + group_by(hospital_id) \%>\% + summarise(patients = n_distinct(patient_id), + n_episodes_365 = sum(is_new_episode(episode_days = 365)), + n_episodes_60 = sum(is_new_episode(episode_days = 60)), + n_episodes_30 = sum(is_new_episode(episode_days = 30))) + + + # grouping on microorganisms leads to the same results as first_isolate(): + x <- example_isolates \%>\% + filter_first_isolate(include_unknown = TRUE) + + y <- example_isolates \%>\% + group_by(mo) \%>\% + filter(is_new_episode()) + + identical(x$patient_id, y$patient_id) + + # but now you can group on isolates and many more: + example_isolates \%>\% + group_by(mo, hospital_id, ward_icu) \%>\% + mutate(flag_episode = is_new_episode()) +} } } \seealso{ diff --git a/tests/testthat/test-_misc.R b/tests/testthat/test-_misc.R index 9d1ef03d..b93097ce 100755 --- a/tests/testthat/test-_misc.R +++ b/tests/testthat/test-_misc.R @@ -54,40 +54,3 @@ test_that("looking up ab columns works", { expect_warning(get_column_abx(dplyr::rename(example_isolates, thisone = AMX), amox = "thisone", tmp = "thisone", verbose = TRUE)) expect_warning(get_column_abx(dplyr::rename(example_isolates, thisone = AMX), amox = "thisone", tmp = "thisone", verbose = FALSE)) }) - -test_that("imports work", { - skip_on_cran() - - import_functions <- c( - "anti_join" = "dplyr", - "cur_column" = "dplyr", - "freq.default" = "cleaner", - "full_join" = "dplyr", - "has_internet" = "curl", - "html_attr" = "rvest", - "html_children" = "rvest", - "html_node" = "rvest", - "html_nodes" = "rvest", - "html_table" = "rvest", - "html_text" = "rvest", - "inline_hist" = "skimr", - "inner_join" = "dplyr", - "insertText" = "rstudioapi", - "left_join" = "dplyr", - "new_pillar_shaft_simple" = "pillar", - "peek_mask" = "dplyr", - "peek_vars" = "tidyselect", - "read_excel" = "readxl", - "read_html" = "xml2", - "right_join" = "dplyr", - "semi_join" = "dplyr", - "sfl" = "skimr", - "showQuestion" = "rstudioapi") - - for (i in seq_len(length(import_functions))) { - fn <- names(import_functions)[i] - pkg <- unname(import_functions[i]) - expect(!is.null(import_fn(name = fn, pkg = pkg, error_on_fail = FALSE)), - failure_message = paste0("Function ", pkg, "::", fn, "() does not exist")) - } -}) diff --git a/tests/testthat/test-first_isolate.R b/tests/testthat/test-first_isolate.R index 27f9009a..3245f93c 100755 --- a/tests/testthat/test-first_isolate.R +++ b/tests/testthat/test-first_isolate.R @@ -200,4 +200,15 @@ test_that("first isolates work", { expect_identical(filter_first_weighted_isolate(example_isolates), subset(example_isolates, first_isolate(ex))) + # notice that all mo's are distinct, so all are TRUE + expect_true(all(example_isolates %pm>% + pm_distinct(mo, .keep_all = TRUE) %pm>% + first_isolate() == TRUE)) + + library(dplyr) + # is_new_episode + old <- example_isolates %>% mutate(out = first_isolate(., include_unknown = TRUE)) + new <- example_isolates %>% group_by(mo) %>% mutate(out = is_new_episode()) + expect_identical(which(old$out), which(new$out)) + }) diff --git a/tests/testthat/test-zzz.R b/tests/testthat/test-zzz.R new file mode 100644 index 00000000..2ea56e8f --- /dev/null +++ b/tests/testthat/test-zzz.R @@ -0,0 +1,69 @@ +# ==================================================================== # +# TITLE # +# Antimicrobial Resistance (AMR) Analysis for R # +# # +# SOURCE # +# https://github.com/msberends/AMR # +# # +# LICENCE # +# (c) 2018-2020 Berends MS, Luz CF et al. # +# Developed at the University of Groningen, the Netherlands, in # +# collaboration with non-profit organisations Certe Medical # +# Diagnostics & Advice, and University Medical Center Groningen. # +# # +# This R package is free software; you can freely use and distribute # +# it for both personal and commercial purposes under the terms of the # +# GNU General Public License version 2.0 (GNU GPL-2), as published by # +# the Free Software Foundation. # +# We created this package for both routine data analysis and academic # +# research and it was publicly released in the hope that it will be # +# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. # +# # +# Visit our website for the full manual and a complete tutorial about # +# how to conduct AMR analysis: https://msberends.github.io/AMR/ # +# ==================================================================== # + +context("zzz.R") + +test_that("imports work", { + skip_on_cran() + + import_functions <- c( + "anti_join" = "dplyr", + "cur_column" = "dplyr", + "cur_data" = "dplyr", + "document_position" = "rstudioapi", + "document_range" = "rstudioapi", + "freq.default" = "cleaner", + "full_join" = "dplyr", + "getSourceEditorContext" = "rstudioapi", + "has_internet" = "curl", + "html_attr" = "rvest", + "html_children" = "rvest", + "html_node" = "rvest", + "html_nodes" = "rvest", + "html_table" = "rvest", + "html_text" = "rvest", + "inline_hist" = "skimr", + "inner_join" = "dplyr", + "insertText" = "rstudioapi", + "insertText" = "rstudioapi", + "insertText" = "rstudioapi", + "left_join" = "dplyr", + "new_pillar_shaft_simple" = "pillar", + "peek_mask" = "dplyr", + "peek_vars" = "tidyselect", + "read_excel" = "readxl", + "read_html" = "xml2", + "right_join" = "dplyr", + "semi_join" = "dplyr", + "sfl" = "skimr", + "showQuestion" = "rstudioapi") + + for (i in seq_len(length(import_functions))) { + fn <- names(import_functions)[i] + pkg <- unname(import_functions[i]) + expect(!is.null(import_fn(name = fn, pkg = pkg, error_on_fail = FALSE)), + failure_message = paste0("Function ", pkg, "::", fn, "() does not exist")) + } +})
xx, .data

a data.frame containing isolates.

col_date

column name of the result date (or date that is was received on the lab), defaults to the first column of with a date class

column name of the result date (or date that is was received on the lab), defaults to the first column with a date class

col_patient_id
-

first_isolate() filter_first_isolate() filter_first_weighted_isolate()

+

first_isolate() filter_first_isolate() filter_first_weighted_isolate() is_new_episode()

Determine first (weighted) isolates