diff --git a/.github/workflows/check.yaml b/.github/workflows/check.yaml
index 1088df41..50f6771e 100644
--- a/.github/workflows/check.yaml
+++ b/.github/workflows/check.yaml
@@ -64,6 +64,7 @@ jobs:
- {os: ubuntu-16.04, r: '3.5', allowfail: false, rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest"}
- {os: ubuntu-16.04, r: '3.4', allowfail: true, rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest"}
- {os: ubuntu-16.04, r: '3.3', allowfail: true, rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest"}
+ - {os: ubuntu-16.04, r: '3.2', allowfail: true, rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest"}
# older R versions cannot be tested, since tidyverse only supports last 4 R x.x versions
env:
R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
@@ -120,14 +121,15 @@ jobs:
shell: Rscript {0}
- name: Check on older R versions
- if: matrix.config.r == '3.3'
+ # no vignettes here, since they rely on R 3.3 and higher
+ if: matrix.config.r == '3.2'
env:
_R_CHECK_CRAN_INCOMING_: false
- run: rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran", "--no-build-vignettes" , "--ignore-vignettes"), error_on = "warning", check_dir = "check")
+ run: rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran", "--ignore-vignettes"), build_args = "--no-build-vignettes" , error_on = "warning", check_dir = "check")
shell: Rscript {0}
- name: Check on newer R versions
- if: matrix.config.r != '3.3'
+ if: matrix.config.r != '3.2'
env:
_R_CHECK_CRAN_INCOMING_: false
run: rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check")
diff --git a/DESCRIPTION b/DESCRIPTION
index f30c3e96..45d1100a 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,5 +1,5 @@
Package: AMR
-Version: 1.4.0.9023
+Version: 1.4.0.9024
Date: 2020-11-17
Title: Antimicrobial Resistance Analysis
Authors@R: c(
diff --git a/NAMESPACE b/NAMESPACE
index 7605a88e..1467837b 100755
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -152,6 +152,7 @@ export(is.mic)
export(is.mo)
export(is.rsi)
export(is.rsi.eligible)
+export(is_new_episode)
export(key_antibiotics)
export(key_antibiotics_equal)
export(kurtosis)
diff --git a/NEWS.md b/NEWS.md
index a80c8cb2..5b99ed0a 100755
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,7 +1,15 @@
-# AMR 1.4.0.9023
+# AMR 1.4.0.9024
## Last updated: 17 November 2020
### New
+* Function `is_new_episode()` to determine patient episodes which are not necessarily based on microorganisms. It also supports grouped variables with e.g. `mutate()` and `summarise()` of the `dplyr` package:
+ ```r
+ example_isolates %>%
+ group_by(hospital_id) %>%
+ summarise(patients = n_distinct(patient_id),
+ n_episodes_365 = sum(is_new_episode(episode_days = 365)),
+ n_episodes_60 = sum(is_new_episode(episode_days = 60)))
+ ```
* Functions `mo_is_gram_negative()` and `mo_is_gram_positive()` as wrappers around `mo_gramstain()`. They always return `TRUE` or `FALSE` (except when the input is `NA` or the MO code is `UNKNOWN`), thus always return `FALSE` for species outside the taxonomic kingdom of Bacteria. If you have the `dplyr` package installed, they can even determine the column with microorganisms themselves when used inside `dplyr` verbs:
```r
example_isolates %>%
diff --git a/R/aa_helper_functions.R b/R/aa_helper_functions.R
index e53e1650..2c70d9eb 100755
--- a/R/aa_helper_functions.R
+++ b/R/aa_helper_functions.R
@@ -139,9 +139,13 @@ check_dataset_integrity <- function() {
}
search_type_in_df <- function(x, type, info = TRUE) {
+ meet_criteria(x, allow_class = "data.frame")
+ meet_criteria(type, allow_class = "character", has_length = 1)
+
# try to find columns based on type
found <- NULL
+ # remove attributes from other packages
x <- as.data.frame(x, stringsAsFactors = FALSE)
colnames(x) <- trimws(colnames(x))
diff --git a/R/first_isolate.R b/R/first_isolate.R
index 7f537074..99557b25 100755
--- a/R/first_isolate.R
+++ b/R/first_isolate.R
@@ -25,10 +25,10 @@
#' Determine first (weighted) isolates
#'
-#' Determine first (weighted) isolates of all microorganisms of every patient per episode and (if needed) per specimen type.
+#' Determine first (weighted) isolates of all microorganisms of every patient per episode and (if needed) per specimen type. To determine patient episodes not necessarily based on microorganisms, use [is_new_episode()] that also supports grouping with the `dplyr` package, see *Examples*.
#' @inheritSection lifecycle Stable lifecycle
-#' @param x a [data.frame] containing isolates.
-#' @param col_date column name of the result date (or date that is was received on the lab), defaults to the first column of with a date class
+#' @param x,.data a [data.frame] containing isolates.
+#' @param col_date column name of the result date (or date that is was received on the lab), defaults to the first column with a date class
#' @param col_patient_id column name of the unique IDs of the patients, defaults to the first column that starts with 'patient' or 'patid' (case insensitive)
#' @param col_mo column name of the IDs of the microorganisms (see [as.mo()]), defaults to the first column of class [`mo`]. Values will be coerced using [as.mo()].
#' @param col_testcode column name of the test codes. Use `col_testcode = NULL` to **not** exclude certain test codes (like test codes for screening). In that case `testcodes_exclude` will be ignored.
@@ -45,17 +45,26 @@
#' @param info print progress
#' @param include_unknown logical to determine whether 'unknown' microorganisms should be included too, i.e. microbial code `"UNKNOWN"`, which defaults to `FALSE`. For WHONET users, this means that all records with organism code `"con"` (*contamination*) will be excluded at default. Isolates with a microbial ID of `NA` will always be excluded as first isolate.
#' @param ... parameters passed on to the [first_isolate()] function
-#' @details **WHY THIS IS SO IMPORTANT** \cr
+#' @details The [is_new_episode()] function is a wrapper around the [first_isolate()] function and can be used for data sets without isolates to just determine patient episodes based on any combination of grouping variables (using `dplyr`), please see *Examples*. Since it runs [first_isolate()] for every group, it is quite slow.
+#'
+#' All isolates with a microbial ID of `NA` will be excluded as first isolate.
+#'
+#' ### Why this is so important
#' To conduct an analysis of antimicrobial resistance, you should only include the first isolate of every patient per episode [(ref)](https:/pubmed.ncbi.nlm.nih.gov/17304462/). If you would not do this, you could easily get an overestimate or underestimate of the resistance of an antibiotic. Imagine that a patient was admitted with an MRSA and that it was found in 5 different blood cultures the following week. The resistance percentage of oxacillin of all *S. aureus* isolates would be overestimated, because you included this MRSA more than once. It would be [selection bias](https://en.wikipedia.org/wiki/Selection_bias).
#'
-#' All isolates with a microbial ID of `NA` will be excluded as first isolate.
+#' ### `filter_*()` shortcuts
#'
-#' The functions [filter_first_isolate()] and [filter_first_weighted_isolate()] are helper functions to quickly filter on first isolates. The function [filter_first_isolate()] is essentially equal to either:
+#' The functions [filter_first_isolate()] and [filter_first_weighted_isolate()] are helper functions to quickly filter on first isolates.
+#'
+#' The function [filter_first_isolate()] is essentially equal to either:
+#'
#' ```
#' x[first_isolate(x, ...), ]
#' x %>% filter(first_isolate(x, ...))
#' ```
+#'
#' The function [filter_first_weighted_isolate()] is essentially equal to:
+#'
#' ```
#' x %>%
#' mutate(keyab = key_antibiotics(.)) %>%
@@ -89,21 +98,22 @@
#' # basic filtering on first isolates
#' example_isolates[first_isolate(example_isolates), ]
#'
+#' # filtering based on isolates ----------------------------------------------
#' \donttest{
#' if (require("dplyr")) {
-#' # Filter on first isolates:
+#' # filter on first isolates:
#' example_isolates %>%
#' mutate(first_isolate = first_isolate(.)) %>%
#' filter(first_isolate == TRUE)
#'
-#' # Short-hand versions:
+#' # short-hand versions:
#' example_isolates %>%
#' filter_first_isolate()
#'
#' example_isolates %>%
#' filter_first_weighted_isolate()
#'
-#' # Now let's see if first isolates matter:
+#' # now let's see if first isolates matter:
#' A <- example_isolates %>%
#' group_by(hospital_id) %>%
#' summarise(count = n_rsi(GEN), # gentamicin availability
@@ -120,6 +130,42 @@
#' # Gentamicin resistance in hospital D appears to be 3.7% higher than
#' # when you (erroneously) would have used all isolates for analysis.
#' }
+#'
+#' # filtering based on any other condition -----------------------------------
+#'
+#' if (require("dplyr")) {
+#' # is_new_episode() can be used in dplyr verbs to determine patient
+#' # episodes based on any (combination of) grouping variables:
+#' example_isolates %>%
+#' mutate(condition = sample(x = c("A", "B", "C"),
+#' size = 2000,
+#' replace = TRUE)) %>%
+#' group_by(condition) %>%
+#' mutate(new_episode = is_new_episode())
+#'
+#' example_isolates %>%
+#' group_by(hospital_id) %>%
+#' summarise(patients = n_distinct(patient_id),
+#' n_episodes_365 = sum(is_new_episode(episode_days = 365)),
+#' n_episodes_60 = sum(is_new_episode(episode_days = 60)),
+#' n_episodes_30 = sum(is_new_episode(episode_days = 30)))
+#'
+#'
+#' # grouping on microorganisms leads to the same results as first_isolate():
+#' x <- example_isolates %>%
+#' filter_first_isolate(include_unknown = TRUE)
+#'
+#' y <- example_isolates %>%
+#' group_by(mo) %>%
+#' filter(is_new_episode())
+#'
+#' identical(x$patient_id, y$patient_id)
+#'
+#' # but now you can group on isolates and many more:
+#' example_isolates %>%
+#' group_by(mo, hospital_id, ward_icu) %>%
+#' mutate(flag_episode = is_new_episode())
+#' }
#' }
first_isolate <- function(x,
col_date = NULL,
@@ -139,7 +185,7 @@ first_isolate <- function(x,
info = interactive(),
include_unknown = FALSE,
...) {
- meet_criteria(x, allow_class = "data.frame")
+ meet_criteria(x, allow_class = "data.frame") # also checks dimensions to be >0
meet_criteria(col_date, allow_class = "character", has_length = 1, allow_NULL = TRUE, is_in = colnames(x))
meet_criteria(col_patient_id, allow_class = "character", has_length = 1, allow_NULL = TRUE, is_in = colnames(x))
meet_criteria(col_mo, allow_class = "character", has_length = 1, allow_NULL = TRUE, is_in = colnames(x))
@@ -175,13 +221,10 @@ first_isolate <- function(x,
}
}
- stop_ifnot(is.data.frame(x), "`x` must be a data.frame")
- stop_if(any(dim(x) == 0), "`x` must contain rows and columns")
-
# remove data.table, grouping from tibbles, etc.
x <- as.data.frame(x, stringsAsFactors = FALSE)
- # try to find columns based on type
+ # try to find columns based on type
# -- mo
if (is.null(col_mo)) {
col_mo <- search_type_in_df(x = x, type = "mo")
@@ -299,13 +342,32 @@ first_isolate <- function(x,
)
}
- # no isolates found
+ # speed up - return immediately if obvious
if (abs(row.start) == Inf | abs(row.end) == Inf) {
if (info == TRUE) {
- message_("=> Found ", font_bold("no isolates"), as_note = FALSE)
+ message_("=> Found ", font_bold("no isolates"),
+ add_fn = font_black,
+ as_note = FALSE)
}
return(rep(FALSE, nrow(x)))
}
+ if (row.start == row.end) {
+ if (info == TRUE) {
+ message_("=> Found ", font_bold("1 isolate"), ", as the data only contained 1 row",
+ add_fn = font_black,
+ as_note = FALSE)
+ }
+ return(TRUE)
+ }
+ if (length(c(row.start:row.end)) == pm_n_distinct(x[c(row.start:row.end), col_mo, drop = TRUE])) {
+ if (info == TRUE) {
+ message_("=> Found ", font_bold(paste(length(c(row.start:row.end)), "isolates")),
+ ", as all isolates were different microorganisms",
+ add_fn = font_black,
+ as_note = FALSE)
+ }
+ return(rep(TRUE, length(c(row.start:row.end))))
+ }
# did find some isolates - add new index numbers of rows
x$newvar_row_index_sorted <- seq_len(nrow(x))
@@ -511,7 +573,66 @@ filter_first_weighted_isolate <- function(x,
subset(x, first_isolate(x = y,
col_date = col_date,
col_patient_id = col_patient_id,
- col_mo = col_mo,
- col_keyantibiotics = col_keyantibiotics,
...))
}
+
+#' @rdname first_isolate
+#' @export
+is_new_episode <- function(.data,
+ episode_days = 365,
+ col_date = NULL,
+ col_patient_id = NULL) {
+ if (missing(.data)) {
+ # look it up - this also supports grouping variables
+ cur_data <- import_fn("cur_data", "dplyr", error_on_fail = FALSE)
+ if (is.null(cur_data)) {
+ stop_("parameter '.data' not set.")
+ }
+ .data <- cur_data()
+ }
+ meet_criteria(.data, allow_class = "data.frame") # also checks dimensions to be >0
+ meet_criteria(col_date, allow_class = "character", has_length = 1, allow_NULL = TRUE, is_in = colnames(x))
+ meet_criteria(col_patient_id, allow_class = "character", has_length = 1, allow_NULL = TRUE, is_in = colnames(x))
+ meet_criteria(episode_days, allow_class = c("numeric", "integer"), has_length = 1)
+
+ # get i'th ID of group, so notices will only be thrown once
+ cur_group_id <- import_fn("cur_group_id", "dplyr", error_on_fail = FALSE)
+ first_group <- tryCatch(is.null(cur_group_id) || cur_group_id() == 1,
+ error = function(e) TRUE)
+
+ # try to find columns based on type
+ # -- date
+ if (is.null(col_date)) {
+ col_date <- search_type_in_df(x = .data,
+ type = "date",
+ info = first_group)
+ stop_if(is.null(col_date), "`col_date` must be set")
+ }
+
+ # -- patient id
+ if (is.null(col_patient_id)) {
+ if (all(c("First name", "Last name", "Sex") %in% colnames(.data))) {
+ # WHONET support
+ .data$patient_id <- paste(.data$`First name`, .data$`Last name`, .data$Sex)
+ col_patient_id <- "patient_id"
+ if (is.null(cur_group_id) || cur_group_id() == 1) {
+ message_("Using combined columns `", font_bold("First name"), "`, `", font_bold("Last name"), "` and `", font_bold("Sex"), "` as input for `col_patient_id`")
+ }
+ } else {
+ col_patient_id <- search_type_in_df(x = .data,
+ type = "patient_id",
+ info = first_group)
+ }
+ stop_if(is.null(col_patient_id), "`col_patient_id` must be set")
+ }
+
+ # create any random mo, so first isolates can be calculated
+ .data$a94a8fe5 <- as.mo("Escherichia coli")
+
+ first_isolate(.data,
+ col_date = col_date,
+ col_patient_id = col_patient_id,
+ episode_days = episode_days,
+ col_mo = "a94a8fe5",
+ info = FALSE)
+}
diff --git a/docs/404.html b/docs/404.html
index 366719f6..edc34440 100644
--- a/docs/404.html
+++ b/docs/404.html
@@ -81,7 +81,7 @@
NEWS.md
-
Function is_new_episode()
to determine patient episodes which are not necessarily based on microorganisms. It also supports grouped variables with e.g. mutate()
and summarise()
of the dplyr
package: r example_isolates %>% group_by(hospital_id) %>% summarise(patients = n_distinct(patient_id), n_episodes_365 = sum(is_new_episode(episode_days = 365)), n_episodes_60 = sum(is_new_episode(episode_days = 60)))
Functions mo_is_gram_negative()
and mo_is_gram_positive()
as wrappers around mo_gramstain()
. They always return TRUE
or FALSE
(except when the input is NA
or the MO code is UNKNOWN
), thus always return FALSE
for species outside the taxonomic kingdom of Bacteria. If you have the dplyr
package installed, they can even determine the column with microorganisms themselves when used inside dplyr
verbs:
diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml
index 3618acfe..4884f74a 100644
--- a/docs/pkgdown.yml
+++ b/docs/pkgdown.yml
@@ -12,7 +12,7 @@ articles:
datasets: datasets.html
resistance_predict: resistance_predict.html
welcome_to_AMR: welcome_to_AMR.html
-last_built: 2020-11-17T10:53Z
+last_built: 2020-11-17T15:56Z
urls:
reference: https://msberends.github.io/AMR//reference
article: https://msberends.github.io/AMR//articles
diff --git a/docs/reference/first_isolate.html b/docs/reference/first_isolate.html
index a8c060b9..1dcba621 100644
--- a/docs/reference/first_isolate.html
+++ b/docs/reference/first_isolate.html
@@ -49,7 +49,7 @@
-
+
@@ -82,7 +82,7 @@
Determine first (weighted) isolates of all microorganisms of every patient per episode and (if needed) per specimen type.
+Determine first (weighted) isolates of all microorganisms of every patient per episode and (if needed) per specimen type. To determine patient episodes not necessarily based on microorganisms, use is_new_episode()
that also supports grouping with the dplyr
package, see Examples.
first_isolate( @@ -278,18 +278,25 @@ col_mo = NULL, col_keyantibiotics = NULL, ... +) + +is_new_episode( + .data, + episode_days = 365, + col_date = NULL, + col_patient_id = NULL )
x | +x, .data | a data.frame containing isolates. |
---|---|---|
col_date | -column name of the result date (or date that is was received on the lab), defaults to the first column of with a date class |
+ column name of the result date (or date that is was received on the lab), defaults to the first column with a date class |
col_patient_id | @@ -366,10 +373,17 @@||
-
|
Determine first (weighted) isolates |
|