1
0
mirror of https://github.com/msberends/AMR.git synced 2025-01-15 02:01:37 +01:00
AMR/R/first_isolate.R

712 lines
35 KiB
R
Raw Normal View History

2018-02-21 11:52:31 +01:00
# ==================================================================== #
# TITLE: #
2022-10-05 09:12:22 +02:00
# AMR: An R Package for Working with Antimicrobial Resistance Data #
2018-02-21 11:52:31 +01:00
# #
# SOURCE CODE: #
2020-07-08 14:48:06 +02:00
# https://github.com/msberends/AMR #
2018-02-21 11:52:31 +01:00
# #
# PLEASE CITE THIS SOFTWARE AS: #
2022-10-05 09:12:22 +02:00
# Berends MS, Luz CF, Friedrich AW, Sinha BNM, Albers CJ, Glasner C #
# (2022). AMR: An R Package for Working with Antimicrobial Resistance #
# Data. Journal of Statistical Software, 104(3), 1-31. #
2023-05-27 10:39:22 +02:00
# https://doi.org/10.18637/jss.v104.i03 #
2022-10-05 09:12:22 +02:00
# #
2022-12-27 15:16:15 +01:00
# Developed at the University of Groningen and the University Medical #
# Center Groningen in The Netherlands, in collaboration with many #
# colleagues from around the world, see our website. #
2018-02-21 11:52:31 +01:00
# #
2019-01-02 23:24:07 +01:00
# This R package is free software; you can freely use and distribute #
# it for both personal and commercial purposes under the terms of the #
# GNU General Public License version 2.0 (GNU GPL-2), as published by #
# the Free Software Foundation. #
# We created this package for both routine data analysis and academic #
# research and it was publicly released in the hope that it will be #
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. #
2020-10-08 11:16:03 +02:00
# #
# Visit our website for the full manual and a complete tutorial about #
# how to conduct AMR data analysis: https://msberends.github.io/AMR/ #
2018-02-21 11:52:31 +01:00
# ==================================================================== #
#' Determine First Isolates
2018-02-21 11:52:31 +01:00
#'
#' Determine first isolates of all microorganisms of every patient per episode and (if needed) per specimen type. These functions support all four methods as summarised by Hindler *et al.* in 2007 (\doi{10.1086/511864}). To determine patient episodes not necessarily based on microorganisms, use [is_new_episode()] that also supports grouping with the `dplyr` package.
#' @param x a [data.frame] containing isolates. Can be left blank for automatic determination, see *Examples*.
#' @param col_date column name of the result date (or date that is was received on the lab) - the default is the first column with a date class
#' @param col_patient_id column name of the unique IDs of the patients - the default is the first column that starts with 'patient' or 'patid' (case insensitive)
#' @param col_mo column name of the names or codes of the microorganisms (see [as.mo()]) - the default is the first column of class [`mo`]. Values will be coerced using [as.mo()].
2020-12-17 16:22:25 +01:00
#' @param col_testcode column name of the test codes. Use `col_testcode = NULL` to **not** exclude certain test codes (such as test codes for screening). In that case `testcodes_exclude` will be ignored.
2018-04-02 11:11:21 +02:00
#' @param col_specimen column name of the specimen type or group
2022-08-27 20:49:37 +02:00
#' @param col_icu column name of the logicals (`TRUE`/`FALSE`) whether a ward or department is an Intensive Care Unit (ICU). This can also be a [logical] vector with the same length as rows in `x`.
#' @param col_keyantimicrobials (only useful when `method = "phenotype-based"`) column name of the key antimicrobials to determine first isolates, see [key_antimicrobials()]. The default is the first column that starts with 'key' followed by 'ab' or 'antibiotics' or 'antimicrobials' (case insensitive). Use `col_keyantimicrobials = FALSE` to prevent this. Can also be the output of [key_antimicrobials()].
2022-08-28 10:31:50 +02:00
#' @param episode_days episode in days after which a genus/species combination will be determined as 'first isolate' again. The default of 365 days is based on the guideline by CLSI, see *Source*.
2021-05-12 18:15:03 +02:00
#' @param testcodes_exclude a [character] vector with test codes that should be excluded (case-insensitive)
#' @param icu_exclude a [logical] to indicate whether ICU isolates should be excluded (rows with value `TRUE` in the column set with `col_icu`)
2020-12-22 00:51:17 +01:00
#' @param specimen_group value in the column set with `col_specimen` to filter on
#' @param type type to determine weighed isolates; can be `"keyantimicrobials"` or `"points"`, see *Details*
2021-04-29 17:16:30 +02:00
#' @param method the method to apply, either `"phenotype-based"`, `"episode-based"`, `"patient-based"` or `"isolate-based"` (can be abbreviated), see *Details*. The default is `"phenotype-based"` if antimicrobial test results are present in the data, and `"episode-based"` otherwise.
2021-05-12 18:15:03 +02:00
#' @param ignore_I [logical] to indicate whether antibiotic interpretations with `"I"` will be ignored when `type = "keyantimicrobials"`, see *Details*
#' @param points_threshold minimum number of points to require before differences in the antibiogram will lead to inclusion of an isolate when `type = "points"`, see *Details*
#' @param info a [logical] to indicate info should be printed - the default is `TRUE` only in interactive mode
2021-05-12 18:15:03 +02:00
#' @param include_unknown a [logical] to indicate whether 'unknown' microorganisms should be included too, i.e. microbial code `"UNKNOWN"`, which defaults to `FALSE`. For WHONET users, this means that all records with organism code `"con"` (*contamination*) will be excluded at default. Isolates with a microbial ID of `NA` will always be excluded as first isolate.
2023-01-21 23:47:20 +01:00
#' @param include_untested_sir a [logical] to indicate whether also rows without antibiotic results are still eligible for becoming a first isolate. Use `include_untested_sir = FALSE` to always return `FALSE` for such rows. This checks the data set for columns of class `sir` and consequently requires transforming columns with antibiotic results using [as.sir()] first.
2021-04-29 17:16:30 +02:00
#' @param ... arguments passed on to [first_isolate()] when using [filter_first_isolate()], otherwise arguments passed on to [key_antimicrobials()] (such as `universal`, `gram_negative`, `gram_positive`)
2022-08-28 10:31:50 +02:00
#' @details
2021-04-29 17:16:30 +02:00
#' To conduct epidemiological analyses on antimicrobial resistance data, only so-called first isolates should be included to prevent overestimation and underestimation of antimicrobial resistance. Different methods can be used to do so, see below.
2022-08-28 10:31:50 +02:00
#'
2021-05-17 19:43:01 +02:00
#' These functions are context-aware. This means that the `x` argument can be left blank if used inside a [data.frame] call, see *Examples*.
2022-08-28 10:31:50 +02:00
#'
#' The [first_isolate()] function is a wrapper around the [is_new_episode()] function, but more efficient for data sets containing microorganism codes or names.
2022-08-28 10:31:50 +02:00
#'
2020-11-17 16:57:41 +01:00
#' All isolates with a microbial ID of `NA` will be excluded as first isolate.
2022-08-28 10:31:50 +02:00
#'
2022-10-10 15:44:59 +02:00
#' ### Different methods
2022-08-28 10:31:50 +02:00
#'
#' According to Hindler *et al.* (2007, \doi{10.1086/511864}), there are different methods (algorithms) to select first isolates with increasing reliability: isolate-based, patient-based, episode-based and phenotype-based. All methods select on a combination of the taxonomic genus and species (not subspecies).
#'
2021-04-29 17:16:30 +02:00
#' All mentioned methods are covered in the [first_isolate()] function:
2022-08-28 10:31:50 +02:00
#'
#'
2021-04-29 17:16:30 +02:00
#' | **Method** | **Function to apply** |
#' |--------------------------------------------------|-------------------------------------------------------|
2021-04-29 17:16:30 +02:00
#' | **Isolate-based** | `first_isolate(x, method = "isolate-based")` |
#' | *(= all isolates)* | |
#' | | |
2023-02-09 13:07:39 +01:00
#' | | |
2021-04-29 17:16:30 +02:00
#' | **Patient-based** | `first_isolate(x, method = "patient-based")` |
#' | *(= first isolate per patient)* | |
#' | | |
2023-02-09 13:07:39 +01:00
#' | | |
2021-04-29 17:16:30 +02:00
#' | **Episode-based** | `first_isolate(x, method = "episode-based")`, or: |
#' | *(= first isolate per episode)* | |
#' | - 7-Day interval from initial isolate | - `first_isolate(x, method = "e", episode_days = 7)` |
#' | - 30-Day interval from initial isolate | - `first_isolate(x, method = "e", episode_days = 30)` |
#' | | |
2023-02-09 13:07:39 +01:00
#' | | |
2021-04-29 17:16:30 +02:00
#' | **Phenotype-based** | `first_isolate(x, method = "phenotype-based")`, or: |
#' | *(= first isolate per phenotype)* | |
#' | - Major difference in any antimicrobial result | - `first_isolate(x, type = "points")` |
#' | - Any difference in key antimicrobial results | - `first_isolate(x, type = "keyantimicrobials")` |
2022-08-28 10:31:50 +02:00
#'
#' ### Isolate-based
2022-08-28 10:31:50 +02:00
#'
2021-04-29 17:16:30 +02:00
#' This method does not require any selection, as all isolates should be included. It does, however, respect all arguments set in the [first_isolate()] function. For example, the default setting for `include_unknown` (`FALSE`) will omit selection of rows without a microbial ID.
2022-08-28 10:31:50 +02:00
#'
#' ### Patient-based
2022-08-28 10:31:50 +02:00
#'
2021-04-29 17:16:30 +02:00
#' To include every genus-species combination per patient once, set the `episode_days` to `Inf`. Although often inappropriate, this method makes sure that no duplicate isolates are selected from the same patient. In a large longitudinal data set, this could mean that isolates are *excluded* that were found years after the initial isolate.
2022-08-28 10:31:50 +02:00
#'
#' ### Episode-based
2022-08-28 10:31:50 +02:00
#'
#' To include every genus-species combination per patient episode once, set the `episode_days` to a sensible number of days. Depending on the type of analysis, this could be 14, 30, 60 or 365. Short episodes are common for analysing specific hospital or ward data, long episodes are common for analysing regional and national data.
2022-08-28 10:31:50 +02:00
#'
2021-04-29 17:16:30 +02:00
#' This is the most common method to correct for duplicate isolates. Patients are categorised into episodes based on their ID and dates (e.g., the date of specimen receipt or laboratory result). While this is a common method, it does not take into account antimicrobial test results. This means that e.g. a methicillin-resistant *Staphylococcus aureus* (MRSA) isolate cannot be differentiated from a wildtype *Staphylococcus aureus* isolate.
2022-08-28 10:31:50 +02:00
#'
#' ### Phenotype-based
2022-08-28 10:31:50 +02:00
#'
2021-04-29 17:16:30 +02:00
#' This is a more reliable method, since it also *weighs* the antibiogram (antimicrobial test results) yielding so-called 'first weighted isolates'. There are two different methods to weigh the antibiogram:
2022-08-28 10:31:50 +02:00
#'
#' 1. Using `type = "points"` and argument `points_threshold` (default)
2022-08-28 10:31:50 +02:00
#'
2022-11-13 13:44:25 +01:00
#' This method weighs *all* antimicrobial drugs available in the data set. Any difference from I to S or R (or vice versa) counts as `0.5` points, a difference from S to R (or vice versa) counts as `1` point. When the sum of points exceeds `points_threshold`, which defaults to `2`, an isolate will be selected as a first weighted isolate.
2022-08-28 10:31:50 +02:00
#'
#' All antimicrobials are internally selected using the [all_antimicrobials()] function. The output of this function does not need to be passed to the [first_isolate()] function.
2022-08-28 10:31:50 +02:00
#'
#'
#' 2. Using `type = "keyantimicrobials"` and argument `ignore_I`
2022-08-28 10:31:50 +02:00
#'
2022-11-13 13:44:25 +01:00
#' This method only weighs specific antimicrobial drugs, called *key antimicrobials*. Any difference from S to R (or vice versa) in these key antimicrobials will select an isolate as a first weighted isolate. With `ignore_I = FALSE`, also differences from I to S or R (or vice versa) will lead to this.
2022-08-28 10:31:50 +02:00
#'
#' Key antimicrobials are internally selected using the [key_antimicrobials()] function, but can also be added manually as a variable to the data and set in the `col_keyantimicrobials` argument. Another option is to pass the output of the [key_antimicrobials()] function directly to the `col_keyantimicrobials` argument.
2022-08-28 10:31:50 +02:00
#'
#'
2021-04-29 17:16:30 +02:00
#' The default method is phenotype-based (using `type = "points"`) and episode-based (using `episode_days = 365`). This makes sure that every genus-species combination is selected per patient once per year, while taking into account all antimicrobial test results. If no antimicrobial test results are available in the data set, only the episode-based method is applied at default.
2018-12-22 22:39:34 +01:00
#' @rdname first_isolate
#' @seealso [key_antimicrobials()]
2018-02-26 12:15:52 +01:00
#' @export
2022-08-27 20:49:37 +02:00
#' @return A [logical] vector
2020-05-16 13:05:47 +02:00
#' @source Methodology of this function is strictly based on:
2022-08-28 10:31:50 +02:00
#'
#' - **M39 Analysis and Presentation of Cumulative Antimicrobial Susceptibility Test Data, 5th Edition**, 2022, *Clinical and Laboratory Standards Institute (CLSI)*. <https://clsi.org/standards/products/microbiology/documents/m39/>.
2022-08-28 10:31:50 +02:00
#'
#' - Hindler JF and Stelling J (2007). **Analysis and Presentation of Cumulative Antibiograms: A New Consensus Guideline from the Clinical and Laboratory Standards Institute.** Clinical Infectious Diseases, 44(6), 867-873. \doi{10.1086/511864}
2018-02-21 11:52:31 +01:00
#' @examples
#' # `example_isolates` is a data set available in the AMR package.
#' # See ?example_isolates.
2022-08-28 10:31:50 +02:00
#'
2023-02-10 13:13:17 +01:00
#' example_isolates[first_isolate(info = TRUE), ]
#' \donttest{
#' # get all first Gram-negatives
2022-08-21 16:37:20 +02:00
#' example_isolates[which(first_isolate(info = FALSE) & mo_is_gram_negative()), ]
#'
#' if (require("dplyr")) {
#' # filter on first isolates using dplyr:
#' example_isolates %>%
2023-02-10 13:13:17 +01:00
#' filter(first_isolate(info = TRUE))
2022-08-27 20:49:37 +02:00
#' }
#' if (require("dplyr")) {
2021-04-29 17:16:30 +02:00
#' # short-hand version:
#' example_isolates %>%
2022-08-21 16:37:20 +02:00
#' filter_first_isolate(info = FALSE)
2022-08-27 20:49:37 +02:00
#' }
#' if (require("dplyr")) {
2022-08-28 10:31:50 +02:00
#' # flag the first isolates per group:
#' example_isolates %>%
#' group_by(ward) %>%
2023-02-10 13:13:17 +01:00
#' mutate(first = first_isolate(info = TRUE)) %>%
2022-08-28 10:31:50 +02:00
#' select(ward, date, patient, mo, first)
#' }
2018-02-21 11:52:31 +01:00
#' }
first_isolate <- function(x = NULL,
2018-10-23 11:15:05 +02:00
col_date = NULL,
col_patient_id = NULL,
col_mo = NULL,
col_testcode = NULL,
col_specimen = NULL,
col_icu = NULL,
col_keyantimicrobials = NULL,
2018-02-21 11:52:31 +01:00
episode_days = 365,
2018-10-23 11:15:05 +02:00
testcodes_exclude = NULL,
2018-02-21 11:52:31 +01:00
icu_exclude = FALSE,
2018-12-22 22:39:34 +01:00
specimen_group = NULL,
type = "points",
method = c("phenotype-based", "episode-based", "patient-based", "isolate-based"),
ignore_I = TRUE,
2018-02-27 20:01:02 +01:00
points_threshold = 2,
info = interactive(),
include_unknown = FALSE,
2023-01-21 23:47:20 +01:00
include_untested_sir = TRUE,
2018-12-22 22:39:34 +01:00
...) {
2021-02-09 12:28:15 +01:00
if (is_null_or_grouped_tbl(x)) {
2023-02-15 17:02:10 +01:00
# when `x` is left blank, auto determine it (get_current_data() searches underlying data within call)
2021-02-09 12:28:15 +01:00
# is also fix for using a grouped df as input (a dot as first argument)
x <- tryCatch(get_current_data(arg_name = "x", call = -2), error = function(e) x)
2021-02-09 12:28:15 +01:00
}
meet_criteria(x, allow_class = "data.frame") # also checks dimensions to be >0
meet_criteria(col_date, allow_class = "character", has_length = 1, allow_NULL = TRUE, is_in = colnames(x))
meet_criteria(col_patient_id, allow_class = "character", has_length = 1, allow_NULL = TRUE, is_in = colnames(x))
meet_criteria(col_mo, allow_class = "character", has_length = 1, allow_NULL = TRUE, is_in = colnames(x))
meet_criteria(col_testcode, allow_class = "character", has_length = 1, allow_NULL = TRUE, is_in = colnames(x))
2020-10-21 15:28:48 +02:00
if (isFALSE(col_specimen)) {
col_specimen <- NULL
}
meet_criteria(col_specimen, allow_class = "character", has_length = 1, allow_NULL = TRUE, is_in = colnames(x))
2022-08-27 20:49:37 +02:00
if (is.logical(col_icu)) {
2023-05-24 15:55:53 +02:00
meet_criteria(col_icu, allow_class = "logical", has_length = c(1, nrow(x)), allow_NA = TRUE, allow_NULL = TRUE)
x$newvar_is_icu <- col_icu
} else if (!is.null(col_icu)) {
2023-05-24 15:55:53 +02:00
# add "logical" to the allowed classes here, since it may give an error in certain user input, and should then also say that logicals can be used too
2022-08-27 20:49:37 +02:00
meet_criteria(col_icu, allow_class = c("character", "logical"), has_length = 1, allow_NULL = TRUE, is_in = colnames(x))
x$newvar_is_icu <- x[, col_icu, drop = TRUE]
} else {
2023-05-24 15:55:53 +02:00
x$newvar_is_icu <- NA
2022-08-27 20:49:37 +02:00
}
# method
2021-04-29 17:16:30 +02:00
method <- coerce_method(method)
meet_criteria(method, allow_class = "character", has_length = 1, is_in = c("phenotype-based", "episode-based", "patient-based", "isolate-based"))
# key antimicrobials
if (length(col_keyantimicrobials) > 1) {
meet_criteria(col_keyantimicrobials, allow_class = "character", has_length = nrow(x))
x$keyabcol <- col_keyantimicrobials
col_keyantimicrobials <- "keyabcol"
2021-04-12 12:35:13 +02:00
} else {
if (isFALSE(col_keyantimicrobials)) {
col_keyantimicrobials <- NULL
# method cannot be phenotype-based anymore
2021-04-29 17:16:30 +02:00
if (method == "phenotype-based") {
method <- "episode-based"
}
2021-04-12 12:35:13 +02:00
}
meet_criteria(col_keyantimicrobials, allow_class = "character", has_length = 1, allow_NULL = TRUE, is_in = colnames(x))
2020-10-21 15:28:48 +02:00
}
2021-04-12 12:35:13 +02:00
meet_criteria(episode_days, allow_class = c("numeric", "integer"), has_length = 1, is_positive = TRUE, is_finite = FALSE)
meet_criteria(testcodes_exclude, allow_class = "character", allow_NULL = TRUE)
meet_criteria(icu_exclude, allow_class = "logical", has_length = 1)
meet_criteria(specimen_group, allow_class = "character", has_length = 1, allow_NULL = TRUE)
meet_criteria(type, allow_class = "character", has_length = 1, is_in = c("points", "keyantimicrobials"))
meet_criteria(ignore_I, allow_class = "logical", has_length = 1)
meet_criteria(points_threshold, allow_class = c("numeric", "integer"), has_length = 1, is_positive = TRUE, is_finite = TRUE)
meet_criteria(info, allow_class = "logical", has_length = 1)
meet_criteria(include_unknown, allow_class = "logical", has_length = 1)
2023-01-21 23:47:20 +01:00
meet_criteria(include_untested_sir, allow_class = "logical", has_length = 1)
2022-08-28 10:31:50 +02:00
2021-04-29 17:16:30 +02:00
# remove data.table, grouping from tibbles, etc.
x <- as.data.frame(x, stringsAsFactors = FALSE)
2022-08-28 10:31:50 +02:00
2023-01-21 23:47:20 +01:00
any_col_contains_sir <- any(vapply(
2022-08-28 10:31:50 +02:00
FUN.VALUE = logical(1),
X = x,
# check only first 10,000 rows
2023-01-21 23:47:20 +01:00
FUN = function(x) any(as.character(x[1:10000]) %in% c("S", "I", "R"), na.rm = TRUE),
2022-08-28 10:31:50 +02:00
USE.NAMES = FALSE
))
2023-01-21 23:47:20 +01:00
if (method == "phenotype-based" && !any_col_contains_sir) {
2021-04-29 17:16:30 +02:00
method <- "episode-based"
}
2022-11-14 15:20:39 +01:00
if (isTRUE(info) && message_not_thrown_before("first_isolate", "method")) {
2023-01-23 15:01:21 +01:00
message_(
paste0(
"Determining first isolates ",
ifelse(method %in% c("episode-based", "phenotype-based"),
ifelse(is.infinite(episode_days),
paste(font_bold("without"), " a specified episode length"),
paste("using an episode length of", font_bold(paste(episode_days, "days")))
2023-01-23 15:01:21 +01:00
),
""
)
),
add_fn = font_red
2022-08-28 10:31:50 +02:00
)
}
# try to find columns based on type
2018-10-23 11:15:05 +02:00
# -- mo
2019-01-15 12:45:24 +01:00
if (is.null(col_mo)) {
2021-05-24 15:29:17 +02:00
col_mo <- search_type_in_df(x = x, type = "mo", info = info)
stop_if(is.null(col_mo), "`col_mo` must be set")
2018-10-23 11:15:05 +02:00
}
2022-08-28 10:31:50 +02:00
# methods ----
if (method == "isolate-based") {
episode_days <- Inf
col_keyantimicrobials <- NULL
x$dummy_dates <- Sys.Date()
col_date <- "dummy_dates"
x$dummy_patients <- paste("dummy", seq_len(nrow(x))) # all 'patients' must be unique
col_patient_id <- "dummy_patients"
} else if (method == "patient-based") {
episode_days <- Inf
col_keyantimicrobials <- NULL
} else if (method == "episode-based") {
col_keyantimicrobials <- NULL
} else if (method == "phenotype-based") {
2022-10-05 09:12:22 +02:00
if (missing(type) && !is.null(col_keyantimicrobials)) {
# type = "points" is default, but not set explicitly, while col_keyantimicrobials is
type <- "keyantimicrobials"
}
if (type == "points") {
2023-01-21 23:47:20 +01:00
x$keyantimicrobials <- all_antimicrobials(x, only_sir_columns = FALSE)
col_keyantimicrobials <- "keyantimicrobials"
2022-10-05 09:12:22 +02:00
} else if (type == "keyantimicrobials" && is.null(col_keyantimicrobials)) {
2021-05-24 15:29:17 +02:00
col_keyantimicrobials <- search_type_in_df(x = x, type = "keyantimicrobials", info = info)
if (is.null(col_keyantimicrobials)) {
# still not found as a column, create it ourselves
2023-01-21 23:47:20 +01:00
x$keyantimicrobials <- key_antimicrobials(x, only_sir_columns = FALSE, col_mo = col_mo, ...)
col_keyantimicrobials <- "keyantimicrobials"
}
}
}
2022-08-28 10:31:50 +02:00
2018-10-23 11:15:05 +02:00
# -- date
2018-12-10 15:14:29 +01:00
if (is.null(col_date)) {
2021-05-24 15:29:17 +02:00
col_date <- search_type_in_df(x = x, type = "date", info = info)
stop_if(is.null(col_date), "`col_date` must be set")
2018-08-31 13:36:19 +02:00
}
2022-08-28 10:31:50 +02:00
2018-10-23 11:15:05 +02:00
# -- patient id
2019-01-15 12:45:24 +01:00
if (is.null(col_patient_id)) {
2019-10-11 17:21:02 +02:00
if (all(c("First name", "Last name", "Sex") %in% colnames(x))) {
2019-01-29 20:20:09 +01:00
# WHONET support
2020-05-16 13:05:47 +02:00
x$patient_id <- paste(x$`First name`, x$`Last name`, x$Sex)
2019-01-29 20:20:09 +01:00
col_patient_id <- "patient_id"
message_("Using combined columns '", font_bold("First name"), "', '", font_bold("Last name"), "' and '", font_bold("Sex"), "' as input for `col_patient_id`")
2019-01-29 20:20:09 +01:00
} else {
2021-05-24 15:29:17 +02:00
col_patient_id <- search_type_in_df(x = x, type = "patient_id", info = info)
2019-01-29 20:20:09 +01:00
}
stop_if(is.null(col_patient_id), "`col_patient_id` must be set")
2018-12-10 15:14:29 +01:00
}
2020-10-21 15:28:48 +02:00
2019-01-29 00:06:50 +01:00
# -- specimen
2022-10-05 09:12:22 +02:00
if (is.null(col_specimen) && !is.null(specimen_group)) {
2021-05-24 15:29:17 +02:00
col_specimen <- search_type_in_df(x = x, type = "specimen", info = info)
2019-01-29 00:06:50 +01:00
}
2022-08-28 10:31:50 +02:00
# check if columns exist
2019-05-13 14:56:23 +02:00
check_columns_existance <- function(column, tblname = x) {
2018-10-23 11:15:05 +02:00
if (!is.null(column)) {
stop_ifnot(column %in% colnames(tblname),
2022-08-28 10:31:50 +02:00
"Column '", column, "' not found.",
call = FALSE
)
2018-02-21 11:52:31 +01:00
}
}
2022-08-28 10:31:50 +02:00
2018-02-21 11:52:31 +01:00
check_columns_existance(col_date)
2018-02-26 14:06:31 +01:00
check_columns_existance(col_patient_id)
2018-08-31 13:36:19 +02:00
check_columns_existance(col_mo)
2018-02-21 11:52:31 +01:00
check_columns_existance(col_testcode)
check_columns_existance(col_keyantimicrobials)
2022-08-28 10:31:50 +02:00
# convert dates to Date
dates <- as.Date(x[, col_date, drop = TRUE])
dates[is.na(dates)] <- as.Date("1970-01-01")
x[, col_date] <- dates
2022-08-28 10:31:50 +02:00
2020-05-16 13:05:47 +02:00
# create original row index
x$newvar_row_index <- seq_len(nrow(x))
2023-10-20 14:51:48 +02:00
x$newvar_mo <- as.mo(x[, col_mo, drop = TRUE], keep_synonyms = TRUE, info = FALSE)
x$newvar_genus_species <- paste(mo_genus(x$newvar_mo, keep_synonyms = TRUE, info = FALSE), mo_species(x$newvar_mo, keep_synonyms = TRUE, info = FALSE))
x$newvar_date <- x[, col_date, drop = TRUE]
2023-03-11 17:13:19 +01:00
x$newvar_patient_id <- as.character(x[, col_patient_id, drop = TRUE])
2022-08-28 10:31:50 +02:00
2018-10-23 11:15:05 +02:00
if (is.null(col_testcode)) {
testcodes_exclude <- NULL
2018-02-21 11:52:31 +01:00
}
# remove testcodes
2022-11-14 15:20:39 +01:00
if (!is.null(testcodes_exclude) && isTRUE(info) && message_not_thrown_before("first_isolate", "excludingtestcodes")) {
message_("Excluding test codes: ", vector_and(testcodes_exclude, quotes = TRUE),
add_fn = font_red
2022-08-28 10:31:50 +02:00
)
2018-02-21 11:52:31 +01:00
}
2022-08-28 10:31:50 +02:00
2018-10-23 11:15:05 +02:00
if (is.null(col_specimen)) {
2018-12-22 22:39:34 +01:00
specimen_group <- NULL
}
2022-08-28 10:31:50 +02:00
# filter on specimen group and keyantibiotics when they are filled in
2018-12-22 22:39:34 +01:00
if (!is.null(specimen_group)) {
2019-05-13 14:56:23 +02:00
check_columns_existance(col_specimen, x)
2022-11-14 15:20:39 +01:00
if (isTRUE(info) && message_not_thrown_before("first_isolate", "excludingspecimen")) {
message_("Excluding other than specimen group '", specimen_group, "'",
add_fn = font_red
2022-08-28 10:31:50 +02:00
)
2018-02-21 11:52:31 +01:00
}
}
if (!is.null(col_keyantimicrobials)) {
2023-03-11 17:13:19 +01:00
x$newvar_key_ab <- as.character(x[, col_keyantimicrobials, drop = TRUE])
2018-02-21 11:52:31 +01:00
}
2022-08-28 10:31:50 +02:00
2018-10-23 11:15:05 +02:00
if (is.null(testcodes_exclude)) {
2019-10-11 17:21:02 +02:00
testcodes_exclude <- ""
2018-02-21 11:52:31 +01:00
}
2022-08-28 10:31:50 +02:00
# arrange data to the right sorting
2018-12-22 22:39:34 +01:00
if (is.null(specimen_group)) {
2022-08-28 10:31:50 +02:00
x <- x[order(
x$newvar_patient_id,
x$newvar_genus_species,
x$newvar_date
), ]
rownames(x) <- NULL
row.start <- 1
row.end <- nrow(x)
2018-02-21 11:52:31 +01:00
} else {
2020-05-16 13:05:47 +02:00
# filtering on specimen and only analyse these rows to save time
2022-08-28 10:31:50 +02:00
x <- x[order(
2023-02-09 13:07:39 +01:00
pm_pull(x, col_specimen),
2022-08-28 10:31:50 +02:00
x$newvar_patient_id,
x$newvar_genus_species,
x$newvar_date
), ]
rownames(x) <- NULL
suppressWarnings(
2023-02-09 13:07:39 +01:00
row.start <- which(x %pm>% pm_pull(col_specimen) == specimen_group) %pm>% min(na.rm = TRUE)
)
suppressWarnings(
2023-02-09 13:07:39 +01:00
row.end <- which(x %pm>% pm_pull(col_specimen) == specimen_group) %pm>% max(na.rm = TRUE)
)
2018-02-21 11:52:31 +01:00
}
2022-08-28 10:31:50 +02:00
2020-11-17 16:57:41 +01:00
# speed up - return immediately if obvious
2022-10-05 09:12:22 +02:00
if (abs(row.start) == Inf || abs(row.end) == Inf) {
2022-11-14 15:20:39 +01:00
if (isTRUE(info)) {
2020-11-17 16:57:41 +01:00
message_("=> Found ", font_bold("no isolates"),
2022-08-28 10:31:50 +02:00
add_fn = font_black,
as_note = FALSE
)
2018-02-21 11:52:31 +01:00
}
return(rep(FALSE, nrow(x)))
2018-02-21 11:52:31 +01:00
}
2020-11-17 16:57:41 +01:00
if (row.start == row.end) {
2022-11-14 15:20:39 +01:00
if (isTRUE(info)) {
2022-08-28 10:31:50 +02:00
message_("=> Found ", font_bold("1 first isolate"), ", as the data only contained 1 row",
add_fn = font_black,
as_note = FALSE
)
2020-11-17 16:57:41 +01:00
}
return(TRUE)
}
2023-02-09 13:07:39 +01:00
if (length(c(row.start:row.end)) == pm_n_distinct(x[c(row.start:row.end), col_mo, drop = TRUE])) {
2022-11-14 15:20:39 +01:00
if (isTRUE(info)) {
message_("=> Found ", font_bold(paste(length(c(row.start:row.end)), "first isolates")),
2022-08-28 10:31:50 +02:00
", as all isolates were different microbial species",
add_fn = font_black,
as_note = FALSE
)
2020-11-17 16:57:41 +01:00
}
return(rep(TRUE, length(c(row.start:row.end))))
}
2022-08-28 10:31:50 +02:00
# did find some isolates - add new index numbers of rows
2020-05-16 13:05:47 +02:00
x$newvar_row_index_sorted <- seq_len(nrow(x))
2022-08-28 10:31:50 +02:00
2023-10-20 14:51:48 +02:00
scope.size <- nrow(x[which(x$newvar_row_index_sorted %in% seq(row.start, row.end, 1) &
2022-08-28 10:31:50 +02:00
!is.na(x$newvar_mo)), , drop = FALSE])
# Analysis of first isolate ----
if (!is.null(col_keyantimicrobials)) {
2022-11-14 15:20:39 +01:00
if (isTRUE(info) && message_not_thrown_before("first_isolate", "type")) {
if (type == "keyantimicrobials") {
message_("Basing inclusion on key antimicrobials, ",
2022-08-28 10:31:50 +02:00
ifelse(ignore_I == FALSE, "not ", ""),
"ignoring I",
add_fn = font_red
2022-08-28 10:31:50 +02:00
)
}
2019-10-11 17:21:02 +02:00
if (type == "points") {
2021-05-30 22:14:38 +02:00
message_("Basing inclusion on all antimicrobial results, using a points threshold of ",
2022-08-28 10:31:50 +02:00
points_threshold,
add_fn = font_red
2022-08-28 10:31:50 +02:00
)
}
2018-02-21 11:52:31 +01:00
}
}
2022-08-28 10:31:50 +02:00
2023-02-09 13:07:39 +01:00
x$other_pat_or_mo <- !(x$newvar_patient_id == pm_lag(x$newvar_patient_id) & x$newvar_genus_species == pm_lag(x$newvar_genus_species))
2022-08-28 10:31:50 +02:00
2023-10-20 14:51:48 +02:00
x$newvar_episode_group <- paste(x$newvar_patient_id, x$newvar_genus_species)
2023-01-23 15:01:21 +01:00
x$more_than_episode_ago <- unlist(
lapply(
split(
x$newvar_date,
2023-10-20 14:51:48 +02:00
x$newvar_episode_group
2023-01-23 15:01:21 +01:00
),
2023-02-10 13:13:17 +01:00
is_new_episode,
2023-10-20 14:51:48 +02:00
episode_days = episode_days,
drop = FALSE
2023-01-23 15:01:21 +01:00
),
use.names = FALSE
2022-08-28 10:31:50 +02:00
)
2023-10-20 14:51:48 +02:00
if (!is.null(col_keyantimicrobials)) {
2023-10-20 14:51:48 +02:00
# using phenotypes
x$different_antibiogram <- !unlist(
lapply(
split(
x$newvar_key_ab,
x$newvar_episode_group
),
duplicated_antibiogram,
points_threshold = points_threshold,
ignore_I = ignore_I,
type = type
),
use.names = FALSE
2022-08-28 10:31:50 +02:00
)
2018-02-21 11:52:31 +01:00
} else {
2023-10-20 14:51:48 +02:00
x$different_antibiogram <- FALSE
2022-08-28 10:31:50 +02:00
}
2023-10-20 14:51:48 +02:00
x$newvar_first_isolate <- x$newvar_row_index_sorted >= row.start &
x$newvar_row_index_sorted <= row.end &
x$newvar_genus_species != "" &
(x$other_pat_or_mo | x$more_than_episode_ago | x$different_antibiogram)
decimal.mark <- getOption("OutDec")
big.mark <- ifelse(decimal.mark != ",", ",", " ")
# first one as TRUE
2020-05-16 13:05:47 +02:00
x[row.start, "newvar_first_isolate"] <- TRUE
# no tests that should be included, or ICU
2018-10-23 11:15:05 +02:00
if (!is.null(col_testcode)) {
2020-05-16 13:05:47 +02:00
x[which(x[, col_testcode] %in% tolower(testcodes_exclude)), "newvar_first_isolate"] <- FALSE
2018-02-21 11:52:31 +01:00
}
if (any(!is.na(x$newvar_is_icu)) && any(x$newvar_is_icu == TRUE, na.rm = TRUE)) {
2020-05-16 13:05:47 +02:00
if (icu_exclude == TRUE) {
2023-02-10 13:13:17 +01:00
if (isTRUE(info)) {
message_("Excluding ", format(sum(x$newvar_is_icu, na.rm = TRUE), decimal.mark = decimal.mark, big.mark = big.mark), " isolates from ICU.",
add_fn = font_red)
2023-02-10 13:13:17 +01:00
}
x[which(x$newvar_is_icu), "newvar_first_isolate"] <- FALSE
2023-02-10 13:13:17 +01:00
} else if (isTRUE(info)) {
message_("Including isolates from ICU.")
2020-05-16 13:05:47 +02:00
}
2018-02-21 11:52:31 +01:00
}
2022-08-28 10:31:50 +02:00
2022-11-14 15:20:39 +01:00
if (isTRUE(info)) {
# print group name if used in dplyr::group_by()
cur_group <- import_fn("cur_group", "dplyr", error_on_fail = FALSE)
if (!is.null(cur_group)) {
group_df <- tryCatch(cur_group(), error = function(e) data.frame())
if (NCOL(group_df) > 0) {
# transform factors to characters
group <- vapply(FUN.VALUE = character(1), group_df, function(x) {
if (is.numeric(x)) {
format(x)
} else if (is.logical(x)) {
as.character(x)
} else {
paste0('"', x, '"')
}
})
message_("\nGroup: ", paste0(names(group), " = ", group, collapse = ", "), "\n",
2022-08-28 10:31:50 +02:00
as_note = FALSE,
add_fn = font_red
)
}
}
}
2022-08-28 10:31:50 +02:00
# handle empty microorganisms
2022-11-14 15:20:39 +01:00
if (any(x$newvar_mo == "UNKNOWN", na.rm = TRUE) && isTRUE(info)) {
2022-08-28 10:31:50 +02:00
message_(
ifelse(include_unknown == TRUE, "Including ", "Excluding "),
2022-08-28 10:31:50 +02:00
format(sum(x$newvar_mo == "UNKNOWN", na.rm = TRUE),
decimal.mark = decimal.mark, big.mark = big.mark
),
" isolates with a microbial ID 'UNKNOWN' (in column '", font_bold(col_mo), "')",
add_fn = font_red
2022-08-28 10:31:50 +02:00
)
}
2020-05-16 13:05:47 +02:00
x[which(x$newvar_mo == "UNKNOWN"), "newvar_first_isolate"] <- include_unknown
2022-08-28 10:31:50 +02:00
# exclude all NAs
2022-11-14 15:20:39 +01:00
if (anyNA(x$newvar_mo) && isTRUE(info)) {
2022-08-28 10:31:50 +02:00
message_(
"Excluding ", format(sum(is.na(x$newvar_mo), na.rm = TRUE),
2022-08-28 10:31:50 +02:00
decimal.mark = decimal.mark, big.mark = big.mark
),
" isolates with a microbial ID `NA` (in column '", font_bold(col_mo), "')",
add_fn = font_red
2022-08-28 10:31:50 +02:00
)
}
2020-05-16 13:05:47 +02:00
x[which(is.na(x$newvar_mo)), "newvar_first_isolate"] <- FALSE
2022-08-28 10:31:50 +02:00
# handle isolates without antibiogram
2023-01-21 23:47:20 +01:00
if (include_untested_sir == FALSE && any(is.sir(x))) {
sir_all_NA <- which(unname(vapply(
2022-08-28 10:31:50 +02:00
FUN.VALUE = logical(1),
2023-01-21 23:47:20 +01:00
as.data.frame(t(x[, is.sir(x), drop = FALSE])),
function(sir_values) all(is.na(sir_values))
2022-08-28 10:31:50 +02:00
)))
2023-01-21 23:47:20 +01:00
x[sir_all_NA, "newvar_first_isolate"] <- FALSE
}
2022-08-28 10:31:50 +02:00
# arrange back according to original sorting again
x <- x[order(x$newvar_row_index), , drop = FALSE]
2020-05-16 13:05:47 +02:00
rownames(x) <- NULL
2022-08-28 10:31:50 +02:00
2022-11-14 15:20:39 +01:00
if (isTRUE(info)) {
n_found <- sum(x$newvar_first_isolate, na.rm = TRUE)
p_found_total <- percentage(n_found / nrow(x[which(!is.na(x$newvar_mo)), , drop = FALSE]), digits = 1)
p_found_scope <- percentage(n_found / scope.size, digits = 1)
if (p_found_total %unlike% "[.]") {
p_found_total <- gsub("%", ".0%", p_found_total, fixed = TRUE)
}
if (p_found_scope %unlike% "[.]") {
p_found_scope <- gsub("%", ".0%", p_found_scope, fixed = TRUE)
}
2018-12-22 22:39:34 +01:00
# mark up number of found
n_found <- format(n_found, big.mark = big.mark, decimal.mark = decimal.mark)
2023-01-23 15:01:21 +01:00
message_(
paste0(
"=> Found ",
font_bold(paste0(
n_found,
ifelse(method == "isolate-based", "", paste0(" '", method, "'")),
" first isolates"
)),
" (",
ifelse(p_found_total != p_found_scope,
paste0(p_found_scope, " within scope and "),
""
),
p_found_total, " of total where a microbial ID was available)"
2022-08-28 10:31:50 +02:00
),
2023-01-23 15:01:21 +01:00
add_fn = font_black, as_note = FALSE
2022-08-28 10:31:50 +02:00
)
}
2020-05-16 13:05:47 +02:00
x$newvar_first_isolate
2018-02-21 11:52:31 +01:00
}
2018-12-22 22:39:34 +01:00
#' @rdname first_isolate
#' @export
filter_first_isolate <- function(x = NULL,
2018-12-22 22:39:34 +01:00
col_date = NULL,
col_patient_id = NULL,
col_mo = NULL,
2021-04-29 17:16:30 +02:00
episode_days = 365,
method = c("phenotype-based", "episode-based", "patient-based", "isolate-based"),
2018-12-22 22:39:34 +01:00
...) {
if (is_null_or_grouped_tbl(x)) {
2023-02-15 17:02:10 +01:00
# when `x` is left blank, auto determine it (get_current_data() searches underlying data within call)
# is also fix for using a grouped df as input (a dot as first argument)
x <- tryCatch(get_current_data(arg_name = "x", call = -2), error = function(e) x)
}
2021-02-09 12:28:15 +01:00
meet_criteria(x, allow_class = "data.frame") # also checks dimensions to be >0
meet_criteria(col_date, allow_class = "character", has_length = 1, allow_NULL = TRUE, is_in = colnames(x))
meet_criteria(col_patient_id, allow_class = "character", has_length = 1, allow_NULL = TRUE, is_in = colnames(x))
meet_criteria(col_mo, allow_class = "character", has_length = 1, allow_NULL = TRUE, is_in = colnames(x))
2021-04-29 17:16:30 +02:00
meet_criteria(episode_days, allow_class = c("numeric", "integer"), has_length = 1, is_positive = TRUE, is_finite = FALSE)
method <- coerce_method(method)
meet_criteria(method, allow_class = "character", has_length = 1, is_in = c("phenotype-based", "episode-based", "patient-based", "isolate-based"))
2022-08-28 10:31:50 +02:00
subset(x, first_isolate(
x = x,
col_date = col_date,
col_patient_id = col_patient_id,
col_mo = col_mo,
episode_days = episode_days,
method = method,
...
))
2018-12-22 22:39:34 +01:00
}
2021-04-29 17:16:30 +02:00
coerce_method <- function(method) {
if (is.null(method)) {
return(method)
}
method <- tolower(as.character(method[1L]))
method[method %like% "^(p$|pheno)"] <- "phenotype-based"
method[method %like% "^(e$|episode)"] <- "episode-based"
method[method %like% "^pat"] <- "patient-based"
2021-04-29 17:16:30 +02:00
method[method %like% "^(i$|iso)"] <- "isolate-based"
method
2020-11-17 16:57:41 +01:00
}
2023-10-20 14:51:48 +02:00
duplicated_antibiogram <- function(antibiogram, points_threshold, ignore_I, type) {
if (length(antibiogram) == 1) {
# fast return, only 1 isolate
return(FALSE)
}
out <- rep(NA, length(antibiogram))
out[1] <- FALSE
out[2] <- antimicrobials_equal(antibiogram[1], antibiogram[2],
ignore_I = ignore_I, points_threshold = points_threshold,
type = type)
if (length(antibiogram) == 2) {
# fast return, no further check required
return(out)
}
# sort after the second one (since we already determined AB equality of the first two)
original_sort <- c(1, 2, rank(antibiogram[3:length(antibiogram)]) + 2)
antibiogram.bak <- antibiogram
antibiogram <- c(antibiogram[1:2], sort(antibiogram[3:length(antibiogram)]))
# we can skip the duplicates - they are never unique antibiograms of course
duplicates <- duplicated(antibiogram)
out[3:length(out)][duplicates[3:length(out)] == TRUE] <- TRUE
if (all(duplicates[3:length(out)] == TRUE, na.rm = TRUE)) {
# fast return, no further check required
return(c(out[1:2], rep(TRUE, length(out) - 2)))
}
for (na in antibiogram[is.na(out)]) {
# check if this antibiogram has any change with other antibiograms
out[which(antibiogram == na)] <- all(
vapply(FUN.VALUE = logical(1),
antibiogram[!is.na(out) & antibiogram != na],
function(y) antimicrobials_equal(y = y, z = na,
ignore_I = ignore_I, points_threshold = points_threshold,
type = type)))
}
out <- out[original_sort]
# rerun duplicated again
duplicates <- duplicated(antibiogram.bak)
out[duplicates == TRUE] <- TRUE
out
}