1
0
mirror of https://github.com/msberends/AMR.git synced 2024-12-27 08:06:13 +01:00
AMR/R/resistance.R

735 lines
26 KiB
R
Raw Normal View History

2018-02-21 11:52:31 +01:00
# ==================================================================== #
# TITLE #
# Antimicrobial Resistance (AMR) Analysis #
# #
# AUTHORS #
# Berends MS (m.s.berends@umcg.nl), Luz CF (c.f.luz@umcg.nl) #
# #
# LICENCE #
# This program is free software; you can redistribute it and/or modify #
# it under the terms of the GNU General Public License version 2.0, #
# as published by the Free Software Foundation. #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# ==================================================================== #
2018-07-13 17:23:46 +02:00
#' Calculate resistance of isolates
2018-02-21 11:52:31 +01:00
#'
2018-07-29 22:14:51 +02:00
#' These functions can be used to calculate the (co-)resistance of microbial isolates (i.e. percentage S, SI, IR or R). All functions can be used in \code{dplyr}s \code{\link[dplyr]{summarise}} and support grouped variables, see \emph{Examples}. \cr\cr
#' \code{R} and \code{IR} can be used to calculate resistance, \code{S} and \code{SI} can be used to calculate susceptibility.\cr
#' \code{n_rsi} counts all cases where antimicrobial interpretations are available.
#' @param ab1 vector of antibiotic interpretations, they will be transformed internally with \code{\link{as.rsi}}
#' @param ab2 like \code{ab}, a vector of antibiotic interpretations. Use this to calculate (the lack of) co-resistance: the probability where one of two drugs have a susceptible result. See Examples.
2018-07-13 17:23:46 +02:00
#' @param include_I logical to indicate whether antimicrobial interpretations of "I" should be included
2018-07-29 22:14:51 +02:00
#' @param minimum minimal amount of available isolates. Any number lower than \code{minimum} will return \code{NA}. The default number of \code{30} isolates is advised by the CLSI as best practice, see Source.
2018-07-13 17:23:46 +02:00
#' @param as_percent logical to indicate whether the output must be returned as percent (text), will else be a double
#' @param interpretation antimicrobial interpretation
2018-07-15 22:56:41 +02:00
#' @param info \emph{DEPRECATED} calculate the amount of available isolates and print it, like \code{n = 423}
#' @param warning \emph{DEPRECATED} show a warning when the available amount of isolates is below \code{minimum}
2018-07-13 17:23:46 +02:00
#' @details \strong{Remember that you should filter your table to let it contain only first isolates!} Use \code{\link{first_isolate}} to determine them in your data set.
#'
2018-07-29 22:14:51 +02:00
#' The functions \code{resistance} and \code{susceptibility} are wrappers around \code{IR} and \code{S}, respectively. All functions except \code{rsi} use hybrid evaluation (i.e. using C++), which makes these functions 20-30 times faster than the old \code{rsi} function. This latter function is still available for backwards compatibility but is deprecated.
2018-05-02 14:56:25 +02:00
#' \if{html}{
2018-07-29 22:14:51 +02:00
#' \cr\cr
2018-05-03 10:19:02 +02:00
#' To calculate the probability (\emph{p}) of susceptibility of one antibiotic, we use this formula:
2018-05-02 14:56:25 +02:00
#' \out{<div style="text-align: center">}\figure{mono_therapy.png}\out{</div>}
2018-05-03 10:19:02 +02:00
#' To calculate the probability (\emph{p}) of susceptibility of more antibiotics (i.e. combination therapy), we need to check whether one of them has a susceptible result (as numerator) and count all cases where all antibiotics were tested (as denominator). \cr
2018-07-13 17:23:46 +02:00
#' \cr
2018-05-03 10:19:02 +02:00
#' For two antibiotics:
2018-05-02 14:56:25 +02:00
#' \out{<div style="text-align: center">}\figure{combi_therapy_2.png}\out{</div>}
2018-05-03 10:19:02 +02:00
#' \cr
2018-07-13 17:23:46 +02:00
#' Theoretically for three antibiotics:
2018-05-02 14:56:25 +02:00
#' \out{<div style="text-align: center">}\figure{combi_therapy_3.png}\out{</div>}
#' }
2018-07-29 22:14:51 +02:00
#' @source \strong{M39 Analysis and Presentation of Cumulative Antimicrobial Susceptibility Test Data, 4th Edition}, 2014, \emph{Clinical and Laboratory Standards Institute (CLSI)}. \url{https://clsi.org/standards/products/microbiology/documents/m39/}.
#' @keywords resistance susceptibility rsi_df rsi antibiotics isolate isolates
2018-05-02 14:56:25 +02:00
#' @return Double or, when \code{as_percent = TRUE}, a character.
2018-07-13 17:23:46 +02:00
#' @rdname resistance
2018-02-21 11:52:31 +01:00
#' @export
#' @examples
2018-07-13 17:23:46 +02:00
#' # Calculate resistance
2018-07-29 22:14:51 +02:00
#' R(septic_patients$amox)
#' IR(septic_patients$amox)
2018-07-13 17:23:46 +02:00
#'
#' # Or susceptibility
2018-07-29 22:14:51 +02:00
#' S(septic_patients$amox)
#' SI(septic_patients$amox)
2018-07-13 17:23:46 +02:00
#'
2018-07-29 22:14:51 +02:00
#' # Since n_rsi counts available isolates (and is used as denominator),
#' # you can calculate back to e.g. count resistant isolates:
#' IR(septic_patients$amox) * n_rsi(septic_patients$amox)
#'
#' library(dplyr)
#' septic_patients %>%
#' group_by(hospital_id) %>%
#' summarise(p = S(cipr),
#' n = n_rsi(cipr)) # n_rsi works like n_distinct in dplyr
2018-07-13 17:23:46 +02:00
#'
#' # Calculate co-resistance between amoxicillin/clav acid and gentamicin,
#' # so we can see that combination therapy does a lot more than mono therapy:
2018-07-29 22:14:51 +02:00
#' S(septic_patients$amcl) # p = 67.3%
#' n_rsi(septic_patients$amcl) # n = 1570
2018-07-13 17:23:46 +02:00
#'
2018-07-29 22:14:51 +02:00
#' S(septic_patients$gent) # p = 74.0%
#' n_rsi(septic_patients$gent) # n = 1842
2018-07-13 17:23:46 +02:00
#'
#' with(septic_patients,
2018-07-29 22:14:51 +02:00
#' S(amcl, gent)) # p = 92.1%
2018-07-13 17:23:46 +02:00
#' with(septic_patients,
2018-07-29 22:14:51 +02:00
#' n_rsi(amcl, gent)) # n = 1504
#'
#' septic_patients %>%
#' group_by(hospital_id) %>%
#' summarise(cipro_p = S(cipr, as_percent = TRUE),
#' cipro_n = n_rsi(cipr),
#' genta_p = S(gent, as_percent = TRUE),
#' genta_n = n_rsi(gent),
#' combination_p = S(cipr, gent, as_percent = TRUE),
#' combination_n = n_rsi(cipr, gent))
2018-05-02 14:56:25 +02:00
#'
#' \dontrun{
2018-07-29 22:14:51 +02:00
#'
2018-05-02 14:56:25 +02:00
#' # calculate current empiric combination therapy of Helicobacter gastritis:
2018-02-21 11:52:31 +01:00
#' my_table %>%
2018-04-02 16:05:09 +02:00
#' filter(first_isolate == TRUE,
2018-02-21 11:52:31 +01:00
#' genus == "Helicobacter") %>%
2018-07-29 22:14:51 +02:00
#' summarise(p = S(amox, metr), # amoxicillin with metronidazole
2018-07-13 17:23:46 +02:00
#' n = n_rsi(amox, metr))
2018-02-21 11:52:31 +01:00
#' }
2018-07-29 22:14:51 +02:00
#' @rdname resistance
#' @name resistance
#' @export
#' @rdname resistance
#' @export
S <- function(ab1,
ab2 = NULL,
minimum = 30,
as_percent = FALSE) {
susceptibility(ab1 = ab1,
ab2 = ab2,
include_I = FALSE,
minimum = minimum,
as_percent = as_percent)
}
#' @rdname resistance
#' @export
SI <- function(ab1,
ab2 = NULL,
minimum = 30,
as_percent = FALSE) {
susceptibility(ab1 = ab1,
ab2 = ab2,
include_I = TRUE,
minimum = minimum,
as_percent = as_percent)
}
#' @rdname resistance
#' @export
IR <- function(ab1,
minimum = 30,
as_percent = FALSE) {
resistance(ab1 = ab1,
include_I = TRUE,
minimum = minimum,
as_percent = as_percent)
}
#' @rdname resistance
#' @export
R <- function(ab1,
minimum = 30,
as_percent = FALSE) {
resistance(ab1 = ab1,
include_I = FALSE,
minimum = minimum,
as_percent = as_percent)
}
#' @rdname resistance
#' @export
n_rsi <- function(ab1, ab2 = NULL) {
if (NCOL(ab1) > 1) {
stop('`ab` must be a vector of antimicrobial interpretations', call. = FALSE)
}
if (!is.rsi(ab1)) {
ab1 <- as.rsi(ab1)
}
if (!is.null(ab2)) {
if (NCOL(ab2) > 1) {
stop('`ab2` must be a vector of antimicrobial interpretations', call. = FALSE)
}
if (!is.rsi(ab2)) {
ab2 <- as.rsi(ab2)
}
sum(!is.na(ab1) & !is.na(ab2))
} else {
sum(!is.na(ab1))
}
}
#' @rdname resistance
#' @export
resistance <- function(ab1,
2018-07-13 17:23:46 +02:00
include_I = TRUE,
minimum = 30,
as_percent = FALSE) {
2018-07-29 22:14:51 +02:00
if (NCOL(ab1) > 1) {
stop('`ab1` must be a vector of antimicrobial interpretations', call. = FALSE)
2018-05-02 14:56:25 +02:00
}
2018-07-13 17:23:46 +02:00
if (!is.logical(include_I)) {
stop('`include_I` must be logical', call. = FALSE)
2018-05-02 14:56:25 +02:00
}
2018-07-13 17:23:46 +02:00
if (!is.numeric(minimum)) {
stop('`minimum` must be numeric', call. = FALSE)
2018-05-02 14:56:25 +02:00
}
2018-07-13 17:23:46 +02:00
if (!is.logical(as_percent)) {
stop('`as_percent` must be logical', call. = FALSE)
2018-05-02 14:56:25 +02:00
}
2018-07-29 22:14:51 +02:00
# ab_name <- deparse(substitute(ab))
if (!is.rsi(ab1)) {
x <- as.rsi(ab1)
2018-07-17 10:32:26 +02:00
warning("Increase speed by transforming to class `rsi` on beforehand: df %>% mutate_at(vars(col10:col20), as.rsi)")
2018-07-15 22:56:41 +02:00
} else {
2018-07-29 22:14:51 +02:00
x <- ab1
2018-07-15 22:56:41 +02:00
}
2018-07-17 10:32:26 +02:00
total <- length(x) - sum(is.na(x)) # faster than C++
2018-07-13 17:23:46 +02:00
if (total < minimum) {
2018-07-29 22:14:51 +02:00
# warning("Too few isolates available for ", ab_name, ": ", total, " < ", minimum, "; returning NA.", call. = FALSE)
2018-07-13 17:23:46 +02:00
return(NA)
2018-05-02 14:56:25 +02:00
}
2018-07-13 17:23:46 +02:00
found <- .Call(`_AMR_rsi_calc_R`, x, include_I)
2018-05-02 14:56:25 +02:00
if (as_percent == TRUE) {
2018-07-13 17:23:46 +02:00
percent(found / total, force_zero = TRUE)
2018-05-02 14:56:25 +02:00
} else {
2018-07-13 17:23:46 +02:00
found / total
2018-05-02 14:56:25 +02:00
}
}
2018-07-13 17:23:46 +02:00
#' @rdname resistance
2018-05-02 14:56:25 +02:00
#' @export
2018-07-13 17:23:46 +02:00
susceptibility <- function(ab1,
ab2 = NULL,
include_I = FALSE,
minimum = 30,
as_percent = FALSE) {
if (NCOL(ab1) > 1) {
stop('`ab1` must be a vector of antimicrobial interpretations', call. = FALSE)
2018-02-21 11:52:31 +01:00
}
2018-07-13 17:23:46 +02:00
if (!is.logical(include_I)) {
stop('`include_I` must be logical', call. = FALSE)
2018-02-21 11:52:31 +01:00
}
2018-07-13 17:23:46 +02:00
if (!is.numeric(minimum)) {
stop('`minimum` must be numeric', call. = FALSE)
2018-02-21 11:52:31 +01:00
}
2018-07-13 17:23:46 +02:00
if (!is.logical(as_percent)) {
stop('`as_percent` must be logical', call. = FALSE)
2018-02-21 11:52:31 +01:00
}
2018-07-17 10:32:26 +02:00
print_warning <- FALSE
2018-07-15 22:56:41 +02:00
if (!is.rsi(ab1)) {
ab1 <- as.rsi(ab1)
2018-07-17 10:32:26 +02:00
print_warning <- TRUE
2018-07-15 22:56:41 +02:00
}
2018-07-13 17:23:46 +02:00
if (!is.null(ab2)) {
2018-07-29 22:14:51 +02:00
# ab_name <- paste(deparse(substitute(ab1)), "and", deparse(substitute(ab2)))
2018-07-13 17:23:46 +02:00
if (NCOL(ab2) > 1) {
stop('`ab2` must be a vector of antimicrobial interpretations', call. = FALSE)
2018-05-02 14:56:25 +02:00
}
2018-07-15 22:56:41 +02:00
if (!is.rsi(ab2)) {
ab2 <- as.rsi(ab2)
2018-07-17 10:32:26 +02:00
print_warning <- TRUE
2018-07-15 22:56:41 +02:00
}
x <- apply(X = data.frame(ab1 = as.integer(ab1),
ab2 = as.integer(ab2)),
2018-07-13 17:23:46 +02:00
MARGIN = 1,
FUN = min)
2018-02-21 11:52:31 +01:00
} else {
2018-07-15 22:56:41 +02:00
x <- ab1
2018-07-29 22:14:51 +02:00
# ab_name <- deparse(substitute(ab1))
2018-02-21 11:52:31 +01:00
}
2018-07-17 10:32:26 +02:00
total <- length(x) - sum(is.na(x))
2018-07-13 17:23:46 +02:00
if (total < minimum) {
2018-07-29 22:14:51 +02:00
# warning("Too few isolates available for ", ab_name, ": ", total, " < ", minimum, "; returning NA.", call. = FALSE)
2018-07-13 17:23:46 +02:00
return(NA)
2018-02-21 11:52:31 +01:00
}
2018-07-13 17:23:46 +02:00
found <- .Call(`_AMR_rsi_calc_S`, x, include_I)
2018-04-02 16:05:09 +02:00
2018-07-17 10:32:26 +02:00
if (print_warning == TRUE) {
warning("Increase speed by transforming to class `rsi` on beforehand: df %>% mutate_at(vars(col10:col20), as.rsi)")
}
2018-05-02 14:56:25 +02:00
if (as_percent == TRUE) {
2018-07-13 17:23:46 +02:00
percent(found / total, force_zero = TRUE)
} else {
found / total
2018-02-21 11:52:31 +01:00
}
2018-07-13 17:23:46 +02:00
}
2018-05-02 14:56:25 +02:00
2018-07-13 17:23:46 +02:00
#' @rdname resistance
#' @export
rsi <- function(ab1,
2018-07-15 22:56:41 +02:00
ab2 = NA,
interpretation = 'IR',
2018-07-13 17:23:46 +02:00
minimum = 30,
2018-07-15 22:56:41 +02:00
as_percent = FALSE,
info = FALSE,
warning = TRUE) {
2018-07-29 22:14:51 +02:00
.Deprecated()
2018-07-15 22:56:41 +02:00
ab1.name <- deparse(substitute(ab1))
if (ab1.name %like% '.[$].') {
ab1.name <- unlist(strsplit(ab1.name, "$", fixed = TRUE))
ab1.name <- ab1.name[length(ab1.name)]
}
if (!ab1.name %like% '^[a-z]{3,4}$') {
ab1.name <- 'rsi1'
}
if (length(ab1) == 1 & is.character(ab1)) {
stop('`ab1` must be a vector of antibiotic interpretations.',
'\n Try rsi(', ab1, ', ...) instead of rsi("', ab1, '", ...)', call. = FALSE)
}
ab2.name <- deparse(substitute(ab2))
if (ab2.name %like% '.[$].') {
ab2.name <- unlist(strsplit(ab2.name, "$", fixed = TRUE))
ab2.name <- ab2.name[length(ab2.name)]
}
if (!ab2.name %like% '^[a-z]{3,4}$') {
ab2.name <- 'rsi2'
}
if (length(ab2) == 1 & is.character(ab2)) {
stop('`ab2` must be a vector of antibiotic interpretations.',
'\n Try rsi(', ab2, ', ...) instead of rsi("', ab2, '", ...)', call. = FALSE)
}
interpretation <- paste(interpretation, collapse = "")
ab1 <- as.rsi(ab1)
ab2 <- as.rsi(ab2)
tbl <- tibble(rsi1 = ab1, rsi2 = ab2)
colnames(tbl) <- c(ab1.name, ab2.name)
if (length(ab2) == 1) {
r <- rsi_df(tbl = tbl,
ab = ab1.name,
interpretation = interpretation,
minimum = minimum,
as_percent = FALSE,
info = info,
warning = warning)
} else {
if (length(ab1) != length(ab2)) {
stop('`ab1` (n = ', length(ab1), ') and `ab2` (n = ', length(ab2), ') must be of same length.', call. = FALSE)
}
if (!interpretation %in% c('S', 'IS', 'SI')) {
warning('`interpretation` not set to S or I/S, albeit analysing a combination therapy.', call. = FALSE)
}
r <- rsi_df(tbl = tbl,
ab = c(ab1.name, ab2.name),
interpretation = interpretation,
minimum = minimum,
as_percent = FALSE,
info = info,
warning = warning)
}
if (as_percent == TRUE) {
percent(r, force_zero = TRUE)
} else {
r
}
}
#' @importFrom dplyr %>% filter_at vars any_vars all_vars
#' @noRd
rsi_df <- function(tbl,
ab,
interpretation = 'IR',
minimum = 30,
as_percent = FALSE,
info = TRUE,
warning = TRUE) {
# in case tbl$interpretation already exists:
interpretations_to_check <- paste(interpretation, collapse = "")
# validate:
if (min(grepl('^[a-z]{3,4}$', ab)) == 0 &
min(grepl('^rsi[1-2]$', ab)) == 0) {
for (i in 1:length(ab)) {
ab[i] <- paste0('rsi', i)
}
}
if (!grepl('^(S|SI|IS|I|IR|RI|R){1}$', interpretations_to_check)) {
stop('Invalid `interpretation`; must be "S", "SI", "I", "IR", or "R".')
}
if ('is_ic' %in% colnames(tbl)) {
if (n_distinct(tbl$is_ic) > 1 & warning == TRUE) {
warning('Dataset contains isolates from the Intensive Care. Exclude them from proper epidemiological analysis.')
}
}
# transform when checking for different results
if (interpretations_to_check %in% c('SI', 'IS')) {
for (i in 1:length(ab)) {
tbl[which(tbl[, ab[i]] == 'I'), ab[i]] <- 'S'
}
interpretations_to_check <- 'S'
}
if (interpretations_to_check %in% c('RI', 'IR')) {
for (i in 1:length(ab)) {
tbl[which(tbl[, ab[i]] == 'I'), ab[i]] <- 'R'
}
interpretations_to_check <- 'R'
}
# get fraction
if (length(ab) == 1) {
numerator <- tbl %>%
filter(pull(., ab[1]) == interpretations_to_check) %>%
nrow()
denominator <- tbl %>%
filter(pull(., ab[1]) %in% c("S", "I", "R")) %>%
nrow()
} else if (length(ab) == 2) {
if (interpretations_to_check != 'S') {
warning('`interpretation` not set to S or I/S, albeit analysing a combination therapy.', call. = FALSE)
}
numerator <- tbl %>%
filter_at(vars(ab[1], ab[2]),
any_vars(. == interpretations_to_check)) %>%
filter_at(vars(ab[1], ab[2]),
all_vars(. %in% c("S", "R", "I"))) %>%
nrow()
denominator <- tbl %>%
filter_at(vars(ab[1], ab[2]),
all_vars(. %in% c("S", "R", "I"))) %>%
nrow()
2018-02-21 11:52:31 +01:00
} else {
2018-07-17 10:32:26 +02:00
stop('Maximum of 2 drugs allowed.')
2018-07-15 22:56:41 +02:00
}
# build text part
if (info == TRUE) {
cat('n =', denominator)
info.txt1 <- percent(denominator / nrow(tbl))
if (denominator == 0) {
info.txt1 <- 'none'
}
info.txt2 <- gsub(',', ' and',
ab %>%
abname(tolower = TRUE) %>%
toString(), fixed = TRUE)
info.txt2 <- gsub('rsi1 and rsi2', 'these two drugs', info.txt2, fixed = TRUE)
info.txt2 <- gsub('rsi1', 'this drug', info.txt2, fixed = TRUE)
cat(paste0(' (of ', nrow(tbl), ' in total; ', info.txt1, ' tested on ', info.txt2, ')\n'))
2018-02-21 11:52:31 +01:00
}
2018-07-15 22:56:41 +02:00
# calculate and format
y <- numerator / denominator
if (as_percent == TRUE) {
y <- percent(y, force_zero = TRUE)
}
if (denominator < minimum) {
if (warning == TRUE) {
warning(paste0('TOO FEW ISOLATES OF ', toString(ab), ' (n = ', denominator, ', n < ', minimum, '); NO RESULT.'))
}
y <- NA
}
# output
y
2018-02-21 11:52:31 +01:00
}
2018-07-15 22:56:41 +02:00
2018-02-21 11:52:31 +01:00
#' Predict antimicrobial resistance
#'
2018-02-27 20:01:02 +01:00
#' Create a prediction model to predict antimicrobial resistance for the next years on statistical solid ground. Standard errors (SE) will be returned as columns \code{se_min} and \code{se_max}. See Examples for a real live example.
2018-07-26 16:30:42 +02:00
#' @inheritParams first_isolate
#' @param col_ab column name of \code{tbl} with antimicrobial interpretations (\code{R}, \code{I} and \code{S})
#' @param col_date column name of the date, will be used to calculate years if this column doesn't consist of years already
#' @param year_min lowest year to use in the prediction model, dafaults the lowest year in \code{col_date}
#' @param year_max highest year to use in the prediction model, defaults to 15 years after today
2018-02-21 11:52:31 +01:00
#' @param year_every unit of sequence between lowest year found in the data and \code{year_max}
2018-07-26 16:30:42 +02:00
#' @param minimum minimal amount of available isolates per year to include. Years containing less observations will be estimated by the model.
2018-02-21 11:52:31 +01:00
#' @param model the statistical model of choice. Valid values are \code{"binomial"} (or \code{"binom"} or \code{"logit"}) or \code{"loglin"} or \code{"linear"} (or \code{"lin"}).
#' @param I_as_R treat \code{I} as \code{R}
2018-07-26 16:30:42 +02:00
#' @param preserve_measurements logical to indicate whether predictions of years that are actually available in the data should be overwritten with the original data. The standard errors of those years will be \code{NA}.
2018-02-21 11:52:31 +01:00
#' @param info print textual analysis with the name and \code{\link{summary}} of the model.
2018-07-26 16:30:42 +02:00
#' @return \code{data.frame} with columns:
#' \itemize{
#' \item{\code{year}}
2018-07-28 09:34:03 +02:00
#' \item{\code{value}, the same as \code{estimated} when \code{preserve_measurements = FALSE}, and a combination of \code{observed} and \code{estimated} otherwise}
2018-07-26 16:30:42 +02:00
#' \item{\code{se_min}, the lower bound of the standard error with a minimum of \code{0}}
#' \item{\code{se_max} the upper bound of the standard error with a maximum of \code{1}}
#' \item{\code{observations}, the total number of observations, i.e. S + I + R}
#' \item{\code{observed}, the original observed values}
#' \item{\code{estimated}, the estimated values, calculated by the model}
#' }
2018-07-13 17:23:46 +02:00
#' @seealso \code{\link{resistance}} \cr \code{\link{lm}} \code{\link{glm}}
#' @rdname resistance_predict
2018-02-21 11:52:31 +01:00
#' @export
2018-07-26 16:30:42 +02:00
#' @importFrom stats predict glm lm
2018-07-28 09:34:03 +02:00
#' @importFrom dplyr %>% pull mutate group_by_at summarise filter n_distinct arrange case_when
2018-07-26 16:30:42 +02:00
# @importFrom tidyr spread
2018-02-21 11:52:31 +01:00
#' @examples
#' \dontrun{
2018-07-26 16:30:42 +02:00
#' # use it with base R:
#' resistance_predict(tbl = tbl[which(first_isolate == TRUE & genus == "Haemophilus"),],
#' col_ab = "amcl", col_date = "date")
2018-04-02 16:05:09 +02:00
#'
2018-07-26 16:30:42 +02:00
#' # or use dplyr so you can actually read it:
2018-02-22 20:48:48 +01:00
#' library(dplyr)
2018-02-21 11:52:31 +01:00
#' tbl %>%
#' filter(first_isolate == TRUE,
#' genus == "Haemophilus") %>%
2018-07-26 16:30:42 +02:00
#' resistance_predict(amcl, date)
2018-02-27 20:01:02 +01:00
#' }
2018-02-21 11:52:31 +01:00
#'
#'
2018-02-27 20:01:02 +01:00
#' # real live example:
#' library(dplyr)
#' septic_patients %>%
#' # get bacteria properties like genus and species
2018-04-02 16:05:09 +02:00
#' left_join_microorganisms("bactid") %>%
2018-02-27 20:01:02 +01:00
#' # calculate first isolates
2018-04-02 16:05:09 +02:00
#' mutate(first_isolate =
2018-02-27 20:01:02 +01:00
#' first_isolate(.,
#' "date",
#' "patient_id",
2018-03-27 17:43:42 +02:00
#' "bactid",
2018-02-27 20:01:02 +01:00
#' col_specimen = NA,
2018-04-02 16:05:09 +02:00
#' col_icu = NA)) %>%
2018-02-27 20:01:02 +01:00
#' # filter on first E. coli isolates
2018-04-02 16:05:09 +02:00
#' filter(genus == "Escherichia",
#' species == "coli",
2018-02-27 20:01:02 +01:00
#' first_isolate == TRUE) %>%
#' # predict resistance of cefotaxime for next years
2018-07-26 16:30:42 +02:00
#' resistance_predict(col_ab = "cfot",
#' col_date = "date",
#' year_max = 2025,
#' preserve_measurements = TRUE,
#' minimum = 0)
#'
#' # create nice plots with ggplot
#' if (!require(ggplot2)) {
2018-02-27 20:01:02 +01:00
#'
2018-07-26 16:30:42 +02:00
#' data <- septic_patients %>%
#' filter(bactid == "ESCCOL") %>%
#' resistance_predict(col_ab = "amox",
#' col_date = "date",
#' info = FALSE,
#' minimum = 15)
#'
#' ggplot(data,
#' aes(x = year)) +
2018-07-28 09:34:03 +02:00
#' geom_col(aes(y = value),
2018-07-26 16:30:42 +02:00
#' fill = "grey75") +
#' geom_errorbar(aes(ymin = se_min,
#' ymax = se_max),
#' colour = "grey50") +
#' scale_y_continuous(limits = c(0, 1),
#' breaks = seq(0, 1, 0.1),
#' labels = paste0(seq(0, 100, 10), "%")) +
#' labs(title = expression(paste("Forecast of amoxicillin resistance in ",
#' italic("E. coli"))),
#' y = "%IR",
#' x = "Year") +
#' theme_minimal(base_size = 13)
#' }
2018-07-13 17:23:46 +02:00
resistance_predict <- function(tbl,
2018-07-26 16:30:42 +02:00
col_ab,
col_date,
year_min = NULL,
year_max = NULL,
year_every = 1,
minimum = 30,
model = 'binomial',
I_as_R = TRUE,
preserve_measurements = TRUE,
info = TRUE) {
2018-04-02 16:05:09 +02:00
2018-03-27 17:43:42 +02:00
if (nrow(tbl) == 0) {
stop('This table does not contain any observations.')
}
2018-04-02 16:05:09 +02:00
2018-02-27 20:01:02 +01:00
if (!col_ab %in% colnames(tbl)) {
stop('Column ', col_ab, ' not found.')
}
2018-04-02 16:05:09 +02:00
2018-02-27 20:01:02 +01:00
if (!col_date %in% colnames(tbl)) {
stop('Column ', col_date, ' not found.')
}
if ('grouped_df' %in% class(tbl)) {
# no grouped tibbles please, mutate will throw errors
tbl <- base::as.data.frame(tbl, stringsAsFactors = FALSE)
}
2018-02-21 11:52:31 +01:00
if (I_as_R == TRUE) {
tbl[, col_ab] <- gsub('I', 'R', tbl %>% pull(col_ab))
}
2018-02-27 20:01:02 +01:00
2018-07-26 16:30:42 +02:00
if (!tbl %>% pull(col_ab) %>% is.rsi()) {
tbl[, col_ab] <- tbl %>% pull(col_ab) %>% as.rsi()
2018-02-27 20:01:02 +01:00
}
2018-04-02 16:05:09 +02:00
2018-02-21 11:52:31 +01:00
year <- function(x) {
2018-02-27 20:01:02 +01:00
if (all(grepl('^[0-9]{4}$', x))) {
x
} else {
as.integer(format(as.Date(x), '%Y'))
}
2018-02-21 11:52:31 +01:00
}
2018-04-02 16:05:09 +02:00
2018-02-21 11:52:31 +01:00
df <- tbl %>%
2018-07-26 16:30:42 +02:00
mutate(year = tbl %>% pull(col_date) %>% year()) %>%
2018-02-21 11:52:31 +01:00
group_by_at(c('year', col_ab)) %>%
summarise(n())
2018-07-26 16:30:42 +02:00
if (df %>% pull(col_ab) %>% n_distinct(na.rm = TRUE) < 2) {
stop("No variety in antimicrobial interpretations - all isolates are '",
df %>% pull(col_ab) %>% unique() %>% .[!is.na(.)], "'.",
call. = FALSE)
}
colnames(df) <- c('year', 'antibiotic', 'observations')
2018-02-21 11:52:31 +01:00
df <- df %>%
2018-07-26 16:30:42 +02:00
filter(!is.na(antibiotic)) %>%
tidyr::spread(antibiotic, observations, fill = 0) %>%
mutate(total = R + S) %>%
filter(total >= minimum)
if (NROW(df) == 0) {
stop('There are no observations.')
}
year_lowest <- min(df$year)
if (is.null(year_min)) {
year_min <- year_lowest
} else {
year_min <- max(year_min, year_lowest, na.rm = TRUE)
}
if (is.null(year_max)) {
year_max <- year(Sys.Date()) + 15
}
years_predict <- seq(from = year_min, to = year_max, by = year_every)
2018-04-02 16:05:09 +02:00
2018-02-21 11:52:31 +01:00
if (model %in% c('binomial', 'binom', 'logit')) {
logitmodel <- with(df, glm(cbind(R, S) ~ year, family = binomial))
if (info == TRUE) {
cat('\nLogistic regression model (logit) with binomial distribution')
cat('\n------------------------------------------------------------\n')
print(summary(logitmodel))
}
2018-04-02 16:05:09 +02:00
2018-07-26 16:30:42 +02:00
predictmodel <- predict(logitmodel, newdata = with(df, list(year = years_predict)), type = "response", se.fit = TRUE)
2018-02-21 11:52:31 +01:00
prediction <- predictmodel$fit
se <- predictmodel$se.fit
2018-04-02 16:05:09 +02:00
2018-02-21 11:52:31 +01:00
} else if (model == 'loglin') {
loglinmodel <- with(df, glm(R ~ year, family = poisson))
if (info == TRUE) {
cat('\nLog-linear regression model (loglin) with poisson distribution')
cat('\n--------------------------------------------------------------\n')
print(summary(loglinmodel))
}
2018-04-02 16:05:09 +02:00
2018-07-26 16:30:42 +02:00
predictmodel <- predict(loglinmodel, newdata = with(df, list(year = years_predict)), type = "response", se.fit = TRUE)
2018-02-21 11:52:31 +01:00
prediction <- predictmodel$fit
se <- predictmodel$se.fit
2018-04-02 16:05:09 +02:00
2018-02-21 11:52:31 +01:00
} else if (model %in% c('lin', 'linear')) {
linmodel <- with(df, lm((R / (R + S)) ~ year))
if (info == TRUE) {
cat('\nLinear regression model')
cat('\n-----------------------\n')
print(summary(linmodel))
}
2018-04-02 16:05:09 +02:00
2018-07-26 16:30:42 +02:00
predictmodel <- predict(linmodel, newdata = with(df, list(year = years_predict)), se.fit = TRUE)
2018-02-21 11:52:31 +01:00
prediction <- predictmodel$fit
se <- predictmodel$se.fit
2018-04-02 16:05:09 +02:00
2018-02-21 11:52:31 +01:00
} else {
stop('No valid model selected.')
}
2018-04-02 16:05:09 +02:00
2018-02-21 11:52:31 +01:00
# prepare the output dataframe
2018-07-28 09:34:03 +02:00
prediction <- data.frame(year = years_predict, value = prediction, stringsAsFactors = FALSE)
2018-04-02 16:05:09 +02:00
2018-07-28 09:34:03 +02:00
prediction$se_min <- prediction$value - se
prediction$se_max <- prediction$value + se
2018-04-02 16:05:09 +02:00
2018-02-21 11:52:31 +01:00
if (model == 'loglin') {
2018-07-28 09:34:03 +02:00
prediction$value <- prediction$value %>%
2018-02-21 11:52:31 +01:00
format(scientific = FALSE) %>%
as.integer()
prediction$se_min <- prediction$se_min %>% as.integer()
prediction$se_max <- prediction$se_max %>% as.integer()
2018-04-02 16:05:09 +02:00
2018-02-21 11:52:31 +01:00
colnames(prediction) <- c('year', 'amountR', 'se_max', 'se_min')
} else {
prediction$se_max[which(prediction$se_max > 1)] <- 1
}
prediction$se_min[which(prediction$se_min < 0)] <- 0
2018-07-26 16:30:42 +02:00
prediction$observations = NA
2018-04-02 16:05:09 +02:00
2018-02-21 11:52:31 +01:00
total <- prediction
2018-04-02 16:05:09 +02:00
2018-02-21 11:52:31 +01:00
if (preserve_measurements == TRUE) {
2018-07-26 16:30:42 +02:00
# replace estimated data by observed data
2018-02-21 11:52:31 +01:00
if (I_as_R == TRUE) {
if (!'I' %in% colnames(df)) {
df$I <- 0
}
2018-07-28 09:34:03 +02:00
df$value <- df$R / rowSums(df[, c('R', 'S', 'I')])
2018-02-21 11:52:31 +01:00
} else {
2018-07-28 09:34:03 +02:00
df$value <- df$R / rowSums(df[, c('R', 'S')])
2018-02-21 11:52:31 +01:00
}
measurements <- data.frame(year = df$year,
2018-07-28 09:34:03 +02:00
value = df$value,
2018-07-26 16:30:42 +02:00
se_min = NA,
se_max = NA,
observations = df$total,
stringsAsFactors = FALSE)
2018-02-21 11:52:31 +01:00
colnames(measurements) <- colnames(prediction)
2018-04-02 16:05:09 +02:00
2018-07-26 16:30:42 +02:00
total <- rbind(measurements,
prediction %>% filter(!year %in% df$year))
if (model %in% c('binomial', 'binom', 'logit')) {
2018-07-28 09:34:03 +02:00
total <- total %>% mutate(observed = ifelse(is.na(observations), NA, value),
estimated = prediction$value)
2018-07-26 16:30:42 +02:00
}
2018-02-21 11:52:31 +01:00
}
2018-04-02 16:05:09 +02:00
2018-07-28 09:34:03 +02:00
if ("value" %in% colnames(total)) {
total <- total %>%
mutate(value = case_when(value > 1 ~ 1,
value < 0 ~ 0,
TRUE ~ value))
}
2018-07-26 16:30:42 +02:00
total %>% arrange(year)
2018-04-02 16:05:09 +02:00
2018-02-21 11:52:31 +01:00
}
2018-07-13 17:23:46 +02:00
#' @rdname resistance_predict
#' @export
rsi_predict <- resistance_predict