AMR/R/mean_amr_distance.R

# ==================================================================== #
# TITLE:                                                               #
# AMR: An R Package for Working with Antimicrobial Resistance Data     #
#                                                                      #
# SOURCE CODE:                                                         #
# https://github.com/msberends/AMR                                     #
#                                                                      #
# PLEASE CITE THIS SOFTWARE AS:                                        #
# Berends MS, Luz CF, Friedrich AW, et al. (2022).                     #
# AMR: An R Package for Working with Antimicrobial Resistance Data.    #
# Journal of Statistical Software, 104(3), 1-31.                       #
# https://doi.org/10.18637/jss.v104.i03                                #
#                                                                      #
# Developed at the University of Groningen and the University Medical  #
# Center Groningen in The Netherlands, in collaboration with many      #
# colleagues from around the world, see our website.                   #
#                                                                      #
# This R package is free software; you can freely use and distribute   #
# it for both personal and commercial purposes under the terms of the  #
# GNU General Public License version 2.0 (GNU GPL-2), as published by  #
# the Free Software Foundation.                                        #
# We created this package for both routine data analysis and academic  #
# research and it was publicly released in the hope that it will be    #
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY.              #
#                                                                      #
# Visit our website for the full manual and a complete tutorial about  #
# how to conduct AMR data analysis: https://msberends.github.io/AMR/   #
# ==================================================================== #

#' Calculate the Mean AMR Distance
#'
#' Calculates a normalised mean for antimicrobial resistance between multiple observations, to help to identify similar isolates without comparing antibiograms by hand.
#' @param x a vector of class [sir][as.sir()], [mic][as.mic()] or [disk][as.disk()], or a [data.frame] containing columns of any of these classes
#' @param ... variables to select (supports [tidyselect language][tidyselect::language] such as `column1:column4` and `where(is.mic)`, and can thus also be [antibiotic selectors][ab_selector()]
#' @param combine_SI 	a [logical] to indicate whether all values of S, SDD, and I must be merged into one, so the input only consists of S+I vs. R (susceptible vs. resistant) - the default is `TRUE`
#' @details The mean AMR distance is effectively [the Z-score](https://en.wikipedia.org/wiki/Standard_score); a normalised numeric value to compare AMR test results which can help to identify similar isolates, without comparing antibiograms by hand.
#'
#' MIC values (see [as.mic()]) are transformed with [log2()] first; their distance is thus calculated as `(log2(x) - mean(log2(x))) / sd(log2(x))`.
#'
#' SIR values (see [as.sir()]) are transformed using `"S"` = 1, `"I"` = 2, and `"R"` = 3. If `combine_SI` is `TRUE` (default), the `"I"` will be considered to be 1.
#'
#' For data sets, the mean AMR distance will be calculated per column, after which the mean per row will be returned, see *Examples*.
#'
#' Use [amr_distance_from_row()] to subtract distances from the distance of one row, see *Examples*.
#' @section Interpretation:
#' Isolates with distances less than 0.01 difference from each other should be considered similar. Differences lower than 0.025 should be considered suspicious.
#' @export
#' @examples
#' sir <- random_sir(10)
#' sir
#' mean_amr_distance(sir)
#'
#' mic <- random_mic(10)
#' mic
#' mean_amr_distance(mic)
#' # equal to the Z-score of their log2:
#' (log2(mic) - mean(log2(mic))) / sd(log2(mic))
#'
#' disk <- random_disk(10)
#' disk
#' mean_amr_distance(disk)
#'
#' y <- data.frame(
#'   id = LETTERS[1:10],
#'   amox = random_sir(10, ab = "amox", mo = "Escherichia coli"),
#'   cipr = random_disk(10, ab = "cipr", mo = "Escherichia coli"),
#'   gent = random_mic(10, ab = "gent", mo = "Escherichia coli"),
#'   tobr = random_mic(10, ab = "tobr", mo = "Escherichia coli")
#' )
#' y
#' mean_amr_distance(y)
#' y$amr_distance <- mean_amr_distance(y, where(is.mic))
#' y[order(y$amr_distance), ]
#'
#' if (require("dplyr")) {
#'   y %>%
#'     mutate(
#'       amr_distance = mean_amr_distance(y),
#'       check_id_C = amr_distance_from_row(amr_distance, id == "C")
#'     ) %>%
#'     arrange(check_id_C)
#' }
#' if (require("dplyr")) {
#'   # support for groups
#'   example_isolates %>%
#'     filter(mo_genus() == "Enterococcus" & mo_species() != "") %>%
#'     select(mo, TCY, carbapenems()) %>%
#'     group_by(mo) %>%
#'     mutate(dist = mean_amr_distance(.)) %>%
#'     arrange(mo, dist)
#' }
mean_amr_distance <- function(x, ...) {
  UseMethod("mean_amr_distance")
}

#' @noRd
#' @export
mean_amr_distance.default <- function(x, ...) {
  x <- as.double(x)
  # calculate z-score
  (x - mean(x, na.rm = TRUE)) / stats::sd(x, na.rm = TRUE)
}

#' @noRd
#' @export
mean_amr_distance.mic <- function(x, ...) {
  mean_amr_distance(log2(x))
}

#' @noRd
#' @export
mean_amr_distance.disk <- function(x, ...) {
  mean_amr_distance(as.double(x))
}

#' @rdname mean_amr_distance
#' @export
mean_amr_distance.sir <- function(x, ..., combine_SI = TRUE) {
  meet_criteria(combine_SI, allow_class = "logical", has_length = 1, .call_depth = -1)
  if (isTRUE(combine_SI)) {
    x[x %in% c("I", "SDD")] <- "S"
  }
  mean_amr_distance(as.double(x))
}

#' @rdname mean_amr_distance
#' @export
mean_amr_distance.data.frame <- function(x, ..., combine_SI = TRUE) {
  meet_criteria(combine_SI, allow_class = "logical", has_length = 1, .call_depth = -1)
  df <- x
  if (is_null_or_grouped_tbl(df)) {
    df <- get_current_data("x", -2)
  }
  df <- as.data.frame(df, stringsAsFactors = FALSE)
  if (tryCatch(length(list(...)) > 0, error = function(e) TRUE)) {
    out <- tryCatch(suppressWarnings(c(...)), error = function(e) NULL)
    if (!is.null(out)) {
      df <- df[, out, drop = FALSE]
    } else {
      df <- pm_select(df, ...)
    }
  }
  df_classes <- colnames(df)[vapply(FUN.VALUE = logical(1), df, function(x) is.disk(x) | is.mic(x) | is.disk(x), USE.NAMES = FALSE)]
  df_antibiotics <- unname(get_column_abx(df, info = FALSE))
  df <- df[, colnames(df)[colnames(df) %in% union(df_classes, df_antibiotics)], drop = FALSE]

  stop_if(ncol(df) < 2,
    "data set must contain at least two variables",
    call = -2
  )
  if (message_not_thrown_before("mean_amr_distance", "groups")) {
    message_("Calculating mean AMR distance based on columns ", vector_and(colnames(df), sort = FALSE))
  }

  res <- vapply(
    FUN.VALUE = double(nrow(df)),
    df,
    mean_amr_distance,
    combine_SI = combine_SI
  )
  if (is.null(dim(res))) {
    if (all(is.na(res))) {
      return(NA_real_)
    } else {
      return(mean(res, na.rm = TRUE))
    }
  }
  res <- rowMeans(res, na.rm = TRUE)
  res[is.infinite(res) | is.nan(res)] <- 0
  res
}

#' @rdname mean_amr_distance
#' @param amr_distance the outcome of [mean_amr_distance()]
#' @param row an index, such as a row number
#' @export
amr_distance_from_row <- function(amr_distance, row) {
  meet_criteria(amr_distance, allow_class = "numeric", is_finite = TRUE)
  meet_criteria(row, allow_class = c("logical", "numeric", "integer"))
  if (is.logical(row)) {
    row <- which(row)
  }
  abs(amr_distance[row] - amr_distance)
}