AMR/R/guess_ab_col.R

# ==================================================================== #
# TITLE                                                                #
# Antimicrobial Resistance (AMR) Analysis                              #
#                                                                      #
# SOURCE                                                               #
# https://gitlab.com/msberends/AMR                                     #
#                                                                      #
# LICENCE                                                              #
# (c) 2019 Berends MS (m.s.berends@umcg.nl), Luz CF (c.f.luz@umcg.nl)  #
#                                                                      #
# This R package is free software; you can freely use and distribute   #
# it for both personal and commercial purposes under the terms of the  #
# GNU General Public License version 2.0 (GNU GPL-2), as published by  #
# the Free Software Foundation.                                        #
#                                                                      #
# This R package was created for academic research and was publicly    #
# released in the hope that it will be useful, but it comes WITHOUT    #
# ANY WARRANTY OR LIABILITY.                                           #
# Visit our website for more info: https://msberends.gitlab.io/AMR.    #
# ==================================================================== #

#' Guess antibiotic column
#'
#' This tries to find a column name in a data set based on information from the \code{\link{antibiotics}} data set. Also supports WHONET abbreviations.
#' @param x a \code{data.frame}
#' @param search_string a text to search \code{x} for, will be checked with \code{\link{as.ab}} if this value is not a column in \code{x}
#' @param verbose a logical to indicate whether additional info should be printed
#' @details You can look for an antibiotic (trade) name or abbreviation and it will search \code{x} and the \code{\link{antibiotics}} data set for any column containing a name or code of that antibiotic. \strong{Longer columns names take precendence over shorter column names.}
#' @importFrom dplyr %>% select filter_all any_vars
#' @importFrom crayon blue
#' @return A column name of \code{x}, or \code{NULL} when no result is found.
#' @export
#' @inheritSection AMR Read more on our website!
#' @examples
#' df <- data.frame(amox = "S",
#'                  tetr = "R")
#'
#' guess_ab_col(df, "amoxicillin")
#' # [1] "amox"
#' guess_ab_col(df, "J01AA07") # ATC code of tetracycline
#' # [1] "tetr"
#'
#' guess_ab_col(df, "J01AA07", verbose = TRUE)
#' # Note: Using column `tetr` as input for "J01AA07".
#' # [1] "tetr"
#'
#' # WHONET codes
#' df <- data.frame(AMP_ND10 = "R",
#'                  AMC_ED20 = "S")
#' guess_ab_col(df, "ampicillin")
#' # [1] "AMP_ND10"
#' guess_ab_col(df, "J01CR02")
#' # [1] "AMC_ED20"
#' guess_ab_col(df, as.ab("augmentin"))
#' # [1] "AMC_ED20"
#'
#' # Longer names take precendence:
#' df <- data.frame(AMP_ED2 = "S",
#'                  AMP_ED20 = "S")
#' guess_ab_col(df, "ampicillin")
#' # [1] "AMP_ED20"
guess_ab_col <- function(x = NULL, search_string = NULL, verbose = FALSE) {
  if (is.null(x) & is.null(search_string)) {
    return(as.name("guess_ab_col"))
  }
  if (!is.data.frame(x)) {
    stop("`x` must be a data.frame")
  }

  if (length(search_string) > 1) {
    warning("argument 'search_string' has length > 1 and only the first element will be used")
    search_string <- search_string[1]
  }
  search_string <- as.character(search_string)

  if (search_string %in% colnames(x)) {
    ab_result <- search_string
  } else {
    search_string.ab <- suppressWarnings(as.ab(search_string))
    if (search_string.ab %in% colnames(x)) {
      ab_result <- colnames(x)[colnames(x) == search_string.ab][1L]

    } else if (any(tolower(colnames(x)) %in% tolower(unlist(ab_property(search_string.ab, "abbreviations"))))) {
      ab_result <- colnames(x)[tolower(colnames(x)) %in% tolower(unlist(ab_property(search_string.ab, "abbreviations")))][1L]

    # } else if (any(tolower(colnames(x)) %in% tolower(ab_tradenames(search_string.ab)))) {
    #   ab_result <- colnames(x)[tolower(colnames(x)) %in% tolower(ab_tradenames(search_string.ab))][1L]

    } else {
      # sort colnames on length - longest first
      cols <- colnames(x[, x %>% colnames() %>% nchar() %>% order() %>% rev()])
      df_trans <- data.frame(cols = cols,
                             abs = suppressWarnings(as.ab(cols)),
                             stringsAsFactors = FALSE)
      ab_result <- df_trans[which(df_trans$abs == search_string.ab), "cols"]
      ab_result <- ab_result[!is.na(ab_result)][1L]
    }
  }

  if (length(ab_result) == 0) {
    if (verbose == TRUE) {
      message(paste0("No column found as input for `", search_string,
                     "` (", ab_name(search_string, language = "en", tolower = TRUE), ")."))
    }
    return(NULL)
  } else {
    if (verbose == TRUE) {
      message(blue(paste0("NOTE: Using column `", bold(ab_result), "` as input for `", search_string,
                          "` (", ab_name(search_string, language = "en", tolower = TRUE), ").")))
    }
    return(ab_result)
  }
}


#' @importFrom crayon blue bold
#' @importFrom dplyr %>% mutate arrange pull
get_column_abx <- function(x,
                           soft_dependencies = NULL,
                           hard_dependencies = NULL,
                           verbose = FALSE,
                           ...) {

  # determine from given data set
  x_bak <- x
  df_trans <- data.frame(colnames = colnames(x),
                         abcode = suppressWarnings(as.ab(colnames(x))))
  df_trans <- df_trans[!is.na(df_trans$abcode),]
  x <- as.character(df_trans$colnames)
  names(x) <- df_trans$abcode
  
  # remove the ones that are not a valid AB code, ATC code, name, abbreviation or synonym,
  # and do not already have the rsi class (as.rsi) 
  # and that have >50% invalid values
  vectr_antibiotics <- unique(toupper(unlist(AMR::antibiotics[,c("ab", "atc", "name", "abbreviations", "synonyms")])))
  vectr_antibiotics <- vectr_antibiotics[!is.na(vectr_antibiotics) & nchar(vectr_antibiotics) >= 3]
   x <- sapply(x, function(col = x, df = x_bak) {
    ifelse(toupper(col) %in% vectr_antibiotics |
             is.rsi(as.data.frame(df)[, col]) | 
             is.rsi.eligible(as.data.frame(df)[, col], threshold = 0.5),
           col,
           NA)
  })
  x <- x[!is.na(x)]

  # add from self-defined dots (...):
  # get_column_abx(example_isolates %>% rename(thisone = AMX), amox = "thisone")
  dots <- list(...)
  if (length(dots) > 0) {
    newnames <- suppressWarnings(as.ab(names(dots)))
    if (any(is.na(newnames))) {
      warning("Invalid antibiotic reference(s): ", toString(names(dots)[is.na(newnames)]),
              call. = FALSE, immediate. = TRUE)
    }
    # turn all NULLs to NAs
    dots <- unlist(lapply(dots, function(x) if (is.null(x)) NA else x))
    names(dots) <- newnames
    dots <- dots[!is.na(names(dots))]
    # merge, but overwrite automatically determined ones by 'dots'
    x <- c(x[!x %in% dots & !names(x) %in% names(dots)], dots)
    # delete NAs, this will make e.g. eucast_rules(... TMP = NULL) work to prevent TMP from being used
    x <- x[!is.na(x)]
  }

  # sort on name
  x <- x[order(names(x), x)]
  duplicates <- x[base::duplicated(x)]
  x <- x[!names(x) %in% names(duplicates)]
  
  if (verbose == TRUE) {
    for (i in 1:length(x)) {
      message(blue(paste0("NOTE: Using column `", bold(x[i]), "` as input for `", names(x)[i],
                          "` (", ab_name(names(x)[i], tolower = TRUE), ").")))
    }
  } else if (length(duplicates) > 0) {
    for (i in 1:length(duplicates)) {
     warning(red(paste0("Using column `", bold(duplicates[i]), "` as input for `", names(x[which(x == duplicates[i])]), 
                        "` (", ab_name(names(x[names(which(x == duplicates))[i]]), tolower = TRUE), 
                        "), although it was matched for multiple antibiotics or columns.")), call. = FALSE)
    }
  }

  if (!is.null(hard_dependencies)) {
    if (!all(hard_dependencies %in% names(x))) {
      # missing a hard dependency will return NA and consequently the data will not be analysed
      missing <- hard_dependencies[!hard_dependencies %in% names(x)]
      generate_warning_abs_missing(missing, any = FALSE)
      return(NA)
    }
  }
  if (!is.null(soft_dependencies)) {
    if (!all(soft_dependencies %in% names(x))) {
      # missing a soft dependency may lower the reliability
      missing <- soft_dependencies[!soft_dependencies %in% names(x)]
      missing_txt <- data.frame(missing = missing,
                                missing_names = AMR::ab_name(missing, tolower = TRUE),
                                stringsAsFactors = FALSE) %>%
        mutate(txt = paste0(bold(missing), " (", missing_names, ")")) %>%
        arrange(missing_names) %>%
        pull(txt)
      message(blue('NOTE: Reliability might be improved if these antimicrobial results would be available too:',
                   paste(missing_txt, collapse = ", ")))
    }
  }
  x
}

generate_warning_abs_missing <- function(missing, any = FALSE) {
  missing <- paste0(missing, " (", ab_name(missing, tolower = TRUE), ")")
  if (any == TRUE) {
    any_txt <- c(" any of", "is")
  } else {
    any_txt <- c("", "are")
  }
  warning(paste0("Introducing NAs since", any_txt[1], " these antimicrobials ", any_txt[2], " required: ",
                 paste(missing, collapse = ", ")),
          immediate. = TRUE,
          call. = FALSE)
}
guess_ab 2019-01-03 23:56:19 +01:00			`# ==================================================================== #`
			`# TITLE #`
			`# Antimicrobial Resistance (AMR) Analysis #`
			`# #`
			`# SOURCE #`
			`# https://gitlab.com/msberends/AMR #`
			`# #`
			`# LICENCE #`
			`# (c) 2019 Berends MS (m.s.berends@umcg.nl), Luz CF (c.f.luz@umcg.nl) #`
			`# #`
			`# This R package is free software; you can freely use and distribute #`
			`# it for both personal and commercial purposes under the terms of the #`
			`# GNU General Public License version 2.0 (GNU GPL-2), as published by #`
			`# the Free Software Foundation. #`
			`# #`
			`# This R package was created for academic research and was publicly #`
			`# released in the hope that it will be useful, but it comes WITHOUT #`
			`# ANY WARRANTY OR LIABILITY. #`
new EUCAST rules algorithm 2019-04-05 18:47:39 +02:00			`# Visit our website for more info: https://msberends.gitlab.io/AMR. #`
guess_ab 2019-01-03 23:56:19 +01:00			`# ==================================================================== #`

			`#' Guess antibiotic column`
			`#'`
(v0.6.1.9045) age test fix 2019-05-31 14:40:15 +02:00			`#' This tries to find a column name in a data set based on information from the \code{\link{antibiotics}} data set. Also supports WHONET abbreviations.`
CI tests 2019-05-13 10:10:16 +02:00			`#' @param x a \code{data.frame}`
(v0.7.1.9032) eucast_rules() improvements 2019-08-09 14:28:46 +02:00			`#' @param search_string a text to search \code{x} for, will be checked with \code{\link{as.ab}} if this value is not a column in \code{x}`
guess_ab 2019-01-03 23:56:19 +01:00			`#' @param verbose a logical to indicate whether additional info should be printed`
(v0.7.1.9032) eucast_rules() improvements 2019-08-09 14:28:46 +02:00			`#' @details You can look for an antibiotic (trade) name or abbreviation and it will search \code{x} and the \code{\link{antibiotics}} data set for any column containing a name or code of that antibiotic. \strong{Longer columns names take precendence over shorter column names.}`
guess_ab 2019-01-03 23:56:19 +01:00			`#' @importFrom dplyr %>% select filter_all any_vars`
cfta streptococci, codecov.yml 2019-04-09 14:59:17 +02:00			`#' @importFrom crayon blue`
CI tests 2019-05-13 10:10:16 +02:00			`#' @return A column name of \code{x}, or \code{NULL} when no result is found.`
guess_ab 2019-01-03 23:56:19 +01:00			`#' @export`
			`#' @inheritSection AMR Read more on our website!`
guess_ab_col, benchmarks 2019-01-11 20:37:23 +01:00			`#' @examples`
			`#' df <- data.frame(amox = "S",`
			`#' tetr = "R")`
			`#'`
			`#' guess_ab_col(df, "amoxicillin")`
			`#' # [1] "amox"`
new antibiotics 2019-05-10 16:44:59 +02:00			`#' guess_ab_col(df, "J01AA07") # ATC code of tetracycline`
guess_ab_col, benchmarks 2019-01-11 20:37:23 +01:00			`#' # [1] "tetr"`
			`#'`
			`#' guess_ab_col(df, "J01AA07", verbose = TRUE)`
CI tests 2019-05-13 10:10:16 +02:00			#' # Note: Using column `tetr` as input for "J01AA07".
guess_ab_col, benchmarks 2019-01-11 20:37:23 +01:00			`#' # [1] "tetr"`
WHONET/EARS-Net support 2019-01-29 00:06:50 +01:00			`#'`
			`#' # WHONET codes`
			`#' df <- data.frame(AMP_ND10 = "R",`
			`#' AMC_ED20 = "S")`
			`#' guess_ab_col(df, "ampicillin")`
			`#' # [1] "AMP_ND10"`
			`#' guess_ab_col(df, "J01CR02")`
			`#' # [1] "AMC_ED20"`
new antibiotics 2019-05-10 16:44:59 +02:00			`#' guess_ab_col(df, as.ab("augmentin"))`
WHONET/EARS-Net support 2019-01-29 00:06:50 +01:00			`#' # [1] "AMC_ED20"`
(v0.6.1.9045) age test fix 2019-05-31 14:40:15 +02:00			`#'`
			`#' # Longer names take precendence:`
			`#' df <- data.frame(AMP_ED2 = "S",`
			`#' AMP_ED20 = "S")`
			`#' guess_ab_col(df, "ampicillin")`
			`#' # [1] "AMP_ED20"`
CI tests 2019-05-13 10:10:16 +02:00			`guess_ab_col <- function(x = NULL, search_string = NULL, verbose = FALSE) {`
			`if (is.null(x) & is.null(search_string)) {`
guess_ab_col, benchmarks 2019-01-11 20:37:23 +01:00			`return(as.name("guess_ab_col"))`
guess_ab 2019-01-03 23:56:19 +01:00			`}`
speed improvement eucast_rules(), support more old MO codes 2019-05-20 12:00:18 +02:00			`if (!is.data.frame(x)) {`
			stop("`x` must be a data.frame")
			`}`
v0.6.1 2019-03-28 21:33:28 +01:00
CI tests 2019-05-13 10:10:16 +02:00			`if (length(search_string) > 1) {`
			`warning("argument 'search_string' has length > 1 and only the first element will be used")`
			`search_string <- search_string[1]`
guess_ab 2019-01-03 23:56:19 +01:00			`}`
CI tests 2019-05-13 10:10:16 +02:00			`search_string <- as.character(search_string)`
guess_ab_col, benchmarks 2019-01-11 20:37:23 +01:00
CI tests 2019-05-13 10:10:16 +02:00			`if (search_string %in% colnames(x)) {`
			`ab_result <- search_string`
new antibiotics 2019-05-10 16:44:59 +02:00			`} else {`
documentation fix 2019-05-13 20:16:51 +02:00			`search_string.ab <- suppressWarnings(as.ab(search_string))`
			`if (search_string.ab %in% colnames(x)) {`
			`ab_result <- colnames(x)[colnames(x) == search_string.ab][1L]`
(v0.7.1.9013) guess_mo_ab speed improvement 2019-07-11 13:39:18 +02:00
			`} else if (any(tolower(colnames(x)) %in% tolower(unlist(ab_property(search_string.ab, "abbreviations"))))) {`
			`ab_result <- colnames(x)[tolower(colnames(x)) %in% tolower(unlist(ab_property(search_string.ab, "abbreviations")))][1L]`

			`# } else if (any(tolower(colnames(x)) %in% tolower(ab_tradenames(search_string.ab)))) {`
			`# ab_result <- colnames(x)[tolower(colnames(x)) %in% tolower(ab_tradenames(search_string.ab))][1L]`

documentation fix 2019-05-13 20:16:51 +02:00			`} else {`
			`# sort colnames on length - longest first`
			`cols <- colnames(x[, x %>% colnames() %>% nchar() %>% order() %>% rev()])`
			`df_trans <- data.frame(cols = cols,`
			`abs = suppressWarnings(as.ab(cols)),`
			`stringsAsFactors = FALSE)`
			`ab_result <- df_trans[which(df_trans$abs == search_string.ab), "cols"]`
			`ab_result <- ab_result[!is.na(ab_result)][1L]`
			`}`
guess_ab 2019-01-03 23:56:19 +01:00			`}`
guess_ab_col, benchmarks 2019-01-11 20:37:23 +01:00
			`if (length(ab_result) == 0) {`
guess_ab 2019-01-03 23:56:19 +01:00			`if (verbose == TRUE) {`
(v0.6.1.9044) first_isolate fix for species 2019-05-31 14:25:11 +02:00			message(paste0("No column found as input for `", search_string,
			"` (", ab_name(search_string, language = "en", tolower = TRUE), ")."))
guess_ab 2019-01-03 23:56:19 +01:00			`}`
			`return(NULL)`
guess_ab_col, benchmarks 2019-01-11 20:37:23 +01:00			`} else {`
			`if (verbose == TRUE) {`
(v0.6.1.9044) first_isolate fix for species 2019-05-31 14:25:11 +02:00			message(blue(paste0("NOTE: Using column `", bold(ab_result), "` as input for `", search_string,
			"` (", ab_name(search_string, language = "en", tolower = TRUE), ").")))
guess_ab_col, benchmarks 2019-01-11 20:37:23 +01:00			`}`
new antibiotics 2019-05-10 16:44:59 +02:00			`return(ab_result)`
guess_ab 2019-01-03 23:56:19 +01:00			`}`
			`}`
(v0.7.1.9004) atc class removal 2019-06-27 11:57:45 +02:00

			`#' @importFrom crayon blue bold`
			`#' @importFrom dplyr %>% mutate arrange pull`
			`get_column_abx <- function(x,`
			`soft_dependencies = NULL,`
			`hard_dependencies = NULL,`
			`verbose = FALSE,`
			`...) {`

			`# determine from given data set`
(v0.7.1.9094) get_column_abx() improvement 2019-10-06 21:44:08 +02:00			`x_bak <- x`
(v0.7.1.9004) atc class removal 2019-06-27 11:57:45 +02:00			`df_trans <- data.frame(colnames = colnames(x),`
			`abcode = suppressWarnings(as.ab(colnames(x))))`
			`df_trans <- df_trans[!is.na(df_trans$abcode),]`
			`x <- as.character(df_trans$colnames)`
			`names(x) <- df_trans$abcode`
(v0.7.1.9094) get_column_abx() improvement 2019-10-06 21:44:08 +02:00
(v0.7.1.9095) get_column_abx() fix 2019-10-06 22:19:26 +02:00			`# remove the ones that are not a valid AB code, ATC code, name, abbreviation or synonym,`
			`# and do not already have the rsi class (as.rsi)`
			`# and that have >50% invalid values`
			`vectr_antibiotics <- unique(toupper(unlist(AMR::antibiotics[,c("ab", "atc", "name", "abbreviations", "synonyms")])))`
			`vectr_antibiotics <- vectr_antibiotics[!is.na(vectr_antibiotics) & nchar(vectr_antibiotics) >= 3]`
			`x <- sapply(x, function(col = x, df = x_bak) {`
			`ifelse(toupper(col) %in% vectr_antibiotics \|`
			`is.rsi(as.data.frame(df)[, col]) \|`
(v0.7.1.9094) get_column_abx() improvement 2019-10-06 21:44:08 +02:00			`is.rsi.eligible(as.data.frame(df)[, col], threshold = 0.5),`
			`col,`
			`NA)`
			`})`
			`x <- x[!is.na(x)]`
(v0.7.1.9004) atc class removal 2019-06-27 11:57:45 +02:00
			`# add from self-defined dots (...):`
(v0.7.1.9063) septic_patients -> example_isolates 2019-08-27 16:45:42 +02:00			`# get_column_abx(example_isolates %>% rename(thisone = AMX), amox = "thisone")`
(v0.7.1.9004) atc class removal 2019-06-27 11:57:45 +02:00			`dots <- list(...)`
			`if (length(dots) > 0) {`
			`newnames <- suppressWarnings(as.ab(names(dots)))`
			`if (any(is.na(newnames))) {`
			`warning("Invalid antibiotic reference(s): ", toString(names(dots)[is.na(newnames)]),`
			`call. = FALSE, immediate. = TRUE)`
			`}`
			`# turn all NULLs to NAs`
			`dots <- unlist(lapply(dots, function(x) if (is.null(x)) NA else x))`
			`names(dots) <- newnames`
			`dots <- dots[!is.na(names(dots))]`
			`# merge, but overwrite automatically determined ones by 'dots'`
			`x <- c(x[!x %in% dots & !names(x) %in% names(dots)], dots)`
			`# delete NAs, this will make e.g. eucast_rules(... TMP = NULL) work to prevent TMP from being used`
			`x <- x[!is.na(x)]`
			`}`

			`# sort on name`
(v0.7.1.9057) get_column_abx() improvement 2019-08-15 17:09:27 +02:00			`x <- x[order(names(x), x)]`
			`duplicates <- x[base::duplicated(x)]`
			`x <- x[!names(x) %in% names(duplicates)]`

(v0.7.1.9004) atc class removal 2019-06-27 11:57:45 +02:00			`if (verbose == TRUE) {`
			`for (i in 1:length(x)) {`
(v0.7.1.9057) get_column_abx() improvement 2019-08-15 17:09:27 +02:00			message(blue(paste0("NOTE: Using column `", bold(x[i]), "` as input for `", names(x)[i],
			"` (", ab_name(names(x)[i], tolower = TRUE), ").")))
(v0.7.1.9004) atc class removal 2019-06-27 11:57:45 +02:00			`}`
(v0.7.1.9057) get_column_abx() improvement 2019-08-15 17:09:27 +02:00			`} else if (length(duplicates) > 0) {`
			`for (i in 1:length(duplicates)) {`
			warning(red(paste0("Using column `", bold(duplicates[i]), "` as input for `", names(x[which(x == duplicates[i])]),
			"` (", ab_name(names(x[names(which(x == duplicates))[i]]), tolower = TRUE),
			`"), although it was matched for multiple antibiotics or columns.")), call. = FALSE)`
(v0.7.1.9004) atc class removal 2019-06-27 11:57:45 +02:00			`}`
			`}`

			`if (!is.null(hard_dependencies)) {`
			`if (!all(hard_dependencies %in% names(x))) {`
			`# missing a hard dependency will return NA and consequently the data will not be analysed`
			`missing <- hard_dependencies[!hard_dependencies %in% names(x)]`
			`generate_warning_abs_missing(missing, any = FALSE)`
			`return(NA)`
			`}`
			`}`
			`if (!is.null(soft_dependencies)) {`
			`if (!all(soft_dependencies %in% names(x))) {`
			`# missing a soft dependency may lower the reliability`
			`missing <- soft_dependencies[!soft_dependencies %in% names(x)]`
			`missing_txt <- data.frame(missing = missing,`
			`missing_names = AMR::ab_name(missing, tolower = TRUE),`
			`stringsAsFactors = FALSE) %>%`
			`mutate(txt = paste0(bold(missing), " (", missing_names, ")")) %>%`
			`arrange(missing_names) %>%`
			`pull(txt)`
			`message(blue('NOTE: Reliability might be improved if these antimicrobial results would be available too:',`
			`paste(missing_txt, collapse = ", ")))`
			`}`
			`}`
			`x`
			`}`

			`generate_warning_abs_missing <- function(missing, any = FALSE) {`
			`missing <- paste0(missing, " (", ab_name(missing, tolower = TRUE), ")")`
			`if (any == TRUE) {`
			`any_txt <- c(" any of", "is")`
			`} else {`
			`any_txt <- c("", "are")`
			`}`
			`warning(paste0("Introducing NAs since", any_txt[1], " these antimicrobials ", any_txt[2], " required: ",`
			`paste(missing, collapse = ", ")),`
			`immediate. = TRUE,`
			`call. = FALSE)`
			`}`