1
0
mirror of https://github.com/msberends/AMR.git synced 2024-12-27 08:06:13 +01:00
AMR/R/guess_ab_col.R

257 lines
11 KiB
R
Raw Normal View History

2019-01-03 23:56:19 +01:00
# ==================================================================== #
# TITLE #
2020-10-08 11:16:03 +02:00
# Antimicrobial Resistance (AMR) Analysis for R #
2019-01-03 23:56:19 +01:00
# #
# SOURCE #
2020-07-08 14:48:06 +02:00
# https://github.com/msberends/AMR #
2019-01-03 23:56:19 +01:00
# #
# LICENCE #
2020-12-27 00:30:28 +01:00
# (c) 2018-2021 Berends MS, Luz CF et al. #
2020-10-08 11:16:03 +02:00
# Developed at the University of Groningen, the Netherlands, in #
# collaboration with non-profit organisations Certe Medical #
# Diagnostics & Advice, and University Medical Center Groningen. #
2019-01-03 23:56:19 +01:00
# #
# This R package is free software; you can freely use and distribute #
# it for both personal and commercial purposes under the terms of the #
# GNU General Public License version 2.0 (GNU GPL-2), as published by #
# the Free Software Foundation. #
# We created this package for both routine data analysis and academic #
# research and it was publicly released in the hope that it will be #
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. #
2020-10-08 11:16:03 +02:00
# #
# Visit our website for the full manual and a complete tutorial about #
# how to conduct AMR analysis: https://msberends.github.io/AMR/ #
2019-01-03 23:56:19 +01:00
# ==================================================================== #
#' Guess antibiotic column
#'
#' This tries to find a column name in a data set based on information from the [antibiotics] data set. Also supports WHONET abbreviations.
#' @inheritSection lifecycle Stable lifecycle
#' @param x a [data.frame]
#' @param search_string a text to search `x` for, will be checked with [as.ab()] if this value is not a column in `x`
2019-01-03 23:56:19 +01:00
#' @param verbose a logical to indicate whether additional info should be printed
#' @details You can look for an antibiotic (trade) name or abbreviation and it will search `x` and the [antibiotics] data set for any column containing a name or code of that antibiotic. **Longer columns names take precedence over shorter column names.**
#' @return A column name of `x`, or `NULL` when no result is found.
2019-01-03 23:56:19 +01:00
#' @export
#' @inheritSection AMR Read more on our website!
2019-01-11 20:37:23 +01:00
#' @examples
#' df <- data.frame(amox = "S",
#' tetr = "R")
#'
#' guess_ab_col(df, "amoxicillin")
#' # [1] "amox"
2019-05-10 16:44:59 +02:00
#' guess_ab_col(df, "J01AA07") # ATC code of tetracycline
2019-01-11 20:37:23 +01:00
#' # [1] "tetr"
#'
#' guess_ab_col(df, "J01AA07", verbose = TRUE)
#' # NOTE: Using column 'tetr' as input for J01AA07 (tetracycline).
2019-01-11 20:37:23 +01:00
#' # [1] "tetr"
2019-01-29 00:06:50 +01:00
#'
#' # WHONET codes
#' df <- data.frame(AMP_ND10 = "R",
#' AMC_ED20 = "S")
#' guess_ab_col(df, "ampicillin")
#' # [1] "AMP_ND10"
#' guess_ab_col(df, "J01CR02")
#' # [1] "AMC_ED20"
2019-05-10 16:44:59 +02:00
#' guess_ab_col(df, as.ab("augmentin"))
2019-01-29 00:06:50 +01:00
#' # [1] "AMC_ED20"
2019-05-31 14:40:15 +02:00
#'
#' # Longer names take precendence:
#' df <- data.frame(AMP_ED2 = "S",
#' AMP_ED20 = "S")
#' guess_ab_col(df, "ampicillin")
#' # [1] "AMP_ED20"
2019-05-13 10:10:16 +02:00
guess_ab_col <- function(x = NULL, search_string = NULL, verbose = FALSE) {
meet_criteria(x, allow_class = "data.frame", allow_NULL = TRUE)
meet_criteria(search_string, allow_class = "character", has_length = 1, allow_NULL = TRUE)
meet_criteria(verbose, allow_class = "logical", has_length = 1)
2019-05-13 10:10:16 +02:00
if (is.null(x) & is.null(search_string)) {
2019-01-11 20:37:23 +01:00
return(as.name("guess_ab_col"))
2019-01-03 23:56:19 +01:00
}
2020-07-13 09:17:24 +02:00
2019-05-13 10:10:16 +02:00
if (search_string %in% colnames(x)) {
ab_result <- search_string
2019-05-10 16:44:59 +02:00
} else {
2019-05-13 20:16:51 +02:00
search_string.ab <- suppressWarnings(as.ab(search_string))
if (search_string.ab %in% colnames(x)) {
ab_result <- colnames(x)[colnames(x) == search_string.ab][1L]
2020-07-13 09:17:24 +02:00
2020-05-18 11:09:02 +02:00
} else if (any(tolower(colnames(x)) %in% tolower(unlist(ab_property(search_string.ab, "abbreviations", language = NULL))))) {
ab_result <- colnames(x)[tolower(colnames(x)) %in% tolower(unlist(ab_property(search_string.ab, "abbreviations", language = NULL)))][1L]
2020-07-13 09:17:24 +02:00
2019-05-13 20:16:51 +02:00
} else {
# sort colnames on length - longest first
cols <- colnames(x[, x %pm>% colnames() %pm>% nchar() %pm>% order() %pm>% rev()])
2019-05-13 20:16:51 +02:00
df_trans <- data.frame(cols = cols,
abs = suppressWarnings(as.ab(cols)),
stringsAsFactors = FALSE)
ab_result <- df_trans[which(df_trans$abs == search_string.ab), "cols"]
ab_result <- ab_result[!is.na(ab_result)][1L]
}
2019-01-03 23:56:19 +01:00
}
2020-07-13 09:17:24 +02:00
2019-01-11 20:37:23 +01:00
if (length(ab_result) == 0) {
2019-01-03 23:56:19 +01:00
if (verbose == TRUE) {
message_("No column found as input for ", search_string,
" (", ab_name(search_string, language = NULL, tolower = TRUE), ").",
2020-10-27 15:56:51 +01:00
add_fn = font_black,
as_note = FALSE)
2019-01-03 23:56:19 +01:00
}
return(NULL)
2019-01-11 20:37:23 +01:00
} else {
if (verbose == TRUE) {
message_("Using column '", font_bold(ab_result), "' as input for ", search_string,
" (", ab_name(search_string, language = NULL, tolower = TRUE), ").")
2019-01-11 20:37:23 +01:00
}
2019-05-10 16:44:59 +02:00
return(ab_result)
2019-01-03 23:56:19 +01:00
}
}
2019-06-27 11:57:45 +02:00
get_column_abx <- function(x,
soft_dependencies = NULL,
hard_dependencies = NULL,
verbose = FALSE,
2020-09-24 00:30:11 +02:00
info = TRUE,
2019-06-27 11:57:45 +02:00
...) {
meet_criteria(x, allow_class = "data.frame")
meet_criteria(soft_dependencies, allow_class = "character", allow_NULL = TRUE)
meet_criteria(hard_dependencies, allow_class = "character", allow_NULL = TRUE)
meet_criteria(verbose, allow_class = "logical", has_length = 1)
meet_criteria(info, allow_class = "logical", has_length = 1)
2020-07-13 09:17:24 +02:00
2020-09-24 00:30:11 +02:00
if (info == TRUE) {
2020-10-27 15:56:51 +01:00
message_("Auto-guessing columns suitable for analysis", appendLF = FALSE)
2020-09-24 00:30:11 +02:00
}
2019-10-08 10:02:19 +02:00
x <- as.data.frame(x, stringsAsFactors = FALSE)
2020-06-09 16:18:03 +02:00
if (NROW(x) > 10000) {
# only test maximum of 10,000 values per column
2020-09-24 00:30:11 +02:00
if (info == TRUE) {
2020-10-27 15:56:51 +01:00
message_(" (using only ", font_bold("the first 10,000 rows"), ")...",
appendLF = FALSE,
as_note = FALSE)
2020-09-24 00:30:11 +02:00
}
2020-06-09 16:18:03 +02:00
x <- x[1:10000, , drop = FALSE]
2020-09-24 00:30:11 +02:00
} else if (info == TRUE) {
2020-10-27 15:56:51 +01:00
message_("...", appendLF = FALSE, as_note = FALSE)
2020-06-09 16:18:03 +02:00
}
x_bak <- x
2019-10-08 10:02:19 +02:00
# only check columns that are a valid AB code, ATC code, name, abbreviation or synonym,
# or already have the <rsi> class (as.rsi)
# and that they have no more than 50% invalid values
2020-02-14 19:54:13 +01:00
vectr_antibiotics <- unique(toupper(unlist(antibiotics[, c("ab", "atc", "name", "abbreviations", "synonyms")])))
2019-10-08 10:02:19 +02:00
vectr_antibiotics <- vectr_antibiotics[!is.na(vectr_antibiotics) & nchar(vectr_antibiotics) >= 3]
x_columns <- vapply(FUN.VALUE = character(1), colnames(x), function(col, df = x_bak) {
if (toupper(col) %in% vectr_antibiotics ||
is.rsi(as.data.frame(df, stringsAsFactors = FALSE)[, col, drop = TRUE]) ||
is.rsi.eligible(as.data.frame(df, stringsAsFactors = FALSE)[, col, drop = TRUE],
threshold = 0.5)) {
2019-10-08 10:02:19 +02:00
return(col)
} else {
return(NA_character_)
}
})
x_columns <- x_columns[!is.na(x_columns)]
x <- x[, x_columns, drop = FALSE] # without drop = TRUE, x will become a vector when x_columns is length 1
2020-07-13 09:17:24 +02:00
2019-06-27 11:57:45 +02:00
df_trans <- data.frame(colnames = colnames(x),
abcode = suppressWarnings(as.ab(colnames(x), info = FALSE)),
stringsAsFactors = FALSE)
2020-09-24 00:30:11 +02:00
df_trans <- df_trans[!is.na(df_trans$abcode), , drop = FALSE]
2019-06-27 11:57:45 +02:00
x <- as.character(df_trans$colnames)
names(x) <- df_trans$abcode
2020-07-13 09:17:24 +02:00
2019-06-27 11:57:45 +02:00
# add from self-defined dots (...):
# such as get_column_abx(example_isolates %pm>% rename(thisone = AMX), amox = "thisone")
2019-06-27 11:57:45 +02:00
dots <- list(...)
if (length(dots) > 0) {
newnames <- suppressWarnings(as.ab(names(dots), info = FALSE))
2019-06-27 11:57:45 +02:00
if (any(is.na(newnames))) {
2020-11-10 16:35:56 +01:00
warning_("Invalid antibiotic reference(s): ", toString(names(dots)[is.na(newnames)]),
call = FALSE,
immediate = TRUE)
2019-06-27 11:57:45 +02:00
}
# turn all NULLs to NAs
dots <- unlist(lapply(dots, function(x) if (is.null(x)) NA else x))
names(dots) <- newnames
dots <- dots[!is.na(names(dots))]
# merge, but overwrite automatically determined ones by 'dots'
x <- c(x[!x %in% dots & !names(x) %in% names(dots)], dots)
# delete NAs, this will make e.g. eucast_rules(... TMP = NULL) work to prevent TMP from being used
x <- x[!is.na(x)]
}
2020-07-13 09:17:24 +02:00
2020-06-03 11:48:00 +02:00
if (length(x) == 0) {
2020-09-24 00:30:11 +02:00
if (info == TRUE) {
2020-10-27 15:56:51 +01:00
message_("No columns found.")
2020-09-24 00:30:11 +02:00
}
2020-06-03 11:48:00 +02:00
return(x)
}
2019-06-27 11:57:45 +02:00
# sort on name
x <- x[order(names(x), x)]
duplicates <- c(x[duplicated(x)], x[duplicated(names(x))])
duplicates <- duplicates[unique(names(duplicates))]
x <- c(x[!names(x) %in% names(duplicates)], duplicates)
x <- x[order(names(x), x)]
2020-06-03 11:48:00 +02:00
# succeeded with auto-guessing
2020-09-24 00:30:11 +02:00
if (info == TRUE) {
message_(" OK.", add_fn = list(font_green, font_bold), as_note = FALSE)
2020-09-24 00:30:11 +02:00
}
2020-07-13 09:17:24 +02:00
for (i in seq_len(length(x))) {
2020-09-24 00:30:11 +02:00
if (info == TRUE & verbose == TRUE & !names(x[i]) %in% names(duplicates)) {
message_("Using column '", font_bold(x[i]), "' as input for ", names(x)[i],
" (", ab_name(names(x)[i], tolower = TRUE, language = NULL), ").")
2019-06-27 11:57:45 +02:00
}
2020-09-24 00:30:11 +02:00
if (info == TRUE & names(x[i]) %in% names(duplicates)) {
warning_(paste0("Using column '", font_bold(x[i]), "' as input for ", names(x)[i],
" (", ab_name(names(x)[i], tolower = TRUE, language = NULL),
2020-11-10 16:35:56 +01:00
"), although it was matched for multiple antibiotics or columns."),
add_fn = font_red,
call = FALSE,
immediate = verbose)
2019-06-27 11:57:45 +02:00
}
}
2019-06-27 11:57:45 +02:00
if (!is.null(hard_dependencies)) {
hard_dependencies <- unique(hard_dependencies)
2019-06-27 11:57:45 +02:00
if (!all(hard_dependencies %in% names(x))) {
# missing a hard dependency will return NA and consequently the data will not be analysed
missing <- hard_dependencies[!hard_dependencies %in% names(x)]
generate_warning_abs_missing(missing, any = FALSE)
return(NA)
}
}
if (!is.null(soft_dependencies)) {
soft_dependencies <- unique(soft_dependencies)
2020-09-24 00:30:11 +02:00
if (info == TRUE & !all(soft_dependencies %in% names(x))) {
2019-06-27 11:57:45 +02:00
# missing a soft dependency may lower the reliability
missing <- soft_dependencies[!soft_dependencies %in% names(x)]
2020-09-24 00:30:11 +02:00
missing_msg <- paste(paste0(ab_name(missing, tolower = TRUE, language = NULL),
2020-10-27 15:56:51 +01:00
" (", font_bold(missing, collapse = NULL), ")"),
2020-07-13 09:17:24 +02:00
collapse = ", ")
2020-10-27 15:56:51 +01:00
message_("Reliability would be improved if these antimicrobial results would be available too: ",
missing_msg)
2019-06-27 11:57:45 +02:00
}
}
x
}
generate_warning_abs_missing <- function(missing, any = FALSE) {
2020-05-18 11:09:02 +02:00
missing <- paste0(missing, " (", ab_name(missing, tolower = TRUE, language = NULL), ")")
2019-06-27 11:57:45 +02:00
if (any == TRUE) {
any_txt <- c(" any of", "is")
} else {
any_txt <- c("", "are")
}
2020-11-10 16:35:56 +01:00
warning_(paste0("Introducing NAs since", any_txt[1], " these antimicrobials ", any_txt[2], " required: ",
paste(missing, collapse = ", ")),
immediate = TRUE,
call = FALSE)
2019-06-27 11:57:45 +02:00
}