2019-01-03 23:56:19 +01:00
# ==================================================================== #
# TITLE #
2022-10-05 09:12:22 +02:00
# AMR: An R Package for Working with Antimicrobial Resistance Data #
2019-01-03 23:56:19 +01:00
# #
# SOURCE #
2020-07-08 14:48:06 +02:00
# https://github.com/msberends/AMR #
2019-01-03 23:56:19 +01:00
# #
2022-10-05 09:12:22 +02:00
# CITE AS #
# Berends MS, Luz CF, Friedrich AW, Sinha BNM, Albers CJ, Glasner C #
# (2022). AMR: An R Package for Working with Antimicrobial Resistance #
# Data. Journal of Statistical Software, 104(3), 1-31. #
# doi:10.18637/jss.v104.i03 #
# #
2020-10-08 11:16:03 +02:00
# Developed at the University of Groningen, the Netherlands, in #
# collaboration with non-profit organisations Certe Medical #
2022-08-28 10:31:50 +02:00
# Diagnostics & Advice, and University Medical Center Groningen. #
2019-01-03 23:56:19 +01:00
# #
# This R package is free software; you can freely use and distribute #
# it for both personal and commercial purposes under the terms of the #
# GNU General Public License version 2.0 (GNU GPL-2), as published by #
# the Free Software Foundation. #
2020-01-05 17:22:09 +01:00
# We created this package for both routine data analysis and academic #
# research and it was publicly released in the hope that it will be #
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. #
2020-10-08 11:16:03 +02:00
# #
# Visit our website for the full manual and a complete tutorial about #
2021-02-02 23:57:35 +01:00
# how to conduct AMR data analysis: https://msberends.github.io/AMR/ #
2019-01-03 23:56:19 +01:00
# ==================================================================== #
2021-01-18 16:57:56 +01:00
#' Guess Antibiotic Column
2019-01-03 23:56:19 +01:00
#'
2019-11-28 22:32:17 +01:00
#' This tries to find a column name in a data set based on information from the [antibiotics] data set. Also supports WHONET abbreviations.
2020-09-18 16:05:53 +02:00
#' @param x a [data.frame]
2019-11-28 22:32:17 +01:00
#' @param search_string a text to search `x` for, will be checked with [as.ab()] if this value is not a column in `x`
2021-05-12 18:15:03 +02:00
#' @param verbose a [logical] to indicate whether additional info should be printed
#' @param only_rsi_columns a [logical] to indicate whether only antibiotic columns must be detected that were transformed to class `<rsi>` (see [as.rsi()]) on beforehand (defaults to `FALSE`)
2022-08-29 09:35:36 +02:00
#' @details You can look for an antibiotic (trade) name or abbreviation and it will search `x` and the [antibiotics] data set for any column containing a name or code of that antibiotic.
2019-11-28 22:32:17 +01:00
#' @return A column name of `x`, or `NULL` when no result is found.
2019-01-03 23:56:19 +01:00
#' @export
2019-01-11 20:37:23 +01:00
#' @examples
2022-08-28 10:31:50 +02:00
#' df <- data.frame(
#' amox = "S",
#' tetr = "R"
#' )
2019-01-11 20:37:23 +01:00
#'
#' guess_ab_col(df, "amoxicillin")
2019-05-10 16:44:59 +02:00
#' guess_ab_col(df, "J01AA07") # ATC code of tetracycline
2019-01-11 20:37:23 +01:00
#'
#' guess_ab_col(df, "J01AA07", verbose = TRUE)
2020-12-03 16:59:04 +01:00
#' # NOTE: Using column 'tetr' as input for J01AA07 (tetracycline).
2019-01-29 00:06:50 +01:00
#'
#' # WHONET codes
2022-08-28 10:31:50 +02:00
#' df <- data.frame(
#' AMP_ND10 = "R",
#' AMC_ED20 = "S"
#' )
2019-01-29 00:06:50 +01:00
#' guess_ab_col(df, "ampicillin")
#' guess_ab_col(df, "J01CR02")
2019-05-10 16:44:59 +02:00
#' guess_ab_col(df, as.ab("augmentin"))
2021-02-08 14:18:42 +01:00
guess_ab_col <- function ( x = NULL , search_string = NULL , verbose = FALSE , only_rsi_columns = FALSE ) {
2020-10-19 17:09:19 +02:00
meet_criteria ( x , allow_class = " data.frame" , allow_NULL = TRUE )
meet_criteria ( search_string , allow_class = " character" , has_length = 1 , allow_NULL = TRUE )
meet_criteria ( verbose , allow_class = " logical" , has_length = 1 )
2021-02-04 16:48:16 +01:00
meet_criteria ( only_rsi_columns , allow_class = " logical" , has_length = 1 )
2022-08-28 10:31:50 +02:00
2022-10-05 09:12:22 +02:00
if ( is.null ( x ) && is.null ( search_string ) ) {
2019-01-11 20:37:23 +01:00
return ( as.name ( " guess_ab_col" ) )
2019-05-10 16:44:59 +02:00
} else {
2021-02-02 23:57:35 +01:00
meet_criteria ( search_string , allow_class = " character" , has_length = 1 , allow_NULL = FALSE )
2019-01-03 23:56:19 +01:00
}
2022-08-28 10:31:50 +02:00
all_found <- get_column_abx ( x ,
info = verbose , only_rsi_columns = only_rsi_columns ,
verbose = verbose , fn = " guess_ab_col"
)
2021-02-02 23:57:35 +01:00
search_string.ab <- suppressWarnings ( as.ab ( search_string ) )
ab_result <- unname ( all_found [names ( all_found ) == search_string.ab ] )
2022-08-28 10:31:50 +02:00
2019-01-11 20:37:23 +01:00
if ( length ( ab_result ) == 0 ) {
2019-01-03 23:56:19 +01:00
if ( verbose == TRUE ) {
2020-12-03 16:59:04 +01:00
message_ ( " No column found as input for " , search_string ,
2022-08-28 10:31:50 +02:00
" (" , ab_name ( search_string , language = NULL , tolower = TRUE ) , " )." ,
add_fn = font_black ,
as_note = FALSE
)
2019-01-03 23:56:19 +01:00
}
return ( NULL )
2019-01-11 20:37:23 +01:00
} else {
if ( verbose == TRUE ) {
2022-08-28 10:31:50 +02:00
message_ (
" Using column '" , font_bold ( ab_result ) , " ' as input for " , search_string ,
" (" , ab_name ( search_string , language = NULL , tolower = TRUE ) , " )."
)
2019-01-11 20:37:23 +01:00
}
2019-05-10 16:44:59 +02:00
return ( ab_result )
2019-01-03 23:56:19 +01:00
}
}
2019-06-27 11:57:45 +02:00
get_column_abx <- function ( x ,
2021-07-03 21:56:53 +02:00
... ,
2019-06-27 11:57:45 +02:00
soft_dependencies = NULL ,
hard_dependencies = NULL ,
verbose = FALSE ,
2020-09-24 00:30:11 +02:00
info = TRUE ,
2021-02-02 23:57:35 +01:00
only_rsi_columns = FALSE ,
2021-04-26 23:57:37 +02:00
sort = TRUE ,
2021-12-11 13:41:31 +01:00
reuse_previous_result = TRUE ,
fn = NULL ) {
2021-05-24 09:34:08 +02:00
# check if retrieved before, then get it from package environment
2022-08-28 10:31:50 +02:00
if ( isTRUE ( reuse_previous_result ) && identical (
unique_call_id (
entire_session = FALSE ,
match_fn = fn
) ,
2022-10-05 09:12:22 +02:00
AMR_env $ get_column_abx.call
2022-08-28 10:31:50 +02:00
) ) {
2021-07-03 21:56:53 +02:00
# so within the same call, within the same environment, we got here again.
# but we could've come from another function within the same call, so now only check the columns that changed
2022-08-28 10:31:50 +02:00
2021-07-03 21:56:53 +02:00
# first remove the columns that are not existing anymore
2022-10-05 09:12:22 +02:00
previous <- AMR_env $ get_column_abx.out
2021-07-03 21:56:53 +02:00
current <- previous [previous %in% colnames ( x ) ]
2022-08-28 10:31:50 +02:00
2021-07-03 21:56:53 +02:00
# then compare columns in current call with columns in original call
2022-10-05 09:12:22 +02:00
new_cols <- colnames ( x ) [ ! colnames ( x ) %in% AMR_env $ get_column_abx.checked_cols ]
2021-07-03 21:56:53 +02:00
if ( length ( new_cols ) > 0 ) {
# these columns did not exist in the last call, so add them
new_cols_rsi <- get_column_abx ( x [ , new_cols , drop = FALSE ] , reuse_previous_result = FALSE , info = FALSE , sort = FALSE )
current <- c ( current , new_cols_rsi )
2021-07-23 21:42:11 +02:00
# order according to columns in current call
2021-07-03 21:56:53 +02:00
current <- current [match ( colnames ( x ) [colnames ( x ) %in% current ] , current ) ]
}
2022-08-28 10:31:50 +02:00
2021-07-03 21:56:53 +02:00
# update pkg environment to improve speed on next run
2022-10-05 09:12:22 +02:00
AMR_env $ get_column_abx.out <- current
AMR_env $ get_column_abx.checked_cols <- colnames ( x )
2021-07-03 21:56:53 +02:00
# and return right values
2022-10-05 09:12:22 +02:00
return ( AMR_env $ get_column_abx.out )
2021-05-24 09:34:08 +02:00
}
2022-08-28 10:31:50 +02:00
2020-10-19 17:09:19 +02:00
meet_criteria ( x , allow_class = " data.frame" )
meet_criteria ( soft_dependencies , allow_class = " character" , allow_NULL = TRUE )
meet_criteria ( hard_dependencies , allow_class = " character" , allow_NULL = TRUE )
meet_criteria ( verbose , allow_class = " logical" , has_length = 1 )
meet_criteria ( info , allow_class = " logical" , has_length = 1 )
2021-02-02 23:57:35 +01:00
meet_criteria ( only_rsi_columns , allow_class = " logical" , has_length = 1 )
2021-04-26 23:57:37 +02:00
meet_criteria ( sort , allow_class = " logical" , has_length = 1 )
2022-08-28 10:31:50 +02:00
2020-09-24 00:30:11 +02:00
if ( info == TRUE ) {
2021-01-15 22:44:52 +01:00
message_ ( " Auto-guessing columns suitable for analysis" , appendLF = FALSE , as_note = FALSE )
2020-09-24 00:30:11 +02:00
}
2022-08-28 10:31:50 +02:00
2019-10-08 10:02:19 +02:00
x <- as.data.frame ( x , stringsAsFactors = FALSE )
2021-07-03 21:56:53 +02:00
x.bak <- x
2021-02-02 23:57:35 +01:00
if ( only_rsi_columns == TRUE ) {
x <- x [ , which ( is.rsi ( x ) ) , drop = FALSE ]
}
2020-06-09 16:18:03 +02:00
if ( NROW ( x ) > 10000 ) {
# only test maximum of 10,000 values per column
2020-09-24 00:30:11 +02:00
if ( info == TRUE ) {
2020-10-27 15:56:51 +01:00
message_ ( " (using only " , font_bold ( " the first 10,000 rows" ) , " )..." ,
2022-08-28 10:31:50 +02:00
appendLF = FALSE ,
as_note = FALSE
)
2020-09-24 00:30:11 +02:00
}
2020-06-09 16:18:03 +02:00
x <- x [1 : 10000 , , drop = FALSE ]
2020-09-24 00:30:11 +02:00
} else if ( info == TRUE ) {
2020-10-27 15:56:51 +01:00
message_ ( " ..." , appendLF = FALSE , as_note = FALSE )
2020-06-09 16:18:03 +02:00
}
2021-01-22 10:20:41 +01:00
2019-10-08 10:02:19 +02:00
# only check columns that are a valid AB code, ATC code, name, abbreviation or synonym,
2022-08-28 10:31:50 +02:00
# or already have the <rsi> class (as.rsi)
2020-12-28 22:24:33 +01:00
# and that they have no more than 50% invalid values
2022-10-14 13:02:50 +02:00
vectr_antibiotics <- unlist ( AMR_env $ AB_lookup $ generalised_all )
2019-10-08 10:02:19 +02:00
vectr_antibiotics <- vectr_antibiotics [ ! is.na ( vectr_antibiotics ) & nchar ( vectr_antibiotics ) >= 3 ]
2022-08-28 10:31:50 +02:00
x_columns <- vapply (
FUN.VALUE = character ( 1 ) ,
colnames ( x ) ,
function ( col , df = x ) {
if ( generalise_antibiotic_name ( col ) %in% vectr_antibiotics ||
is.rsi ( x [ , col , drop = TRUE ] ) ||
is.rsi.eligible ( x [ , col , drop = TRUE ] , threshold = 0.5 )
) {
return ( col )
} else {
return ( NA_character_ )
}
} , USE.NAMES = FALSE
)
2019-10-08 22:21:33 +02:00
x_columns <- x_columns [ ! is.na ( x_columns ) ]
2021-02-02 23:57:35 +01:00
x <- x [ , x_columns , drop = FALSE ] # without drop = FALSE, x will become a vector when x_columns is length 1
2022-08-28 10:31:50 +02:00
df_trans <- data.frame (
colnames = colnames ( x ) ,
abcode = suppressWarnings ( as.ab ( colnames ( x ) , info = FALSE ) ) ,
stringsAsFactors = FALSE
)
2020-09-24 00:30:11 +02:00
df_trans <- df_trans [ ! is.na ( df_trans $ abcode ) , , drop = FALSE ]
2021-07-03 21:56:53 +02:00
out <- as.character ( df_trans $ colnames )
names ( out ) <- df_trans $ abcode
2022-08-28 10:31:50 +02:00
2019-06-27 11:57:45 +02:00
# add from self-defined dots (...):
2021-02-02 23:57:35 +01:00
# such as get_column_abx(example_isolates %>% rename(thisone = AMX), amox = "thisone")
2021-08-30 14:07:46 +02:00
all_okay <- TRUE
2019-06-27 11:57:45 +02:00
dots <- list ( ... )
2021-12-11 13:41:31 +01:00
# remove data.frames, since this is also used running `eucast_rules(eucast_rules_df = df)`
dots <- dots [ ! vapply ( FUN.VALUE = logical ( 1 ) , dots , is.data.frame ) ]
2019-06-27 11:57:45 +02:00
if ( length ( dots ) > 0 ) {
2020-08-14 13:36:10 +02:00
newnames <- suppressWarnings ( as.ab ( names ( dots ) , info = FALSE ) )
2022-10-05 09:12:22 +02:00
if ( anyNA ( newnames ) ) {
2021-08-30 14:07:46 +02:00
if ( info == TRUE ) {
message_ ( " WARNING" , add_fn = list ( font_yellow , font_bold ) , as_note = FALSE )
}
warning_ ( " Invalid antibiotic reference(s): " , vector_and ( names ( dots ) [is.na ( newnames ) ] , quotes = FALSE ) ,
2022-08-28 10:31:50 +02:00
call = FALSE ,
immediate = TRUE
)
2021-08-30 14:07:46 +02:00
all_okay <- FALSE
}
unexisting_cols <- which ( ! vapply ( FUN.VALUE = logical ( 1 ) , dots , function ( col ) all ( col %in% x_columns ) ) )
if ( length ( unexisting_cols ) > 0 ) {
if ( info == TRUE ) {
message_ ( " ERROR" , add_fn = list ( font_red , font_bold ) , as_note = FALSE )
}
stop_ ( " Column(s) not found: " , vector_and ( unlist ( dots [ [unexisting_cols ] ] ) , quotes = FALSE ) ,
2022-08-28 10:31:50 +02:00
call = FALSE
)
2021-08-30 14:07:46 +02:00
all_okay <- FALSE
2019-06-27 11:57:45 +02:00
}
# turn all NULLs to NAs
2021-07-03 21:56:53 +02:00
dots <- unlist ( lapply ( dots , function ( dot ) if ( is.null ( dot ) ) NA else dot ) )
2019-06-27 11:57:45 +02:00
names ( dots ) <- newnames
dots <- dots [ ! is.na ( names ( dots ) ) ]
# merge, but overwrite automatically determined ones by 'dots'
2021-07-03 21:56:53 +02:00
out <- c ( out [ ! out %in% dots & ! names ( out ) %in% names ( dots ) ] , dots )
2019-06-27 11:57:45 +02:00
# delete NAs, this will make e.g. eucast_rules(... TMP = NULL) work to prevent TMP from being used
2021-07-03 21:56:53 +02:00
out <- out [ ! is.na ( out ) ]
2019-06-27 11:57:45 +02:00
}
2022-08-28 10:31:50 +02:00
2021-07-03 21:56:53 +02:00
if ( length ( out ) == 0 ) {
2022-10-05 09:12:22 +02:00
if ( info == TRUE && all_okay == TRUE ) {
2020-10-27 15:56:51 +01:00
message_ ( " No columns found." )
2020-09-24 00:30:11 +02:00
}
2022-10-05 09:12:22 +02:00
AMR_env $ get_column_abx.call <- unique_call_id ( entire_session = FALSE , match_fn = fn )
AMR_env $ get_column_abx.checked_cols <- colnames ( x.bak )
AMR_env $ get_column_abx.out <- out
2021-07-03 21:56:53 +02:00
return ( out )
2020-06-03 11:48:00 +02:00
}
2022-08-28 10:31:50 +02:00
2019-06-27 11:57:45 +02:00
# sort on name
2021-04-26 23:57:37 +02:00
if ( sort == TRUE ) {
2021-07-03 21:56:53 +02:00
out <- out [order ( names ( out ) , out ) ]
2021-04-26 23:57:37 +02:00
}
2021-12-09 10:48:25 +01:00
# only keep the first hits, no duplicates
duplicates <- c ( out [duplicated ( names ( out ) ) ] , out [duplicated ( unname ( out ) ) ] )
2021-12-11 13:41:31 +01:00
if ( length ( duplicates ) > 0 ) {
all_okay <- FALSE
2020-09-24 00:30:11 +02:00
}
2022-08-28 10:31:50 +02:00
2021-12-11 13:41:31 +01:00
if ( info == TRUE ) {
if ( all_okay == TRUE ) {
message_ ( " OK." , add_fn = list ( font_green , font_bold ) , as_note = FALSE )
} else {
message_ ( " WARNING." , add_fn = list ( font_yellow , font_bold ) , as_note = FALSE )
2019-06-27 11:57:45 +02:00
}
2021-12-11 13:41:31 +01:00
for ( i in seq_len ( length ( out ) ) ) {
2022-10-05 09:12:22 +02:00
if ( verbose == TRUE && ! names ( out [i ] ) %in% names ( duplicates ) ) {
2022-08-28 10:31:50 +02:00
message_ (
" Using column '" , font_bold ( out [i ] ) , " ' as input for " , names ( out ) [i ] ,
" (" , ab_name ( names ( out ) [i ] , tolower = TRUE , language = NULL ) , " )."
)
2021-12-11 13:41:31 +01:00
}
if ( names ( out [i ] ) %in% names ( duplicates ) ) {
already_set_as <- out [unname ( out ) == unname ( out [i ] ) ] [1L ]
2022-08-28 10:31:50 +02:00
warning_ ( paste0 (
" Column '" , font_bold ( out [i ] ) , " ' will not be used for " ,
names ( out ) [i ] , " (" , ab_name ( names ( out ) [i ] , tolower = TRUE , language = NULL ) , " )" ,
" , as it is already set for " ,
names ( already_set_as ) , " (" , ab_name ( names ( already_set_as ) , tolower = TRUE , language = NULL ) , " )"
) ,
add_fn = font_red ,
immediate = verbose
)
2021-12-11 13:41:31 +01:00
}
2019-06-27 11:57:45 +02:00
}
}
2022-08-28 10:31:50 +02:00
2021-12-11 13:41:31 +01:00
out <- out [ ! duplicated ( names ( out ) ) ]
out <- out [ ! duplicated ( unname ( out ) ) ]
if ( sort == TRUE ) {
out <- out [order ( names ( out ) , out ) ]
}
2022-08-28 10:31:50 +02:00
2019-06-27 11:57:45 +02:00
if ( ! is.null ( hard_dependencies ) ) {
2019-10-26 21:56:41 +02:00
hard_dependencies <- unique ( hard_dependencies )
2021-07-03 21:56:53 +02:00
if ( ! all ( hard_dependencies %in% names ( out ) ) ) {
2019-06-27 11:57:45 +02:00
# missing a hard dependency will return NA and consequently the data will not be analysed
2021-07-03 21:56:53 +02:00
missing <- hard_dependencies [ ! hard_dependencies %in% names ( out ) ]
2019-06-27 11:57:45 +02:00
generate_warning_abs_missing ( missing , any = FALSE )
return ( NA )
}
}
if ( ! is.null ( soft_dependencies ) ) {
2019-10-26 21:56:41 +02:00
soft_dependencies <- unique ( soft_dependencies )
2022-10-05 09:12:22 +02:00
if ( info == TRUE && ! all ( soft_dependencies %in% names ( out ) ) ) {
2019-06-27 11:57:45 +02:00
# missing a soft dependency may lower the reliability
2021-07-03 21:56:53 +02:00
missing <- soft_dependencies [ ! soft_dependencies %in% names ( out ) ]
2022-08-28 10:31:50 +02:00
missing_msg <- vector_and ( paste0 (
ab_name ( missing , tolower = TRUE , language = NULL ) ,
" (" , font_bold ( missing , collapse = NULL ) , " )"
) ,
quotes = FALSE
)
message_ (
" Reliability would be improved if these antimicrobial results would be available too: " ,
missing_msg
)
2019-06-27 11:57:45 +02:00
}
}
2022-08-28 10:31:50 +02:00
2022-10-05 09:12:22 +02:00
AMR_env $ get_column_abx.call <- unique_call_id ( entire_session = FALSE , match_fn = fn )
AMR_env $ get_column_abx.checked_cols <- colnames ( x.bak )
AMR_env $ get_column_abx.out <- out
2021-07-03 21:56:53 +02:00
out
2019-06-27 11:57:45 +02:00
}
2021-11-28 23:01:26 +01:00
get_ab_from_namespace <- function ( x , cols_ab ) {
# cols_ab comes from get_column_abx()
2022-08-28 10:31:50 +02:00
2022-10-05 09:12:22 +02:00
x <- trimws2 ( unique ( toupper ( unlist ( strsplit ( x , " ," , fixed = TRUE ) ) ) ) )
2021-11-28 23:01:26 +01:00
x_new <- character ( )
for ( val in x ) {
if ( paste0 ( " AB_" , val ) %in% ls ( envir = asNamespace ( " AMR" ) ) ) {
2022-08-28 10:31:50 +02:00
# antibiotic group names, as defined in data-raw/_pre_commit_hook.R, such as `AB_CARBAPENEMS`
2021-11-28 23:01:26 +01:00
val <- eval ( parse ( text = paste0 ( " AB_" , val ) ) , envir = asNamespace ( " AMR" ) )
2022-10-14 13:02:50 +02:00
} else if ( val %in% AMR_env $ AB_lookup $ ab ) {
2021-11-28 23:01:26 +01:00
# separate drugs, such as `AMX`
val <- as.ab ( val )
} else {
stop_ ( " unknown antimicrobial agent (group): " , val , call = FALSE )
}
x_new <- c ( x_new , val )
}
x_new <- unique ( x_new )
out <- cols_ab [match ( x_new , names ( cols_ab ) ) ]
out [ ! is.na ( out ) ]
}
2019-06-27 11:57:45 +02:00
generate_warning_abs_missing <- function ( missing , any = FALSE ) {
2020-05-18 11:09:02 +02:00
missing <- paste0 ( missing , " (" , ab_name ( missing , tolower = TRUE , language = NULL ) , " )" )
2019-06-27 11:57:45 +02:00
if ( any == TRUE ) {
any_txt <- c ( " any of" , " is" )
} else {
any_txt <- c ( " " , " are" )
}
2022-08-28 10:31:50 +02:00
warning_ ( paste0 (
" Introducing NAs since" , any_txt [1 ] , " these antimicrobials " , any_txt [2 ] , " required: " ,
vector_and ( missing , quotes = FALSE )
) ,
immediate = TRUE
)
2019-06-27 11:57:45 +02:00
}