2019-01-03 23:56:19 +01:00
# ==================================================================== #
# TITLE #
# Antimicrobial Resistance (AMR) Analysis #
# #
# SOURCE #
2020-07-08 14:48:06 +02:00
# https://github.com/msberends/AMR #
2019-01-03 23:56:19 +01:00
# #
# LICENCE #
2020-01-05 17:22:09 +01:00
# (c) 2018-2020 Berends MS, Luz CF et al. #
2019-01-03 23:56:19 +01:00
# #
# This R package is free software; you can freely use and distribute #
# it for both personal and commercial purposes under the terms of the #
# GNU General Public License version 2.0 (GNU GPL-2), as published by #
# the Free Software Foundation. #
# #
2020-01-05 17:22:09 +01:00
# We created this package for both routine data analysis and academic #
# research and it was publicly released in the hope that it will be #
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. #
2020-07-08 14:48:06 +02:00
# Visit our website for more info: https://msberends.github.io/AMR. #
2019-01-03 23:56:19 +01:00
# ==================================================================== #
#' Guess antibiotic column
#'
2019-11-28 22:32:17 +01:00
#' This tries to find a column name in a data set based on information from the [antibiotics] data set. Also supports WHONET abbreviations.
2020-01-05 17:22:09 +01:00
#' @inheritSection lifecycle Maturing lifecycle
2019-11-28 22:32:17 +01:00
#' @param x a [`data.frame`]
#' @param search_string a text to search `x` for, will be checked with [as.ab()] if this value is not a column in `x`
2019-01-03 23:56:19 +01:00
#' @param verbose a logical to indicate whether additional info should be printed
2019-11-28 22:32:17 +01:00
#' @details You can look for an antibiotic (trade) name or abbreviation and it will search `x` and the [antibiotics] data set for any column containing a name or code of that antibiotic. **Longer columns names take precendence over shorter column names.**
#' @return A column name of `x`, or `NULL` when no result is found.
2019-01-03 23:56:19 +01:00
#' @export
#' @inheritSection AMR Read more on our website!
2019-01-11 20:37:23 +01:00
#' @examples
#' df <- data.frame(amox = "S",
#' tetr = "R")
#'
#' guess_ab_col(df, "amoxicillin")
#' # [1] "amox"
2019-05-10 16:44:59 +02:00
#' guess_ab_col(df, "J01AA07") # ATC code of tetracycline
2019-01-11 20:37:23 +01:00
#' # [1] "tetr"
#'
#' guess_ab_col(df, "J01AA07", verbose = TRUE)
2020-07-22 10:24:23 +02:00
#' # NOTE: Using column `tetr` as input for `J01AA07` (tetracycline).
2019-01-11 20:37:23 +01:00
#' # [1] "tetr"
2019-01-29 00:06:50 +01:00
#'
#' # WHONET codes
#' df <- data.frame(AMP_ND10 = "R",
#' AMC_ED20 = "S")
#' guess_ab_col(df, "ampicillin")
#' # [1] "AMP_ND10"
#' guess_ab_col(df, "J01CR02")
#' # [1] "AMC_ED20"
2019-05-10 16:44:59 +02:00
#' guess_ab_col(df, as.ab("augmentin"))
2019-01-29 00:06:50 +01:00
#' # [1] "AMC_ED20"
2019-05-31 14:40:15 +02:00
#'
#' # Longer names take precendence:
#' df <- data.frame(AMP_ED2 = "S",
#' AMP_ED20 = "S")
#' guess_ab_col(df, "ampicillin")
#' # [1] "AMP_ED20"
2019-05-13 10:10:16 +02:00
guess_ab_col <- function ( x = NULL , search_string = NULL , verbose = FALSE ) {
if ( is.null ( x ) & is.null ( search_string ) ) {
2019-01-11 20:37:23 +01:00
return ( as.name ( " guess_ab_col" ) )
2019-01-03 23:56:19 +01:00
}
2020-06-22 11:18:40 +02:00
stop_ifnot ( is.data.frame ( x ) , " `x` must be a data.frame" )
2020-07-13 09:17:24 +02:00
2019-05-13 10:10:16 +02:00
if ( length ( search_string ) > 1 ) {
warning ( " argument 'search_string' has length > 1 and only the first element will be used" )
search_string <- search_string [1 ]
2019-01-03 23:56:19 +01:00
}
2019-05-13 10:10:16 +02:00
search_string <- as.character ( search_string )
2020-07-13 09:17:24 +02:00
2019-05-13 10:10:16 +02:00
if ( search_string %in% colnames ( x ) ) {
ab_result <- search_string
2019-05-10 16:44:59 +02:00
} else {
2019-05-13 20:16:51 +02:00
search_string.ab <- suppressWarnings ( as.ab ( search_string ) )
if ( search_string.ab %in% colnames ( x ) ) {
ab_result <- colnames ( x ) [colnames ( x ) == search_string.ab ] [1L ]
2020-07-13 09:17:24 +02:00
2020-05-18 11:09:02 +02:00
} else if ( any ( tolower ( colnames ( x ) ) %in% tolower ( unlist ( ab_property ( search_string.ab , " abbreviations" , language = NULL ) ) ) ) ) {
ab_result <- colnames ( x ) [tolower ( colnames ( x ) ) %in% tolower ( unlist ( ab_property ( search_string.ab , " abbreviations" , language = NULL ) ) ) ] [1L ]
2020-07-13 09:17:24 +02:00
2019-05-13 20:16:51 +02:00
} else {
# sort colnames on length - longest first
cols <- colnames ( x [ , x %>% colnames ( ) %>% nchar ( ) %>% order ( ) %>% rev ( ) ] )
df_trans <- data.frame ( cols = cols ,
abs = suppressWarnings ( as.ab ( cols ) ) ,
stringsAsFactors = FALSE )
ab_result <- df_trans [which ( df_trans $ abs == search_string.ab ) , " cols" ]
ab_result <- ab_result [ ! is.na ( ab_result ) ] [1L ]
}
2019-01-03 23:56:19 +01:00
}
2020-07-13 09:17:24 +02:00
2019-01-11 20:37:23 +01:00
if ( length ( ab_result ) == 0 ) {
2019-01-03 23:56:19 +01:00
if ( verbose == TRUE ) {
2019-05-31 14:25:11 +02:00
message ( paste0 ( " No column found as input for `" , search_string ,
2020-05-18 11:09:02 +02:00
" ` (" , ab_name ( search_string , language = NULL , tolower = TRUE ) , " )." ) )
2019-01-03 23:56:19 +01:00
}
return ( NULL )
2019-01-11 20:37:23 +01:00
} else {
if ( verbose == TRUE ) {
2020-05-16 13:05:47 +02:00
message ( font_blue ( paste0 ( " NOTE: Using column `" , font_bold ( ab_result ) , " ` as input for `" , search_string ,
2020-07-13 09:17:24 +02:00
" ` (" , ab_name ( search_string , language = NULL , tolower = TRUE ) , " )." ) ) )
2019-01-11 20:37:23 +01:00
}
2019-05-10 16:44:59 +02:00
return ( ab_result )
2019-01-03 23:56:19 +01:00
}
}
2019-06-27 11:57:45 +02:00
get_column_abx <- function ( x ,
soft_dependencies = NULL ,
hard_dependencies = NULL ,
verbose = FALSE ,
... ) {
2020-07-13 09:17:24 +02:00
2020-06-09 16:18:03 +02:00
message ( font_blue ( " NOTE: Auto-guessing columns suitable for analysis" ) , appendLF = FALSE )
2019-10-08 10:02:19 +02:00
x <- as.data.frame ( x , stringsAsFactors = FALSE )
2020-06-09 16:18:03 +02:00
if ( NROW ( x ) > 10000 ) {
# only test maximum of 10,000 values per column
message ( font_blue ( paste0 ( " (using only " , font_bold ( " the first 10,000 rows" ) , " )..." ) ) , appendLF = FALSE )
x <- x [1 : 10000 , , drop = FALSE ]
} else {
message ( font_blue ( " ..." ) , appendLF = FALSE )
}
2019-10-06 21:44:08 +02:00
x_bak <- x
2019-10-08 10:02:19 +02:00
# only check columns that are a valid AB code, ATC code, name, abbreviation or synonym,
# or already have the rsi class (as.rsi)
# and that have no more than 50% invalid values
2020-02-14 19:54:13 +01:00
vectr_antibiotics <- unique ( toupper ( unlist ( antibiotics [ , c ( " ab" , " atc" , " name" , " abbreviations" , " synonyms" ) ] ) ) )
2019-10-08 10:02:19 +02:00
vectr_antibiotics <- vectr_antibiotics [ ! is.na ( vectr_antibiotics ) & nchar ( vectr_antibiotics ) >= 3 ]
2019-10-08 22:21:33 +02:00
x_columns <- sapply ( colnames ( x ) , function ( col , df = x_bak ) {
2019-10-08 10:02:19 +02:00
if ( toupper ( col ) %in% vectr_antibiotics |
is.rsi ( as.data.frame ( df ) [ , col ] ) |
is.rsi.eligible ( as.data.frame ( df ) [ , col ] , threshold = 0.5 ) ) {
return ( col )
} else {
return ( NA_character_ )
}
} )
2019-10-08 22:21:33 +02:00
x_columns <- x_columns [ ! is.na ( x_columns ) ]
x <- x [ , x_columns , drop = FALSE ] # without drop = TRUE, x will become a vector when x_columns is length 1
2020-07-13 09:17:24 +02:00
2019-06-27 11:57:45 +02:00
df_trans <- data.frame ( colnames = colnames ( x ) ,
abcode = suppressWarnings ( as.ab ( colnames ( x ) ) ) )
2019-10-11 17:21:02 +02:00
df_trans <- df_trans [ ! is.na ( df_trans $ abcode ) , ]
2019-06-27 11:57:45 +02:00
x <- as.character ( df_trans $ colnames )
names ( x ) <- df_trans $ abcode
2020-07-13 09:17:24 +02:00
2019-06-27 11:57:45 +02:00
# add from self-defined dots (...):
2019-10-11 17:21:02 +02:00
# such as get_column_abx(example_isolates %>% rename(thisone = AMX), amox = "thisone")
2019-06-27 11:57:45 +02:00
dots <- list ( ... )
if ( length ( dots ) > 0 ) {
newnames <- suppressWarnings ( as.ab ( names ( dots ) ) )
if ( any ( is.na ( newnames ) ) ) {
warning ( " Invalid antibiotic reference(s): " , toString ( names ( dots ) [is.na ( newnames ) ] ) ,
call. = FALSE , immediate. = TRUE )
}
# turn all NULLs to NAs
dots <- unlist ( lapply ( dots , function ( x ) if ( is.null ( x ) ) NA else x ) )
names ( dots ) <- newnames
dots <- dots [ ! is.na ( names ( dots ) ) ]
# merge, but overwrite automatically determined ones by 'dots'
x <- c ( x [ ! x %in% dots & ! names ( x ) %in% names ( dots ) ] , dots )
# delete NAs, this will make e.g. eucast_rules(... TMP = NULL) work to prevent TMP from being used
x <- x [ ! is.na ( x ) ]
}
2020-07-13 09:17:24 +02:00
2020-06-03 11:48:00 +02:00
if ( length ( x ) == 0 ) {
message ( font_blue ( " No columns found." ) )
return ( x )
}
2019-06-27 11:57:45 +02:00
# sort on name
2019-08-15 17:09:27 +02:00
x <- x [order ( names ( x ) , x ) ]
2019-10-26 21:56:41 +02:00
duplicates <- c ( x [base :: duplicated ( x ) ] , x [base :: duplicated ( names ( x ) ) ] )
duplicates <- duplicates [unique ( names ( duplicates ) ) ]
x <- c ( x [ ! names ( x ) %in% names ( duplicates ) ] , duplicates )
x <- x [order ( names ( x ) , x ) ]
2019-08-15 17:09:27 +02:00
2020-06-03 11:48:00 +02:00
# succeeded with auto-guessing
2020-05-16 13:05:47 +02:00
message ( font_blue ( " OK." ) )
2020-07-13 09:17:24 +02:00
2019-10-26 21:56:41 +02:00
for ( i in seq_len ( length ( x ) ) ) {
if ( verbose == TRUE & ! names ( x [i ] ) %in% names ( duplicates ) ) {
2020-05-16 13:05:47 +02:00
message ( font_blue ( paste0 ( " NOTE: Using column `" , font_bold ( x [i ] ) , " ` as input for `" , names ( x ) [i ] ,
2020-07-13 09:17:24 +02:00
" ` (" , ab_name ( names ( x ) [i ] , tolower = TRUE , language = NULL ) , " )." ) ) )
2019-06-27 11:57:45 +02:00
}
2019-10-26 21:56:41 +02:00
if ( names ( x [i ] ) %in% names ( duplicates ) ) {
2020-05-16 13:05:47 +02:00
warning ( font_red ( paste0 ( " Using column `" , font_bold ( x [i ] ) , " ` as input for `" , names ( x ) [i ] ,
2020-07-13 09:17:24 +02:00
" ` (" , ab_name ( names ( x ) [i ] , tolower = TRUE , language = NULL ) ,
" ), although it was matched for multiple antibiotics or columns." ) ) ,
2019-10-26 21:56:41 +02:00
call. = FALSE ,
immediate. = verbose )
2019-06-27 11:57:45 +02:00
}
}
2019-10-26 21:56:41 +02:00
2019-06-27 11:57:45 +02:00
if ( ! is.null ( hard_dependencies ) ) {
2019-10-26 21:56:41 +02:00
hard_dependencies <- unique ( hard_dependencies )
2019-06-27 11:57:45 +02:00
if ( ! all ( hard_dependencies %in% names ( x ) ) ) {
# missing a hard dependency will return NA and consequently the data will not be analysed
missing <- hard_dependencies [ ! hard_dependencies %in% names ( x ) ]
generate_warning_abs_missing ( missing , any = FALSE )
return ( NA )
}
}
if ( ! is.null ( soft_dependencies ) ) {
2019-10-26 21:56:41 +02:00
soft_dependencies <- unique ( soft_dependencies )
2019-06-27 11:57:45 +02:00
if ( ! all ( soft_dependencies %in% names ( x ) ) ) {
# missing a soft dependency may lower the reliability
missing <- soft_dependencies [ ! soft_dependencies %in% names ( x ) ]
2020-05-16 13:05:47 +02:00
missing_txt <- paste ( paste0 ( ab_name ( missing , tolower = TRUE , language = NULL ) ,
2020-07-13 09:17:24 +02:00
" (" , font_bold ( missing , collapse = NULL ) , " )" ) ,
collapse = " , " )
2020-05-16 13:05:47 +02:00
message ( font_blue ( " NOTE: Reliability would be improved if these antimicrobial results would be available too:" ,
missing_txt ) )
2019-06-27 11:57:45 +02:00
}
}
x
}
generate_warning_abs_missing <- function ( missing , any = FALSE ) {
2020-05-18 11:09:02 +02:00
missing <- paste0 ( missing , " (" , ab_name ( missing , tolower = TRUE , language = NULL ) , " )" )
2019-06-27 11:57:45 +02:00
if ( any == TRUE ) {
any_txt <- c ( " any of" , " is" )
} else {
any_txt <- c ( " " , " are" )
}
warning ( paste0 ( " Introducing NAs since" , any_txt [1 ] , " these antimicrobials " , any_txt [2 ] , " required: " ,
paste ( missing , collapse = " , " ) ) ,
immediate. = TRUE ,
call. = FALSE )
}