2019-05-10 16:44:59 +02:00
# ==================================================================== #
# TITLE #
# Antimicrobial Resistance (AMR) Analysis #
# #
# SOURCE #
# https://gitlab.com/msberends/AMR #
# #
# LICENCE #
2020-01-05 17:22:09 +01:00
# (c) 2018-2020 Berends MS, Luz CF et al. #
2019-05-10 16:44:59 +02:00
# #
# This R package is free software; you can freely use and distribute #
# it for both personal and commercial purposes under the terms of the #
# GNU General Public License version 2.0 (GNU GPL-2), as published by #
# the Free Software Foundation. #
# #
2020-01-05 17:22:09 +01:00
# We created this package for both routine data analysis and academic #
# research and it was publicly released in the hope that it will be #
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. #
2019-05-10 16:44:59 +02:00
# Visit our website for more info: https://msberends.gitlab.io/AMR. #
# ==================================================================== #
#' Transform to antibiotic ID
#'
2019-11-28 22:32:17 +01:00
#' Use this function to determine the antibiotic code of one or more antibiotics. The data set [antibiotics] will be searched for abbreviations, official names and synonyms (brand names).
2020-01-05 17:22:09 +01:00
#' @inheritSection lifecycle Maturing lifecycle
2019-05-10 16:44:59 +02:00
#' @param x character vector to determine to antibiotic ID
2019-10-04 15:36:12 +02:00
#' @param ... arguments passed on to internal functions
2019-05-10 16:44:59 +02:00
#' @rdname as.ab
#' @inheritSection WHOCC WHOCC
#' @importFrom dplyr %>% filter slice pull
2019-11-28 22:32:17 +01:00
#' @details All entries in the [antibiotics] data set have three different identifiers: a human readable EARS-Net code (column `ab`, used by ECDC and WHONET), an ATC code (column `atc`, used by WHO), and a CID code (column `cid`, Compound ID, used by PubChem). The data set contains more than 5,000 official brand names from many different countries, as found in PubChem.
2019-05-13 10:10:16 +02:00
#'
2019-11-28 22:32:17 +01:00
#' Use the [ab_property()] functions to get properties based on the returned antibiotic ID, see Examples.
2019-05-10 16:44:59 +02:00
#' @section Source:
#' World Health Organization (WHO) Collaborating Centre for Drug Statistics Methodology: \url{https://www.whocc.no/atc_ddd_index/}
#'
#' WHONET 2019 software: \url{http://www.whonet.org/software.html}
#'
#' European Commission Public Health PHARMACEUTICALS - COMMUNITY REGISTER: \url{http://ec.europa.eu/health/documents/community-register/html/atc.htm}
2019-11-28 22:32:17 +01:00
#' @aliases ab
#' @return Character (vector) with class [`ab`]. Unknown values will return `NA`.
#' @seealso [antibiotics] for the dataframe that is being used to determine ATCs.
2019-05-10 16:44:59 +02:00
#' @inheritSection AMR Read more on our website!
2019-11-28 22:32:17 +01:00
#' @export
2019-05-10 16:44:59 +02:00
#' @examples
2020-01-08 11:30:33 +01:00
#' # these examples all return "ERY", the ID of erythromycin:
2019-05-10 16:44:59 +02:00
#' as.ab("J01FA01")
#' as.ab("J 01 FA 01")
#' as.ab("Erythromycin")
#' as.ab("eryt")
#' as.ab(" eryt 123")
#' as.ab("ERYT")
#' as.ab("ERY")
2019-05-16 21:20:00 +02:00
#' as.ab("eritromicine") # spelled wrong, yet works
2019-05-10 16:44:59 +02:00
#' as.ab("Erythrocin") # trade name
#' as.ab("Romycin") # trade name
2020-01-08 11:30:33 +01:00
#'
#' # spelling from different languages and dyslexia are no problem
#' ab_atc("ceftriaxon")
2020-02-16 22:43:56 +01:00
#' ab_atc("cephtriaxone") # small spelling error
#' ab_atc("cephthriaxone") # or a bit more severe
#' ab_atc("seephthriaaksone") # and even this works
2019-05-10 16:44:59 +02:00
#'
2020-01-08 11:30:33 +01:00
#' # use ab_* functions to get a specific properties (see ?ab_property);
2019-05-10 16:44:59 +02:00
#' # they use as.ab() internally:
#' ab_name("J01FA01") # "Erythromycin"
#' ab_name("eryt") # "Erythromycin"
2019-10-04 15:36:12 +02:00
as.ab <- function ( x , ... ) {
2020-02-14 19:54:13 +01:00
check_dataset_integrity ( )
2019-05-10 16:44:59 +02:00
if ( is.ab ( x ) ) {
return ( x )
}
2019-05-16 21:20:00 +02:00
2020-02-14 19:54:13 +01:00
if ( all ( toupper ( x ) %in% antibiotics $ ab ) ) {
2019-05-16 21:20:00 +02:00
# valid AB code, but not yet right class
return ( structure ( .Data = toupper ( x ) ,
class = " ab" ) )
}
2019-05-10 16:44:59 +02:00
x_bak <- x
2019-10-04 15:36:12 +02:00
# remove diacritics
x <- iconv ( x , from = " UTF-8" , to = " ASCII//TRANSLIT" )
x <- gsub ( ' "' , " " , x , fixed = TRUE )
2019-05-10 16:44:59 +02:00
# remove suffices
2019-06-11 14:18:25 +02:00
x_bak_clean <- gsub ( " _(mic|rsi|dis[ck])$" , " " , x , ignore.case = TRUE )
2019-05-10 16:44:59 +02:00
# remove disk concentrations, like LVX_NM -> LVX
2019-11-09 11:33:22 +01:00
x_bak_clean <- gsub ( " _[A-Z]{2}[0-9_.]{0,3}$" , " " , x_bak_clean , ignore.case = TRUE )
2019-06-11 14:18:25 +02:00
# remove part between brackets if that's followed by another string
x_bak_clean <- gsub ( " (.*)+ [(].*[)]" , " \\1" , x_bak_clean )
# keep only max 1 space
x_bak_clean <- trimws ( gsub ( " +" , " " , x_bak_clean , ignore.case = TRUE ) )
2019-10-04 15:36:12 +02:00
# non-character, space or number should be a slash
2019-10-06 21:07:38 +02:00
x_bak_clean <- gsub ( " [^A-Za-z0-9 -]" , " /" , x_bak_clean )
2019-10-04 15:36:12 +02:00
# spaces around non-characters must be removed: amox + clav -> amox/clav
x_bak_clean <- gsub ( " (.*[a-zA-Z0-9]) ([^a-zA-Z0-9].*)" , " \\1\\2" , x_bak_clean )
x_bak_clean <- gsub ( " (.*[^a-zA-Z0-9]) ([a-zA-Z0-9].*)" , " \\1\\2" , x_bak_clean )
2020-01-08 11:30:33 +01:00
# remove hyphen after a starting "co"
x_bak_clean <- gsub ( " ^co-" , " co" , x_bak_clean , ignore.case = TRUE )
# replace text 'and' with a slash
x_bak_clean <- gsub ( " and " , " /" , x_bak_clean , ignore.case = TRUE )
2019-10-04 15:36:12 +02:00
2019-05-10 16:44:59 +02:00
x <- unique ( x_bak_clean )
x_new <- rep ( NA_character_ , length ( x ) )
x_unknown <- character ( 0 )
2019-10-11 17:21:02 +02:00
for ( i in seq_len ( length ( x ) ) ) {
2019-05-10 16:44:59 +02:00
if ( is.na ( x [i ] ) | is.null ( x [i ] ) ) {
next
}
if ( identical ( x [i ] , " " ) ) {
x_unknown <- c ( x_unknown , x_bak [x [i ] == x_bak_clean ] [1 ] )
next
}
2019-06-07 22:47:37 +02:00
# prevent "bacteria" from coercing to TMP, since Bacterial is a brand name of it
if ( identical ( tolower ( x [i ] ) , " bacteria" ) ) {
x_unknown <- c ( x_unknown , x_bak [x [i ] == x_bak_clean ] [1 ] )
next
}
2019-05-10 16:44:59 +02:00
# exact AB code
2020-02-14 19:54:13 +01:00
found <- antibiotics [which ( antibiotics $ ab == toupper ( x [i ] ) ) , ] $ ab
2019-05-10 16:44:59 +02:00
if ( length ( found ) > 0 ) {
x_new [i ] <- found [1L ]
next
}
# exact ATC code
2020-02-14 19:54:13 +01:00
found <- antibiotics [which ( antibiotics $ atc == toupper ( x [i ] ) ) , ] $ ab
2019-05-10 16:44:59 +02:00
if ( length ( found ) > 0 ) {
x_new [i ] <- found [1L ]
next
}
# exact CID code
2020-02-14 19:54:13 +01:00
found <- antibiotics [which ( antibiotics $ cid == x [i ] ) , ] $ ab
2019-05-10 16:44:59 +02:00
if ( length ( found ) > 0 ) {
x_new [i ] <- found [1L ]
next
}
# exact name
2020-02-14 19:54:13 +01:00
found <- antibiotics [which ( toupper ( antibiotics $ name ) == toupper ( x [i ] ) ) , ] $ ab
2019-05-10 16:44:59 +02:00
if ( length ( found ) > 0 ) {
x_new [i ] <- found [1L ]
next
}
2020-01-26 20:20:00 +01:00
# exact LOINC code
2020-02-14 19:54:13 +01:00
loinc_found <- unlist ( lapply ( antibiotics $ loinc ,
2020-01-26 20:20:00 +01:00
function ( s ) if ( x [i ] %in% s ) {
TRUE
} else {
FALSE
} ) )
2020-02-14 19:54:13 +01:00
found <- antibiotics $ ab [loinc_found == TRUE ]
2020-01-26 20:20:00 +01:00
if ( length ( found ) > 0 ) {
x_new [i ] <- found [1L ]
next
}
2019-05-10 16:44:59 +02:00
# exact synonym
2020-02-14 19:54:13 +01:00
synonym_found <- unlist ( lapply ( antibiotics $ synonyms ,
2019-05-10 16:44:59 +02:00
function ( s ) if ( toupper ( x [i ] ) %in% toupper ( s ) ) {
TRUE
} else {
FALSE
} ) )
2020-02-14 19:54:13 +01:00
found <- antibiotics $ ab [synonym_found == TRUE ]
2019-05-10 16:44:59 +02:00
if ( length ( found ) > 0 ) {
x_new [i ] <- found [1L ]
next
}
# exact abbreviation
2020-02-14 19:54:13 +01:00
abbr_found <- unlist ( lapply ( antibiotics $ abbreviations ,
2019-05-10 16:44:59 +02:00
function ( a ) if ( toupper ( x [i ] ) %in% toupper ( a ) ) {
TRUE
} else {
FALSE
} ) )
2020-02-14 19:54:13 +01:00
found <- antibiotics $ ab [abbr_found == TRUE ]
2019-05-10 16:44:59 +02:00
if ( length ( found ) > 0 ) {
x_new [i ] <- found [1L ]
next
}
# first >=4 characters of name
if ( nchar ( x [i ] ) >= 4 ) {
2020-02-14 19:54:13 +01:00
found <- antibiotics [which ( toupper ( antibiotics $ name ) %like% paste0 ( " ^" , x [i ] ) ) , ] $ ab
2019-05-10 16:44:59 +02:00
if ( length ( found ) > 0 ) {
x_new [i ] <- found [1L ]
next
}
}
# allow characters that resemble others, but only continue when having more than 3 characters
if ( nchar ( x [i ] ) <= 3 ) {
x_unknown <- c ( x_unknown , x_bak [x [i ] == x_bak_clean ] [1 ] )
next
}
2019-05-16 21:20:00 +02:00
x_spelling <- tolower ( x [i ] )
x_spelling <- gsub ( " [iy]+" , " [iy]+" , x_spelling )
2019-06-07 22:47:37 +02:00
x_spelling <- gsub ( " (c|k|q|qu|s|z|x|ks)+" , " (c|k|q|qu|s|z|x|ks)+" , x_spelling )
2019-05-16 21:20:00 +02:00
x_spelling <- gsub ( " (ph|f|v)+" , " (ph|f|v)+" , x_spelling )
x_spelling <- gsub ( " (th|t)+" , " (th|t)+" , x_spelling )
x_spelling <- gsub ( " a+" , " a+" , x_spelling )
x_spelling <- gsub ( " e+" , " e+" , x_spelling )
x_spelling <- gsub ( " o+" , " o+" , x_spelling )
2019-05-10 16:44:59 +02:00
# allow any ending of -in/-ine and -im/-ime
2019-05-16 21:20:00 +02:00
x_spelling <- gsub ( " (\\[iy\\]\\+(n|m)|\\[iy\\]\\+(n|m)e\\+)$" , " [iy]+(n|m)e*" , x_spelling )
2019-05-10 16:44:59 +02:00
# allow any ending of -ol/-ole
2019-05-16 21:20:00 +02:00
x_spelling <- gsub ( " (o\\+l|o\\+le\\+)$" , " o+le*" , x_spelling )
# allow any ending of -on/-one
x_spelling <- gsub ( " (o\\+n|o\\+ne\\+)$" , " o+ne*" , x_spelling )
2019-06-07 22:47:37 +02:00
# replace multiple same characters to single one with '+', like "ll" -> "l+"
x_spelling <- gsub ( " (.)\\1+" , " \\1+" , x_spelling )
2019-10-04 15:36:12 +02:00
2019-05-10 16:44:59 +02:00
# try if name starts with it
2020-02-14 19:54:13 +01:00
found <- antibiotics [which ( antibiotics $ name %like% paste0 ( " ^" , x_spelling ) ) , ] $ ab
2019-05-10 16:44:59 +02:00
if ( length ( found ) > 0 ) {
x_new [i ] <- found [1L ]
next
}
# and try if any synonym starts with it
2020-02-14 19:54:13 +01:00
synonym_found <- unlist ( lapply ( antibiotics $ synonyms ,
2019-05-10 16:44:59 +02:00
function ( s ) if ( any ( s %like% paste0 ( " ^" , x_spelling ) ) ) {
TRUE
} else {
FALSE
} ) )
2020-02-14 19:54:13 +01:00
found <- antibiotics $ ab [synonym_found == TRUE ]
2019-05-10 16:44:59 +02:00
if ( length ( found ) > 0 ) {
x_new [i ] <- found [1L ]
next
}
2019-06-11 14:18:25 +02:00
# try by removing all spaces
if ( x [i ] %like% " " ) {
found <- suppressWarnings ( as.ab ( gsub ( " +" , " " , x [i ] ) ) )
2019-06-11 15:31:32 +02:00
if ( length ( found ) > 0 & ! is.na ( found ) ) {
2019-06-11 14:18:25 +02:00
x_new [i ] <- found [1L ]
next
}
}
# try by removing all spaces and numbers
if ( x [i ] %like% " " | x [i ] %like% " [0-9]" ) {
found <- suppressWarnings ( as.ab ( gsub ( " [ 0-9]" , " " , x [i ] ) ) )
2019-06-11 15:31:32 +02:00
if ( length ( found ) > 0 & ! is.na ( found ) ) {
2019-06-11 14:18:25 +02:00
x_new [i ] <- found [1L ]
next
}
}
2020-04-14 14:12:31 +02:00
2019-10-04 15:36:12 +02:00
if ( ! isFALSE ( list ( ... ) $ initial_search ) ) {
# transform back from other languages and try again
x_translated <- paste ( lapply ( strsplit ( x [i ] , " [^a-zA-Z0-9 ]" ) ,
function ( y ) {
2019-10-11 17:21:02 +02:00
for ( i in seq_len ( length ( y ) ) ) {
2019-10-04 15:36:12 +02:00
y [i ] <- ifelse ( tolower ( y [i ] ) %in% tolower ( translations_file $ replacement ) ,
translations_file [which ( tolower ( translations_file $ replacement ) == tolower ( y [i ] ) &
! isFALSE ( translations_file $ fixed ) ) , " pattern" ] ,
y [i ] )
}
y
} ) [ [1 ] ] ,
collapse = " /" )
x_translated_guess <- suppressWarnings ( as.ab ( x_translated , initial_search = FALSE ) )
if ( ! is.na ( x_translated_guess ) ) {
x_new [i ] <- x_translated_guess
next
}
2019-10-06 21:07:38 +02:00
if ( ! isFALSE ( list ( ... ) $ initial_search2 ) ) {
# now also try to coerce brandname combinations like "Amoxy/clavulanic acid"
x_translated <- paste ( lapply ( strsplit ( x_translated , " [^a-zA-Z0-9 ]" ) ,
function ( y ) {
2019-10-11 17:21:02 +02:00
for ( i in seq_len ( length ( y ) ) ) {
2019-10-06 21:07:38 +02:00
y_name <- suppressWarnings ( ab_name ( y [i ] , language = NULL , initial_search = FALSE , initial_search2 = FALSE ) )
y [i ] <- ifelse ( ! is.na ( y_name ) ,
y_name ,
y [i ] )
}
y
} ) [ [1 ] ] ,
collapse = " /" )
x_translated_guess <- suppressWarnings ( as.ab ( x_translated , initial_search = FALSE ) )
if ( ! is.na ( x_translated_guess ) ) {
x_new [i ] <- x_translated_guess
next
}
2019-10-04 15:36:12 +02:00
}
}
2020-04-14 14:12:31 +02:00
# try by removing all trailing capitals
if ( x [i ] %like_case% " [a-z]+[A-Z]+$" ) {
found <- suppressWarnings ( as.ab ( gsub ( " [A-Z]+$" , " " , x [i ] ) ) )
if ( length ( found ) > 0 & ! is.na ( found ) ) {
x_new [i ] <- found [1L ]
next
}
}
2019-05-10 16:44:59 +02:00
# not found
x_unknown <- c ( x_unknown , x_bak [x [i ] == x_bak_clean ] [1 ] )
}
2019-06-11 15:31:32 +02:00
# take failed ATC codes apart from rest
x_unknown_ATCs <- x_unknown [x_unknown %like% " [A-Z][0-9][0-9][A-Z][A-Z][0-9][0-9]" ]
x_unknown <- x_unknown [ ! x_unknown %in% x_unknown_ATCs ]
if ( length ( x_unknown_ATCs ) > 0 ) {
warning ( " These ATC codes are not (yet) in the antibiotics data set: " ,
2019-10-11 17:21:02 +02:00
paste ( ' "' , sort ( unique ( x_unknown_ATCs ) ) , ' "' , sep = " " , collapse = " , " ) ,
2019-06-11 15:31:32 +02:00
" ." ,
call. = FALSE )
}
2019-05-10 16:44:59 +02:00
if ( length ( x_unknown ) > 0 ) {
2019-08-11 19:07:26 +02:00
warning ( " These values could not be coerced to a valid antimicrobial ID: " ,
2019-10-11 17:21:02 +02:00
paste ( ' "' , sort ( unique ( x_unknown ) ) , ' "' , sep = " " , collapse = " , " ) ,
2019-05-10 16:44:59 +02:00
" ." ,
call. = FALSE )
}
x_result <- data.frame ( x = x_bak_clean , stringsAsFactors = FALSE ) %>%
left_join ( data.frame ( x = x , x_new = x_new , stringsAsFactors = FALSE ) , by = " x" ) %>%
pull ( x_new )
2019-05-16 21:20:00 +02:00
if ( length ( x_result ) == 0 ) {
x_result <- NA_character_
}
2019-05-10 16:44:59 +02:00
structure ( .Data = x_result ,
class = " ab" )
}
2019-05-16 21:20:00 +02:00
#' @rdname as.ab
2019-05-10 16:44:59 +02:00
#' @export
is.ab <- function ( x ) {
2020-01-31 23:27:38 +01:00
inherits ( x , " ab" )
2019-05-10 16:44:59 +02:00
}
#' @exportMethod print.ab
#' @export
#' @noRd
print.ab <- function ( x , ... ) {
cat ( " Class 'ab'\n" )
2019-08-07 15:37:39 +02:00
print ( as.character ( x ) , quote = FALSE )
2019-05-10 16:44:59 +02:00
}
#' @exportMethod as.data.frame.ab
#' @export
#' @noRd
2019-10-11 17:21:02 +02:00
as.data.frame.ab <- function ( x , ... ) {
2019-05-10 16:44:59 +02:00
# same as as.data.frame.character but with removed stringsAsFactors
nm <- paste ( deparse ( substitute ( x ) , width.cutoff = 500L ) ,
collapse = " " )
if ( ! " nm" %in% names ( list ( ... ) ) ) {
as.data.frame.vector ( x , ... , nm = nm )
} else {
as.data.frame.vector ( x , ... )
}
}
2019-08-12 14:48:09 +02:00
#' @exportMethod [.ab
2019-05-10 16:44:59 +02:00
#' @export
#' @noRd
2019-08-14 14:57:06 +02:00
" [.ab" <- function ( x , ... ) {
2019-08-12 14:48:09 +02:00
y <- NextMethod ( )
2019-08-14 14:57:06 +02:00
attributes ( y ) <- attributes ( x )
y
}
2019-08-26 16:02:03 +02:00
#' @exportMethod [[.ab
2019-08-14 14:57:06 +02:00
#' @export
#' @noRd
2019-08-26 16:02:03 +02:00
" [[.ab" <- function ( x , ... ) {
2019-08-14 14:57:06 +02:00
y <- NextMethod ( )
2019-08-26 16:02:03 +02:00
attributes ( y ) <- attributes ( x )
2019-08-14 14:57:06 +02:00
y
}
2019-08-26 16:02:03 +02:00
#' @exportMethod [<-.ab
2019-08-14 14:57:06 +02:00
#' @export
#' @noRd
2019-08-26 16:02:03 +02:00
" [<-.ab" <- function ( i , j , ... , value ) {
2019-08-14 14:57:06 +02:00
y <- NextMethod ( )
2019-08-26 16:02:03 +02:00
attributes ( y ) <- attributes ( i )
2020-02-14 19:54:13 +01:00
class_integrity_check ( y , " antimicrobial code" , antibiotics $ ab )
2019-08-14 14:57:06 +02:00
}
#' @exportMethod [[<-.ab
#' @export
#' @noRd
" [[<-.ab" <- function ( i , j , ... , value ) {
y <- NextMethod ( )
2019-08-26 16:02:03 +02:00
attributes ( y ) <- attributes ( i )
2020-02-14 19:54:13 +01:00
class_integrity_check ( y , " antimicrobial code" , antibiotics $ ab )
2019-08-14 14:57:06 +02:00
}
#' @exportMethod c.ab
#' @export
#' @noRd
c.ab <- function ( x , ... ) {
y <- NextMethod ( )
attributes ( y ) <- attributes ( x )
2020-02-14 19:54:13 +01:00
class_integrity_check ( y , " antimicrobial code" , antibiotics $ ab )
2019-05-10 16:44:59 +02:00
}
2019-08-07 15:37:39 +02:00
2020-03-14 14:05:43 +01:00
#' @importFrom vctrs vec_ptype_abbr
2019-08-07 15:37:39 +02:00
#' @export
2020-03-14 14:05:43 +01:00
vec_ptype_abbr.ab <- function ( x , ... ) {
" ab"
}
#' @importFrom vctrs vec_ptype_full
#' @export
vec_ptype_full.ab <- function ( x , ... ) {
2019-08-07 15:37:39 +02:00
" ab"
}
#' @importFrom pillar pillar_shaft
#' @export
pillar_shaft.ab <- function ( x , ... ) {
out <- format ( x )
2019-08-08 22:39:42 +02:00
out [is.na ( x ) ] <- pillar :: style_na ( " NA" )
2019-08-07 15:37:39 +02:00
pillar :: new_pillar_shaft_simple ( out , align = " left" , min_width = 4 )
}