2019-05-10 16:44:59 +02:00
# ==================================================================== #
# TITLE #
2021-02-02 23:57:35 +01:00
# Antimicrobial Resistance (AMR) Data Analysis for R #
2019-05-10 16:44:59 +02:00
# #
# SOURCE #
2020-07-08 14:48:06 +02:00
# https://github.com/msberends/AMR #
2019-05-10 16:44:59 +02:00
# #
# LICENCE #
2020-12-27 00:30:28 +01:00
# (c) 2018-2021 Berends MS, Luz CF et al. #
2020-10-08 11:16:03 +02:00
# Developed at the University of Groningen, the Netherlands, in #
# collaboration with non-profit organisations Certe Medical #
# Diagnostics & Advice, and University Medical Center Groningen. #
2019-05-10 16:44:59 +02:00
# #
# This R package is free software; you can freely use and distribute #
# it for both personal and commercial purposes under the terms of the #
# GNU General Public License version 2.0 (GNU GPL-2), as published by #
# the Free Software Foundation. #
2020-01-05 17:22:09 +01:00
# We created this package for both routine data analysis and academic #
# research and it was publicly released in the hope that it will be #
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. #
2020-10-08 11:16:03 +02:00
# #
# Visit our website for the full manual and a complete tutorial about #
2021-02-02 23:57:35 +01:00
# how to conduct AMR data analysis: https://msberends.github.io/AMR/ #
2019-05-10 16:44:59 +02:00
# ==================================================================== #
2021-01-18 16:57:56 +01:00
#' Transform Input to an Antibiotic ID
2019-05-10 16:44:59 +02:00
#'
2019-11-28 22:32:17 +01:00
#' Use this function to determine the antibiotic code of one or more antibiotics. The data set [antibiotics] will be searched for abbreviations, official names and synonyms (brand names).
2021-01-18 16:57:56 +01:00
#' @inheritSection lifecycle Stable Lifecycle
2019-05-10 16:44:59 +02:00
#' @param x character vector to determine to antibiotic ID
2020-06-26 12:31:27 +02:00
#' @param flag_multiple_results logical to indicate whether a note should be printed to the console that probably more than one antibiotic code or name can be retrieved from a single input value.
2020-08-14 13:36:10 +02:00
#' @param info logical to indicate whether a progress bar should be printed
2019-10-04 15:36:12 +02:00
#' @param ... arguments passed on to internal functions
2019-05-10 16:44:59 +02:00
#' @rdname as.ab
#' @inheritSection WHOCC WHOCC
2019-11-28 22:32:17 +01:00
#' @details All entries in the [antibiotics] data set have three different identifiers: a human readable EARS-Net code (column `ab`, used by ECDC and WHONET), an ATC code (column `atc`, used by WHO), and a CID code (column `cid`, Compound ID, used by PubChem). The data set contains more than 5,000 official brand names from many different countries, as found in PubChem.
2020-07-01 16:21:36 +02:00
#'
#' All these properties will be searched for the user input. The [as.ab()] can correct for different forms of misspelling:
#'
2020-12-17 16:22:25 +01:00
#' * Wrong spelling of drug names (such as "tobramicin" or "gentamycin"), which corrects for most audible similarities such as f/ph, x/ks, c/z/s, t/th, etc.
2020-07-01 16:21:36 +02:00
#' * Too few or too many vowels or consonants
2020-12-17 16:22:25 +01:00
#' * Switching two characters (such as "mreopenem", often the case in clinical data, when doctors typed too fast)
2020-07-01 16:21:36 +02:00
#' * Digitalised paper records, leaving artefacts like 0/o/O (zero and O's), B/8, n/r, etc.
2019-05-13 10:10:16 +02:00
#'
2021-01-18 16:57:56 +01:00
#' Use the [`ab_*`][ab_property()] functions to get properties based on the returned antibiotic ID, see *Examples*.
2020-06-25 17:34:50 +02:00
#'
2020-12-17 16:22:25 +01:00
#' Note: the [as.ab()] and [`ab_*`][ab_property()] functions may use very long regular expression to match brand names of antimicrobial agents. This may fail on some systems.
2019-05-10 16:44:59 +02:00
#' @section Source:
#' World Health Organization (WHO) Collaborating Centre for Drug Statistics Methodology: \url{https://www.whocc.no/atc_ddd_index/}
#'
#' WHONET 2019 software: \url{http://www.whonet.org/software.html}
#'
#' European Commission Public Health PHARMACEUTICALS - COMMUNITY REGISTER: \url{http://ec.europa.eu/health/documents/community-register/html/atc.htm}
2019-11-28 22:32:17 +01:00
#' @aliases ab
2020-09-18 16:05:53 +02:00
#' @return A [character] [vector] with additional class [`ab`]
2020-06-25 17:34:50 +02:00
#' @seealso
2020-09-18 16:05:53 +02:00
#' * [antibiotics] for the [data.frame] that is being used to determine ATCs
2020-06-25 17:34:50 +02:00
#' * [ab_from_text()] for a function to retrieve antimicrobial drugs from clinical text (from health care records)
2021-01-18 16:57:56 +01:00
#' @inheritSection AMR Reference Data Publicly Available
#' @inheritSection AMR Read more on Our Website!
2019-11-28 22:32:17 +01:00
#' @export
2019-05-10 16:44:59 +02:00
#' @examples
2020-01-08 11:30:33 +01:00
#' # these examples all return "ERY", the ID of erythromycin:
2019-05-10 16:44:59 +02:00
#' as.ab("J01FA01")
#' as.ab("J 01 FA 01")
#' as.ab("Erythromycin")
#' as.ab("eryt")
#' as.ab(" eryt 123")
#' as.ab("ERYT")
#' as.ab("ERY")
2019-05-16 21:20:00 +02:00
#' as.ab("eritromicine") # spelled wrong, yet works
2019-05-10 16:44:59 +02:00
#' as.ab("Erythrocin") # trade name
#' as.ab("Romycin") # trade name
2020-01-08 11:30:33 +01:00
#'
#' # spelling from different languages and dyslexia are no problem
#' ab_atc("ceftriaxon")
2020-02-16 22:43:56 +01:00
#' ab_atc("cephtriaxone") # small spelling error
#' ab_atc("cephthriaxone") # or a bit more severe
#' ab_atc("seephthriaaksone") # and even this works
2019-05-10 16:44:59 +02:00
#'
2020-01-08 11:30:33 +01:00
#' # use ab_* functions to get a specific properties (see ?ab_property);
2019-05-10 16:44:59 +02:00
#' # they use as.ab() internally:
#' ab_name("J01FA01") # "Erythromycin"
#' ab_name("eryt") # "Erythromycin"
2021-01-14 14:41:44 +01:00
#'
#' if (require("dplyr")) {
#'
#' # you can quickly rename <rsi> columns using dplyr >= 1.0.0:
#' example_isolates %>%
#' rename_with(as.ab, where(is.rsi))
#'
#' }
2020-08-14 13:36:10 +02:00
as.ab <- function ( x , flag_multiple_results = TRUE , info = TRUE , ... ) {
2020-10-19 20:44:45 +02:00
meet_criteria ( x , allow_class = c ( " character" , " numeric" , " integer" , " factor" ) , allow_NA = TRUE )
2020-10-19 17:09:19 +02:00
meet_criteria ( flag_multiple_results , allow_class = " logical" , has_length = 1 )
meet_criteria ( info , allow_class = " logical" , has_length = 1 )
2020-02-14 19:54:13 +01:00
check_dataset_integrity ( )
2019-05-10 16:44:59 +02:00
if ( is.ab ( x ) ) {
return ( x )
}
2020-07-13 09:17:24 +02:00
2020-06-26 12:31:27 +02:00
initial_search <- is.null ( list ( ... ) $ initial_search )
2020-06-25 17:34:50 +02:00
already_regex <- isTRUE ( list ( ... ) $ already_regex )
2021-02-02 23:57:35 +01:00
fast_mode <- isTRUE ( list ( ... ) $ fast_mode )
2020-07-13 09:17:24 +02:00
2019-05-10 16:44:59 +02:00
x_bak <- x
2020-06-25 17:34:50 +02:00
x <- toupper ( x )
2021-03-05 10:32:09 +01:00
x_nonNA <- x [ ! is.na ( x ) ]
2021-03-08 02:38:32 +01:00
if ( all ( x_nonNA %in% antibiotics $ ab , na.rm = TRUE ) ) {
# all valid AB codes, but not yet right class
return ( set_clean_class ( x ,
new_class = c ( " ab" , " character" ) ) )
}
if ( all ( x_nonNA %in% toupper ( antibiotics $ name ) , na.rm = TRUE ) ) {
# all valid AB names
out <- antibiotics $ ab [match ( x , toupper ( antibiotics $ name ) ) ]
out [is.na ( x ) ] <- NA_character_
return ( out )
}
if ( all ( x_nonNA %in% antibiotics $ atc , na.rm = TRUE ) ) {
# all valid ATC codes
out <- antibiotics $ ab [match ( x , antibiotics $ atc ) ]
out [is.na ( x ) ] <- NA_character_
return ( out )
}
2021-03-05 10:32:09 +01:00
2019-10-04 15:36:12 +02:00
# remove diacritics
x <- iconv ( x , from = " UTF-8" , to = " ASCII//TRANSLIT" )
x <- gsub ( ' "' , " " , x , fixed = TRUE )
2021-02-02 23:57:35 +01:00
x <- gsub ( " (specimen|specimen date|specimen_date|spec_date|^dates?$)" , " " , x , ignore.case = TRUE , perl = TRUE )
2020-06-25 17:34:50 +02:00
x_bak_clean <- x
if ( already_regex == FALSE ) {
2020-09-18 16:05:53 +02:00
x_bak_clean <- generalise_antibiotic_name ( x_bak_clean )
2020-06-25 17:34:50 +02:00
}
2020-07-13 09:17:24 +02:00
2020-09-19 11:54:01 +02:00
x <- unique ( x_bak_clean ) # this means that every x is in fact generalise_antibiotic_name(x)
2019-05-10 16:44:59 +02:00
x_new <- rep ( NA_character_ , length ( x ) )
x_unknown <- character ( 0 )
2020-06-02 16:05:56 +02:00
2020-06-26 12:31:27 +02:00
note_if_more_than_one_found <- function ( found , index , from_text ) {
if ( initial_search == TRUE & isTRUE ( length ( from_text ) > 1 ) ) {
2020-09-24 00:30:11 +02:00
abnames <- ab_name ( from_text , tolower = TRUE , initial_search = FALSE )
if ( ab_name ( found [1L ] , language = NULL ) %like% " clavulanic acid" ) {
abnames <- abnames [ ! abnames == " clavulanic acid" ]
}
if ( length ( abnames ) > 1 ) {
2020-10-27 15:56:51 +01:00
message_ ( " More than one result was found for item " , index , " : " ,
2021-02-04 16:48:16 +01:00
vector_and ( abnames , quotes = FALSE ) )
2020-09-24 00:30:11 +02:00
}
2020-06-26 12:31:27 +02:00
}
found [1L ]
}
if ( initial_search == TRUE ) {
2020-09-18 16:05:53 +02:00
progress <- progress_ticker ( n = length ( x ) , n_min = ifelse ( isTRUE ( info ) , 25 , length ( x ) + 1 ) ) # start if n >= 25
2020-06-26 12:31:27 +02:00
on.exit ( close ( progress ) )
}
2019-10-11 17:21:02 +02:00
for ( i in seq_len ( length ( x ) ) ) {
2021-02-02 23:57:35 +01:00
2020-06-26 12:31:27 +02:00
if ( initial_search == TRUE ) {
progress $ tick ( )
}
2019-05-10 16:44:59 +02:00
if ( is.na ( x [i ] ) | is.null ( x [i ] ) ) {
next
}
2020-06-02 16:05:56 +02:00
if ( identical ( x [i ] , " " ) |
# no short names:
nchar ( x [i ] ) <= 2 |
# prevent "bacteria" from coercing to TMP, since Bacterial is a brand name of it:
identical ( tolower ( x [i ] ) , " bacteria" ) ) {
2019-06-07 22:47:37 +02:00
x_unknown <- c ( x_unknown , x_bak [x [i ] == x_bak_clean ] [1 ] )
next
}
2020-06-02 16:05:56 +02:00
2021-02-02 23:57:35 +01:00
if ( fast_mode == FALSE && flag_multiple_results == TRUE && x [i ] %like% " [ ]" ) {
2020-12-03 22:30:14 +01:00
from_text <- tryCatch ( suppressWarnings ( ab_from_text ( x [i ] , initial_search = FALSE , translate_ab = FALSE ) [ [1 ] ] ) ,
error = function ( e ) character ( 0 ) )
2020-06-26 12:31:27 +02:00
} else {
from_text <- character ( 0 )
}
2021-01-12 22:08:04 +01:00
# old code for phenoxymethylpenicillin (Peni V)
if ( x [i ] == " PNV" ) {
x_new [i ] <- " PHN"
next
}
2020-08-14 13:36:10 +02:00
# exact name
2020-09-18 16:05:53 +02:00
found <- antibiotics [which ( AB_lookup $ generalised_name == x [i ] ) , ] $ ab
2020-08-14 13:36:10 +02:00
if ( length ( found ) > 0 ) {
x_new [i ] <- found [1L ]
next
}
2019-05-10 16:44:59 +02:00
# exact AB code
2020-06-25 17:34:50 +02:00
found <- antibiotics [which ( antibiotics $ ab == x [i ] ) , ] $ ab
2019-05-10 16:44:59 +02:00
if ( length ( found ) > 0 ) {
2020-06-26 12:31:27 +02:00
x_new [i ] <- note_if_more_than_one_found ( found , i , from_text )
2019-05-10 16:44:59 +02:00
next
}
2020-07-13 09:17:24 +02:00
2019-05-10 16:44:59 +02:00
# exact ATC code
2020-06-25 17:34:50 +02:00
found <- antibiotics [which ( antibiotics $ atc == x [i ] ) , ] $ ab
2019-05-10 16:44:59 +02:00
if ( length ( found ) > 0 ) {
2020-06-26 12:31:27 +02:00
x_new [i ] <- note_if_more_than_one_found ( found , i , from_text )
2019-05-10 16:44:59 +02:00
next
}
2020-07-13 09:17:24 +02:00
2019-05-10 16:44:59 +02:00
# exact CID code
2020-02-14 19:54:13 +01:00
found <- antibiotics [which ( antibiotics $ cid == x [i ] ) , ] $ ab
2019-05-10 16:44:59 +02:00
if ( length ( found ) > 0 ) {
2020-06-26 12:31:27 +02:00
x_new [i ] <- note_if_more_than_one_found ( found , i , from_text )
2019-05-10 16:44:59 +02:00
next
}
2020-01-26 20:20:00 +01:00
# exact LOINC code
2020-09-18 16:05:53 +02:00
loinc_found <- unlist ( lapply ( AB_lookup $ generalised_loinc ,
2020-09-19 11:54:01 +02:00
function ( s ) x [i ] %in% s ) )
2020-02-14 19:54:13 +01:00
found <- antibiotics $ ab [loinc_found == TRUE ]
2020-01-26 20:20:00 +01:00
if ( length ( found ) > 0 ) {
2020-06-26 12:31:27 +02:00
x_new [i ] <- note_if_more_than_one_found ( found , i , from_text )
2020-01-26 20:20:00 +01:00
next
}
2020-07-13 09:17:24 +02:00
2019-05-10 16:44:59 +02:00
# exact synonym
2020-09-18 16:05:53 +02:00
synonym_found <- unlist ( lapply ( AB_lookup $ generalised_synonyms ,
2020-09-19 11:54:01 +02:00
function ( s ) x [i ] %in% s ) )
2020-02-14 19:54:13 +01:00
found <- antibiotics $ ab [synonym_found == TRUE ]
2019-05-10 16:44:59 +02:00
if ( length ( found ) > 0 ) {
2020-06-26 12:31:27 +02:00
x_new [i ] <- note_if_more_than_one_found ( found , i , from_text )
2019-05-10 16:44:59 +02:00
next
}
2020-07-13 09:17:24 +02:00
2019-05-10 16:44:59 +02:00
# exact abbreviation
2020-09-18 16:05:53 +02:00
abbr_found <- unlist ( lapply ( AB_lookup $ generalised_abbreviations ,
2020-09-19 11:54:01 +02:00
function ( s ) x [i ] %in% s ) )
2020-02-14 19:54:13 +01:00
found <- antibiotics $ ab [abbr_found == TRUE ]
2019-05-10 16:44:59 +02:00
if ( length ( found ) > 0 ) {
2020-06-26 12:31:27 +02:00
x_new [i ] <- note_if_more_than_one_found ( found , i , from_text )
2019-05-10 16:44:59 +02:00
next
}
2020-07-13 09:17:24 +02:00
2019-05-10 16:44:59 +02:00
# allow characters that resemble others, but only continue when having more than 3 characters
if ( nchar ( x [i ] ) <= 3 ) {
x_unknown <- c ( x_unknown , x_bak [x [i ] == x_bak_clean ] [1 ] )
next
}
2020-06-25 17:34:50 +02:00
x_spelling <- x [i ]
if ( already_regex == FALSE ) {
2020-09-25 14:44:50 +02:00
x_spelling <- gsub ( " [IY]+" , " [IY]+" , x_spelling , perl = TRUE )
x_spelling <- gsub ( " (C|K|Q|QU|S|Z|X|KS)+" , " (C|K|Q|QU|S|Z|X|KS)+" , x_spelling , perl = TRUE )
x_spelling <- gsub ( " (PH|F|V)+" , " (PH|F|V)+" , x_spelling , perl = TRUE )
x_spelling <- gsub ( " (TH|T)+" , " (TH|T)+" , x_spelling , perl = TRUE )
x_spelling <- gsub ( " A+" , " A+" , x_spelling , perl = TRUE )
x_spelling <- gsub ( " E+" , " E+" , x_spelling , perl = TRUE )
x_spelling <- gsub ( " O+" , " O+" , x_spelling , perl = TRUE )
2020-06-25 17:34:50 +02:00
# allow any ending of -in/-ine and -im/-ime
2020-09-25 14:44:50 +02:00
x_spelling <- gsub ( " (\\[IY\\]\\+(N|M)|\\[IY\\]\\+(N|M)E\\+)$" , " [IY]+(N|M)E*" , x_spelling , perl = TRUE )
2020-06-25 17:34:50 +02:00
# allow any ending of -ol/-ole
2020-09-25 14:44:50 +02:00
x_spelling <- gsub ( " (O\\+L|O\\+LE\\+)$" , " O+LE*" , x_spelling , perl = TRUE )
2020-06-25 17:34:50 +02:00
# allow any ending of -on/-one
2020-09-25 14:44:50 +02:00
x_spelling <- gsub ( " (O\\+N|O\\+NE\\+)$" , " O+NE*" , x_spelling , perl = TRUE )
2020-06-25 17:34:50 +02:00
# replace multiple same characters to single one with '+', like "ll" -> "l+"
2020-09-25 14:44:50 +02:00
x_spelling <- gsub ( " (.)\\1+" , " \\1+" , x_spelling , perl = TRUE )
2020-06-25 17:34:50 +02:00
# replace spaces and slashes with a possibility on both
2020-09-25 14:44:50 +02:00
x_spelling <- gsub ( " [ /]" , " ( .*|.*/)" , x_spelling , perl = TRUE )
2020-06-25 17:34:50 +02:00
# correct for digital reading text (OCR)
2020-09-25 14:44:50 +02:00
x_spelling <- gsub ( " [NRD8B]" , " [NRD8B]" , x_spelling , perl = TRUE )
x_spelling <- gsub ( " (O|0)" , " (O|0)+" , x_spelling , perl = TRUE )
2020-07-01 16:21:36 +02:00
x_spelling <- gsub ( " ++" , " +" , x_spelling , fixed = TRUE )
2020-06-25 17:34:50 +02:00
}
2020-07-13 09:17:24 +02:00
2019-05-10 16:44:59 +02:00
# try if name starts with it
2020-09-18 16:05:53 +02:00
found <- antibiotics [which ( AB_lookup $ generalised_name %like% paste0 ( " ^" , x_spelling ) ) , ] $ ab
2019-05-10 16:44:59 +02:00
if ( length ( found ) > 0 ) {
2020-06-26 12:31:27 +02:00
x_new [i ] <- note_if_more_than_one_found ( found , i , from_text )
2019-05-10 16:44:59 +02:00
next
}
2020-06-25 17:34:50 +02:00
# try if name ends with it
2020-09-18 16:05:53 +02:00
found <- antibiotics [which ( AB_lookup $ generalised_name %like% paste0 ( x_spelling , " $" ) ) , ] $ ab
2020-06-25 17:34:50 +02:00
if ( nchar ( x [i ] ) >= 4 & length ( found ) > 0 ) {
2020-06-26 12:31:27 +02:00
x_new [i ] <- note_if_more_than_one_found ( found , i , from_text )
2020-06-25 17:34:50 +02:00
next
}
2020-07-13 09:17:24 +02:00
2019-05-10 16:44:59 +02:00
# and try if any synonym starts with it
2020-09-18 16:05:53 +02:00
synonym_found <- unlist ( lapply ( AB_lookup $ generalised_synonyms ,
2020-09-19 11:54:01 +02:00
function ( s ) any ( s %like% paste0 ( " ^" , x_spelling ) ) ) )
2020-02-14 19:54:13 +01:00
found <- antibiotics $ ab [synonym_found == TRUE ]
2019-05-10 16:44:59 +02:00
if ( length ( found ) > 0 ) {
2020-06-26 12:31:27 +02:00
x_new [i ] <- note_if_more_than_one_found ( found , i , from_text )
2019-05-10 16:44:59 +02:00
next
}
2020-07-13 09:17:24 +02:00
2020-06-26 12:31:27 +02:00
# INITIAL SEARCH - More uncertain results ----
2021-02-02 23:57:35 +01:00
if ( initial_search == TRUE && fast_mode == FALSE ) {
2020-06-25 17:34:50 +02:00
# only run on first try
# try by removing all spaces
if ( x [i ] %like% " " ) {
2020-09-25 14:44:50 +02:00
found <- suppressWarnings ( as.ab ( gsub ( " +" , " " , x [i ] , perl = TRUE ) , initial_search = FALSE ) )
2020-06-25 17:34:50 +02:00
if ( length ( found ) > 0 & ! is.na ( found ) ) {
2020-06-26 12:31:27 +02:00
x_new [i ] <- note_if_more_than_one_found ( found , i , from_text )
2020-06-25 17:34:50 +02:00
next
}
2019-06-11 14:18:25 +02:00
}
2020-06-25 17:34:50 +02:00
# try by removing all spaces and numbers
if ( x [i ] %like% " " | x [i ] %like% " [0-9]" ) {
2020-09-25 14:44:50 +02:00
found <- suppressWarnings ( as.ab ( gsub ( " [ 0-9]" , " " , x [i ] , perl = TRUE ) , initial_search = FALSE ) )
2020-06-25 17:34:50 +02:00
if ( length ( found ) > 0 & ! is.na ( found ) ) {
2020-06-26 12:31:27 +02:00
x_new [i ] <- note_if_more_than_one_found ( found , i , from_text )
2020-06-25 17:34:50 +02:00
next
}
2019-06-11 14:18:25 +02:00
}
2020-06-25 17:34:50 +02:00
2019-10-04 15:36:12 +02:00
# transform back from other languages and try again
2020-09-18 16:05:53 +02:00
x_translated <- paste ( lapply ( strsplit ( x [i ] , " [^A-Z0-9]" ) ,
2019-10-04 15:36:12 +02:00
function ( y ) {
2019-10-11 17:21:02 +02:00
for ( i in seq_len ( length ( y ) ) ) {
2021-03-05 01:31:46 +01:00
for ( lang in LANGUAGES_SUPPORTED [LANGUAGES_SUPPORTED != " en" ] ) {
y [i ] <- ifelse ( tolower ( y [i ] ) %in% tolower ( translations_file [ , lang , drop = TRUE ] ) ,
translations_file [which ( tolower ( translations_file [ , lang , drop = TRUE ] ) == tolower ( y [i ] ) &
! isFALSE ( translations_file $ fixed ) ) , " pattern" ] ,
y [i ] )
}
2019-10-04 15:36:12 +02:00
}
2020-09-18 16:05:53 +02:00
generalise_antibiotic_name ( y )
2019-10-04 15:36:12 +02:00
} ) [ [1 ] ] ,
collapse = " /" )
2020-06-26 12:31:27 +02:00
x_translated_guess <- suppressWarnings ( as.ab ( x_translated , initial_search = FALSE ) )
2020-06-25 17:34:50 +02:00
if ( ! is.na ( x_translated_guess ) ) {
x_new [i ] <- x_translated_guess
next
}
# now also try to coerce brandname combinations like "Amoxy/clavulanic acid"
x_translated <- paste ( lapply ( strsplit ( x_translated , " [^A-Z0-9 ]" ) ,
function ( y ) {
for ( i in seq_len ( length ( y ) ) ) {
2020-06-26 12:31:27 +02:00
y_name <- suppressWarnings ( ab_name ( y [i ] , language = NULL , initial_search = FALSE ) )
2020-06-25 17:34:50 +02:00
y [i ] <- ifelse ( ! is.na ( y_name ) ,
y_name ,
y [i ] )
}
2020-09-18 16:05:53 +02:00
generalise_antibiotic_name ( y )
2020-06-25 17:34:50 +02:00
} ) [ [1 ] ] ,
collapse = " /" )
2020-06-26 12:31:27 +02:00
x_translated_guess <- suppressWarnings ( as.ab ( x_translated , initial_search = FALSE ) )
2019-10-04 15:36:12 +02:00
if ( ! is.na ( x_translated_guess ) ) {
x_new [i ] <- x_translated_guess
next
}
2019-10-06 21:07:38 +02:00
2020-06-25 17:34:50 +02:00
# try by removing all trailing capitals
if ( x [i ] %like_case% " [a-z]+[A-Z]+$" ) {
2020-09-25 14:44:50 +02:00
found <- suppressWarnings ( as.ab ( gsub ( " [A-Z]+$" , " " , x [i ] , perl = TRUE ) , initial_search = FALSE ) )
2020-06-25 17:34:50 +02:00
if ( ! is.na ( found ) ) {
2020-06-26 12:31:27 +02:00
x_new [i ] <- note_if_more_than_one_found ( found , i , from_text )
2019-10-06 21:07:38 +02:00
next
}
2019-10-04 15:36:12 +02:00
}
2020-06-25 17:34:50 +02:00
# keep only letters
2020-09-25 14:44:50 +02:00
found <- suppressWarnings ( as.ab ( gsub ( " [^A-Z]" , " " , x [i ] , perl = TRUE ) , initial_search = FALSE ) )
2020-06-25 17:34:50 +02:00
if ( ! is.na ( found ) ) {
2020-06-26 12:31:27 +02:00
x_new [i ] <- note_if_more_than_one_found ( found , i , from_text )
2020-04-14 14:12:31 +02:00
next
}
2020-06-25 17:34:50 +02:00
# try from a bigger text, like from a health care record, see ?ab_from_text
2020-06-26 12:31:27 +02:00
# already calculated above if flag_multiple_results = TRUE
2021-02-02 23:57:35 +01:00
if ( flag_multiple_results == TRUE ) {
2020-06-26 12:31:27 +02:00
found <- from_text [1L ]
} else {
2020-12-03 22:30:14 +01:00
found <- tryCatch ( suppressWarnings ( ab_from_text ( x [i ] , initial_search = FALSE , translate_ab = FALSE ) [ [1 ] ] [1L ] ) ,
error = function ( e ) NA_character_ )
2020-06-26 12:31:27 +02:00
}
2020-06-25 17:34:50 +02:00
if ( ! is.na ( found ) ) {
2020-06-26 12:31:27 +02:00
x_new [i ] <- note_if_more_than_one_found ( found , i , from_text )
2020-06-25 17:34:50 +02:00
next
}
2020-07-13 09:17:24 +02:00
2020-06-25 17:34:50 +02:00
# first 5 except for cephalosporins, then first 7 (those cephalosporins all start quite the same!)
2020-06-26 12:31:27 +02:00
found <- suppressWarnings ( as.ab ( substr ( x [i ] , 1 , 5 ) , initial_search = FALSE ) )
if ( ! is.na ( found ) && ! ab_group ( found , initial_search = FALSE ) %like% " cephalosporins" ) {
x_new [i ] <- note_if_more_than_one_found ( found , i , from_text )
2020-06-25 17:34:50 +02:00
next
}
2020-06-26 12:31:27 +02:00
found <- suppressWarnings ( as.ab ( substr ( x [i ] , 1 , 7 ) , initial_search = FALSE ) )
2020-06-25 17:34:50 +02:00
if ( ! is.na ( found ) ) {
2020-06-26 12:31:27 +02:00
x_new [i ] <- note_if_more_than_one_found ( found , i , from_text )
2020-06-25 17:34:50 +02:00
next
}
# make all consonants facultative
2020-09-25 14:44:50 +02:00
search_str <- gsub ( " ([BCDFGHJKLMNPQRSTVWXZ])" , " \\1*" , x [i ] , perl = TRUE )
2020-06-26 12:31:27 +02:00
found <- suppressWarnings ( as.ab ( search_str , initial_search = FALSE , already_regex = TRUE ) )
2020-06-25 17:34:50 +02:00
# keep at least 4 normal characters
2020-09-25 14:44:50 +02:00
if ( nchar ( gsub ( " .\\*" , " " , search_str , perl = TRUE ) ) < 4 ) {
2020-06-25 17:34:50 +02:00
found <- NA
}
if ( ! is.na ( found ) ) {
2020-06-26 12:31:27 +02:00
x_new [i ] <- note_if_more_than_one_found ( found , i , from_text )
2020-06-25 17:34:50 +02:00
next
}
2020-07-13 09:17:24 +02:00
2020-06-25 17:34:50 +02:00
# make all vowels facultative
2020-09-25 14:44:50 +02:00
search_str <- gsub ( " ([AEIOUY])" , " \\1*" , x [i ] , perl = TRUE )
2020-06-26 12:31:27 +02:00
found <- suppressWarnings ( as.ab ( search_str , initial_search = FALSE , already_regex = TRUE ) )
2020-06-25 17:34:50 +02:00
# keep at least 5 normal characters
2020-09-25 14:44:50 +02:00
if ( nchar ( gsub ( " .\\*" , " " , search_str , perl = TRUE ) ) < 5 ) {
2020-06-25 17:34:50 +02:00
found <- NA
}
if ( ! is.na ( found ) ) {
2020-06-26 12:31:27 +02:00
x_new [i ] <- note_if_more_than_one_found ( found , i , from_text )
2020-06-25 17:34:50 +02:00
next
}
# allow misspelling of vowels
x_spelling <- gsub ( " A+" , " [AEIOU]+" , x_spelling , fixed = TRUE )
x_spelling <- gsub ( " E+" , " [AEIOU]+" , x_spelling , fixed = TRUE )
x_spelling <- gsub ( " I+" , " [AEIOU]+" , x_spelling , fixed = TRUE )
x_spelling <- gsub ( " O+" , " [AEIOU]+" , x_spelling , fixed = TRUE )
x_spelling <- gsub ( " U+" , " [AEIOU]+" , x_spelling , fixed = TRUE )
2020-06-26 12:31:27 +02:00
found <- suppressWarnings ( as.ab ( x_spelling , initial_search = FALSE , already_regex = TRUE ) )
2020-06-25 17:34:50 +02:00
if ( ! is.na ( found ) ) {
2020-06-26 12:31:27 +02:00
x_new [i ] <- note_if_more_than_one_found ( found , i , from_text )
2020-06-25 17:34:50 +02:00
next
}
2020-07-01 16:21:36 +02:00
# try with switched character, like "mreopenem"
for ( j in seq_len ( nchar ( x [i ] ) ) ) {
x_switched <- paste0 (
# beginning part:
substr ( x [i ] , 1 , j - 1 ) ,
# here is the switching of 2 characters:
substr ( x [i ] , j + 1 , j + 1 ) ,
substr ( x [i ] , j , j ) ,
# ending part:
substr ( x [i ] , j + 2 , nchar ( x [i ] ) ) )
found <- suppressWarnings ( as.ab ( x_switched , initial_search = FALSE ) )
if ( ! is.na ( found ) ) {
break
}
}
if ( ! is.na ( found ) ) {
x_new [i ] <- found [1L ]
next
}
2020-06-26 12:31:27 +02:00
} # end of initial_search = TRUE
2020-07-01 16:21:36 +02:00
2019-05-10 16:44:59 +02:00
# not found
x_unknown <- c ( x_unknown , x_bak [x [i ] == x_bak_clean ] [1 ] )
}
2020-06-26 12:31:27 +02:00
if ( initial_search == TRUE ) {
close ( progress )
}
2020-07-13 09:17:24 +02:00
2019-06-11 15:31:32 +02:00
# take failed ATC codes apart from rest
x_unknown_ATCs <- x_unknown [x_unknown %like% " [A-Z][0-9][0-9][A-Z][A-Z][0-9][0-9]" ]
x_unknown <- x_unknown [ ! x_unknown %in% x_unknown_ATCs ]
if ( length ( x_unknown_ATCs ) > 0 ) {
2020-11-10 16:35:56 +01:00
warning_ ( " These ATC codes are not (yet) in the antibiotics data set: " ,
2021-02-04 16:48:16 +01:00
vector_and ( x_unknown_ATCs ) , " ." ,
2020-11-10 16:35:56 +01:00
call = FALSE )
2019-06-11 15:31:32 +02:00
}
2020-06-25 17:34:50 +02:00
2021-02-02 23:57:35 +01:00
if ( length ( x_unknown ) > 0 & fast_mode == FALSE ) {
2020-11-10 16:35:56 +01:00
warning_ ( " These values could not be coerced to a valid antimicrobial ID: " ,
2021-02-04 16:48:16 +01:00
vector_and ( x_unknown ) , " ." ,
2020-11-10 16:35:56 +01:00
" ." ,
call = FALSE )
2019-05-10 16:44:59 +02:00
}
2020-07-13 09:17:24 +02:00
2020-09-18 16:05:53 +02:00
x_result <- data.frame ( x = x_bak_clean , stringsAsFactors = FALSE ) %pm>%
pm_left_join ( data.frame ( x = x , x_new = x_new , stringsAsFactors = FALSE ) , by = " x" ) %pm>%
2021-02-02 23:57:35 +01:00
pm_pull ( x_new )
2020-07-13 09:17:24 +02:00
2019-05-16 21:20:00 +02:00
if ( length ( x_result ) == 0 ) {
x_result <- NA_character_
}
2020-07-13 09:17:24 +02:00
2020-11-16 16:57:55 +01:00
set_clean_class ( x_result ,
new_class = c ( " ab" , " character" ) )
2019-05-10 16:44:59 +02:00
}
2019-05-16 21:20:00 +02:00
#' @rdname as.ab
2019-05-10 16:44:59 +02:00
#' @export
is.ab <- function ( x ) {
2020-01-31 23:27:38 +01:00
inherits ( x , " ab" )
2019-05-10 16:44:59 +02:00
}
2020-08-28 21:55:47 +02:00
# will be exported using s3_register() in R/zzz.R
2020-08-26 15:34:12 +02:00
pillar_shaft.ab <- function ( x , ... ) {
2020-08-28 21:55:47 +02:00
out <- trimws ( format ( x ) )
out [is.na ( x ) ] <- font_na ( NA )
create_pillar_column ( out , align = " left" , min_width = 4 )
2020-08-26 15:34:12 +02:00
}
2020-08-28 21:55:47 +02:00
# will be exported using s3_register() in R/zzz.R
2020-08-26 15:34:12 +02:00
type_sum.ab <- function ( x , ... ) {
" ab"
}
2020-05-28 16:48:55 +02:00
#' @method print ab
2019-05-10 16:44:59 +02:00
#' @export
#' @noRd
print.ab <- function ( x , ... ) {
2020-05-27 16:37:49 +02:00
cat ( " Class <ab>\n" )
2019-08-07 15:37:39 +02:00
print ( as.character ( x ) , quote = FALSE )
2019-05-10 16:44:59 +02:00
}
2020-05-28 16:48:55 +02:00
#' @method as.data.frame ab
2019-05-10 16:44:59 +02:00
#' @export
#' @noRd
2020-05-19 13:18:01 +02:00
as.data.frame.ab <- function ( x , ... ) {
2020-05-19 12:08:49 +02:00
nm <- deparse1 ( substitute ( x ) )
2019-05-10 16:44:59 +02:00
if ( ! " nm" %in% names ( list ( ... ) ) ) {
2020-05-19 12:08:49 +02:00
as.data.frame.vector ( as.ab ( x ) , ... , nm = nm )
2019-05-10 16:44:59 +02:00
} else {
2020-05-19 12:08:49 +02:00
as.data.frame.vector ( as.ab ( x ) , ... )
2019-05-10 16:44:59 +02:00
}
}
2020-05-28 16:48:55 +02:00
#' @method [ ab
2019-05-10 16:44:59 +02:00
#' @export
#' @noRd
2019-08-14 14:57:06 +02:00
" [.ab" <- function ( x , ... ) {
2019-08-12 14:48:09 +02:00
y <- NextMethod ( )
2019-08-14 14:57:06 +02:00
attributes ( y ) <- attributes ( x )
y
}
2020-05-28 16:48:55 +02:00
#' @method [[ ab
2019-08-14 14:57:06 +02:00
#' @export
#' @noRd
2019-08-26 16:02:03 +02:00
" [[.ab" <- function ( x , ... ) {
2019-08-14 14:57:06 +02:00
y <- NextMethod ( )
2019-08-26 16:02:03 +02:00
attributes ( y ) <- attributes ( x )
2019-08-14 14:57:06 +02:00
y
}
2020-05-28 16:48:55 +02:00
#' @method [<- ab
2019-08-14 14:57:06 +02:00
#' @export
#' @noRd
2019-08-26 16:02:03 +02:00
" [<-.ab" <- function ( i , j , ... , value ) {
2019-08-14 14:57:06 +02:00
y <- NextMethod ( )
2019-08-26 16:02:03 +02:00
attributes ( y ) <- attributes ( i )
2020-02-14 19:54:13 +01:00
class_integrity_check ( y , " antimicrobial code" , antibiotics $ ab )
2019-08-14 14:57:06 +02:00
}
2020-05-28 16:48:55 +02:00
#' @method [[<- ab
2019-08-14 14:57:06 +02:00
#' @export
#' @noRd
" [[<-.ab" <- function ( i , j , ... , value ) {
y <- NextMethod ( )
2019-08-26 16:02:03 +02:00
attributes ( y ) <- attributes ( i )
2020-02-14 19:54:13 +01:00
class_integrity_check ( y , " antimicrobial code" , antibiotics $ ab )
2019-08-14 14:57:06 +02:00
}
2020-05-28 16:48:55 +02:00
#' @method c ab
2019-08-14 14:57:06 +02:00
#' @export
#' @noRd
c.ab <- function ( x , ... ) {
y <- NextMethod ( )
attributes ( y ) <- attributes ( x )
2020-02-14 19:54:13 +01:00
class_integrity_check ( y , " antimicrobial code" , antibiotics $ ab )
2019-05-10 16:44:59 +02:00
}
2020-09-18 16:05:53 +02:00
2020-09-25 14:44:50 +02:00
#' @method unique ab
#' @export
#' @noRd
unique.ab <- function ( x , incomparables = FALSE , ... ) {
y <- NextMethod ( )
attributes ( y ) <- attributes ( x )
y
}
2020-09-18 16:05:53 +02:00
generalise_antibiotic_name <- function ( x ) {
x <- toupper ( x )
# remove suffices
2020-09-25 14:44:50 +02:00
x <- gsub ( " _(MIC|RSI|DIS[CK])$" , " " , x , perl = TRUE )
2020-09-18 16:05:53 +02:00
# remove disk concentrations, like LVX_NM -> LVX
2020-09-25 14:44:50 +02:00
x <- gsub ( " _[A-Z]{2}[0-9_.]{0,3}$" , " " , x , perl = TRUE )
2020-09-18 16:05:53 +02:00
# remove part between brackets if that's followed by another string
x <- gsub ( " (.*)+ [(].*[)]" , " \\1" , x )
# keep only max 1 space
2020-09-25 14:44:50 +02:00
x <- trimws2 ( gsub ( " +" , " " , x , perl = TRUE ) )
2020-09-18 16:05:53 +02:00
# non-character, space or number should be a slash
2020-09-25 14:44:50 +02:00
x <- gsub ( " [^A-Z0-9 -]" , " /" , x , perl = TRUE )
2020-09-18 16:05:53 +02:00
# spaces around non-characters must be removed: amox + clav -> amox/clav
2020-09-25 14:44:50 +02:00
x <- gsub ( " (.*[A-Z0-9]) ([^A-Z0-9].*)" , " \\1\\2" , x , perl = TRUE )
x <- gsub ( " (.*[^A-Z0-9]) ([A-Z0-9].*)" , " \\1\\2" , x , perl = TRUE )
2020-09-18 16:05:53 +02:00
# remove hyphen after a starting "co"
2020-09-25 14:44:50 +02:00
x <- gsub ( " ^CO-" , " CO" , x , perl = TRUE )
2020-09-18 16:05:53 +02:00
# replace operators with a space
2020-09-25 14:44:50 +02:00
x <- gsub ( " (/| AND | WITH | W/|[+]|[-])+" , " " , x , perl = TRUE )
2020-09-18 16:05:53 +02:00
x
}