diff --git a/DESCRIPTION b/DESCRIPTION index f8366141..d652038a 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: AMR -Version: 0.2.0.9019 -Date: 2018-08-01 +Version: 0.2.0.9020 +Date: 2018-08-02 Title: Antimicrobial Resistance Analysis Authors@R: c( person( diff --git a/NEWS.md b/NEWS.md index b3daa2a4..80c5afdb 100755 --- a/NEWS.md +++ b/NEWS.md @@ -7,7 +7,11 @@ * Universal: amoxicillin, amoxicillin/clavlanic acid, cefuroxime, piperacillin/tazobactam, ciprofloxacin, trimethoprim/sulfamethoxazole * Gram-positive: vancomycin, teicoplanin, tetracycline, erythromycin, oxacillin, rifampicin * Gram-negative: gentamicin, tobramycin, colistin, cefotaxime, ceftazidime, meropenem -* Functions `as.bactid` and `is.bactid` to transform/look up microbial ID's +* Determining bacterial ID: + * New functions `as.bactid` and `is.bactid` to transform/ look up microbial ID's. + * The existing function `guess_bactid` is now an alias of `as.bactid` + * New Becker classification for *Staphylococcus* to categorise them into Coagulase Negative *Staphylococci* (CoNS) and Coagulase Positve *Staphylococci* (CoPS) + * New Lancefield classification for *Streptococcus* to categorise them into Lancefield groups * For convience, new descriptive statistical functions `kurtosis` and `skewness` that are lacking in base R - they are generic functions and have support for vectors, data.frames and matrices * Function `g.test` to perform the Χ2 distributed [*G*-test](https://en.wikipedia.org/wiki/G-test), which use is the same as `chisq.test` * Function `ratio` to transform a vector of values to a preset ratio diff --git a/R/bactid.R b/R/bactid.R index 0c1bcb98..07218704 100644 --- a/R/bactid.R +++ b/R/bactid.R @@ -20,10 +20,12 @@ #' #' Use this function to determine a valid ID based on a genus (and species). This input can be a full name (like \code{"Staphylococcus aureus"}), an abbreviated name (like \code{"S. aureus"}), or just a genus. You could also \code{\link{select}} a genus and species column, zie Examples. #' @param x a character vector or a dataframe with one or two columns +#' @param Becker a logical to indicate whether \emph{Staphylococci} should be categorised into Coagulase Negative \emph{Staphylococci} ("CoNS") and Coagulase Positive \emph{Staphylococci} ("CoPS") instead of their own species, according to Karsten Becker \emph{et al.} [1]. This excludes \emph{Staphylococcus aureus} at default, use \code{Becker = "all"} to also categorise \emph{S. aureus} as "CoPS". +#' @param Lancefield a logical to indicate whether beta-haemolytic \emph{Streptococci} should be categorised into Lancefield groups instead of their own species, according to Rebecca C. Lancefield [2]. These \emph{Streptococci} will be categorised in their first group, i.e. \emph{Streptococcus dysgalactiae} will be group C, although officially it was also categorised into groups G and L. Groups D and E will be ignored, since they are \emph{Enterococci}. #' @rdname as.bactid -#' @details \code{guess_bactid} does exactly the same as \code{as.bactid}. +#' @details \code{guess_bactid} is an alias of \code{as.bactid}. #' -#' Some exceptions have been built in to get more logical results, based on prevalence of human pathogens. For example: +#' Some exceptions have been built in to get more logical results, based on prevalence of human pathogens. These are: #' \itemize{ #' \item{\code{"E. coli"} will return the ID of \emph{Escherichia coli} and not \emph{Entamoeba coli}, although the latter would alphabetically come first} #' \item{\code{"H. influenzae"} will return the ID of \emph{Haemophilus influenzae} and not \emph{Haematobacter influenzae}} @@ -32,6 +34,11 @@ #' } #' Moreover, this function also supports ID's based on only Gram stain, when the species is not known. \cr #' For example, \code{"Gram negative rods"} and \code{"GNR"} will both return the ID of a Gram negative rod: \code{GNR}. +#' @source +#' [1] Becker K \emph{et al.} \strong{Coagulase-Negative Staphylococci}. 2014. Clin Microbiol Rev. 27(4): 870–926. \cr +#' \url{https://dx.doi.org/10.1128/CMR.00109-13} \cr +#' [2] Lancefield RC \strong{A serological differentiation of human and other groups of hemolytic streptococci}. 1933. J Exp Med. 57(4): 571–95. \cr +#' \url{https://dx.doi.org/10.1084/jem.57.4.571} #' @export #' @importFrom dplyr %>% filter pull #' @return Character (vector) with class \code{"bactid"}. Unknown values will return \code{NA}. @@ -48,6 +55,12 @@ #' as.bactid("VISA") # Vancomycin Intermediate S. aureus #' as.bactid("VRSA") # Vancomycin Resistant S. aureus #' +#' guess_bactid("S. epidermidis") # will remain species: STAEPI +#' guess_bactid("S. epidermidis", Becker = TRUE) # will not remain species: STACNS +#' +#' guess_bactid("S. pyogenes") # will remain species: STCAGA +#' guess_bactid("S. pyogenes", Lancefield = TRUE) # will not remain species: STCGRA +#' #' \dontrun{ #' df$bactid <- as.bactid(df$microorganism_name) #' @@ -66,7 +79,7 @@ #' df <- df %>% #' mutate(bactid = guess_bactid(paste(genus, species))) #' } -as.bactid <- function(x) { +as.bactid <- function(x, Becker = FALSE, Lancefield = FALSE) { failures <- character(0) @@ -96,13 +109,79 @@ as.bactid <- function(x) { x <- trimws(x, which = "both") x.backup <- x # replace space by regex sign + x_withspaces <- gsub(" ", ".* ", x, fixed = TRUE) x <- gsub(" ", ".*", x, fixed = TRUE) - # add start and stop + # for species x_species <- paste(x, 'species') + # add start en stop regex x <- paste0('^', x, '$') + x_withspaces <- paste0('^', x_withspaces, '$') for (i in 1:length(x)) { + if (Becker == TRUE | Becker == "all") { + mo <- suppressWarnings(guess_bactid(x.fullbackup[i])) + if (mo %like% '^STA') { + # See Source. It's this figure: + # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4187637/figure/F3/ + species <- left_join_microorganisms(mo)$species + if (species %in% c("arlettae", "auricularis", "capitis", + "caprae", "carnosus", "cohnii", "condimene", + "devriesei", "epidermidis", "equorum", + "fleurettii", "gallinarum", "haemolyticus", + "hominis", "jettensis", "kloosii", "lentus", + "lugdunensis", "massiliensis", "microti", + "muscae", "nepalensis", "pasteuri", "perrasii", + "pettenkoleri", "piscifermentans", "rostri", + "saccharott", "saprophyticus", "sciuri", + "siepanovicii", "simulans", "succinus", + "vitulinus", "warneri", "xylosus")) { + x[i] <- "STACNS" + next + } else if ((Becker == "all" & species == "aureus") + | species %in% c("simiae", "agnetis", "chromogenes", + "delphirul", "felis", "futrae", + "hyicus", "intermedius", + "pseudointermedius", "schleiferi")) { + x[i] <- "STACPS" + next + } + } + } + + if (Lancefield == TRUE) { + mo <- suppressWarnings(guess_bactid(x.fullbackup[i])) + if (mo %like% '^STC') { + # See Source + species <- left_join_microorganisms(mo)$species + if (species == "pyogenes") { + x[i] <- "STCGRA" + next + } + if (species == "agalactiae") { + x[i] <- "STCGRB" + next + } + if (species %in% c("equisimilis", "equi", + "zooepidemicus", "dysgalactiae")) { + x[i] <- "STCGRC" + next + } + if (species == "anginosus") { + x[i] <- "STCGRF" + next + } + if (species == "sanguis") { + x[i] <- "STCGRH" + next + } + if (species == "salivarius") { + x[i] <- "STCGRK" + next + } + } + } + if (identical(x.backup[i], "")) { # empty values x[i] <- NA @@ -142,7 +221,7 @@ as.bactid <- function(x) { x[i] <- 'PSEAER' next } - if (tolower(x[i]) %like% 'coagulase' + if (tolower(x[i]) %like% 'coagulase negative' | tolower(x[i]) %like% 'cns' | tolower(x[i]) %like% 'cons') { # coerce S. coagulase negative, also as CNS and CoNS @@ -192,7 +271,14 @@ as.bactid <- function(x) { next } - # try any match + # try any match keeping spaces + found <- AMR::microorganisms[which(AMR::microorganisms$fullname %like% x_withspaces[i]),]$bactid + if (length(found) > 0) { + x[i] <- found[1L] + next + } + + # try any match diregarding spaces found <- AMR::microorganisms[which(AMR::microorganisms$fullname %like% x[i]),]$bactid if (length(found) > 0) { x[i] <- found[1L] @@ -200,7 +286,7 @@ as.bactid <- function(x) { } # try exact match of only genus, with 'species' attached - # (e.g. this prevents Streptococcus for becoming Peptostreptococcus, since "p" < "s") + # (this prevents Streptococcus from becoming Peptostreptococcus, since "p" < "s") found <- AMR::microorganisms[which(AMR::microorganisms$fullname == x_species[i]),]$bactid if (length(found) > 0) { x[i] <- found[1L] diff --git a/R/data.R b/R/data.R index a83b798c..aee20e3e 100755 --- a/R/data.R +++ b/R/data.R @@ -236,8 +236,8 @@ #' Dataset with ~2500 microorganisms #' -#' A dataset containing 2453 microorganisms. MO codes of the UMCG can be looked up using \code{\link{microorganisms.umcg}}. -#' @format A data.frame with 2453 observations and 12 variables: +#' A dataset containing 2456 microorganisms. MO codes of the UMCG can be looked up using \code{\link{microorganisms.umcg}}. +#' @format A data.frame with 2456 observations and 12 variables: #' \describe{ #' \item{\code{bactid}}{ID of microorganism} #' \item{\code{bactsys}}{Bactsyscode of microorganism} diff --git a/README.md b/README.md index 805465f6..b68843fc 100755 --- a/README.md +++ b/README.md @@ -33,6 +33,8 @@ With `AMR` you can: * Universal: amoxicillin, amoxicillin/clavlanic acid, cefuroxime, piperacillin/tazobactam, ciprofloxacin, trimethoprim/sulfamethoxazole * Specific for Gram-positives: vancomycin, teicoplanin, tetracycline, erythromycin, oxacillin, rifampicin * Specific for Gram-negatives: gentamicin, tobramycin, colistin, cefotaxime, ceftazidime, meropenem +* Categorise *Staphylococci* into Coagulase Negative *Staphylococci* (CoNS) and Coagulase Positve *Staphylococci* (CoPS) according to [Karsten Becker *et al.*](https://www.ncbi.nlm.nih.gov/pmc/articles/pmid/25278577/) +* Categorise *Streptococci* into Lancefield groups * Get antimicrobial ATC properties from the WHO Collaborating Centre for Drug Statistics Methodology ([WHOCC](https://www.whocc.no/atc_ddd_methodology/who_collaborating_centre/)), to be able to: * Translate antibiotic codes (like *AMOX*), official names (like *amoxicillin*) and even trade names (like *Amoxil* or *Trimox*) to an [ATC code](https://www.whocc.no/atc_ddd_index/?code=J01CA04&showdescription=no) (like *J01CA04*) and vice versa with the `abname` function * Get the latest antibiotic properties like hierarchic groups and [defined daily dose](https://en.wikipedia.org/wiki/Defined_daily_dose) (DDD) with units and administration form from the WHOCC website with the `atc_property` function diff --git a/data/microorganisms.rda b/data/microorganisms.rda index d66b105d..a540ee69 100755 Binary files a/data/microorganisms.rda and b/data/microorganisms.rda differ diff --git a/man/as.bactid.Rd b/man/as.bactid.Rd index ebd58f68..4bf99cc7 100644 --- a/man/as.bactid.Rd +++ b/man/as.bactid.Rd @@ -5,15 +5,25 @@ \alias{guess_bactid} \alias{is.bactid} \title{Transform to bacteria ID} +\source{ +[1] Becker K \emph{et al.} \strong{Coagulase-Negative Staphylococci}. 2014. Clin Microbiol Rev. 27(4): 870–926. \cr + \url{https://dx.doi.org/10.1128/CMR.00109-13} \cr +[2] Lancefield RC \strong{A serological differentiation of human and other groups of hemolytic streptococci}. 1933. J Exp Med. 57(4): 571–95. \cr + \url{https://dx.doi.org/10.1084/jem.57.4.571} +} \usage{ -as.bactid(x) +as.bactid(x, Becker = FALSE, Lancefield = FALSE) -guess_bactid(x) +guess_bactid(x, Becker = FALSE, Lancefield = FALSE) is.bactid(x) } \arguments{ \item{x}{a character vector or a dataframe with one or two columns} + +\item{Becker}{a logical to indicate whether \emph{Staphylococci} should be categorised into Coagulase Negative \emph{Staphylococci} ("CoNS") and Coagulase Positive \emph{Staphylococci} ("CoPS") instead of their own species, according to Karsten Becker \emph{et al.} [1]. This excludes \emph{Staphylococcus aureus} at default, use \code{Becker = "all"} to also categorise \emph{S. aureus} as "CoPS".} + +\item{Lancefield}{a logical to indicate whether beta-haemolytic \emph{Streptococci} should be categorised into Lancefield groups instead of their own species, according to Rebecca C. Lancefield [2]. These \emph{Streptococci} will be categorised in their first group, i.e. \emph{Streptococcus dysgalactiae} will be group C, although officially it was also categorised into groups G and L. Groups D and E will be ignored, since they are \emph{Enterococci}.} } \value{ Character (vector) with class \code{"bactid"}. Unknown values will return \code{NA}. @@ -22,9 +32,9 @@ Character (vector) with class \code{"bactid"}. Unknown values will return \code{ Use this function to determine a valid ID based on a genus (and species). This input can be a full name (like \code{"Staphylococcus aureus"}), an abbreviated name (like \code{"S. aureus"}), or just a genus. You could also \code{\link{select}} a genus and species column, zie Examples. } \details{ -\code{guess_bactid} does exactly the same as \code{as.bactid}. +\code{guess_bactid} is an alias of \code{as.bactid}. -Some exceptions have been built in to get more logical results, based on prevalence of human pathogens. For example: +Some exceptions have been built in to get more logical results, based on prevalence of human pathogens. These are: \itemize{ \item{\code{"E. coli"} will return the ID of \emph{Escherichia coli} and not \emph{Entamoeba coli}, although the latter would alphabetically come first} \item{\code{"H. influenzae"} will return the ID of \emph{Haemophilus influenzae} and not \emph{Haematobacter influenzae}} @@ -46,6 +56,12 @@ as.bactid("MRSA") # Methicillin Resistant S. aureus as.bactid("VISA") # Vancomycin Intermediate S. aureus as.bactid("VRSA") # Vancomycin Resistant S. aureus +guess_bactid("S. epidermidis") # will remain species: STAEPI +guess_bactid("S. epidermidis", Becker = TRUE) # will not remain species: STACNS + +guess_bactid("S. pyogenes") # will remain species: STCAGA +guess_bactid("S. pyogenes", Lancefield = TRUE) # will not remain species: STCGRA + \dontrun{ df$bactid <- as.bactid(df$microorganism_name) diff --git a/man/microorganisms.Rd b/man/microorganisms.Rd index 2dd3a2d6..48a792d3 100755 --- a/man/microorganisms.Rd +++ b/man/microorganisms.Rd @@ -4,7 +4,7 @@ \name{microorganisms} \alias{microorganisms} \title{Dataset with ~2500 microorganisms} -\format{A data.frame with 2453 observations and 12 variables: +\format{A data.frame with 2456 observations and 12 variables: \describe{ \item{\code{bactid}}{ID of microorganism} \item{\code{bactsys}}{Bactsyscode of microorganism} @@ -23,7 +23,7 @@ microorganisms } \description{ -A dataset containing 2453 microorganisms. MO codes of the UMCG can be looked up using \code{\link{microorganisms.umcg}}. +A dataset containing 2456 microorganisms. MO codes of the UMCG can be looked up using \code{\link{microorganisms.umcg}}. } \seealso{ \code{\link{guess_bactid}} \code{\link{antibiotics}} \code{\link{microorganisms.umcg}} diff --git a/tests/testthat/test-bactid.R b/tests/testthat/test-bactid.R index 38ff93d9..73b32342 100644 --- a/tests/testthat/test-bactid.R +++ b/tests/testthat/test-bactid.R @@ -30,6 +30,33 @@ test_that("as.bactid works", { "VISA"))), rep("STAAUR", 8)) + # check for Becker classification + expect_identical(as.character(guess_bactid("S. epidermidis", Becker = FALSE)), "STAEPI") + expect_identical(as.character(guess_bactid("S. epidermidis", Becker = TRUE)), "STACNS") + expect_identical(as.character(guess_bactid("STAEPI", Becker = TRUE)), "STACNS") + expect_identical(as.character(guess_bactid("S. intermedius", Becker = FALSE)), "STAINT") + expect_identical(as.character(guess_bactid("S. intermedius", Becker = TRUE)), "STACPS") + expect_identical(as.character(guess_bactid("STAINT", Becker = TRUE)), "STACPS") + # aureus must only be influenced if Becker = "all" + expect_identical(as.character(guess_bactid("STAAUR", Becker = FALSE)), "STAAUR") + expect_identical(as.character(guess_bactid("STAAUR", Becker = TRUE)), "STAAUR") + expect_identical(as.character(guess_bactid("STAAUR", Becker = "all")), "STACPS") + + # check for Lancefield classification + expect_identical(as.character(guess_bactid("S. pyogenes", Lancefield = FALSE)), "STCPYO") + expect_identical(as.character(guess_bactid("S. pyogenes", Lancefield = TRUE)), "STCGRA") + expect_identical(as.character(guess_bactid("STCPYO", Lancefield = TRUE)), "STCGRA") + expect_identical(as.character(guess_bactid("S. agalactiae", Lancefield = FALSE)), "STCAGA") + expect_identical(as.character(guess_bactid("S. agalactiae", Lancefield = TRUE)), "STCGRB") # group B + expect_identical(as.character(guess_bactid("S. equisimilis", Lancefield = FALSE)), "STCEQS") + expect_identical(as.character(guess_bactid("S. equisimilis", Lancefield = TRUE)), "STCGRC") # group C + expect_identical(as.character(guess_bactid("S. anginosus", Lancefield = FALSE)), "STCANG") + expect_identical(as.character(guess_bactid("S. anginosus", Lancefield = TRUE)), "STCGRF") # group F + expect_identical(as.character(guess_bactid("S. sanguis", Lancefield = FALSE)), "STCSAN") + expect_identical(as.character(guess_bactid("S. sanguis", Lancefield = TRUE)), "STCGRH") # group H + expect_identical(as.character(guess_bactid("S. salivarius", Lancefield = FALSE)), "STCSAL") + expect_identical(as.character(guess_bactid("S. salivarius", Lancefield = TRUE)), "STCGRK") # group K + # select with one column expect_identical( septic_patients[1:10,] %>% diff --git a/tests/testthat/test-first_isolate.R b/tests/testthat/test-first_isolate.R index 8f37f4f7..4f13ee73 100755 --- a/tests/testthat/test-first_isolate.R +++ b/tests/testthat/test-first_isolate.R @@ -1,7 +1,7 @@ context("first_isolate.R") test_that("first isolates work", { - # septic_patients contains 1959 out of 2000 first isolates + # septic_patients contains 1331 out of 2000 first isolates expect_equal( sum( first_isolate(tbl = septic_patients, @@ -10,9 +10,9 @@ test_that("first isolates work", { col_bactid = "bactid", info = TRUE), na.rm = TRUE), - 1326) + 1331) - # septic_patients contains 1962 out of 2000 first *weighted* isolates + # septic_patients contains 1426 out of 2000 first *weighted* isolates expect_equal( suppressWarnings( sum( @@ -24,8 +24,8 @@ test_that("first isolates work", { type = "keyantibiotics", info = TRUE), na.rm = TRUE)), - 1421) - # and 1961 when using points + 1426) + # and 1430 when using points expect_equal( suppressWarnings( sum( @@ -37,9 +37,9 @@ test_that("first isolates work", { type = "points", info = TRUE), na.rm = TRUE)), - 1425) + 1430) - # septic_patients contains 1732 out of 2000 first non-ICU isolates + # septic_patients contains 1176 out of 2000 first non-ICU isolates expect_equal( sum( first_isolate(septic_patients, @@ -50,7 +50,7 @@ test_that("first isolates work", { info = TRUE, icu_exclude = TRUE), na.rm = TRUE), - 1171) + 1176) # set 1500 random observations to be of specimen type 'Urine' random_rows <- sample(x = 1:2000, size = 1500, replace = FALSE)