diff --git a/DESCRIPTION b/DESCRIPTION index fb4059e5..b28deb19 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -28,7 +28,6 @@ Depends: R (>= 3.0.0) Imports: backports, - broom, clipr, curl, dplyr (>= 0.7.0), diff --git a/NAMESPACE b/NAMESPACE index ce3edb46..60753e49 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +S3method(as.data.frame,bactid) S3method(as.data.frame,frequency_tbl) S3method(as.double,mic) S3method(as.integer,mic) @@ -16,6 +17,7 @@ S3method(kurtosis,matrix) S3method(plot,frequency_tbl) S3method(plot,mic) S3method(plot,rsi) +S3method(print,bactid) S3method(print,frequency_tbl) S3method(print,mic) S3method(print,rsi) @@ -32,6 +34,7 @@ export(MDRO) export(MRGN) export(abname) export(anti_join_microorganisms) +export(as.bactid) export(as.mic) export(as.rsi) export(atc_ddd) @@ -48,6 +51,7 @@ export(guess_atc) export(guess_bactid) export(inner_join_microorganisms) export(interpretive_reading) +export(is.bactid) export(is.mic) export(is.rsi) export(key_antibiotics) @@ -68,6 +72,7 @@ export(semi_join_microorganisms) export(skewness) export(susceptibility) export(top_freq) +exportMethods(as.data.frame.bactid) exportMethods(as.data.frame.frequency_tbl) exportMethods(as.double.mic) exportMethods(as.integer.mic) @@ -85,6 +90,7 @@ exportMethods(kurtosis.matrix) exportMethods(plot.frequency_tbl) exportMethods(plot.mic) exportMethods(plot.rsi) +exportMethods(print.bactid) exportMethods(print.frequency_tbl) exportMethods(print.mic) exportMethods(print.rsi) diff --git a/NEWS.md b/NEWS.md index a331d627..850c2a17 100755 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,7 @@ * **BREAKING**: the methodology for determining first weighted isolates was changed. The antibiotics that are compared between isolates (call *key antibiotics*) to include more first isolates (afterwards called first *weighted* isolates) are now as follows: * Gram-positive: amoxicillin, amoxicillin/clavlanic acid, cefuroxime, piperacillin/tazobactam, ciprofloxacin, trimethoprim/sulfamethoxazole, vancomycin, teicoplanin, tetracycline, erythromycin, oxacillin, rifampicin * Gram-negative: amoxicillin, amoxicillin/clavlanic acid, cefuroxime, piperacillin/tazobactam, ciprofloxacin, trimethoprim/sulfamethoxazole, gentamicin, tobramycin, colistin, cefotaxime, ceftazidime, meropenem +* Functions `as.bactid` and `is.bactid` to transform/look up microbial ID's; this replaces the function `guess_bactid` but it will remain available for backwards compatibility * For convience, new descriptive statistical functions `kurtosis` and `skewness` that are lacking in base R - they are generic functions and have support for vectors, data.frames and matrices * Function `g.test` to perform the Χ2 distributed [*G*-test](https://en.wikipedia.org/wiki/G-test), which use is the same as `chisq.test` * Function `ratio` to transform a vector of values to a preset ratio @@ -28,8 +29,9 @@ * Printing of class `mic` now shows all MIC values * `%like%` now supports multiple patterns * Frequency tables are now actual `data.frame`s with altered console printing to make it look like a frequency table. Because of this, the parameter `toConsole` is not longer needed. -* Small translational improvements to the `septic_patients` dataset -* Small improvements to the `microorganisms` dataset, especially for *Salmonella* +* Fix for `freq` where the class of an item would be lost +* Small translational improvements to the `septic_patients` dataset and the column `bactid` now has the new class `"bactid"` +* Small improvements to the `microorganisms` dataset (especially for *Salmonella*) and the column `bactid` now has the new class `"bactid"` * Combined MIC/RSI values will now be coerced by the `rsi` and `mic` functions: * `as.rsi("<=0.002; S")` will return `S` * `as.mic("<=0.002; S")` will return `<=0.002` @@ -38,7 +40,8 @@ * Build-in host check for `atc_property` as it requires the host set by `url` to be responsive * Improved `first_isolate` algorithm to exclude isolates where bacteria ID or genus is unavailable * Fix for warning *hybrid evaluation forced for row_number* ([`924b62`](https://github.com/tidyverse/dplyr/commit/924b62)) from the `dplyr` package v0.7.5 and above -* Support for 1 or 2 columns as input for `guess_bactid` +* Support for empty values and for 1 or 2 columns as input for `guess_bactid` (now called `as.bactid`) + * So `yourdata %>% select(genus, species) %>% as.bactid()` now also works #### Other * Unit testing for R 3.0 and the latest available release: https://travis-ci.org/msberends/AMR diff --git a/R/guess_bactid.R b/R/bactid.R similarity index 53% rename from R/guess_bactid.R rename to R/bactid.R index 785ca0ae..93c10e7f 100644 --- a/R/guess_bactid.R +++ b/R/bactid.R @@ -16,37 +16,58 @@ # GNU General Public License for more details. # # ==================================================================== # -#' Find bacteria ID based on genus/species +#' Transform to bacteria ID #' -#' Use this function to determine a valid ID based on a genus (and species). This input could be a full name (like \code{"Staphylococcus aureus"}), an abbreviated name (like \code{"S. aureus"}), or just a genus. You could also \code{\link{select}} a genus and species column, zie Examples. -#' @param x character vector or a dataframe with one or two columns +#' Use this function to determine a valid ID based on a genus (and species). This input can be a full name (like \code{"Staphylococcus aureus"}), an abbreviated name (like \code{"S. aureus"}), or just a genus. You could also \code{\link{select}} a genus and species column, zie Examples. +#' @param x a character vector or a dataframe with one or two columns +#' @rdname as.bactid +#' @details Some exceptions have been built in to get more logical results, based on prevalence of human pathogens. For example: +#' \itemize{ +#' \item{\code{"E. coli"} will return the ID of \emph{Escherichia coli} and not \emph{Entamoeba coli}, although the latter would alphabetically come first} +#' \item{\code{"H. influenzae"} will return the ID of \emph{Haemophilus influenzae} and not \emph{Haematobacter influenzae}} +#' \item{Something like \code{"p aer"} will return the ID of \emph{Pseudomonas aeruginosa} and not \emph{Pasteurella aerogenes}} +#' \item{Something like \code{"stau"} or \code{"staaur"} will return the ID of \emph{Staphylococcus aureus} and not \emph{Staphylococcus auricularis}} +#' } +#' Moreover, this function also supports ID's based on only Gram stain, when the species is not known. \cr +#' For example, \code{"Gram negative rods"} and \code{"GNR"} will both return the ID of a Gram negative rod: \code{GNR}. #' @export #' @importFrom dplyr %>% filter pull -#' @return Character (vector). +#' @return Character (vector) with class \code{"bactid"}. Unknown values will return \code{NA}. #' @seealso \code{\link{microorganisms}} for the dataframe that is being used to determine ID's. #' @examples #' # These examples all return "STAAUR", the ID of S. aureus: -#' guess_bactid("stau") -#' guess_bactid("STAU") -#' guess_bactid("staaur") -#' guess_bactid("S. aureus") -#' guess_bactid("S aureus") -#' guess_bactid("Staphylococcus aureus") -#' guess_bactid("MRSA") # Methicillin-resistant S. aureus -#' guess_bactid("VISA") # Vancomycin Intermediate S. aureus +#' as.bactid("stau") +#' as.bactid("STAU") +#' as.bactid("staaur") +#' as.bactid("S. aureus") +#' as.bactid("S aureus") +#' as.bactid("Staphylococcus aureus") +#' as.bactid("MRSA") # Methicillin Resistant S. aureus +#' as.bactid("VISA") # Vancomycin Intermediate S. aureus +#' as.bactid("VRSA") # Vancomycin Resistant S. aureus #' #' \dontrun{ -#' df$bactid <- guess_bactid(df$microorganism_name) +#' df$bactid <- as.bactid(df$microorganism_name) #' #' # the select function of tidyverse is also supported: -#' df$bactid <- df %>% select(microorganism_name) %>% guess_bactid() +#' library(dplyr) +#' df$bactid <- df %>% +#' select(microorganism_name) %>% +#' as.bactid() #' #' # and can even contain 2 columns, which is convenient for genus/species combinations: -#' df$bactid <- df %>% select(genus, species) %>% guess_bactid() +#' df$bactid <- df %>% +#' select(genus, species) %>% +#' as.bactid() +#' #' # same result: -#' df <- df %>% mutate(bactid = paste(genus, species)) %>% guess_bactid()) +#' df <- df %>% +#' mutate(bactid = paste(genus, species) %>% +#' as.bactid()) #' } -guess_bactid <- function(x) { +as.bactid <- function(x) { + + failures <- character(0) if (NCOL(x) == 2) { # support tidyverse selection like: df %>% select(colA, colB) @@ -60,17 +81,19 @@ guess_bactid <- function(x) { if (NCOL(x) > 2) { stop('`x` can be 2 columns at most', call. = FALSE) } + # support tidyverse selection like: df %>% select(colA) if (!is.vector(x)) { x <- pull(x, 1) } } + x.fullbackup <- x # remove dots and other non-text in case of "E. coli" except spaces - x <- gsub("[^a-zA-Z ]+", "", x) + x <- gsub("[^a-zA-Z0-9 ]+", "", x) # but spaces before and after should be omitted x <- trimws(x, which = "both") - x.bak <- x + x.backup <- x # replace space by regex sign x <- gsub(" ", ".*", x, fixed = TRUE) # add start and stop @@ -96,42 +119,44 @@ guess_bactid <- function(x) { # avoid detection of Pasteurella aerogenes in case of Pseudomonas aeruginosa x[i] <- 'Pseudomonas aeruginosa' } - if (tolower(x[i]) %like% 'coagulase') { - # coerce S. coagulase negative + if (tolower(x[i]) %like% 'coagulase' + | tolower(x[i]) %like% 'cns' + | tolower(x[i]) %like% 'cons') { + # coerce S. coagulase negative, also as CNS and CoNS x[i] <- 'Coagulase Negative Staphylococcus (CNS)' } # translate known trivial names to genus+species - if (!is.na(x.bak[i])) { - if (toupper(x.bak[i]) == 'MRSA' - | toupper(x.bak[i]) == 'VISA' - | toupper(x.bak[i]) == 'VRSA') { + if (!is.na(x.backup[i])) { + if (toupper(x.backup[i]) == 'MRSA' + | toupper(x.backup[i]) == 'VISA' + | toupper(x.backup[i]) == 'VRSA') { x[i] <- 'Staphylococcus aureus' } - if (toupper(x.bak[i]) == 'MRSE') { + if (toupper(x.backup[i]) == 'MRSE') { x[i] <- 'Staphylococcus epidermidis' } - if (toupper(x.bak[i]) == 'VRE') { + if (toupper(x.backup[i]) == 'VRE') { x[i] <- 'Enterococcus' } - if (toupper(x.bak[i]) == 'MRPA') { + if (toupper(x.backup[i]) == 'MRPA') { # multi resistant P. aeruginosa x[i] <- 'Pseudomonas aeruginosa' } - if (toupper(x.bak[i]) == 'PISP' - | toupper(x.bak[i]) == 'PRSP') { + if (toupper(x.backup[i]) == 'PISP' + | toupper(x.backup[i]) == 'PRSP') { # peni resistant S. pneumoniae x[i] <- 'Streptococcus pneumoniae' } - if (toupper(x.bak[i]) == 'VISP' - | toupper(x.bak[i]) == 'VRSP') { + if (toupper(x.backup[i]) == 'VISP' + | toupper(x.backup[i]) == 'VRSP') { # vanco resistant S. pneumoniae x[i] <- 'Streptococcus pneumoniae' } } # let's try the ID's first - found <- AMR::microorganisms %>% filter(bactid == x.bak[i]) + found <- AMR::microorganisms %>% filter(bactid == x.backup[i]) if (nrow(found) == 0) { # now try exact match @@ -152,38 +177,82 @@ guess_bactid <- function(x) { } if (nrow(found) == 0) { # search for GLIMS code - if (toupper(x.bak[i]) %in% toupper(AMR::microorganisms.umcg$mocode)) { - found <- AMR::microorganisms.umcg %>% filter(toupper(mocode) == toupper(x.bak[i])) + if (toupper(x.backup[i]) %in% toupper(AMR::microorganisms.umcg$mocode)) { + found <- AMR::microorganisms.umcg %>% filter(toupper(mocode) == toupper(x.backup[i])) } } if (nrow(found) == 0) { # try splitting of characters and then find ID # like esco = E. coli, klpn = K. pneumoniae, stau = S. aureus x_split <- x - x_length <- nchar(x.bak[i]) - x_split[i] <- paste0(x.bak[i] %>% substr(1, x_length / 2) %>% trimws(), + x_length <- nchar(x.backup[i]) + x_split[i] <- paste0(x.backup[i] %>% substr(1, x_length / 2) %>% trimws(), '.* ', - x.bak[i] %>% substr((x_length / 2) + 1, x_length) %>% trimws()) + x.backup[i] %>% substr((x_length / 2) + 1, x_length) %>% trimws()) found <- AMR::microorganisms %>% filter(fullname %like% paste0('^', x_split[i])) } if (nrow(found) == 0) { # try any match with text before and after original search string # so "negative rods" will be "GNR" - if (x.bak[i] %like% "^Gram") { - x.bak[i] <- gsub("^Gram", "", x.bak[i], ignore.case = TRUE) + if (x.backup[i] %like% "^Gram") { + x.backup[i] <- gsub("^Gram", "", x.backup[i], ignore.case = TRUE) # remove leading and trailing spaces again - x.bak[i] <- trimws(x.bak[i], which = "both") + x.backup[i] <- trimws(x.backup[i], which = "both") } - if (!is.na(x.bak[i])) { - found <- AMR::microorganisms %>% filter(fullname %like% x.bak[i]) + if (!is.na(x.backup[i])) { + found <- AMR::microorganisms %>% filter(fullname %like% x.backup[i]) } } - if (nrow(found) != 0) { + if (nrow(found) != 0 & x.backup[i] != "") { x[i] <- as.character(found[1, 'bactid']) } else { - x[i] <- "" + x[i] <- NA_character_ + failures <- c(failures, x.fullbackup[i]) } } + + failures <- failures[!failures %in% c(NA, NULL, NaN)] + if (length(failures) > 0) { + warning("These values could not be coerced to a valid bactid: ", + paste('"', unique(failures), '"', sep = "", collapse = ', '), + ".", + call. = FALSE) + } + class(x) <- "bactid" + attr(x, 'package') <- 'AMR' + attr(x, 'package.version') <- packageDescription('AMR')$Version x } + +#' @rdname as.bactid +#' @export +guess_bactid <- as.bactid + +#' @rdname as.bactid +#' @export +is.bactid <- function(x) { + identical(class(x), "bactid") +} + +#' @exportMethod print.bactid +#' @export +#' @noRd +print.bactid <- function(x, ...) { + cat("Class 'bactid'\n") + print.default(as.character(x), quote = FALSE) +} + +#' @exportMethod as.data.frame.bactid +#' @export +#' @noRd +as.data.frame.bactid <- function (x, ...) { + # same as as.data.frame.character but with removed stringsAsFactors + nm <- paste(deparse(substitute(x), width.cutoff = 500L), + collapse = " ") + if (!"nm" %in% names(list(...))) { + as.data.frame.vector(x, ..., nm = nm) + } else { + as.data.frame.vector(x, ...) + } +} diff --git a/R/eucast.R b/R/eucast.R index 3d2575fd..b1ec16c8 100755 --- a/R/eucast.R +++ b/R/eucast.R @@ -201,9 +201,10 @@ EUCAST_rules <- function(tbl, } # join to microorganisms table - joinby <- colnames(AMR::microorganisms)[1] - names(joinby) <- col_bactid - tbl <- tbl %>% left_join(y = AMR::microorganisms, by = joinby, suffix = c("_tempmicroorganisms", "")) + if (!tbl %>% pull(col_bactid) %>% is.bactid()) { + tbl[, col_bactid] <- tbl %>% pull(col_bactid) %>% as.bactid() + } + tbl <- tbl %>% left_join_microorganisms(by = col_bactid, suffix = c("_tempmicroorganisms", "")) # antibiotic classes aminoglycosides <- c(tobr, gent, kana, neom, neti, siso) diff --git a/R/first_isolates.R b/R/first_isolate.R similarity index 97% rename from R/first_isolates.R rename to R/first_isolate.R index 4f9f2bff..fd97bcf4 100755 --- a/R/first_isolates.R +++ b/R/first_isolate.R @@ -22,7 +22,7 @@ #' @param tbl a \code{data.frame} containing isolates. #' @param col_date column name of the result date (or date that is was received on the lab) #' @param col_patient_id column name of the unique IDs of the patients -#' @param col_bactid column name of the unique IDs of the microorganisms (should occur in the \code{\link{microorganisms}} dataset). Get your bactid's with the function \code{\link{guess_bactid}}, that takes microorganism names as input. +#' @param col_bactid column name of the unique IDs of the microorganisms: \code{bactid}'s. If this column has another class than \code{"bactid"}, values will be coerced using \code{\link{as.bactid}}. #' @param col_testcode column name of the test codes. Use \code{col_testcode = NA} to \strong{not} exclude certain test codes (like test codes for screening). In that case \code{testcodes_exclude} will be ignored. Supports tidyverse-like quotation. #' @param col_specimen column name of the specimen type or group #' @param col_icu column name of the logicals (\code{TRUE}/\code{FALSE}) whether a ward or department is an Intensive Care Unit (ICU) @@ -126,7 +126,7 @@ first_isolate <- function(tbl, # bactid OR genus+species must be available if (is.na(col_bactid) & (is.na(col_genus) | is.na(col_species))) { - stop('`col_bactid or both `col_genus` and `col_species` must be available.') + stop('`col_bactid` or both `col_genus` and `col_species` must be available.') } # check if columns exist @@ -152,6 +152,9 @@ first_isolate <- function(tbl, check_columns_existance(col_keyantibiotics) if (!is.na(col_bactid)) { + if (!tbl %>% pull(col_bactid) %>% is.bactid()) { + tbl[, col_bactid] <- tbl %>% pull(col_bactid) %>% as.bactid() + } tbl <- tbl %>% left_join_microorganisms(by = col_bactid) col_genus <- "genus" col_species <- "species" diff --git a/R/freq.R b/R/freq.R index 9d88c560..17cfab5a 100755 --- a/R/freq.R +++ b/R/freq.R @@ -273,8 +273,11 @@ frequency_tbl <- function(x, } else { NAs <- x[is.na(x)] } + if (na.rm == TRUE) { + x_class <- class(x) x <- x[!x %in% NAs] + class(x) <- x_class } if (missing(sort.count) & 'factor' %in% class(x)) { diff --git a/R/join.R b/R/join_microorganisms.R similarity index 72% rename from R/join.R rename to R/join_microorganisms.R index c22a0953..3cdaa276 100755 --- a/R/join.R +++ b/R/join_microorganisms.R @@ -26,8 +26,8 @@ #' df2 <- left_join_microorganisms(df, "bacteria_id") #' colnames(df2) inner_join_microorganisms <- function(x, by = 'bactid', suffix = c("2", ""), ...) { - if (any(class(x) %in% c('character', 'factor'))) { - x <- data.frame(bactid = x, stringsAsFactors = FALSE) + if (!any(class(x) %in% c("bactid", "data.frame", "matrix"))) { + x <- data.frame(bactid = as.bactid(x), stringsAsFactors = FALSE) } # no name set to `by` parameter if (is.null(names(by))) { @@ -36,7 +36,9 @@ inner_join_microorganisms <- function(x, by = 'bactid', suffix = c("2", ""), ... } else { joinby <- by } - join <- dplyr::inner_join(x = x, y = AMR::microorganisms, by = joinby, suffix = c("2", ""), ...) + join <- suppressWarnings( + dplyr::inner_join(x = x, y = AMR::microorganisms, by = joinby, suffix = c("2", ""), ...) + ) if (nrow(join) > nrow(x)) { warning('the newly joined tbl contains ', nrow(join) - nrow(x), ' rows more that its original') } @@ -46,8 +48,8 @@ inner_join_microorganisms <- function(x, by = 'bactid', suffix = c("2", ""), ... #' @rdname join #' @export left_join_microorganisms <- function(x, by = 'bactid', suffix = c("2", ""), ...) { - if (any(class(x) %in% c('character', 'factor'))) { - x <- data.frame(bactid = x, stringsAsFactors = FALSE) + if (!any(class(x) %in% c("bactid", "data.frame", "matrix"))) { + x <- data.frame(bactid = as.bactid(x), stringsAsFactors = FALSE) } # no name set to `by` parameter if (is.null(names(by))) { @@ -56,7 +58,9 @@ left_join_microorganisms <- function(x, by = 'bactid', suffix = c("2", ""), ...) } else { joinby <- by } - join <- dplyr::left_join(x = x, y = AMR::microorganisms, by = joinby, suffix = c("2", ""), ...) + join <- suppressWarnings( + dplyr::left_join(x = x, y = AMR::microorganisms, by = joinby, suffix = c("2", ""), ...) + ) if (nrow(join) > nrow(x)) { warning('the newly joined tbl contains ', nrow(join) - nrow(x), ' rows more that its original') } @@ -66,8 +70,8 @@ left_join_microorganisms <- function(x, by = 'bactid', suffix = c("2", ""), ...) #' @rdname join #' @export right_join_microorganisms <- function(x, by = 'bactid', suffix = c("2", ""), ...) { - if (any(class(x) %in% c('character', 'factor'))) { - x <- data.frame(bactid = x, stringsAsFactors = FALSE) + if (!any(class(x) %in% c("bactid", "data.frame", "matrix"))) { + x <- data.frame(bactid = as.bactid(x), stringsAsFactors = FALSE) } # no name set to `by` parameter if (is.null(names(by))) { @@ -76,7 +80,9 @@ right_join_microorganisms <- function(x, by = 'bactid', suffix = c("2", ""), ... } else { joinby <- by } - join <- dplyr::right_join(x = x, y = AMR::microorganisms, by = joinby, suffix = c("2", ""), ...) + join <- suppressWarnings( + dplyr::right_join(x = x, y = AMR::microorganisms, by = joinby, suffix = c("2", ""), ...) + ) if (nrow(join) > nrow(x)) { warning('the newly joined tbl contains ', nrow(join) - nrow(x), ' rows more that its original') } @@ -86,8 +92,8 @@ right_join_microorganisms <- function(x, by = 'bactid', suffix = c("2", ""), ... #' @rdname join #' @export full_join_microorganisms <- function(x, by = 'bactid', suffix = c("2", ""), ...) { - if (any(class(x) %in% c('character', 'factor'))) { - x <- data.frame(bactid = x, stringsAsFactors = FALSE) + if (!any(class(x) %in% c("bactid", "data.frame", "matrix"))) { + x <- data.frame(bactid = as.bactid(x), stringsAsFactors = FALSE) } # no name set to `by` parameter if (is.null(names(by))) { @@ -96,7 +102,9 @@ full_join_microorganisms <- function(x, by = 'bactid', suffix = c("2", ""), ...) } else { joinby <- by } - join <- dplyr::full_join(x = x, y = AMR::microorganisms, by = joinby, suffix = c("2", ""), ...) + join <- suppressWarnings( + dplyr::full_join(x = x, y = AMR::microorganisms, by = joinby, suffix = c("2", ""), ...) + ) if (nrow(join) > nrow(x)) { warning('the newly joined tbl contains ', nrow(join) - nrow(x), ' rows more that its original') } @@ -106,8 +114,8 @@ full_join_microorganisms <- function(x, by = 'bactid', suffix = c("2", ""), ...) #' @rdname join #' @export semi_join_microorganisms <- function(x, by = 'bactid', ...) { - if (any(class(x) %in% c('character', 'factor'))) { - x <- data.frame(bactid = x, stringsAsFactors = FALSE) + if (!any(class(x) %in% c("bactid", "data.frame", "matrix"))) { + x <- data.frame(bactid = as.bactid(x), stringsAsFactors = FALSE) } # no name set to `by` parameter if (is.null(names(by))) { @@ -116,14 +124,16 @@ semi_join_microorganisms <- function(x, by = 'bactid', ...) { } else { joinby <- by } - dplyr::semi_join(x = x, y = AMR::microorganisms, by = joinby, ...) + suppressWarnings( + dplyr::semi_join(x = x, y = AMR::microorganisms, by = joinby, ...) + ) } #' @rdname join #' @export anti_join_microorganisms <- function(x, by = 'bactid', ...) { - if (any(class(x) %in% c('character', 'factor'))) { - x <- data.frame(bactid = x, stringsAsFactors = FALSE) + if (!any(class(x) %in% c("bactid", "data.frame", "matrix"))) { + x <- data.frame(bactid = as.bactid(x), stringsAsFactors = FALSE) } # no name set to `by` parameter if (is.null(names(by))) { @@ -132,5 +142,7 @@ anti_join_microorganisms <- function(x, by = 'bactid', ...) { } else { joinby <- by } - dplyr::anti_join(x = x, y = AMR::microorganisms, by = joinby, ...) + suppressWarnings( + dplyr::anti_join(x = x, y = AMR::microorganisms, by = joinby, ...) + ) } diff --git a/README.md b/README.md index b05851c1..1e9c5920 100755 --- a/README.md +++ b/README.md @@ -126,6 +126,86 @@ after # 5 PSEAER R R - - R ``` +Bacteria ID's can be retrieved with the `as.bactid` function. It uses any type of info about a microorganism as input. For example, all these will return value `STAAUR`, the ID of *S. aureus*: +```r +as.bactid("stau") +as.bactid("STAU") +as.bactid("staaur") +as.bactid("S. aureus") +as.bactid("S aureus") +as.bactid("Staphylococcus aureus") +as.bactid("MRSA") # Methicillin Resistant S. aureus +as.bactid("VISA") # Vancomycin Intermediate S. aureus +as.bactid("VRSA") # Vancomycin Resistant S. aureus +``` + +### New classes +This package contains two new S3 classes: `mic` for MIC values (e.g. from Vitek or Phoenix) and `rsi` for antimicrobial drug interpretations (i.e. S, I and R). Both are actually ordered factors under the hood (an MIC of `2` being higher than `<=1` but lower than `>=32`, and for class `rsi` factors are ordered as `S < I < R`). +Both classes have extensions for existing generic functions like `print`, `summary` and `plot`. + +```r +# Transform values to new classes +mic_data <- as.mic(c(">=32", "1.0", "8", "<=0.128", "8", "16", "16")) +rsi_data <- as.rsi(c(rep("S", 474), rep("I", 36), rep("R", 370))) +``` +These functions also try to coerce valid values. + +Quick overviews when just printing objects: +```r +mic_data +# Class 'mic': 7 isolates +# +# 0 +# +# <=0.128 1 8 16 >=32 +# 1 1 2 2 1 + +rsi_data +# Class 'rsi': 880 isolates +# +# : 0 +# Sum of S: 474 +# Sum of IR: 406 +# - Sum of R: 370 +# - Sum of I: 36 +# +# %S %IR %I %R +# 53.9 46.1 4.1 42.0 +``` + +A plot of `rsi_data`: +```r +plot(rsi_data) +``` + +![example1](man/figures/rsi_example.png) + +A plot of `mic_data` (defaults to bar plot): +```r +plot(mic_data) +``` + +![example2](man/figures/mic_example.png) + +Other epidemiological functions: + +```r +# Determine key antibiotic based on bacteria ID +key_antibiotics(...) + +# Selection of first isolates of any patient +first_isolate(...) + +# Calculate resistance levels of antibiotics, can be used with `summarise` (dplyr) +rsi(...) +# Predict resistance levels of antibiotics +rsi_predict(...) + +# Get name of antibiotic by ATC code +abname(...) +abname("J01CR02", from = "atc", to = "umcg") # "AMCL" +``` + ### Frequency tables Base R lacks a simple function to create frequency tables. We created such a function that works with almost all data types: `freq` (or `frequency_tbl`). It can be used in two ways: ```r @@ -235,79 +315,12 @@ Learn more about this function with: ?freq ``` -### New classes -This package contains two new S3 classes: `mic` for MIC values (e.g. from Vitek or Phoenix) and `rsi` for antimicrobial drug interpretations (i.e. S, I and R). Both are actually ordered factors under the hood (an MIC of `2` being higher than `<=1` but lower than `>=32`, and for class `rsi` factors are ordered as `S < I < R`). -Both classes have extensions for existing generic functions like `print`, `summary` and `plot`. - -```r -# Transform values to new classes -mic_data <- as.mic(c(">=32", "1.0", "8", "<=0.128", "8", "16", "16")) -rsi_data <- as.rsi(c(rep("S", 474), rep("I", 36), rep("R", 370))) -``` -These functions also try to coerce valid values. - -Quick overviews when just printing objects: -```r -mic_data -# Class 'mic': 7 isolates -# -# 0 -# -# <=0.128 1 8 16 >=32 -# 1 1 2 2 1 - -rsi_data -# Class 'rsi': 880 isolates -# -# : 0 -# Sum of S: 474 -# Sum of IR: 406 -# - Sum of R: 370 -# - Sum of I: 36 -# -# %S %IR %I %R -# 53.9 46.1 4.1 42.0 -``` - -A plot of `rsi_data`: -```r -plot(rsi_data) -``` - -![example1](man/figures/rsi_example.png) - -A plot of `mic_data` (defaults to bar plot): -```r -plot(mic_data) -``` - -![example2](man/figures/mic_example.png) - -Other epidemiological functions: - -```r -# Determine key antibiotic based on bacteria ID -key_antibiotics(...) - -# Selection of first isolates of any patient -first_isolate(...) - -# Calculate resistance levels of antibiotics, can be used with `summarise` (dplyr) -rsi(...) -# Predict resistance levels of antibiotics -rsi_predict(...) - -# Get name of antibiotic by ATC code -abname(...) -abname("J01CR02", from = "atc", to = "umcg") # "AMCL" -``` - ### Databases included in package Datasets to work with antibiotics and bacteria properties. ```r # Dataset with 2000 random blood culture isolates from anonymised # septic patients between 2001 and 2017 in 5 Dutch hospitals -septic_patients # A tibble: 4,000 x 47 +septic_patients # A tibble: 2,000 x 47 # Dataset with ATC antibiotics codes, official names, trade names # and DDD's (oral and parenteral) diff --git a/data/microorganisms.rda b/data/microorganisms.rda index c466967a..d66b105d 100755 Binary files a/data/microorganisms.rda and b/data/microorganisms.rda differ diff --git a/data/septic_patients.rda b/data/septic_patients.rda index 9a4cb8c3..fc482274 100755 Binary files a/data/septic_patients.rda and b/data/septic_patients.rda differ diff --git a/man/as.bactid.Rd b/man/as.bactid.Rd new file mode 100644 index 00000000..3a17bc6a --- /dev/null +++ b/man/as.bactid.Rd @@ -0,0 +1,69 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/bactid.R +\name{as.bactid} +\alias{as.bactid} +\alias{guess_bactid} +\alias{is.bactid} +\title{Transform to bacteria ID} +\usage{ +as.bactid(x) + +guess_bactid(x) + +is.bactid(x) +} +\arguments{ +\item{x}{a character vector or a dataframe with one or two columns} +} +\value{ +Character (vector) with class \code{"bactid"}. Unknown values will return \code{NA}. +} +\description{ +Use this function to determine a valid ID based on a genus (and species). This input can be a full name (like \code{"Staphylococcus aureus"}), an abbreviated name (like \code{"S. aureus"}), or just a genus. You could also \code{\link{select}} a genus and species column, zie Examples. +} +\details{ +Some exceptions have been built in to get more logical results, based on prevalence of human pathogens. For example: +\itemize{ + \item{\code{"E. coli"} will return the ID of \emph{Escherichia coli} and not \emph{Entamoeba coli}, although the latter would alphabetically come first} + \item{\code{"H. influenzae"} will return the ID of \emph{Haemophilus influenzae} and not \emph{Haematobacter influenzae}} + \item{Something like \code{"p aer"} will return the ID of \emph{Pseudomonas aeruginosa} and not \emph{Pasteurella aerogenes}} + \item{Something like \code{"stau"} or \code{"staaur"} will return the ID of \emph{Staphylococcus aureus} and not \emph{Staphylococcus auricularis}} +} +Moreover, this function also supports ID's based on only Gram stain, when the species is not known. \cr +For example, \code{"Gram negative rods"} and \code{"GNR"} will both return the ID of a Gram negative rod: \code{GNR}. +} +\examples{ +# These examples all return "STAAUR", the ID of S. aureus: +as.bactid("stau") +as.bactid("STAU") +as.bactid("staaur") +as.bactid("S. aureus") +as.bactid("S aureus") +as.bactid("Staphylococcus aureus") +as.bactid("MRSA") # Methicillin Resistant S. aureus +as.bactid("VISA") # Vancomycin Intermediate S. aureus +as.bactid("VRSA") # Vancomycin Resistant S. aureus + +\dontrun{ +df$bactid <- as.bactid(df$microorganism_name) + +# the select function of tidyverse is also supported: +library(dplyr) +df$bactid <- df \%>\% + select(microorganism_name) \%>\% + as.bactid() + +# and can even contain 2 columns, which is convenient for genus/species combinations: +df$bactid <- df \%>\% + select(genus, species) \%>\% + as.bactid() + +# same result: +df <- df \%>\% + mutate(bactid = paste(genus, species) \%>\% + as.bactid()) +} +} +\seealso{ +\code{\link{microorganisms}} for the dataframe that is being used to determine ID's. +} diff --git a/man/first_isolate.Rd b/man/first_isolate.Rd index f9e8e457..86b582cb 100755 --- a/man/first_isolate.Rd +++ b/man/first_isolate.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/first_isolates.R +% Please edit documentation in R/first_isolate.R \name{first_isolate} \alias{first_isolate} \title{Determine first (weighted) isolates} @@ -21,7 +21,7 @@ first_isolate(tbl, col_date, col_patient_id, col_bactid = NA, \item{col_patient_id}{column name of the unique IDs of the patients} -\item{col_bactid}{column name of the unique IDs of the microorganisms (should occur in the \code{\link{microorganisms}} dataset). Get your bactid's with the function \code{\link{guess_bactid}}, that takes microorganism names as input.} +\item{col_bactid}{column name of the unique IDs of the microorganisms: \code{bactid}'s. If this column has another class than \code{"bactid"}, values will be coerced using \code{\link{as.bactid}}.} \item{col_testcode}{column name of the test codes. Use \code{col_testcode = NA} to \strong{not} exclude certain test codes (like test codes for screening). In that case \code{testcodes_exclude} will be ignored. Supports tidyverse-like quotation.} diff --git a/man/guess_bactid.Rd b/man/guess_bactid.Rd deleted file mode 100755 index f86c197d..00000000 --- a/man/guess_bactid.Rd +++ /dev/null @@ -1,43 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/guess_bactid.R -\name{guess_bactid} -\alias{guess_bactid} -\title{Find bacteria ID based on genus/species} -\usage{ -guess_bactid(x) -} -\arguments{ -\item{x}{character vector or a dataframe with one or two columns} -} -\value{ -Character (vector). -} -\description{ -Use this function to determine a valid ID based on a genus (and species). This input could be a full name (like \code{"Staphylococcus aureus"}), an abbreviated name (like \code{"S. aureus"}), or just a genus. You could also \code{\link{select}} a genus and species column, zie Examples. -} -\examples{ -# These examples all return "STAAUR", the ID of S. aureus: -guess_bactid("stau") -guess_bactid("STAU") -guess_bactid("staaur") -guess_bactid("S. aureus") -guess_bactid("S aureus") -guess_bactid("Staphylococcus aureus") -guess_bactid("MRSA") # Methicillin-resistant S. aureus -guess_bactid("VISA") # Vancomycin Intermediate S. aureus - -\dontrun{ -df$bactid <- guess_bactid(df$microorganism_name) - -# the select function of tidyverse is also supported: -df$bactid <- df \%>\% select(microorganism_name) \%>\% guess_bactid() - -# and can even contain 2 columns, which is convenient for genus/species combinations: -df$bactid <- df \%>\% select(genus, species) \%>\% guess_bactid() -# same result: -df <- df \%>\% mutate(bactid = paste(genus, species)) \%>\% guess_bactid()) -} -} -\seealso{ -\code{\link{microorganisms}} for the dataframe that is being used to determine ID's. -} diff --git a/man/join.Rd b/man/join.Rd index 09254631..ca89e27d 100755 --- a/man/join.Rd +++ b/man/join.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/join.R +% Please edit documentation in R/join_microorganisms.R \name{join} \alias{join} \alias{inner_join_microorganisms} diff --git a/man/key_antibiotics.Rd b/man/key_antibiotics.Rd index 4e222c2b..8d97b7e7 100755 --- a/man/key_antibiotics.Rd +++ b/man/key_antibiotics.Rd @@ -19,7 +19,7 @@ key_antibiotics_equal(x, y, type = c("keyantibiotics", "points"), \arguments{ \item{tbl}{table with antibiotics coloms, like \code{amox} and \code{amcl}.} -\item{col_bactid}{column name of the unique IDs of the microorganisms (should occur in the \code{\link{microorganisms}} dataset). Get your bactid's with the function \code{\link{guess_bactid}}, that takes microorganism names as input.} +\item{col_bactid}{column name of the unique IDs of the microorganisms: \code{bactid}'s. If this column has another class than \code{"bactid"}, values will be coerced using \code{\link{as.bactid}}.} \item{universal_1, universal_2, universal_3, universal_4, universal_5, universal_6}{column names of \strong{broad-spectrum} antibiotics, case-insensitive} diff --git a/tests/testthat/test-bactid.R b/tests/testthat/test-bactid.R new file mode 100644 index 00000000..1fc8253f --- /dev/null +++ b/tests/testthat/test-bactid.R @@ -0,0 +1,64 @@ +context("bactid.R") + +test_that("as.bactid works", { + expect_identical( + as.character(as.bactid(c("E. coli", "H. influenzae"))), + c("ESCCOL", "HAEINF")) + + expect_equal(as.character(as.bactid("Escherichia coli")), "ESCCOL") + expect_equal(as.character(as.bactid("P. aer")), "PSEAER") # not Pasteurella aerogenes + + expect_equal(as.character(as.bactid("Negative rods")), "GNR") + + expect_equal(as.character(as.bactid("MRSE")), "STAEPI") + expect_equal(as.character(as.bactid("VRE")), "ENC") + expect_equal(as.character(as.bactid("MRPA")), "PSEAER") + expect_equal(as.character(as.bactid("PISP")), "STCPNE") + expect_equal(as.character(as.bactid("PRSP")), "STCPNE") + expect_equal(as.character(as.bactid("VISP")), "STCPNE") + expect_equal(as.character(as.bactid("VRSP")), "STCPNE") + + expect_identical( + as.character( + as.bactid(c("stau", + "STAU", + "staaur", + "S. aureus", + "S aureus", + "Staphylococcus aureus", + "MRSA", + "VISA"))), + rep("STAAUR", 8)) + + # select with one column + expect_identical( + septic_patients[1:10,] %>% + left_join_microorganisms() %>% + select(genus) %>% + as.bactid() %>% + as.character(), + c("STC", "STC", "NEI", "STA", "STA", + "NEI", "ENT", "ENT", "ESC", "KLE")) + + # select with two columns + expect_identical( + septic_patients[1:10,] %>% + pull(bactid), + septic_patients[1:10,] %>% + left_join_microorganisms() %>% + select(genus, species) %>% + as.bactid() %>% + as.character()) + + # unknown results + expect_warning(as.bactid(c("INVALID", "Yeah, unknown"))) + + # print + expect_output(print(as.bactid(c("ESCCOL", NA)))) + + # helper function + expect_identical(as.bactid("ESCCOL"), + guess_bactid("ESCCOL")) + + +}) diff --git a/tests/testthat/test-eucast.R b/tests/testthat/test-eucast.R index 666f5970..22892005 100755 --- a/tests/testthat/test-eucast.R +++ b/tests/testthat/test-eucast.R @@ -1,27 +1,34 @@ context("eucast.R") test_that("EUCAST rules work", { - a <- suppressWarnings(EUCAST_rules(septic_patients)) + expect_identical(colnames(septic_patients), + colnames(suppressWarnings(EUCAST_rules(septic_patients)))) - a <- data.frame(bactid = c("KLEPNE", # Klebsiella pneumoniae - "PSEAER", # Pseudomonas aeruginosa - "ENTAER"), # Enterobacter aerogenes + a <- data.frame(bactid = + c("KLEPNE", # Klebsiella pneumoniae + "PSEAER", # Pseudomonas aeruginosa + "ENTAER"), # Enterobacter aerogenes amox = "-", # Amoxicillin stringsAsFactors = FALSE) - b <- data.frame(bactid = c("KLEPNE", # Klebsiella pneumoniae - "PSEAER", # Pseudomonas aeruginosa - "ENTAER"), # Enterobacter aerogenes + b <- data.frame(bactid = + as.bactid( + c("KLEPNE", # Klebsiella pneumoniae + "PSEAER", # Pseudomonas aeruginosa + "ENTAER")), # Enterobacter aerogenes amox = "R", # Amoxicillin stringsAsFactors = FALSE) - expect_equal(EUCAST_rules(a, info = FALSE), b) - expect_equal(suppressWarnings(interpretive_reading(a, info = TRUE)), b) + expect_identical(EUCAST_rules(a, info = FALSE), b) + expect_identical(suppressWarnings(interpretive_reading(a, info = TRUE)), b) - a <- data.frame(bactid = c("STAAUR", # Staphylococcus aureus - "STCGRA"), # Streptococcus pyognenes (Lancefield Group A) + a <- data.frame(bactid = + c("STAAUR", # Staphylococcus aureus + "STCGRA"), # Streptococcus pyognenes (Lancefield Group A) coli = "-", # Colistin stringsAsFactors = FALSE) - b <- data.frame(bactid = c("STAAUR", # Staphylococcus aureus - "STCGRA"), # Streptococcus pyognenes (Lancefield Group A) + b <- data.frame(bactid = + as.bactid( + c("STAAUR", # Staphylococcus aureus + "STCGRA")), # Streptococcus pyognenes (Lancefield Group A) coli = "R", # Colistin stringsAsFactors = FALSE) expect_equal(EUCAST_rules(a, info = FALSE), b) diff --git a/tests/testthat/test-first_isolates.R b/tests/testthat/test-first_isolate.R similarity index 99% rename from tests/testthat/test-first_isolates.R rename to tests/testthat/test-first_isolate.R index 6b08494f..c809bbfc 100755 --- a/tests/testthat/test-first_isolates.R +++ b/tests/testthat/test-first_isolate.R @@ -1,4 +1,4 @@ -context("first_isolates.R") +context("first_isolate.R") test_that("first isolates work", { # septic_patients contains 1959 out of 2000 first isolates diff --git a/tests/testthat/test-guess_bactid.R b/tests/testthat/test-guess_bactid.R deleted file mode 100644 index 17d03d11..00000000 --- a/tests/testthat/test-guess_bactid.R +++ /dev/null @@ -1,49 +0,0 @@ -context("guess_bactid.R") - -test_that("guess_bactid works", { - expect_identical( - guess_bactid(c("E. coli", "H. influenzae")), - c("ESCCOL", "HAEINF")) - - expect_equal(guess_bactid("Escherichia coli"), "ESCCOL") - expect_equal(guess_bactid("P. aer"), "PSEAER") # not Pasteurella aerogenes - - expect_equal(guess_bactid("Negative rods"), "GNR") - - expect_equal(guess_bactid("MRSE"), "STAEPI") - expect_equal(guess_bactid("VRE"), "ENC") - expect_equal(guess_bactid("MRPA"), "PSEAER") - expect_equal(guess_bactid("PISP"), "STCPNE") - expect_equal(guess_bactid("PRSP"), "STCPNE") - expect_equal(guess_bactid("VISP"), "STCPNE") - expect_equal(guess_bactid("VRSP"), "STCPNE") - - expect_identical( - guess_bactid(c("stau", - "STAU", - "staaur", - "S. aureus", - "S aureus", - "Staphylococcus aureus", - "MRSA", - "VISA")), - rep("STAAUR", 8)) - - # select with one column - expect_identical( - septic_patients[1:10,] %>% - left_join_microorganisms() %>% - select(genus) %>% - guess_bactid(), - c("STC", "STC", "NEI", "STA", "STA", - "NEI", "ENT", "ENT", "ESC", "KLE")) - - # select with two columns - expect_identical( - septic_patients[1:10,] %>% - pull(bactid), - septic_patients[1:10,] %>% - left_join_microorganisms() %>% - select(genus, species) %>% - guess_bactid()) -}) diff --git a/tests/testthat/test-joins.R b/tests/testthat/test-join_microorganisms.R similarity index 96% rename from tests/testthat/test-joins.R rename to tests/testthat/test-join_microorganisms.R index 0ba877fe..3436a3f7 100755 --- a/tests/testthat/test-joins.R +++ b/tests/testthat/test-join_microorganisms.R @@ -1,4 +1,4 @@ -context("joins.R") +context("join_microorganisms.R") test_that("joins work", { unjoined <- septic_patients