diff --git a/R/mo.R b/R/mo.R index 127f38e6..3c504f46 100644 --- a/R/mo.R +++ b/R/mo.R @@ -110,6 +110,7 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE) { if (NCOL(x) > 2) { stop('`x` can be 2 columns at most', call. = FALSE) } + x[is.null(x)] <- NA # support tidyverse selection like: df %>% select(colA) if (!is.vector(x)) { @@ -127,6 +128,8 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE) { x_backup <- x # translate to English for supported languages of mo_property x <- gsub("(Gruppe|gruppe|groep|grupo)", "group", x) + # remove 'empty' genus and species values + x <- gsub("(no MO)", "", x, fixed = TRUE) # remove dots and other non-text in case of "E. coli" except spaces x <- gsub("[^a-zA-Z0-9 ]+", "", x) # but spaces before and after should be omitted @@ -144,11 +147,9 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE) { x_withspaces <- paste0('^', x_withspaces, '$') for (i in 1:length(x)) { - if (identical(x_trimmed[i], "")) { # empty values x[i] <- NA - #failures <- c(failures, x_backup[i]) next } if (x_backup[i] %in% AMR::microorganisms$mo) { @@ -161,6 +162,11 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE) { x[i] <- x_trimmed[i] next } + if (x_backup[i] %in% AMR::microorganisms$fullname) { + # is exact match in fullname + x[i] <- AMR::microorganisms[which(AMR::microorganisms$fullname == x_backup[i]), ]$mo[1] + next + } if (tolower(x[i]) == '^e.*coli$') { # avoid detection of Entamoeba coli in case of E. coli @@ -173,7 +179,7 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE) { next } if (tolower(x[i]) == '^c.*difficile$') { - # avoid detection of Clostridium difficile in case of C. difficile + # avoid detection of Catabacter difficile in case of C. difficile x[i] <- 'CLODIF' next } @@ -189,16 +195,18 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE) { x[i] <- 'PSEAER' next } - if (tolower(x[i]) %like% 'coagulase negative' - | tolower(x[i]) %like% 'cns' - | tolower(x[i]) %like% 'cons') { + + # CoNS and CoPS in different languages (support for German, Dutch, Spanish, Portuguese) + if (tolower(x[i]) %like% '[ck]oagulas[ea] negatie?[vf]' + | tolower(x_trimmed[i]) %like% '[ck]oagulas[ea] negatie?[vf]' + | tolower(x[i]) %like% '[ck]o?ns[^a-z]?$') { # coerce S. coagulase negative x[i] <- 'STACNS' next } - if (tolower(x[i]) %like% 'coagulase positive' - | tolower(x[i]) %like% 'cps' - | tolower(x[i]) %like% 'cops') { + if (tolower(x[i]) %like% '[ck]oagulas[ea] positie?[vf]' + | tolower(x_trimmed[i]) %like% '[ck]oagulas[ea] positie?[vf]' + | tolower(x[i]) %like% '[ck]o?ps[^a-z]?$') { # coerce S. coagulase positive x[i] <- 'STACPS' next @@ -381,6 +389,10 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE) { x[x == "STCSAL"] <- "STCGRK" # S. salivarius } + # for the returned genera without species (like "ESC"), add species (like "ESCSPP") where the input contained it + indices <- unique(x_input) %like% "[A-Z]{3}SPP" & !x %like% "[A-Z]{3}SPP" + x[indices] <- paste0(x[indices], 'SPP') + # left join the found results to the original input values (x_input) df_found <- data.frame(input = as.character(unique(x_input)), found = x, diff --git a/R/mo_property.R b/R/mo_property.R index 408f3e34..92cb9d4a 100644 --- a/R/mo_property.R +++ b/R/mo_property.R @@ -99,10 +99,10 @@ #' mo_gramstain("E. coli", language = "es") # "Bacilos negativos" #' mo_gramstain("Giardia", language = "pt") # "Parasitas" #' -#' mo_fullname("S. pyo", +#' mo_fullname("S. pyogenes", #' Lancefield = TRUE, #' language = "de") # "Streptococcus Gruppe A" -#' mo_fullname("S. pyo", +#' mo_fullname("S. pyogenes", #' Lancefield = TRUE, #' language = "nl") # "Streptococcus groep A" mo_family <- function(x) { @@ -111,8 +111,8 @@ mo_family <- function(x) { #' @rdname mo_property #' @export -mo_genus <- function(x) { - mo_property(x, "genus") +mo_genus <- function(x, language = NULL) { + mo_property(x, "genus", language = language) } #' @rdname mo_property diff --git a/man/mo_property.Rd b/man/mo_property.Rd index c4551212..53208dd3 100644 --- a/man/mo_property.Rd +++ b/man/mo_property.Rd @@ -20,7 +20,7 @@ \usage{ mo_family(x) -mo_genus(x) +mo_genus(x, language = NULL) mo_species(x, Becker = FALSE, Lancefield = FALSE, language = NULL) @@ -42,6 +42,8 @@ mo_property(x, property = "fullname", Becker = FALSE, \arguments{ \item{x}{any (vector of) text that can be coerced to a valid microorganism code with \code{\link{as.mo}}} +\item{language}{language of the returned text, defaults to the systems language. Either one of \code{"en"} (English), \code{"de"} (German), \code{"nl"} (Dutch), \code{"es"} (Spanish) or \code{"pt"} (Portuguese).} + \item{Becker}{a logical to indicate whether \emph{Staphylococci} should be categorised into Coagulase Negative \emph{Staphylococci} ("CoNS") and Coagulase Positive \emph{Staphylococci} ("CoPS") instead of their own species, according to Karsten Becker \emph{et al.} [1]. This excludes \emph{Staphylococcus aureus} at default, use \code{Becker = "all"} to also categorise \emph{S. aureus} as "CoPS".} @@ -50,8 +52,6 @@ mo_property(x, property = "fullname", Becker = FALSE, This excludes \emph{Enterococci} at default (who are in group D), use \code{Lancefield = "all"} to also categorise all \emph{Enterococci} as group D.} -\item{language}{language of the returned text, defaults to the systems language. Either one of \code{"en"} (English), \code{"de"} (German), \code{"nl"} (Dutch), \code{"es"} (Spanish) or \code{"pt"} (Portuguese).} - \item{property}{one of the column names of one of the \code{\link{microorganisms}} data set, like \code{"mo"}, \code{"bactsys"}, \code{"family"}, \code{"genus"}, \code{"species"}, \code{"fullname"}, \code{"gramstain"} and \code{"aerobic"}} } \value{ @@ -126,10 +126,10 @@ mo_gramstain("E. coli", language = "nl") # "Negatieve staven" mo_gramstain("E. coli", language = "es") # "Bacilos negativos" mo_gramstain("Giardia", language = "pt") # "Parasitas" -mo_fullname("S. pyo", +mo_fullname("S. pyogenes", Lancefield = TRUE, language = "de") # "Streptococcus Gruppe A" -mo_fullname("S. pyo", +mo_fullname("S. pyogenes", Lancefield = TRUE, language = "nl") # "Streptococcus groep A" } diff --git a/tests/testthat/test-mo.R b/tests/testthat/test-mo.R index 405d4e23..e3fd4c78 100644 --- a/tests/testthat/test-mo.R +++ b/tests/testthat/test-mo.R @@ -1,6 +1,13 @@ context("mo.R") test_that("as.mo works", { + + library(dplyr) + MOs <- AMR::microorganisms %>% filter(!is.na(mo)) + + expect_identical(as.character(MOs$mo), as.character(as.mo(MOs$mo))) + expect_identical(MOs$fullname, mo_fullname(MOs$fullname, language = "en")) + expect_identical( as.character(as.mo(c("E. coli", "H. influenzae"))), c("ESCCOL", "HAEINF")) diff --git a/vignettes/freq.Rmd b/vignettes/freq.Rmd index 3ab6bbd7..59ad453c 100755 --- a/vignettes/freq.Rmd +++ b/vignettes/freq.Rmd @@ -26,25 +26,8 @@ Frequency tables (or frequency distributions) are summaries of the distribution ## Frequencies of one variable To only show and quickly review the content of one variable, you can just select this variable in various ways. Let's say we want to get the frequencies of the `sex` variable of the `septic_patients` dataset: -```{r, echo = TRUE, results = 'hide'} -# just using base R -freq(septic_patients$sex) - -# using base R to select the variable and pass it on with a pipe from the dplyr package -septic_patients$sex %>% freq() - -# do it all with pipes, using the `select` function from the dplyr package -septic_patients %>% - select(sex) %>% - freq() - -# or the preferred way: using a pipe to pass the variable on to the freq function -septic_patients %>% freq(sex) # this also shows 'sex' in the title - -``` -This will all lead to the following table: -```{r, echo = FALSE} -freq(septic_patients$sex) +```{r, echo = TRUE} +septic_patients %>% freq(sex) ``` This immediately shows the class of the variable, its length and availability (i.e. the amount of `NA`), the amount of unique values and (most importantly) that among septic patients men are more prevalent than women.