mirror of
https://github.com/msberends/AMR.git
synced 2024-12-25 18:06:12 +01:00
AI improvements for microorganisms
This commit is contained in:
parent
936198372e
commit
4816419f0c
30
R/mo.R
30
R/mo.R
@ -110,6 +110,7 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE) {
|
||||
if (NCOL(x) > 2) {
|
||||
stop('`x` can be 2 columns at most', call. = FALSE)
|
||||
}
|
||||
x[is.null(x)] <- NA
|
||||
|
||||
# support tidyverse selection like: df %>% select(colA)
|
||||
if (!is.vector(x)) {
|
||||
@ -127,6 +128,8 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE) {
|
||||
x_backup <- x
|
||||
# translate to English for supported languages of mo_property
|
||||
x <- gsub("(Gruppe|gruppe|groep|grupo)", "group", x)
|
||||
# remove 'empty' genus and species values
|
||||
x <- gsub("(no MO)", "", x, fixed = TRUE)
|
||||
# remove dots and other non-text in case of "E. coli" except spaces
|
||||
x <- gsub("[^a-zA-Z0-9 ]+", "", x)
|
||||
# but spaces before and after should be omitted
|
||||
@ -144,11 +147,9 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE) {
|
||||
x_withspaces <- paste0('^', x_withspaces, '$')
|
||||
|
||||
for (i in 1:length(x)) {
|
||||
|
||||
if (identical(x_trimmed[i], "")) {
|
||||
# empty values
|
||||
x[i] <- NA
|
||||
#failures <- c(failures, x_backup[i])
|
||||
next
|
||||
}
|
||||
if (x_backup[i] %in% AMR::microorganisms$mo) {
|
||||
@ -161,6 +162,11 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE) {
|
||||
x[i] <- x_trimmed[i]
|
||||
next
|
||||
}
|
||||
if (x_backup[i] %in% AMR::microorganisms$fullname) {
|
||||
# is exact match in fullname
|
||||
x[i] <- AMR::microorganisms[which(AMR::microorganisms$fullname == x_backup[i]), ]$mo[1]
|
||||
next
|
||||
}
|
||||
|
||||
if (tolower(x[i]) == '^e.*coli$') {
|
||||
# avoid detection of Entamoeba coli in case of E. coli
|
||||
@ -173,7 +179,7 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE) {
|
||||
next
|
||||
}
|
||||
if (tolower(x[i]) == '^c.*difficile$') {
|
||||
# avoid detection of Clostridium difficile in case of C. difficile
|
||||
# avoid detection of Catabacter difficile in case of C. difficile
|
||||
x[i] <- 'CLODIF'
|
||||
next
|
||||
}
|
||||
@ -189,16 +195,18 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE) {
|
||||
x[i] <- 'PSEAER'
|
||||
next
|
||||
}
|
||||
if (tolower(x[i]) %like% 'coagulase negative'
|
||||
| tolower(x[i]) %like% 'cns'
|
||||
| tolower(x[i]) %like% 'cons') {
|
||||
|
||||
# CoNS and CoPS in different languages (support for German, Dutch, Spanish, Portuguese)
|
||||
if (tolower(x[i]) %like% '[ck]oagulas[ea] negatie?[vf]'
|
||||
| tolower(x_trimmed[i]) %like% '[ck]oagulas[ea] negatie?[vf]'
|
||||
| tolower(x[i]) %like% '[ck]o?ns[^a-z]?$') {
|
||||
# coerce S. coagulase negative
|
||||
x[i] <- 'STACNS'
|
||||
next
|
||||
}
|
||||
if (tolower(x[i]) %like% 'coagulase positive'
|
||||
| tolower(x[i]) %like% 'cps'
|
||||
| tolower(x[i]) %like% 'cops') {
|
||||
if (tolower(x[i]) %like% '[ck]oagulas[ea] positie?[vf]'
|
||||
| tolower(x_trimmed[i]) %like% '[ck]oagulas[ea] positie?[vf]'
|
||||
| tolower(x[i]) %like% '[ck]o?ps[^a-z]?$') {
|
||||
# coerce S. coagulase positive
|
||||
x[i] <- 'STACPS'
|
||||
next
|
||||
@ -381,6 +389,10 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE) {
|
||||
x[x == "STCSAL"] <- "STCGRK" # S. salivarius
|
||||
}
|
||||
|
||||
# for the returned genera without species (like "ESC"), add species (like "ESCSPP") where the input contained it
|
||||
indices <- unique(x_input) %like% "[A-Z]{3}SPP" & !x %like% "[A-Z]{3}SPP"
|
||||
x[indices] <- paste0(x[indices], 'SPP')
|
||||
|
||||
# left join the found results to the original input values (x_input)
|
||||
df_found <- data.frame(input = as.character(unique(x_input)),
|
||||
found = x,
|
||||
|
@ -99,10 +99,10 @@
|
||||
#' mo_gramstain("E. coli", language = "es") # "Bacilos negativos"
|
||||
#' mo_gramstain("Giardia", language = "pt") # "Parasitas"
|
||||
#'
|
||||
#' mo_fullname("S. pyo",
|
||||
#' mo_fullname("S. pyogenes",
|
||||
#' Lancefield = TRUE,
|
||||
#' language = "de") # "Streptococcus Gruppe A"
|
||||
#' mo_fullname("S. pyo",
|
||||
#' mo_fullname("S. pyogenes",
|
||||
#' Lancefield = TRUE,
|
||||
#' language = "nl") # "Streptococcus groep A"
|
||||
mo_family <- function(x) {
|
||||
@ -111,8 +111,8 @@ mo_family <- function(x) {
|
||||
|
||||
#' @rdname mo_property
|
||||
#' @export
|
||||
mo_genus <- function(x) {
|
||||
mo_property(x, "genus")
|
||||
mo_genus <- function(x, language = NULL) {
|
||||
mo_property(x, "genus", language = language)
|
||||
}
|
||||
|
||||
#' @rdname mo_property
|
||||
|
@ -20,7 +20,7 @@
|
||||
\usage{
|
||||
mo_family(x)
|
||||
|
||||
mo_genus(x)
|
||||
mo_genus(x, language = NULL)
|
||||
|
||||
mo_species(x, Becker = FALSE, Lancefield = FALSE, language = NULL)
|
||||
|
||||
@ -42,6 +42,8 @@ mo_property(x, property = "fullname", Becker = FALSE,
|
||||
\arguments{
|
||||
\item{x}{any (vector of) text that can be coerced to a valid microorganism code with \code{\link{as.mo}}}
|
||||
|
||||
\item{language}{language of the returned text, defaults to the systems language. Either one of \code{"en"} (English), \code{"de"} (German), \code{"nl"} (Dutch), \code{"es"} (Spanish) or \code{"pt"} (Portuguese).}
|
||||
|
||||
\item{Becker}{a logical to indicate whether \emph{Staphylococci} should be categorised into Coagulase Negative \emph{Staphylococci} ("CoNS") and Coagulase Positive \emph{Staphylococci} ("CoPS") instead of their own species, according to Karsten Becker \emph{et al.} [1].
|
||||
|
||||
This excludes \emph{Staphylococcus aureus} at default, use \code{Becker = "all"} to also categorise \emph{S. aureus} as "CoPS".}
|
||||
@ -50,8 +52,6 @@ mo_property(x, property = "fullname", Becker = FALSE,
|
||||
|
||||
This excludes \emph{Enterococci} at default (who are in group D), use \code{Lancefield = "all"} to also categorise all \emph{Enterococci} as group D.}
|
||||
|
||||
\item{language}{language of the returned text, defaults to the systems language. Either one of \code{"en"} (English), \code{"de"} (German), \code{"nl"} (Dutch), \code{"es"} (Spanish) or \code{"pt"} (Portuguese).}
|
||||
|
||||
\item{property}{one of the column names of one of the \code{\link{microorganisms}} data set, like \code{"mo"}, \code{"bactsys"}, \code{"family"}, \code{"genus"}, \code{"species"}, \code{"fullname"}, \code{"gramstain"} and \code{"aerobic"}}
|
||||
}
|
||||
\value{
|
||||
@ -126,10 +126,10 @@ mo_gramstain("E. coli", language = "nl") # "Negatieve staven"
|
||||
mo_gramstain("E. coli", language = "es") # "Bacilos negativos"
|
||||
mo_gramstain("Giardia", language = "pt") # "Parasitas"
|
||||
|
||||
mo_fullname("S. pyo",
|
||||
mo_fullname("S. pyogenes",
|
||||
Lancefield = TRUE,
|
||||
language = "de") # "Streptococcus Gruppe A"
|
||||
mo_fullname("S. pyo",
|
||||
mo_fullname("S. pyogenes",
|
||||
Lancefield = TRUE,
|
||||
language = "nl") # "Streptococcus groep A"
|
||||
}
|
||||
|
@ -1,6 +1,13 @@
|
||||
context("mo.R")
|
||||
|
||||
test_that("as.mo works", {
|
||||
|
||||
library(dplyr)
|
||||
MOs <- AMR::microorganisms %>% filter(!is.na(mo))
|
||||
|
||||
expect_identical(as.character(MOs$mo), as.character(as.mo(MOs$mo)))
|
||||
expect_identical(MOs$fullname, mo_fullname(MOs$fullname, language = "en"))
|
||||
|
||||
expect_identical(
|
||||
as.character(as.mo(c("E. coli", "H. influenzae"))),
|
||||
c("ESCCOL", "HAEINF"))
|
||||
|
@ -26,25 +26,8 @@ Frequency tables (or frequency distributions) are summaries of the distribution
|
||||
## Frequencies of one variable
|
||||
|
||||
To only show and quickly review the content of one variable, you can just select this variable in various ways. Let's say we want to get the frequencies of the `sex` variable of the `septic_patients` dataset:
|
||||
```{r, echo = TRUE, results = 'hide'}
|
||||
# just using base R
|
||||
freq(septic_patients$sex)
|
||||
|
||||
# using base R to select the variable and pass it on with a pipe from the dplyr package
|
||||
septic_patients$sex %>% freq()
|
||||
|
||||
# do it all with pipes, using the `select` function from the dplyr package
|
||||
septic_patients %>%
|
||||
select(sex) %>%
|
||||
freq()
|
||||
|
||||
# or the preferred way: using a pipe to pass the variable on to the freq function
|
||||
septic_patients %>% freq(sex) # this also shows 'sex' in the title
|
||||
|
||||
```
|
||||
This will all lead to the following table:
|
||||
```{r, echo = FALSE}
|
||||
freq(septic_patients$sex)
|
||||
```{r, echo = TRUE}
|
||||
septic_patients %>% freq(sex)
|
||||
```
|
||||
This immediately shows the class of the variable, its length and availability (i.e. the amount of `NA`), the amount of unique values and (most importantly) that among septic patients men are more prevalent than women.
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user