AI improvements for microorganisms

2026-02-24 07:09:10 +01:00 · 2018-09-09 12:11:44 +02:00
parent 936198372e
commit 4816419f0c
5 changed files with 39 additions and 37 deletions
--- a/R/mo.R
+++ b/R/mo.R
@@ -110,6 +110,7 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE) {
    if (NCOL(x) > 2) {
      stop('`x` can be 2 columns at most', call. = FALSE)
    }
+    x[is.null(x)] <- NA

    # support tidyverse selection like: df %>% select(colA)
    if (!is.vector(x)) {
@@ -127,6 +128,8 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE) {
  x_backup <- x
  # translate to English for supported languages of mo_property
  x <- gsub("(Gruppe|gruppe|groep|grupo)", "group", x)
+  # remove 'empty' genus and species values
+  x <- gsub("(no MO)", "", x, fixed = TRUE)
  # remove dots and other non-text in case of "E. coli" except spaces
  x <- gsub("[^a-zA-Z0-9 ]+", "", x)
  # but spaces before and after should be omitted
@@ -144,11 +147,9 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE) {
  x_withspaces <- paste0('^', x_withspaces, '$')

  for (i in 1:length(x)) {
-
    if (identical(x_trimmed[i], "")) {
      # empty values
      x[i] <- NA
-      #failures <- c(failures, x_backup[i])
      next
    }
    if (x_backup[i] %in% AMR::microorganisms$mo) {
@@ -161,6 +162,11 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE) {
      x[i] <- x_trimmed[i]
      next
    }
+    if (x_backup[i] %in% AMR::microorganisms$fullname) {
+      # is exact match in fullname
+      x[i] <- AMR::microorganisms[which(AMR::microorganisms$fullname == x_backup[i]), ]$mo[1]
+      next
+    }

    if (tolower(x[i]) == '^e.*coli$') {
      # avoid detection of Entamoeba coli in case of E. coli
@@ -173,7 +179,7 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE) {
      next
    }
    if (tolower(x[i]) == '^c.*difficile$') {
-      # avoid detection of Clostridium difficile in case of C. difficile
+      # avoid detection of Catabacter difficile in case of C. difficile
      x[i] <- 'CLODIF'
      next
    }
@@ -189,16 +195,18 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE) {
      x[i] <- 'PSEAER'
      next
    }
-    if (tolower(x[i]) %like% 'coagulase negative'
-        | tolower(x[i]) %like% 'cns'
-        | tolower(x[i]) %like% 'cons') {
+
+    # CoNS and CoPS in different languages (support for German, Dutch, Spanish, Portuguese)
+    if (tolower(x[i]) %like% '[ck]oagulas[ea] negatie?[vf]'
+        | tolower(x_trimmed[i]) %like% '[ck]oagulas[ea] negatie?[vf]'
+        | tolower(x[i]) %like% '[ck]o?ns[^a-z]?$') {
      # coerce S. coagulase negative
      x[i] <- 'STACNS'
      next
    }
-    if (tolower(x[i]) %like% 'coagulase positive'
-        | tolower(x[i]) %like% 'cps'
-        | tolower(x[i]) %like% 'cops') {
+    if (tolower(x[i]) %like% '[ck]oagulas[ea] positie?[vf]'
+        | tolower(x_trimmed[i]) %like% '[ck]oagulas[ea] positie?[vf]'
+        | tolower(x[i]) %like% '[ck]o?ps[^a-z]?$') {
      # coerce S. coagulase positive
      x[i] <- 'STACPS'
      next
@@ -381,6 +389,10 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE) {
    x[x == "STCSAL"] <- "STCGRK" # S. salivarius
  }

+  # for the returned genera without species (like "ESC"), add species (like "ESCSPP") where the input contained it
+  indices <- unique(x_input) %like% "[A-Z]{3}SPP" & !x %like% "[A-Z]{3}SPP"
+  x[indices] <- paste0(x[indices], 'SPP')
+
  # left join the found results to the original input values (x_input)
  df_found <- data.frame(input = as.character(unique(x_input)),
                         found = x,
--- a/R/mo_property.R
+++ b/R/mo_property.R
@@ -99,10 +99,10 @@
 #' mo_gramstain("E. coli", language = "es")  # "Bacilos negativos"
 #' mo_gramstain("Giardia", language = "pt")  # "Parasitas"
 #'
-#' mo_fullname("S. pyo",
+#' mo_fullname("S. pyogenes",
 #'             Lancefield = TRUE,
 #'             language = "de")              # "Streptococcus Gruppe A"
-#' mo_fullname("S. pyo",
+#' mo_fullname("S. pyogenes",
 #'             Lancefield = TRUE,
 #'             language = "nl")              # "Streptococcus groep A"
 mo_family <- function(x) {
@@ -111,8 +111,8 @@ mo_family <- function(x) {

 #' @rdname mo_property
 #' @export
-mo_genus <- function(x) {
-  mo_property(x, "genus")
+mo_genus <- function(x, language = NULL) {
+  mo_property(x, "genus", language = language)
 }

 #' @rdname mo_property
--- a/man/mo_property.Rd
+++ b/man/mo_property.Rd
@@ -20,7 +20,7 @@
 \usage{
 mo_family(x)

-mo_genus(x)
+mo_genus(x, language = NULL)

 mo_species(x, Becker = FALSE, Lancefield = FALSE, language = NULL)

@@ -42,6 +42,8 @@ mo_property(x, property = "fullname", Becker = FALSE,
 \arguments{
 \item{x}{any (vector of) text that can be coerced to a valid microorganism code with \code{\link{as.mo}}}

+\item{language}{language of the returned text, defaults to the systems language. Either one of \code{"en"} (English), \code{"de"} (German), \code{"nl"} (Dutch), \code{"es"} (Spanish) or \code{"pt"} (Portuguese).}
+
 \item{Becker}{a logical to indicate whether \emph{Staphylococci} should be categorised into Coagulase Negative \emph{Staphylococci} ("CoNS") and Coagulase Positive \emph{Staphylococci} ("CoPS") instead of their own species, according to Karsten Becker \emph{et al.} [1].

  This excludes \emph{Staphylococcus aureus} at default, use \code{Becker = "all"} to also categorise \emph{S. aureus} as "CoPS".}
@@ -50,8 +52,6 @@ mo_property(x, property = "fullname", Becker = FALSE,

  This excludes \emph{Enterococci} at default (who are in group D), use \code{Lancefield = "all"} to also categorise all \emph{Enterococci} as group D.}

-\item{language}{language of the returned text, defaults to the systems language. Either one of \code{"en"} (English), \code{"de"} (German), \code{"nl"} (Dutch), \code{"es"} (Spanish) or \code{"pt"} (Portuguese).}
-
 \item{property}{one of the column names of one of the \code{\link{microorganisms}} data set, like \code{"mo"}, \code{"bactsys"}, \code{"family"}, \code{"genus"}, \code{"species"}, \code{"fullname"}, \code{"gramstain"} and \code{"aerobic"}}
 }
 \value{
@@ -126,10 +126,10 @@ mo_gramstain("E. coli", language = "nl")  # "Negatieve staven"
 mo_gramstain("E. coli", language = "es")  # "Bacilos negativos"
 mo_gramstain("Giardia", language = "pt")  # "Parasitas"

-mo_fullname("S. pyo",
+mo_fullname("S. pyogenes",
            Lancefield = TRUE,
            language = "de")              # "Streptococcus Gruppe A"
-mo_fullname("S. pyo",
+mo_fullname("S. pyogenes",
            Lancefield = TRUE,
            language = "nl")              # "Streptococcus groep A"
 }
--- a/tests/testthat/test-mo.R
+++ b/tests/testthat/test-mo.R
@@ -1,6 +1,13 @@
 context("mo.R")

 test_that("as.mo works", {
+
+  library(dplyr)
+  MOs <- AMR::microorganisms %>% filter(!is.na(mo))
+
+  expect_identical(as.character(MOs$mo), as.character(as.mo(MOs$mo)))
+  expect_identical(MOs$fullname, mo_fullname(MOs$fullname, language = "en"))
+
  expect_identical(
    as.character(as.mo(c("E. coli", "H. influenzae"))),
    c("ESCCOL", "HAEINF"))
--- a/vignettes/freq.Rmd
+++ b/vignettes/freq.Rmd
@@ -26,25 +26,8 @@ Frequency tables (or frequency distributions) are summaries of the distribution
 ## Frequencies of one variable

 To only show and quickly review the content of one variable, you can just select this variable in various ways. Let's say we want to get the frequencies of the `sex` variable of the `septic_patients` dataset:
-```{r, echo = TRUE, results = 'hide'}
-# just using base R
-freq(septic_patients$sex)
-
-# using base R to select the variable and pass it on with a pipe from the dplyr package
-septic_patients$sex %>% freq()
-
-# do it all with pipes, using the `select` function from the dplyr package
-septic_patients %>%
-  select(sex) %>%
-  freq()
-
-# or the preferred way: using a pipe to pass the variable on to the freq function
-septic_patients %>% freq(sex) # this also shows 'sex' in the title
-
-```
-This will all lead to the following table:
-```{r, echo = FALSE}
-freq(septic_patients$sex)
+```{r, echo = TRUE}
+septic_patients %>% freq(sex)
 ```
 This immediately shows the class of the variable, its length and availability (i.e. the amount of `NA`), the amount of unique values and (most importantly) that among septic patients men are more prevalent than women.