algoritm improvement, removed all Catabacter except for C. hongkongensis

2026-01-11 17:54:33 +01:00 · 2018-09-10 11:40:54 +02:00
parent 4816419f0c
commit b83e6a9380
9 changed files with 56 additions and 11 deletions
--- a/R/data.R
+++ b/R/data.R
@@ -122,8 +122,8 @@

 #' Data set with human pathogenic microorganisms
 #'
-#' A data set containing 2,669 (potential) human pathogenic microorganisms. MO codes can be looked up using \code{\link{guess_mo}}.
-#' @format A \code{\link{tibble}} with 2,669 observations and 10 variables:
+#' A data set containing 2,630 (potential) human pathogenic microorganisms. MO codes can be looked up using \code{\link{guess_mo}}.
+#' @format A \code{\link{tibble}} with 2,630 observations and 10 variables:
 #' \describe{
 #'   \item{\code{mo}}{ID of microorganism}
 #'   \item{\code{bactsys}}{Bactsyscode of microorganism}
--- a/R/mo.R
+++ b/R/mo.R
@@ -131,7 +131,7 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE) {
  # remove 'empty' genus and species values
  x <- gsub("(no MO)", "", x, fixed = TRUE)
  # remove dots and other non-text in case of "E. coli" except spaces
-  x <- gsub("[^a-zA-Z0-9 ]+", "", x)
+  x <- gsub("[^a-zA-Z0-9/ \\-]+", "", x)
  # but spaces before and after should be omitted
  x <- trimws(x, which = "both")
  x_trimmed <- x
@@ -146,6 +146,12 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE) {
  x_withspaces_start <- paste0('^', x_withspaces)
  x_withspaces <- paste0('^', x_withspaces, '$')

+  # print(x)
+  # print(x_withspaces_all)
+  # print(x_withspaces_start)
+  # print(x_withspaces)
+  # print(x_backup)
+
  for (i in 1:length(x)) {
    if (identical(x_trimmed[i], "")) {
      # empty values
@@ -195,6 +201,11 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE) {
      x[i] <- 'PSEAER'
      next
    }
+    if (x_backup[i] %like% '^l.*pneum.*' & !x_backup[i] %like% '^l.*non.*pneum.*') {
+      # avoid detection of Legionella non pneumophila in case of Legionella pneumophila
+      x[i] <- 'LEGPNE'
+      next
+    }

    # CoNS and CoPS in different languages (support for German, Dutch, Spanish, Portuguese)
    if (tolower(x[i]) %like% '[ck]oagulas[ea] negatie?[vf]'
@@ -250,6 +261,12 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE) {
      x[i] <- found[1L]
      next
    }
+    # try the same, now based on genus + species
+    found <- MOs[which(paste(MOs$genus, MOs$species) %like% x_withspaces[i]),]$mo
+    if (length(found) > 0) {
+      x[i] <- found[1L]
+      next
+    }
    # try any match keeping spaces, not ending with $
    found <- MOs[which(MOs$fullname %like% x_withspaces_start[i]),]$mo
    if (length(found) > 0) {
@@ -329,7 +346,7 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE) {

  failures <- failures[!failures %in% c(NA, NULL, NaN)]
  if (length(failures) > 0) {
-    warning("These values could not be coerced to a valid mo: ",
+    warning("These ", length(failures) , " values could not be coerced to a valid mo: ",
            paste('"', unique(failures), '"', sep = "", collapse = ', '),
            ".",
            call. = FALSE)
--- a/R/mo_property.R
+++ b/R/mo_property.R
@@ -159,7 +159,7 @@ mo_shortname <- function(x, Becker = FALSE, Lancefield = FALSE, language = NULL)
    # return G. species
    result <- paste0(substr(mo_genus(x), 1, 1), ". ", suppressWarnings(mo_species(x)))
  }
-  result[result %in% c(". ")] <- ""
+  result[result %in% c(". ", "(. ")] <- ""
  mo_translate(result, language = language)
 }

@@ -224,6 +224,7 @@ mo_translate <- function(x, language) {
    language == "de" ~ x %>%
      gsub("Coagulase Negative Staphylococcus","Koagulase-negative Staphylococcus", ., fixed = TRUE) %>%
      gsub("Coagulase Positive Staphylococcus","Koagulase-positive Staphylococcus", ., fixed = TRUE) %>%
+      gsub("Beta-haemolytic Streptococcus",    "Beta-h\u00e4molytischer Streptococcus", ., fixed = TRUE) %>%
      gsub("(no MO)",          "(kein MO)", ., fixed = TRUE) %>%
      gsub("Negative rods",    "Negative St\u00e4bchen", ., fixed = TRUE) %>%
      gsub("Negative cocci",   "Negative Kokken", ., fixed = TRUE) %>%
@@ -244,6 +245,7 @@ mo_translate <- function(x, language) {
    language == "nl" ~ x %>%
      gsub("Coagulase Negative Staphylococcus","Coagulase-negatieve Staphylococcus", ., fixed = TRUE) %>%
      gsub("Coagulase Positive Staphylococcus","Coagulase-positieve Staphylococcus", ., fixed = TRUE) %>%
+      gsub("Beta-haemolytic Streptococcus",    "Beta-hemolytische Streptococcus", ., fixed = TRUE) %>%
      gsub("(no MO)",          "(geen MO)", ., fixed = TRUE) %>%
      gsub("Negative rods",    "Negatieve staven", ., fixed = TRUE) %>%
      gsub("Negative cocci",   "Negatieve kokken", ., fixed = TRUE) %>%
@@ -264,6 +266,7 @@ mo_translate <- function(x, language) {
    language == "es" ~ x %>%
      gsub("Coagulase Negative Staphylococcus","Staphylococcus coagulasa negativo", ., fixed = TRUE) %>%
      gsub("Coagulase Positive Staphylococcus","Staphylococcus coagulasa positivo", ., fixed = TRUE) %>%
+      gsub("Beta-haemolytic Streptococcus",    "Streptococcus Beta-hemol\u00edtico", ., fixed = TRUE) %>%
      gsub("(no MO)",          "(sin MO)", ., fixed = TRUE) %>%
      gsub("Negative rods",    "Bacilos negativos", ., fixed = TRUE) %>%
      gsub("Negative cocci",   "Cocos negativos", ., fixed = TRUE) %>%
@@ -284,6 +287,7 @@ mo_translate <- function(x, language) {
    language == "pt" ~ x %>%
      gsub("Coagulase Negative Staphylococcus","Staphylococcus coagulase negativo", ., fixed = TRUE) %>%
      gsub("Coagulase Positive Staphylococcus","Staphylococcus coagulase positivo", ., fixed = TRUE) %>%
+      gsub("Beta-haemolytic Streptococcus",    "Streptococcus Beta-hemol\u00edtico", ., fixed = TRUE) %>%
      gsub("(no MO)",          "(sem MO)", ., fixed = TRUE) %>%
      gsub("Negative rods",    "Bacilos negativos", ., fixed = TRUE) %>%
      gsub("Negative cocci",   "Cocos negativos", ., fixed = TRUE) %>%