diff --git a/NEWS.md b/NEWS.md index 690d3c65..6af7a6fa 100755 --- a/NEWS.md +++ b/NEWS.md @@ -25,7 +25,7 @@ * Introduction to AMR as a vignette #### Changed -* Added 231 microorganisms to the `microorganisms` data set and removed the few viruses it contained, now *n* = 2,669 (2,230 bacteria, 285 fungi/yeasts, 153 parasites, 1 other) +* Added almost 200 microorganisms to the `microorganisms` data set and removed the few viruses it contained * Added three antimicrobial agents to the `antibiotics` data set: Terbinafine (D01BA02), Rifaximin (A07AA11) and Isoconazole (D01AC05) * Added 163 trade names to the `antibiotics` data set, it now contains 298 different trade names in total, e.g.: ```r diff --git a/R/data.R b/R/data.R index 22dcb533..6d391b7c 100755 --- a/R/data.R +++ b/R/data.R @@ -122,8 +122,8 @@ #' Data set with human pathogenic microorganisms #' -#' A data set containing 2,669 (potential) human pathogenic microorganisms. MO codes can be looked up using \code{\link{guess_mo}}. -#' @format A \code{\link{tibble}} with 2,669 observations and 10 variables: +#' A data set containing 2,630 (potential) human pathogenic microorganisms. MO codes can be looked up using \code{\link{guess_mo}}. +#' @format A \code{\link{tibble}} with 2,630 observations and 10 variables: #' \describe{ #' \item{\code{mo}}{ID of microorganism} #' \item{\code{bactsys}}{Bactsyscode of microorganism} diff --git a/R/mo.R b/R/mo.R index 3c504f46..f0a071e0 100644 --- a/R/mo.R +++ b/R/mo.R @@ -131,7 +131,7 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE) { # remove 'empty' genus and species values x <- gsub("(no MO)", "", x, fixed = TRUE) # remove dots and other non-text in case of "E. coli" except spaces - x <- gsub("[^a-zA-Z0-9 ]+", "", x) + x <- gsub("[^a-zA-Z0-9/ \\-]+", "", x) # but spaces before and after should be omitted x <- trimws(x, which = "both") x_trimmed <- x @@ -146,6 +146,12 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE) { x_withspaces_start <- paste0('^', x_withspaces) x_withspaces <- paste0('^', x_withspaces, '$') + # print(x) + # print(x_withspaces_all) + # print(x_withspaces_start) + # print(x_withspaces) + # print(x_backup) + for (i in 1:length(x)) { if (identical(x_trimmed[i], "")) { # empty values @@ -195,6 +201,11 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE) { x[i] <- 'PSEAER' next } + if (x_backup[i] %like% '^l.*pneum.*' & !x_backup[i] %like% '^l.*non.*pneum.*') { + # avoid detection of Legionella non pneumophila in case of Legionella pneumophila + x[i] <- 'LEGPNE' + next + } # CoNS and CoPS in different languages (support for German, Dutch, Spanish, Portuguese) if (tolower(x[i]) %like% '[ck]oagulas[ea] negatie?[vf]' @@ -250,6 +261,12 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE) { x[i] <- found[1L] next } + # try the same, now based on genus + species + found <- MOs[which(paste(MOs$genus, MOs$species) %like% x_withspaces[i]),]$mo + if (length(found) > 0) { + x[i] <- found[1L] + next + } # try any match keeping spaces, not ending with $ found <- MOs[which(MOs$fullname %like% x_withspaces_start[i]),]$mo if (length(found) > 0) { @@ -329,7 +346,7 @@ as.mo <- function(x, Becker = FALSE, Lancefield = FALSE) { failures <- failures[!failures %in% c(NA, NULL, NaN)] if (length(failures) > 0) { - warning("These values could not be coerced to a valid mo: ", + warning("These ", length(failures) , " values could not be coerced to a valid mo: ", paste('"', unique(failures), '"', sep = "", collapse = ', '), ".", call. = FALSE) diff --git a/R/mo_property.R b/R/mo_property.R index 92cb9d4a..f4c9034a 100644 --- a/R/mo_property.R +++ b/R/mo_property.R @@ -159,7 +159,7 @@ mo_shortname <- function(x, Becker = FALSE, Lancefield = FALSE, language = NULL) # return G. species result <- paste0(substr(mo_genus(x), 1, 1), ". ", suppressWarnings(mo_species(x))) } - result[result %in% c(". ")] <- "" + result[result %in% c(". ", "(. ")] <- "" mo_translate(result, language = language) } @@ -224,6 +224,7 @@ mo_translate <- function(x, language) { language == "de" ~ x %>% gsub("Coagulase Negative Staphylococcus","Koagulase-negative Staphylococcus", ., fixed = TRUE) %>% gsub("Coagulase Positive Staphylococcus","Koagulase-positive Staphylococcus", ., fixed = TRUE) %>% + gsub("Beta-haemolytic Streptococcus", "Beta-h\u00e4molytischer Streptococcus", ., fixed = TRUE) %>% gsub("(no MO)", "(kein MO)", ., fixed = TRUE) %>% gsub("Negative rods", "Negative St\u00e4bchen", ., fixed = TRUE) %>% gsub("Negative cocci", "Negative Kokken", ., fixed = TRUE) %>% @@ -244,6 +245,7 @@ mo_translate <- function(x, language) { language == "nl" ~ x %>% gsub("Coagulase Negative Staphylococcus","Coagulase-negatieve Staphylococcus", ., fixed = TRUE) %>% gsub("Coagulase Positive Staphylococcus","Coagulase-positieve Staphylococcus", ., fixed = TRUE) %>% + gsub("Beta-haemolytic Streptococcus", "Beta-hemolytische Streptococcus", ., fixed = TRUE) %>% gsub("(no MO)", "(geen MO)", ., fixed = TRUE) %>% gsub("Negative rods", "Negatieve staven", ., fixed = TRUE) %>% gsub("Negative cocci", "Negatieve kokken", ., fixed = TRUE) %>% @@ -264,6 +266,7 @@ mo_translate <- function(x, language) { language == "es" ~ x %>% gsub("Coagulase Negative Staphylococcus","Staphylococcus coagulasa negativo", ., fixed = TRUE) %>% gsub("Coagulase Positive Staphylococcus","Staphylococcus coagulasa positivo", ., fixed = TRUE) %>% + gsub("Beta-haemolytic Streptococcus", "Streptococcus Beta-hemol\u00edtico", ., fixed = TRUE) %>% gsub("(no MO)", "(sin MO)", ., fixed = TRUE) %>% gsub("Negative rods", "Bacilos negativos", ., fixed = TRUE) %>% gsub("Negative cocci", "Cocos negativos", ., fixed = TRUE) %>% @@ -284,6 +287,7 @@ mo_translate <- function(x, language) { language == "pt" ~ x %>% gsub("Coagulase Negative Staphylococcus","Staphylococcus coagulase negativo", ., fixed = TRUE) %>% gsub("Coagulase Positive Staphylococcus","Staphylococcus coagulase positivo", ., fixed = TRUE) %>% + gsub("Beta-haemolytic Streptococcus", "Streptococcus Beta-hemol\u00edtico", ., fixed = TRUE) %>% gsub("(no MO)", "(sem MO)", ., fixed = TRUE) %>% gsub("Negative rods", "Bacilos negativos", ., fixed = TRUE) %>% gsub("Negative cocci", "Cocos negativos", ., fixed = TRUE) %>% diff --git a/README.md b/README.md index 19141c90..aba1ff4c 100755 --- a/README.md +++ b/README.md @@ -388,7 +388,7 @@ antibiotics # A tibble: 423 x 18 # Dataset with bacteria codes and properties like gram stain and # aerobic/anaerobic -microorganisms # A tibble: 2,669 x 10 +microorganisms # A tibble: 2,630 x 10 ``` ## Copyright diff --git a/data/microorganisms.rda b/data/microorganisms.rda index 16018a32..1bb364a1 100755 Binary files a/data/microorganisms.rda and b/data/microorganisms.rda differ diff --git a/man/microorganisms.Rd b/man/microorganisms.Rd index f43be0d4..73944191 100755 --- a/man/microorganisms.Rd +++ b/man/microorganisms.Rd @@ -4,7 +4,7 @@ \name{microorganisms} \alias{microorganisms} \title{Data set with human pathogenic microorganisms} -\format{A \code{\link{tibble}} with 2,669 observations and 10 variables: +\format{A \code{\link{tibble}} with 2,630 observations and 10 variables: \describe{ \item{\code{mo}}{ID of microorganism} \item{\code{bactsys}}{Bactsyscode of microorganism} @@ -21,7 +21,7 @@ microorganisms } \description{ -A data set containing 2,669 (potential) human pathogenic microorganisms. MO codes can be looked up using \code{\link{guess_mo}}. +A data set containing 2,630 (potential) human pathogenic microorganisms. MO codes can be looked up using \code{\link{guess_mo}}. } \seealso{ \code{\link{guess_mo}} \code{\link{antibiotics}} \code{\link{microorganisms.umcg}} diff --git a/tests/testthat/test-mo.R b/tests/testthat/test-mo.R index e3fd4c78..b86ef467 100644 --- a/tests/testthat/test-mo.R +++ b/tests/testthat/test-mo.R @@ -4,9 +4,7 @@ test_that("as.mo works", { library(dplyr) MOs <- AMR::microorganisms %>% filter(!is.na(mo)) - expect_identical(as.character(MOs$mo), as.character(as.mo(MOs$mo))) - expect_identical(MOs$fullname, mo_fullname(MOs$fullname, language = "en")) expect_identical( as.character(as.mo(c("E. coli", "H. influenzae"))), @@ -21,6 +19,9 @@ test_that("as.mo works", { expect_equal(as.character(as.mo("K. pneu rhino")), "KLEPNERH") # K. pneumoniae subspp. rhinoscleromatis expect_equal(as.character(as.mo("Bartonella")), "BAR") expect_equal(as.character(as.mo("C. difficile")), "CLODIF") + expect_equal(as.character(as.mo("L. pneumophila")), "LEGPNE") + expect_equal(as.character(as.mo("L. non pneumophila")), "LEGNON") + expect_equal(as.character(as.mo("S. beta-haemolytic")), "STCHAE") expect_equal(as.character(as.mo("S. pyo")), "STCPYO") # not Actinomyces pyogenes diff --git a/tests/testthat/test-mo_property.R b/tests/testthat/test-mo_property.R index 08209c4c..de467241 100644 --- a/tests/testthat/test-mo_property.R +++ b/tests/testthat/test-mo_property.R @@ -22,4 +22,27 @@ test_that("mo_property works", { expect_equal(mo_gramstain("E. coli", language = "nl"), "Negatieve staven") expect_error(mo_type("E. coli", language = "INVALID")) + + # test integrity + library(dplyr) + MOs <- AMR::microorganisms %>% filter(!is.na(mo)) + expect_identical(MOs$fullname, mo_fullname(MOs$fullname, language = "en")) + + mo_clean <- MOs$mo + mo_from_shortname <- as.mo(mo_shortname(mo_clean)) + mo_clean <- mo_clean[nchar(mo_from_shortname) == 6 & + !is.na(mo_from_shortname) & + !mo_from_shortname %like% "...SPP"] + mo_from_shortname <- mo_from_shortname[nchar(mo_from_shortname) == 6 & + !is.na(mo_from_shortname) & + !mo_from_shortname %like% "...SPP"] + tb <- tibble(a = substr(mo_clean, 1, 6), + b = mo_from_shortname, + c = a == b, + d = mo_shortname(a), + e = mo_shortname(b), + f = d == e) + expect_gt(sum(tb$c) / nrow(tb), 0.9) # more than 90% of MO code should be identical + expect_identical(sum(tb$f), nrow(tb)) # all shortnames should be identical + })