From 60fe98bbbd8d63e275128222af292c983d8b19ac Mon Sep 17 00:00:00 2001 From: "Matthijs S. Berends" Date: Thu, 6 Dec 2018 14:36:39 +0100 Subject: [PATCH] better as.mo handling --- DESCRIPTION | 8 +- NAMESPACE | 3 + NEWS.md | 18 +++- R/first_isolate.R | 4 +- R/mo.R | 174 +++++++++++++++++++++++++++++---------- R/mo_property.R | 9 ++ man/mo_failures.Rd | 14 ++++ man/mo_renamed.Rd | 14 ++++ tests/testthat/test-mo.R | 10 +++ 9 files changed, 204 insertions(+), 50 deletions(-) create mode 100644 man/mo_failures.Rd create mode 100644 man/mo_renamed.Rd diff --git a/DESCRIPTION b/DESCRIPTION index eb599d2e..9eb8ef9b 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: AMR -Version: 0.5.0 -Date: 2018-12-01 +Version: 0.5.0.9001 +Date: 2018-12-05 Title: Antimicrobial Resistance Analysis Authors@R: c( person( @@ -39,8 +39,8 @@ Authors@R: c( role = "ths", comment = c(ORCID = "0000-0003-1634-0010"))) Description: Functions to simplify the analysis and prediction of Antimicrobial - Resistance (AMR) to work with microbial and antimicrobial properties by using - evidence-based methods. + Resistance (AMR) and to work with microbial and antimicrobial properties by + using evidence-based methods. Depends: R (>= 3.1.0) Imports: diff --git a/NAMESPACE b/NAMESPACE index 89ac6b88..3aa2223b 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -94,6 +94,7 @@ export(mdro) export(mo_TSN) export(mo_authors) export(mo_class) +export(mo_failures) export(mo_family) export(mo_fullname) export(mo_genus) @@ -103,6 +104,7 @@ export(mo_order) export(mo_phylum) export(mo_property) export(mo_ref) +export(mo_renamed) export(mo_shortname) export(mo_species) export(mo_subkingdom) @@ -175,6 +177,7 @@ importFrom(crayon,blue) importFrom(crayon,bold) importFrom(crayon,green) importFrom(crayon,italic) +importFrom(crayon,magenta) importFrom(crayon,red) importFrom(crayon,silver) importFrom(crayon,strip_style) diff --git a/NEWS.md b/NEWS.md index 0d3f838a..ff75ace6 100755 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,19 @@ -# 0.5.0 -**Published on CRAN: 2018-12-01** +# 0.5.0.90xx (latest development version) + +#### New +* Function `mo_failures` to review values that could not be coerced to a valid MO code, using `as.mo`. This latter function will now only show a maximum of 25 uncoerced values. + +#### Changed +* Improvements for `as.mo`: + * Finds better results when input is in other languages + * Better handling for subspecies + * Better handling for *Salmonellae* +* Function `first_isolate` will now use a column named like "patid" for the patient ID, when this parameter was left blank + + + +# 0.5.0 (latest stable release) +**Published on CRAN: 2018-11-30** #### New * Repository moved to GitLab: https://gitlab.com/msberends/AMR diff --git a/R/first_isolate.R b/R/first_isolate.R index c87f76d8..ea553c41 100755 --- a/R/first_isolate.R +++ b/R/first_isolate.R @@ -164,8 +164,8 @@ first_isolate <- function(tbl, message("NOTE: Using column `", col_date, "` as input for `col_date`.") } # -- patient id - if (is.null(col_patient_id) & any(colnames(tbl) %like% "^patient")) { - col_patient_id <- colnames(tbl)[colnames(tbl) %like% "^patient"][1] + if (is.null(col_patient_id) & any(colnames(tbl) %like% "^(patient|patid)")) { + col_patient_id <- colnames(tbl)[colnames(tbl) %like% "^(patient|patid)"][1] message("NOTE: Using column `", col_patient_id, "` as input for `col_patient_id`.") } diff --git a/R/mo.R b/R/mo.R index b55c0478..083adf1b 100644 --- a/R/mo.R +++ b/R/mo.R @@ -137,10 +137,10 @@ #' mutate(mo = as.mo(paste(genus, species))) #' } as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = FALSE, reference_df = NULL) { - structure(mo_validate(x = x, property = "mo", - Becker = Becker, Lancefield = Lancefield, - allow_uncertain = allow_uncertain, reference_df = reference_df), - class = "mo") + mo <- mo_validate(x = x, property = "mo", + Becker = Becker, Lancefield = Lancefield, + allow_uncertain = allow_uncertain, reference_df = reference_df) + structure(.Data = mo, class = "mo") } #' @rdname as.mo @@ -155,9 +155,12 @@ is.mo <- function(x) { #' @export guess_mo <- as.mo -#' @importFrom dplyr %>% pull left_join +#' @importFrom dplyr %>% pull left_join n_distinct #' @importFrom data.table data.table as.data.table setkey -exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = FALSE, reference_df = NULL, property = "mo") { +#' @importFrom crayon magenta red italic +exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, + allow_uncertain = FALSE, reference_df = NULL, + property = "mo", clear_options = TRUE) { if (!"AMR" %in% base::.packages()) { library("AMR") @@ -168,6 +171,11 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = # microorganisms.oldDT # old taxonomic names, sorted by name (genus+species), TSN } + if (clear_options == TRUE) { + options(mo_failures = NULL) + options(mo_renamed = NULL) + } + if (NCOL(x) == 2) { # support tidyverse selection like: df %>% select(colA, colB) # paste these columns together @@ -231,10 +239,10 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = x_backup <- trimws(x, which = "both") # remove spp and species - x <- gsub(" +(spp.?|species)", "", x_backup) + x <- trimws(gsub(" +(spp.?|ssp.?|subsp.?|species)", " ", x_backup, ignore.case = TRUE), which = "both") x_species <- paste(x, "species") # translate to English for supported languages of mo_property - x <- gsub("(Gruppe|gruppe|groep|grupo|gruppo|groupe)", "group", x) + x <- gsub("(Gruppe|gruppe|groep|grupo|gruppo|groupe)", "group", x, ignore.case = TRUE) # remove 'empty' genus and species values x <- gsub("(no MO)", "", x, fixed = TRUE) # remove non-text in case of "E. coli" except dots and spaces @@ -244,6 +252,9 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = x <- trimws(x, which = "both") x_trimmed <- x x_trimmed_species <- paste(x_trimmed, "species") + x_trimmed_without_group <- gsub(" group$", "", x_trimmed, ignore.case = TRUE) + # remove last part from "-" or "/" + x_trimmed_without_group <- gsub("(.*)[-/].*", "\\1", x_trimmed_without_group) # replace space and dot by regex sign x_withspaces <- gsub("[ .]+", ".* ", x) x <- gsub("[ .]+", ".*", x) @@ -252,13 +263,14 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = x_withspaces_start <- paste0('^', x_withspaces) x_withspaces <- paste0('^', x_withspaces, '$') - # cat(paste0('x "', x, '"\n')) - # cat(paste0('x_species "', x_species, '"\n')) - # cat(paste0('x_withspaces_start "', x_withspaces_start, '"\n')) - # cat(paste0('x_withspaces "', x_withspaces, '"\n')) - # cat(paste0('x_backup "', x_backup, '"\n')) - # cat(paste0('x_trimmed "', x_trimmed, '"\n')) - # cat(paste0('x_trimmed_species "', x_trimmed_species, '"\n')) + # cat(paste0('x "', x, '"\n')) + # cat(paste0('x_species "', x_species, '"\n')) + # cat(paste0('x_withspaces_start "', x_withspaces_start, '"\n')) + # cat(paste0('x_withspaces "', x_withspaces, '"\n')) + # cat(paste0('x_backup "', x_backup, '"\n')) + # cat(paste0('x_trimmed "', x_trimmed, '"\n')) + # cat(paste0('x_trimmed_species "', x_trimmed_species, '"\n')) + # cat(paste0('x_trimmed_without_group "', x_trimmed_without_group, '"\n')) for (i in 1:length(x)) { if (identical(x_trimmed[i], "")) { @@ -302,7 +314,8 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = x[i] <- microorganismsDT[mo == 'B_STPHY_EPI', ..property][[1]][1L] next } - if (toupper(x_trimmed[i]) == 'VRE') { + if (toupper(x_trimmed[i]) == "VRE" + | x_trimmed[i] %like% '(enterococci|enterokok|enterococo)[a-z]*?$') { x[i] <- microorganismsDT[mo == 'B_ENTRC', ..property][[1]][1L] next } @@ -323,7 +336,18 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = next } if (toupper(x_trimmed[i]) %like% '^G[ABCDFGHK]S$') { - x[i] <- microorganismsDT[mo == gsub("G([ABCDFGHK])S", "B_STRPTC_GR\\1", x_trimmed[i]), ..property][[1]][1L] + # Streptococci, like GBS = Group B Streptococci (B_STRPTC_GRB) + x[i] <- microorganismsDT[mo == gsub("G([ABCDFGHK])S", "B_STRPTC_GR\\1", x_trimmed[i], ignore.case = TRUE), ..property][[1]][1L] + next + } + if (toupper(x_trimmed[i]) %like% '(streptococc|streptokok).* [ABCDFGHK]$') { + # Streptococci in different languages, like "estreptococos grupo B" + x[i] <- microorganismsDT[mo == gsub(".*(streptococ|streptokok|estreptococ).* ([ABCDFGHK])$", "B_STRPTC_GR\\2", x_trimmed[i], ignore.case = TRUE), ..property][[1]][1L] + next + } + if (toupper(x_trimmed[i]) %like% 'group [ABCDFGHK] (streptococ|streptokok|estreptococ)') { + # Streptococci in different languages, like "Group A Streptococci" + x[i] <- microorganismsDT[mo == gsub(".*group ([ABCDFGHK]) (streptococ|streptokok|estreptococ).*", "B_STRPTC_GR\\1", x_trimmed[i], ignore.case = TRUE), ..property][[1]][1L] next } # CoNS/CoPS in different languages (support for German, Dutch, Spanish, Portuguese) ---- @@ -341,18 +365,24 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = x[i] <- microorganismsDT[mo == 'B_STPHY_CPS', ..property][[1]][1L] next } - if (tolower(x[i]) %like% '^gram[ -]+nega.*' - | tolower(x_trimmed[i]) %like% '^gram[ -]+nega.*') { + if (tolower(x[i]) %like% 'gram[ -]?neg.*' + | tolower(x_trimmed[i]) %like% 'gram[ -]?neg.*') { # coerce S. coagulase positive x[i] <- microorganismsDT[mo == 'B_GRAMN', ..property][[1]][1L] next } - if (tolower(x[i]) %like% '^gram[ -]+posi.*' - | tolower(x_trimmed[i]) %like% '^gram[ -]+posi.*') { + if (tolower(x[i]) %like% 'gram[ -]?pos.*' + | tolower(x_trimmed[i]) %like% 'gram[ -]?pos.*') { # coerce S. coagulase positive x[i] <- microorganismsDT[mo == 'B_GRAMP', ..property][[1]][1L] next } + if (grepl("[sS]almonella [A-Z][a-z]+ ?.*", x_trimmed[i])) { + # Salmonella with capital letter species like "Salmonella Goettingen" - they're all S. enterica + x[i] <- microorganismsDT[mo == 'B_SLMNL_ENT', ..property][[1]][1L] + base::message(magenta(paste0("Note: ", italic(x_trimmed[i]), " is a subspecies of ", italic("Salmonella enterica"), " (B_SLMNL_ENT)"))) + next + } } # FIRST TRY FULLNAMES AND CODES @@ -424,6 +454,12 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = x[i] <- found[1L] next } + found <- microorganisms.prevDT[tolower(fullname) == tolower(x_trimmed_without_group[i]), ..property][[1]] + if (length(found) > 0) { + x[i] <- found[1L] + next + } + # try any match keeping spaces ---- found <- microorganisms.prevDT[fullname %like% x_withspaces[i], ..property][[1]] @@ -495,28 +531,29 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = x[i] <- found[1L] next } - + found <- microorganisms.unprevDT[tolower(fullname) == tolower(x_trimmed_without_group[i]), ..property][[1]] + if (length(found) > 0) { + x[i] <- found[1L] + next + } # try any match keeping spaces ---- found <- microorganisms.unprevDT[fullname %like% x_withspaces[i], ..property][[1]] if (length(found) > 0) { x[i] <- found[1L] next } - # try any match keeping spaces, not ending with $ ---- found <- microorganisms.unprevDT[fullname %like% x_withspaces_start[i], ..property][[1]] if (length(found) > 0) { x[i] <- found[1L] next } - # try any match diregarding spaces ---- found <- microorganisms.unprevDT[fullname %like% x[i], ..property][[1]] - if (length(found) > 0) { + if (length(found) > 0 & nchar(x_trimmed[i]) >= 6) { x[i] <- found[1L] next } - # try splitting of characters in the middle and then find ID ---- # only when text length is 6 or lower # like esco = E. coli, klpn = K. pneumoniae, stau = S. aureus, staaur = S. aureus @@ -568,15 +605,15 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = found <- microorganisms.oldDT[name %like% x_withspaces[i] | name %like% x_withspaces_start[i] | name %like% x[i],] - if (NROW(found) > 0) { + if (NROW(found) > 0 & nchar(x_trimmed[i]) >= 6) { if (property == "ref") { x[i] <- found[1, ref] } else { x[i] <- microorganismsDT[tsn == found[1, tsn_new], ..property][[1]] } - warning("Uncertain interpretation: '", - x_backup[i], "' -> '", found[1, name], "'", - call. = FALSE, immediate. = TRUE) + warning(red(paste0("UNCERTAIN - '", + x_backup[i], "' -> ", italic(found[1, name]))), + call. = FALSE, immediate. = TRUE) renamed_note(name_old = found[1, name], name_new = microorganismsDT[tsn == found[1, tsn_new], fullname], ref_old = found[1, ref], @@ -584,14 +621,38 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = next } - # (2) try to strip off one element and check the remains - x_strip <- x_backup[i] %>% strsplit(" ") %>% unlist() - x_strip <- x_strip[1:length(x_strip) - 1] - x[i] <- suppressWarnings(suppressMessages(as.mo(x_strip))) + # (2) strip values between brackets ---- + found <- microorganismsDT[fullname %like% gsub("( [(].*[)]) ", " ", x_withspaces[i]) + | fullname %like% gsub("( [(].*[)]) ", " ", x_backup[i]) + | fullname %like% gsub("( [(].*[)]) ", " ", x[i]),] + if (NROW(found) > 0 & nchar(x_trimmed[i]) >= 6) { + x[i] <- found[1, ..property][[1]] + warning(red(paste0("UNCERTAIN - '", + x_backup[i], "' -> ", italic(found[1, fullname][[1]]), " (", found[1, mo][[1]], ")")), + call. = FALSE, immediate. = TRUE) + next + } + + # (3) try to strip off one element and check the remains ---- + look_for_part <- function(z) { + x_strip <- z %>% strsplit(" ") %>% unlist() + if (length(x_strip) > 1 & nchar(x_trimmed[i]) >= 6) { + for (i in 1:(length(x_strip) - 1)) { + x_strip_collapsed <- paste(x_strip[1:(length(x_strip) - i)], collapse = " ") + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, clear_options = FALSE))) + if (!is.na(found)) { + found <- microorganismsDT[mo == found, ..property][[1]] + warning(red(paste0("UNCERTAIN - '", + z, "' -> ", italic(microorganismsDT[mo == found[1L], fullname][[1]]), " (", found[1L], ")")), + call. = FALSE, immediate. = TRUE) + return(found[1L]) + } + } + } + return(NA_character_) + } + x[i] <- look_for_part(x_backup[i]) if (!is.na(x[i])) { - warning("Uncertain interpretation: '", - x_backup[i], "' -> '", microorganismsDT[mo == x[i], fullname], "' (", x[i], ")", - call. = FALSE, immediate. = TRUE) next } } @@ -605,10 +666,16 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = failures <- failures[!failures %in% c(NA, NULL, NaN)] if (length(failures) > 0) { - warning("These ", length(failures) , " values could not be coerced to a valid MO code: ", - paste('"', unique(failures), '"', sep = "", collapse = ', '), - ".", - call. = FALSE) + options(mo_failures = sort(unique(failures))) + if (n_distinct(failures) > 25) { + warning(n_distinct(failures), " different values could not be coerced to a valid MO code. See mo_failures() to review them.", + call. = FALSE) + } else { + warning("These ", length(failures) , " values could not be coerced to a valid MO code: ", + paste('"', unique(failures), '"', sep = "", collapse = ', '), + ". See mo_failures() to review them.", + call. = FALSE) + } } # Becker ---- @@ -687,6 +754,7 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = x } +#' @importFrom crayon blue renamed_note <- function(name_old, name_new, ref_old = "", ref_new = "") { if (!is.na(ref_old)) { ref_old <- paste0(" (", ref_old, ")") @@ -698,7 +766,11 @@ renamed_note <- function(name_old, name_new, ref_old = "", ref_new = "") { } else { ref_new <- "" } - base::message(paste0("Note: '", name_old, "'", ref_old, " was renamed '", name_new, "'", ref_new)) + msg <- paste0("'", name_old, "'", ref_old, " was renamed '", name_new, "'", ref_new) + msg_plain <- paste0(name_old, ref_old, " -> ", name_new, ref_new) + msg_plain <- c(getOption("mo_renamed", character(0)), msg_plain) + options(mo_renamed = sort(msg_plain)) + base::message(blue(paste("Note:", msg))) } #' @exportMethod print.mo @@ -733,3 +805,21 @@ as.data.frame.mo <- function (x, ...) { pull.mo <- function(.data, ...) { pull(as.data.frame(.data), ...) } + +#' Vector of failed coercion attempts +#' +#' Returns a vector of all failed attempts to coerce values to a valid MO code with \code{\link{as.mo}}. +#' @seealso \code{\link{as.mo}} +#' @export +mo_failures <- function() { + getOption("mo_failures") +} + +#' Vector of taxonomic renamed items +#' +#' Returns a vector of all renamed items of the last coercion to valid MO codes with \code{\link{as.mo}}. +#' @seealso \code{\link{as.mo}} +#' @export +mo_renamed <- function() { + getOption("mo_renamed") +} diff --git a/R/mo_property.R b/R/mo_property.R index 997f4322..2bc24ac9 100644 --- a/R/mo_property.R +++ b/R/mo_property.R @@ -461,6 +461,15 @@ mo_validate <- function(x, property, ...) { Lancefield <- FALSE } + if (!"AMR" %in% base::.packages()) { + library("AMR") + # These data.tables are available as data sets when the AMR package is loaded: + # microorganismsDT # this one is sorted by kingdom (B