diff --git a/DESCRIPTION b/DESCRIPTION index db62ab0b..e011adeb 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: AMR -Version: 0.4.0.9012 -Date: 2018-11-17 +Version: 0.4.0.9013 +Date: 2018-11-24 Title: Antimicrobial Resistance Analysis Authors@R: c( person( diff --git a/NEWS.md b/NEWS.md index 979f8240..c455c71e 100755 --- a/NEWS.md +++ b/NEWS.md @@ -23,6 +23,13 @@ * Added column `kingdom` to the microorganisms data set, and function `mo_kingdom` to look up values * Tremendous speed improvement for `as.mo` (and subsequently all `mo_*` functions), as empty values wil be ignored *a priori* * Fewer than 3 characters as input for `as.mo` will return NA +* Function `as.mo` (and all `mo_*` wrappers) now supports genus abbreviations with "species" attached + ```r + as.mo("E. species") # B_ESCHR + mo_fullname("E. spp.") # "Escherichia species" + as.mo("S. spp") # B_STPHY + mo_fullname("S. species") # "Staphylococcus species" + ``` * Added parameter `combine_IR` (TRUE/FALSE) to functions `portion_df` and `count_df`, to indicate that all values of I and R must be merged into one, so the output only consists of S vs. IR (susceptible vs. non-susceptible) * Fix for `portion_*(..., as_percent = TRUE)` when minimal number of isolates would not be met * Added parameter `also_single_tested` for `portion_*` and `count_*` functions to also include cases where not all antibiotics were tested but at least one of the tested antibiotics includes the target antimicribial interpretation, see `?portion` @@ -45,9 +52,9 @@ * Now prints in markdown at default in non-interactive sessions * No longer adds the factor level column and sorts factors on count again * Support for class `difftime` - * New parameter `na`, to choose with character to print for empty values - * New parameter `header` to turn it off (default when `markdown = TRUE`) - * New parameter `title` to replace the automatically set title + * New parameter `na`, to choose which character to print for empty values + * New parameter `header` to turn the header info off (default when `markdown = TRUE`) + * New parameter `title` to manually setbthe title of the frequency table * `first_isolate` now tries to find columns to use as input when parameters are left blank * Improvements for MDRO algorithm (function `mdro`) * Data set `septic_patients` is now a `data.frame`, not a tibble anymore @@ -66,6 +73,7 @@ * In `g.test`, when `sum(x)` is below 1000 or any of the expected values is below 5, Fisher's Exact Test will be suggested * `ab_name` will try to fall back on `as.atc` when no results are found * Removed the addin to view data sets +* Percentages will now will rounded more logically (e.g. in `freq` function) #### Other * New dependency on package `crayon`, to support formatted text in the console diff --git a/R/freq.R b/R/freq.R index 5ea410cc..31de3aa9 100755 --- a/R/freq.R +++ b/R/freq.R @@ -313,16 +313,22 @@ frequency_tbl <- function(x, } } - na_txt <- paste0(NAs %>% length() %>% format(), ' = ', - (NAs %>% length() / (NAs %>% length() + x %>% length())) %>% percent(force_zero = TRUE, round = digits) %>% - sub('NaN', '0', ., fixed = TRUE)) - if (!na_txt %like% "^0 =") { - na_txt <- red(na_txt) + if (NROW(x) > 0) { + na_txt <- paste0(NAs %>% length() %>% format(), ' = ', + (NAs %>% length() / (NAs %>% length() + x %>% length())) %>% percent(force_zero = TRUE, round = digits) %>% + sub('NaN', '0', ., fixed = TRUE)) + if (!na_txt %like% "^0 =") { + na_txt <- red(na_txt) + } else { + na_txt <- green(na_txt) + } + na_txt <- paste0('(of which NA: ', na_txt, ')') } else { - na_txt <- green(na_txt) + na_txt <- "" } + header_txt <- header_txt %>% paste0(markdown_line, '\nLength: ', (NAs %>% length() + x %>% length()) %>% format(), - ' (of which NA: ', na_txt, ')') + ' ', na_txt) header_txt <- header_txt %>% paste0(markdown_line, '\nUnique: ', x %>% n_distinct() %>% format()) if (NROW(x) > 0 & any(class(x) == "character")) { @@ -592,7 +598,12 @@ print.frequency_tbl <- function(x, nmax = getOption("max.print.freq", default = } title <- paste(title, group_var) } - title <- paste("Frequency table of", trimws(title)) + title <- trimws(title) + if (title == "") { + title <- "Frequency table" + } else { + title <- paste("Frequency table of", trimws(title)) + } } else { title <- opt$title } diff --git a/R/globals.R b/R/globals.R index 451f8856..0360affe 100755 --- a/R/globals.R +++ b/R/globals.R @@ -54,6 +54,8 @@ globalVariables(c(".", "microorganisms.prevDT", "microorganisms.unprevDT", "microorganisms.oldDT", + "microorganisms.certe", + "microorganisms.umcg", "mo", "mo.old", "n", diff --git a/R/misc.R b/R/misc.R index ddd7d796..7ba73a54 100755 --- a/R/misc.R +++ b/R/misc.R @@ -28,12 +28,24 @@ addin_insert_like <- function() { # No export, no Rd percent <- function(x, round = 1, force_zero = FALSE, ...) { - val <- base::round(x * 100, digits = round) - if (force_zero == TRUE & any(val == as.integer(val) & !is.na(val))) { - val[val == as.integer(val)] <- paste0(val[val == as.integer(val)], ".", strrep(0, round)) + + # https://stackoverflow.com/a/12688836/4575331 + round2 <- function(x, n) (trunc((abs(x) * 10 ^ n) + 0.5) / 10 ^ n) * sign(x) + + val <- round2(x, round + 2) # round up 0.5 + val <- round(x = val * 100, digits = round) # remove floating point error + + if (force_zero == TRUE) { + if (any(val == as.integer(val) & !is.na(val))) { + # add zeroes to all integers + val[val == as.integer(as.character(val))] <- paste0(val[val == as.integer(val)], ".", strrep(0, round)) + } + # add extra zeroes if needed + val_decimals <- nchar(gsub(".*[.](.*)", "\\1", as.character(val))) + val[val_decimals < round] <- paste0(val[val_decimals < round], strrep(0, max(0, round - val_decimals))) } pct <- base::paste0(val, "%") - pct[pct == "NA%"] <- NA_character_ + pct[pct %in% c("NA%", "NaN%")] <- NA_character_ pct } diff --git a/R/mo.R b/R/mo.R index a27b0e9f..8aadaab6 100644 --- a/R/mo.R +++ b/R/mo.R @@ -53,7 +53,7 @@ #' \itemize{ #' \item{Taxonomic kingdom: it first searches in bacteria, then fungi, then protozoa} #' \item{Human pathogenic prevalence: it first searches in more prevalent microorganisms, then less prevalent ones} -#' \item{Valid MO codes and full names: it first searches in already valid MO code and genus/species combinations} +#' \item{Valid MO codes and full names: it first searches in already valid MO code and known genus/species combinations} #' \item{Breakdown of input values: from here it starts to breakdown input values to find possible matches} #' } #' @@ -126,16 +126,15 @@ #' library(dplyr) #' df$mo <- df %>% #' select(microorganism_name) %>% -#' guess_mo() +#' as.mo() #' #' # and can even contain 2 columns, which is convenient for genus/species combinations: #' df$mo <- df %>% #' select(genus, species) %>% -#' guess_mo() -#' -#' # same result: +#' as.mo() +#' # although this works easier and does the same: #' df <- df %>% -#' mutate(mo = guess_mo(paste(genus, species))) +#' mutate(mo = as.mo(paste(genus, species))) #' } as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = FALSE, reference_df = NULL) { structure(mo_validate(x = x, property = "mo", @@ -160,11 +159,14 @@ guess_mo <- as.mo #' @importFrom data.table data.table as.data.table setkey exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = FALSE, reference_df = NULL, property = "mo") { - # These data.tables are available as data sets when the AMR package is loaded: - # microorganismsDT # this one is sorted by kingdom (B% select(colA, colB) @@ -216,31 +218,35 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = suppressWarnings( x <- data.frame(x = x, stringsAsFactors = FALSE) %>% left_join(reference_df, by = "x") %>% - left_join(AMR::microorganisms, by = "mo") %>% + left_join(microorganisms, by = "mo") %>% pull(property) ) - } else if (all(toupper(x) %in% AMR::microorganisms.certe[, "certe"])) { + } else if (all(toupper(x) %in% microorganisms.certe[, "certe"])) { # old Certe codes - y <- as.data.table(AMR::microorganisms.certe)[data.table(certe = toupper(x)), on = "certe", ] + y <- as.data.table(microorganisms.certe)[data.table(certe = toupper(x)), on = "certe", ] x <- microorganismsDT[data.table(mo = y[["mo"]]), on = "mo", ..property][[1]] } else if (!all(x %in% microorganismsDT[[property]])) { x_backup <- trimws(x, which = "both") - x_species <- paste(x_backup, "species") + + # remove spp and species + x <- gsub(" +(spp.?|species)", "", x_backup) + x_species <- paste(x, "species") # translate to English for supported languages of mo_property x <- gsub("(Gruppe|gruppe|groep|grupo|gruppo|groupe)", "group", x) # remove 'empty' genus and species values x <- gsub("(no MO)", "", x, fixed = TRUE) - # remove dots and other non-text in case of "E. coli" except spaces - x <- gsub("[^a-zA-Z0-9/ \\-]+", "", x) + # remove non-text in case of "E. coli" except dots and spaces + x <- gsub("[^.a-zA-Z0-9/ \\-]+", "", x) + # but spaces before and after should be omitted x <- trimws(x, which = "both") x_trimmed <- x x_trimmed_species <- paste(x_trimmed, "species") - # replace space by regex sign - x_withspaces <- gsub(" ", ".* ", x, fixed = TRUE) - x <- gsub(" ", ".*", x, fixed = TRUE) + # replace space and dot by regex sign + x_withspaces <- gsub("[ .]+", ".* ", x) + x <- gsub("[ .]+", ".*", x) # add start en stop regex x <- paste0('^', x, '$') x_withspaces_start <- paste0('^', x_withspaces) @@ -261,10 +267,28 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = next } if (nchar(x_trimmed[i]) < 3) { - # fewer than 3 chars, add as failure - x[i] <- NA_character_ - failures <- c(failures, x_backup[i]) - next + # check if search term was like "A. species", then return first genus found with ^A + if (x_backup[i] %like% "species" | x_backup[i] %like% "spp[.]?") { + # get mo code of first hit + found <- microorganismsDT[fullname %like% x_withspaces_start[i], mo][[1]] + mo_code <- found[1L] %>% strsplit("_") %>% unlist() %>% .[1:2] %>% paste(collapse = "_") + found <- microorganismsDT[mo == mo_code, ..property][[1]] + # return first genus that begins with x_trimmed, e.g. when "E. spp." + if (length(found) > 0) { + x[i] <- found[1L] + next + } else { + # fewer than 3 chars, add as failure + x[i] <- NA_character_ + failures <- c(failures, x_backup[i]) + next + } + } else { + # fewer than 3 chars, add as failure + x[i] <- NA_character_ + failures <- c(failures, x_backup[i]) + next + } } # translate known trivial abbreviations to genus + species ---- @@ -353,15 +377,9 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = } # TRY OTHER SOURCES ---- - if (x_backup[i] %in% AMR::microorganisms.certe$certe) { - x[i] <- microorganismsDT[mo == AMR::microorganisms.certe[AMR::microorganisms.certe[, 1] == x_backup[i], 2], ..property][[1]][1L] - # x[i] <- exec_as.mo(x = AMR::microorganisms.certe[AMR::microorganisms.certe$certe == x_backup[i], "mo"], - # property = property) - # next - } - if (x_backup[i] %in% AMR::microorganisms.umcg[, 1]) { - mo_umcg <- AMR::microorganisms.umcg[AMR::microorganisms.umcg[, 1] == x_backup[i], 2] - mo_found <- AMR::microorganisms.certe[AMR::microorganisms.certe[, 1] == mo_umcg, 2] + if (x_backup[i] %in% microorganisms.umcg[, 1]) { + mo_umcg <- microorganisms.umcg[microorganisms.umcg[, 1] == x_backup[i], 2] + mo_found <- microorganisms.certe[microorganisms.certe[, 1] == mo_umcg, 2] if (length(mo_found) == 0) { # not found x[i] <- NA_character_ @@ -371,13 +389,15 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = } next } - if (x_backup[i] %in% reference_df[, 1]) { - ref_mo <- reference_df[reference_df[, 1] == x_backup[i], 2] - if (ref_mo %in% microorganismsDT[, mo]) { - x[i] <- microorganismsDT[mo == ref_mo, ..property][[1]][1L] - next - } else { - warning("Value '", x_backup[i], "' was found in reference_df, but '", ref_mo, "' is not a valid MO code.", call. = FALSE) + if (!is.null(reference_df)) { + if (x_backup[i] %in% reference_df[, 1]) { + ref_mo <- reference_df[reference_df[, 1] == x_backup[i], 2] + if (ref_mo %in% microorganismsDT[, mo]) { + x[i] <- microorganismsDT[mo == ref_mo, ..property][[1]][1L] + next + } else { + warning("Value '", x_backup[i], "' was found in reference_df, but '", ref_mo, "' is not a valid MO code.", call. = FALSE) + } } } diff --git a/R/zzz.R b/R/zzz.R index 6a5d0771..e3074e56 100755 --- a/R/zzz.R +++ b/R/zzz.R @@ -47,28 +47,3 @@ NULL .onLoad <- function(libname, pkgname) { backports::import(pkgname) } - -.onAttach <- function(libname, pkgname) { - # save data.tables to improve speed of as.mo: - - # microorganismsDT <- data.table::as.data.table(AMR::microorganisms) - # microorganisms.oldDT <- data.table::as.data.table(AMR::microorganisms.old) - # - # data.table::setkey(microorganismsDT, prevalence, tsn) - # data.table::setkey(microorganisms.oldDT, tsn, name) - - base::assign(x = "microorganismsDT", - value = microorganismsDT, - envir = base::as.environment("package:AMR")) - base::assign(x = "microorganisms.prevDT", - value = microorganismsDT[prevalence != 9999,], - envir = base::as.environment("package:AMR")) - base::assign(x = "microorganisms.unprevDT", - value = microorganismsDT[prevalence == 9999,], - envir = base::as.environment("package:AMR")) - - base::assign(x = "microorganisms.oldDT", - value = microorganisms.oldDT, - envir = base::as.environment("package:AMR")) - -} diff --git a/README.md b/README.md index 167069c9..446c62fb 100755 --- a/README.md +++ b/README.md @@ -347,10 +347,8 @@ key_antibiotics(...) # Selection of first isolates of any patient first_isolate(...) -# Calculate resistance levels of antibiotics, can be used with `summarise` (dplyr) -rsi(...) # Predict resistance levels of antibiotics -rsi_predict(...) +resistance_predict(...) # Get name of antibiotic by ATC code abname(...) diff --git a/data/microorganisms.prevDT.rda b/data/microorganisms.prevDT.rda index 02121910..62bf1795 100644 Binary files a/data/microorganisms.prevDT.rda and b/data/microorganisms.prevDT.rda differ diff --git a/data/microorganisms.unprevDT.rda b/data/microorganisms.unprevDT.rda index 4ab178ca..1eb394f0 100644 Binary files a/data/microorganisms.unprevDT.rda and b/data/microorganisms.unprevDT.rda differ diff --git a/man/as.mo.Rd b/man/as.mo.Rd index dd8e9aea..42cd7442 100644 --- a/man/as.mo.Rd +++ b/man/as.mo.Rd @@ -58,7 +58,7 @@ This function uses Artificial Intelligence (AI) to help getting fast and logical \itemize{ \item{Taxonomic kingdom: it first searches in bacteria, then fungi, then protozoa} \item{Human pathogenic prevalence: it first searches in more prevalent microorganisms, then less prevalent ones} - \item{Valid MO codes and full names: it first searches in already valid MO code and genus/species combinations} + \item{Valid MO codes and full names: it first searches in already valid MO code and known genus/species combinations} \item{Breakdown of input values: from here it starts to breakdown input values to find possible matches} } @@ -132,16 +132,15 @@ df$mo <- as.mo(df$microorganism_name) library(dplyr) df$mo <- df \%>\% select(microorganism_name) \%>\% - guess_mo() + as.mo() # and can even contain 2 columns, which is convenient for genus/species combinations: df$mo <- df \%>\% select(genus, species) \%>\% - guess_mo() - -# same result: + as.mo() +# although this works easier and does the same: df <- df \%>\% - mutate(mo = guess_mo(paste(genus, species))) + mutate(mo = as.mo(paste(genus, species))) } } \seealso{ diff --git a/tests/testthat/test-misc.R b/tests/testthat/test-misc.R index 164a9c9f..a1052717 100755 --- a/tests/testthat/test-misc.R +++ b/tests/testthat/test-misc.R @@ -5,6 +5,9 @@ test_that("percentages works", { expect_equal(percent(0.5), "50%") expect_equal(percent(0.500, force_zero = TRUE), "50.0%") expect_equal(percent(0.1234), "12.3%") + # round up 0.5 + expect_equal(percent(0.0054), "0.5%") + expect_equal(percent(0.0055), "0.6%") }) test_that("size format works", { diff --git a/tests/testthat/test-mo.R b/tests/testthat/test-mo.R index 165c9182..5d37764f 100644 --- a/tests/testthat/test-mo.R +++ b/tests/testthat/test-mo.R @@ -195,6 +195,11 @@ test_that("as.mo works", { # TSN of prevalent and non prevalent ones expect_equal(mo_TSN(c("Gomphosphaeria aponina delicatula", "Escherichia coli")), - c(717, 285)) + c(717, 285)) + + expect_equal(mo_fullname(c("E. spp.", + "E. spp", + "E. species")), + rep("Escherichia species", 3)) }) diff --git a/tests/testthat/test-read.4d.R b/tests/testthat/test-read.4d.R index 9b99b1e5..25c24d89 100644 --- a/tests/testthat/test-read.4d.R +++ b/tests/testthat/test-read.4d.R @@ -18,7 +18,7 @@ test_that("read 4D works", { tf <- tempfile() write.table(test1, file = tf, quote = F, sep = "\t") - x <- read.4D(tf, skip = 0) + x <- read.4D(tf, skip = 0, info = TRUE) unlink(tf) expect_equal(ncol(x), 11)