support A. species for as.mo, cleanup

This commit is contained in:
dr. M.S. (Matthijs) Berends 2018-11-24 20:25:09 +01:00
parent 63ba4b2980
commit 9ddf6dc530
14 changed files with 126 additions and 93 deletions

View File

@ -1,6 +1,6 @@
Package: AMR Package: AMR
Version: 0.4.0.9012 Version: 0.4.0.9013
Date: 2018-11-17 Date: 2018-11-24
Title: Antimicrobial Resistance Analysis Title: Antimicrobial Resistance Analysis
Authors@R: c( Authors@R: c(
person( person(

14
NEWS.md
View File

@ -23,6 +23,13 @@
* Added column `kingdom` to the microorganisms data set, and function `mo_kingdom` to look up values * Added column `kingdom` to the microorganisms data set, and function `mo_kingdom` to look up values
* Tremendous speed improvement for `as.mo` (and subsequently all `mo_*` functions), as empty values wil be ignored *a priori* * Tremendous speed improvement for `as.mo` (and subsequently all `mo_*` functions), as empty values wil be ignored *a priori*
* Fewer than 3 characters as input for `as.mo` will return NA * Fewer than 3 characters as input for `as.mo` will return NA
* Function `as.mo` (and all `mo_*` wrappers) now supports genus abbreviations with "species" attached
```r
as.mo("E. species") # B_ESCHR
mo_fullname("E. spp.") # "Escherichia species"
as.mo("S. spp") # B_STPHY
mo_fullname("S. species") # "Staphylococcus species"
```
* Added parameter `combine_IR` (TRUE/FALSE) to functions `portion_df` and `count_df`, to indicate that all values of I and R must be merged into one, so the output only consists of S vs. IR (susceptible vs. non-susceptible) * Added parameter `combine_IR` (TRUE/FALSE) to functions `portion_df` and `count_df`, to indicate that all values of I and R must be merged into one, so the output only consists of S vs. IR (susceptible vs. non-susceptible)
* Fix for `portion_*(..., as_percent = TRUE)` when minimal number of isolates would not be met * Fix for `portion_*(..., as_percent = TRUE)` when minimal number of isolates would not be met
* Added parameter `also_single_tested` for `portion_*` and `count_*` functions to also include cases where not all antibiotics were tested but at least one of the tested antibiotics includes the target antimicribial interpretation, see `?portion` * Added parameter `also_single_tested` for `portion_*` and `count_*` functions to also include cases where not all antibiotics were tested but at least one of the tested antibiotics includes the target antimicribial interpretation, see `?portion`
@ -45,9 +52,9 @@
* Now prints in markdown at default in non-interactive sessions * Now prints in markdown at default in non-interactive sessions
* No longer adds the factor level column and sorts factors on count again * No longer adds the factor level column and sorts factors on count again
* Support for class `difftime` * Support for class `difftime`
* New parameter `na`, to choose with character to print for empty values * New parameter `na`, to choose which character to print for empty values
* New parameter `header` to turn it off (default when `markdown = TRUE`) * New parameter `header` to turn the header info off (default when `markdown = TRUE`)
* New parameter `title` to replace the automatically set title * New parameter `title` to manually setbthe title of the frequency table
* `first_isolate` now tries to find columns to use as input when parameters are left blank * `first_isolate` now tries to find columns to use as input when parameters are left blank
* Improvements for MDRO algorithm (function `mdro`) * Improvements for MDRO algorithm (function `mdro`)
* Data set `septic_patients` is now a `data.frame`, not a tibble anymore * Data set `septic_patients` is now a `data.frame`, not a tibble anymore
@ -66,6 +73,7 @@
* In `g.test`, when `sum(x)` is below 1000 or any of the expected values is below 5, Fisher's Exact Test will be suggested * In `g.test`, when `sum(x)` is below 1000 or any of the expected values is below 5, Fisher's Exact Test will be suggested
* `ab_name` will try to fall back on `as.atc` when no results are found * `ab_name` will try to fall back on `as.atc` when no results are found
* Removed the addin to view data sets * Removed the addin to view data sets
* Percentages will now will rounded more logically (e.g. in `freq` function)
#### Other #### Other
* New dependency on package `crayon`, to support formatted text in the console * New dependency on package `crayon`, to support formatted text in the console

View File

@ -313,16 +313,22 @@ frequency_tbl <- function(x,
} }
} }
na_txt <- paste0(NAs %>% length() %>% format(), ' = ', if (NROW(x) > 0) {
(NAs %>% length() / (NAs %>% length() + x %>% length())) %>% percent(force_zero = TRUE, round = digits) %>% na_txt <- paste0(NAs %>% length() %>% format(), ' = ',
sub('NaN', '0', ., fixed = TRUE)) (NAs %>% length() / (NAs %>% length() + x %>% length())) %>% percent(force_zero = TRUE, round = digits) %>%
if (!na_txt %like% "^0 =") { sub('NaN', '0', ., fixed = TRUE))
na_txt <- red(na_txt) if (!na_txt %like% "^0 =") {
na_txt <- red(na_txt)
} else {
na_txt <- green(na_txt)
}
na_txt <- paste0('(of which NA: ', na_txt, ')')
} else { } else {
na_txt <- green(na_txt) na_txt <- ""
} }
header_txt <- header_txt %>% paste0(markdown_line, '\nLength: ', (NAs %>% length() + x %>% length()) %>% format(), header_txt <- header_txt %>% paste0(markdown_line, '\nLength: ', (NAs %>% length() + x %>% length()) %>% format(),
' (of which NA: ', na_txt, ')') ' ', na_txt)
header_txt <- header_txt %>% paste0(markdown_line, '\nUnique: ', x %>% n_distinct() %>% format()) header_txt <- header_txt %>% paste0(markdown_line, '\nUnique: ', x %>% n_distinct() %>% format())
if (NROW(x) > 0 & any(class(x) == "character")) { if (NROW(x) > 0 & any(class(x) == "character")) {
@ -592,7 +598,12 @@ print.frequency_tbl <- function(x, nmax = getOption("max.print.freq", default =
} }
title <- paste(title, group_var) title <- paste(title, group_var)
} }
title <- paste("Frequency table of", trimws(title)) title <- trimws(title)
if (title == "") {
title <- "Frequency table"
} else {
title <- paste("Frequency table of", trimws(title))
}
} else { } else {
title <- opt$title title <- opt$title
} }

View File

@ -54,6 +54,8 @@ globalVariables(c(".",
"microorganisms.prevDT", "microorganisms.prevDT",
"microorganisms.unprevDT", "microorganisms.unprevDT",
"microorganisms.oldDT", "microorganisms.oldDT",
"microorganisms.certe",
"microorganisms.umcg",
"mo", "mo",
"mo.old", "mo.old",
"n", "n",

View File

@ -28,12 +28,24 @@ addin_insert_like <- function() {
# No export, no Rd # No export, no Rd
percent <- function(x, round = 1, force_zero = FALSE, ...) { percent <- function(x, round = 1, force_zero = FALSE, ...) {
val <- base::round(x * 100, digits = round)
if (force_zero == TRUE & any(val == as.integer(val) & !is.na(val))) { # https://stackoverflow.com/a/12688836/4575331
val[val == as.integer(val)] <- paste0(val[val == as.integer(val)], ".", strrep(0, round)) round2 <- function(x, n) (trunc((abs(x) * 10 ^ n) + 0.5) / 10 ^ n) * sign(x)
val <- round2(x, round + 2) # round up 0.5
val <- round(x = val * 100, digits = round) # remove floating point error
if (force_zero == TRUE) {
if (any(val == as.integer(val) & !is.na(val))) {
# add zeroes to all integers
val[val == as.integer(as.character(val))] <- paste0(val[val == as.integer(val)], ".", strrep(0, round))
}
# add extra zeroes if needed
val_decimals <- nchar(gsub(".*[.](.*)", "\\1", as.character(val)))
val[val_decimals < round] <- paste0(val[val_decimals < round], strrep(0, max(0, round - val_decimals)))
} }
pct <- base::paste0(val, "%") pct <- base::paste0(val, "%")
pct[pct == "NA%"] <- NA_character_ pct[pct %in% c("NA%", "NaN%")] <- NA_character_
pct pct
} }

100
R/mo.R
View File

@ -53,7 +53,7 @@
#' \itemize{ #' \itemize{
#' \item{Taxonomic kingdom: it first searches in bacteria, then fungi, then protozoa} #' \item{Taxonomic kingdom: it first searches in bacteria, then fungi, then protozoa}
#' \item{Human pathogenic prevalence: it first searches in more prevalent microorganisms, then less prevalent ones} #' \item{Human pathogenic prevalence: it first searches in more prevalent microorganisms, then less prevalent ones}
#' \item{Valid MO codes and full names: it first searches in already valid MO code and genus/species combinations} #' \item{Valid MO codes and full names: it first searches in already valid MO code and known genus/species combinations}
#' \item{Breakdown of input values: from here it starts to breakdown input values to find possible matches} #' \item{Breakdown of input values: from here it starts to breakdown input values to find possible matches}
#' } #' }
#' #'
@ -126,16 +126,15 @@
#' library(dplyr) #' library(dplyr)
#' df$mo <- df %>% #' df$mo <- df %>%
#' select(microorganism_name) %>% #' select(microorganism_name) %>%
#' guess_mo() #' as.mo()
#' #'
#' # and can even contain 2 columns, which is convenient for genus/species combinations: #' # and can even contain 2 columns, which is convenient for genus/species combinations:
#' df$mo <- df %>% #' df$mo <- df %>%
#' select(genus, species) %>% #' select(genus, species) %>%
#' guess_mo() #' as.mo()
#' #' # although this works easier and does the same:
#' # same result:
#' df <- df %>% #' df <- df %>%
#' mutate(mo = guess_mo(paste(genus, species))) #' mutate(mo = as.mo(paste(genus, species)))
#' } #' }
as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = FALSE, reference_df = NULL) { as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = FALSE, reference_df = NULL) {
structure(mo_validate(x = x, property = "mo", structure(mo_validate(x = x, property = "mo",
@ -160,11 +159,14 @@ guess_mo <- as.mo
#' @importFrom data.table data.table as.data.table setkey #' @importFrom data.table data.table as.data.table setkey
exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = FALSE, reference_df = NULL, property = "mo") { exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain = FALSE, reference_df = NULL, property = "mo") {
# These data.tables are available as data sets when the AMR package is loaded: if (!"AMR" %in% base::.packages()) {
# microorganismsDT # this one is sorted by kingdom (B<F<P), prevalence, TSN library("AMR")
# microorganisms.prevDT # same as microorganismsDT, but with prevalence != 9999 # These data.tables are available as data sets when the AMR package is loaded:
# microorganisms.unprevDT # same as microorganismsDT, but with prevalence == 9999 # microorganismsDT # this one is sorted by kingdom (B<F<P), prevalence, TSN
# microorganisms.oldDT # old taxonomic names, sorted by name (genus+species), TSN # microorganisms.prevDT # same as microorganismsDT, but with prevalence != 9999
# microorganisms.unprevDT # same as microorganismsDT, but with prevalence == 9999
# microorganisms.oldDT # old taxonomic names, sorted by name (genus+species), TSN
}
if (NCOL(x) == 2) { if (NCOL(x) == 2) {
# support tidyverse selection like: df %>% select(colA, colB) # support tidyverse selection like: df %>% select(colA, colB)
@ -216,31 +218,35 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain =
suppressWarnings( suppressWarnings(
x <- data.frame(x = x, stringsAsFactors = FALSE) %>% x <- data.frame(x = x, stringsAsFactors = FALSE) %>%
left_join(reference_df, by = "x") %>% left_join(reference_df, by = "x") %>%
left_join(AMR::microorganisms, by = "mo") %>% left_join(microorganisms, by = "mo") %>%
pull(property) pull(property)
) )
} else if (all(toupper(x) %in% AMR::microorganisms.certe[, "certe"])) { } else if (all(toupper(x) %in% microorganisms.certe[, "certe"])) {
# old Certe codes # old Certe codes
y <- as.data.table(AMR::microorganisms.certe)[data.table(certe = toupper(x)), on = "certe", ] y <- as.data.table(microorganisms.certe)[data.table(certe = toupper(x)), on = "certe", ]
x <- microorganismsDT[data.table(mo = y[["mo"]]), on = "mo", ..property][[1]] x <- microorganismsDT[data.table(mo = y[["mo"]]), on = "mo", ..property][[1]]
} else if (!all(x %in% microorganismsDT[[property]])) { } else if (!all(x %in% microorganismsDT[[property]])) {
x_backup <- trimws(x, which = "both") x_backup <- trimws(x, which = "both")
x_species <- paste(x_backup, "species")
# remove spp and species
x <- gsub(" +(spp.?|species)", "", x_backup)
x_species <- paste(x, "species")
# translate to English for supported languages of mo_property # translate to English for supported languages of mo_property
x <- gsub("(Gruppe|gruppe|groep|grupo|gruppo|groupe)", "group", x) x <- gsub("(Gruppe|gruppe|groep|grupo|gruppo|groupe)", "group", x)
# remove 'empty' genus and species values # remove 'empty' genus and species values
x <- gsub("(no MO)", "", x, fixed = TRUE) x <- gsub("(no MO)", "", x, fixed = TRUE)
# remove dots and other non-text in case of "E. coli" except spaces # remove non-text in case of "E. coli" except dots and spaces
x <- gsub("[^a-zA-Z0-9/ \\-]+", "", x) x <- gsub("[^.a-zA-Z0-9/ \\-]+", "", x)
# but spaces before and after should be omitted # but spaces before and after should be omitted
x <- trimws(x, which = "both") x <- trimws(x, which = "both")
x_trimmed <- x x_trimmed <- x
x_trimmed_species <- paste(x_trimmed, "species") x_trimmed_species <- paste(x_trimmed, "species")
# replace space by regex sign # replace space and dot by regex sign
x_withspaces <- gsub(" ", ".* ", x, fixed = TRUE) x_withspaces <- gsub("[ .]+", ".* ", x)
x <- gsub(" ", ".*", x, fixed = TRUE) x <- gsub("[ .]+", ".*", x)
# add start en stop regex # add start en stop regex
x <- paste0('^', x, '$') x <- paste0('^', x, '$')
x_withspaces_start <- paste0('^', x_withspaces) x_withspaces_start <- paste0('^', x_withspaces)
@ -261,10 +267,28 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain =
next next
} }
if (nchar(x_trimmed[i]) < 3) { if (nchar(x_trimmed[i]) < 3) {
# fewer than 3 chars, add as failure # check if search term was like "A. species", then return first genus found with ^A
x[i] <- NA_character_ if (x_backup[i] %like% "species" | x_backup[i] %like% "spp[.]?") {
failures <- c(failures, x_backup[i]) # get mo code of first hit
next found <- microorganismsDT[fullname %like% x_withspaces_start[i], mo][[1]]
mo_code <- found[1L] %>% strsplit("_") %>% unlist() %>% .[1:2] %>% paste(collapse = "_")
found <- microorganismsDT[mo == mo_code, ..property][[1]]
# return first genus that begins with x_trimmed, e.g. when "E. spp."
if (length(found) > 0) {
x[i] <- found[1L]
next
} else {
# fewer than 3 chars, add as failure
x[i] <- NA_character_
failures <- c(failures, x_backup[i])
next
}
} else {
# fewer than 3 chars, add as failure
x[i] <- NA_character_
failures <- c(failures, x_backup[i])
next
}
} }
# translate known trivial abbreviations to genus + species ---- # translate known trivial abbreviations to genus + species ----
@ -353,15 +377,9 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain =
} }
# TRY OTHER SOURCES ---- # TRY OTHER SOURCES ----
if (x_backup[i] %in% AMR::microorganisms.certe$certe) { if (x_backup[i] %in% microorganisms.umcg[, 1]) {
x[i] <- microorganismsDT[mo == AMR::microorganisms.certe[AMR::microorganisms.certe[, 1] == x_backup[i], 2], ..property][[1]][1L] mo_umcg <- microorganisms.umcg[microorganisms.umcg[, 1] == x_backup[i], 2]
# x[i] <- exec_as.mo(x = AMR::microorganisms.certe[AMR::microorganisms.certe$certe == x_backup[i], "mo"], mo_found <- microorganisms.certe[microorganisms.certe[, 1] == mo_umcg, 2]
# property = property)
# next
}
if (x_backup[i] %in% AMR::microorganisms.umcg[, 1]) {
mo_umcg <- AMR::microorganisms.umcg[AMR::microorganisms.umcg[, 1] == x_backup[i], 2]
mo_found <- AMR::microorganisms.certe[AMR::microorganisms.certe[, 1] == mo_umcg, 2]
if (length(mo_found) == 0) { if (length(mo_found) == 0) {
# not found # not found
x[i] <- NA_character_ x[i] <- NA_character_
@ -371,13 +389,15 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE, allow_uncertain =
} }
next next
} }
if (x_backup[i] %in% reference_df[, 1]) { if (!is.null(reference_df)) {
ref_mo <- reference_df[reference_df[, 1] == x_backup[i], 2] if (x_backup[i] %in% reference_df[, 1]) {
if (ref_mo %in% microorganismsDT[, mo]) { ref_mo <- reference_df[reference_df[, 1] == x_backup[i], 2]
x[i] <- microorganismsDT[mo == ref_mo, ..property][[1]][1L] if (ref_mo %in% microorganismsDT[, mo]) {
next x[i] <- microorganismsDT[mo == ref_mo, ..property][[1]][1L]
} else { next
warning("Value '", x_backup[i], "' was found in reference_df, but '", ref_mo, "' is not a valid MO code.", call. = FALSE) } else {
warning("Value '", x_backup[i], "' was found in reference_df, but '", ref_mo, "' is not a valid MO code.", call. = FALSE)
}
} }
} }

25
R/zzz.R
View File

@ -47,28 +47,3 @@ NULL
.onLoad <- function(libname, pkgname) { .onLoad <- function(libname, pkgname) {
backports::import(pkgname) backports::import(pkgname)
} }
.onAttach <- function(libname, pkgname) {
# save data.tables to improve speed of as.mo:
# microorganismsDT <- data.table::as.data.table(AMR::microorganisms)
# microorganisms.oldDT <- data.table::as.data.table(AMR::microorganisms.old)
#
# data.table::setkey(microorganismsDT, prevalence, tsn)
# data.table::setkey(microorganisms.oldDT, tsn, name)
base::assign(x = "microorganismsDT",
value = microorganismsDT,
envir = base::as.environment("package:AMR"))
base::assign(x = "microorganisms.prevDT",
value = microorganismsDT[prevalence != 9999,],
envir = base::as.environment("package:AMR"))
base::assign(x = "microorganisms.unprevDT",
value = microorganismsDT[prevalence == 9999,],
envir = base::as.environment("package:AMR"))
base::assign(x = "microorganisms.oldDT",
value = microorganisms.oldDT,
envir = base::as.environment("package:AMR"))
}

View File

@ -347,10 +347,8 @@ key_antibiotics(...)
# Selection of first isolates of any patient # Selection of first isolates of any patient
first_isolate(...) first_isolate(...)
# Calculate resistance levels of antibiotics, can be used with `summarise` (dplyr)
rsi(...)
# Predict resistance levels of antibiotics # Predict resistance levels of antibiotics
rsi_predict(...) resistance_predict(...)
# Get name of antibiotic by ATC code # Get name of antibiotic by ATC code
abname(...) abname(...)

Binary file not shown.

Binary file not shown.

View File

@ -58,7 +58,7 @@ This function uses Artificial Intelligence (AI) to help getting fast and logical
\itemize{ \itemize{
\item{Taxonomic kingdom: it first searches in bacteria, then fungi, then protozoa} \item{Taxonomic kingdom: it first searches in bacteria, then fungi, then protozoa}
\item{Human pathogenic prevalence: it first searches in more prevalent microorganisms, then less prevalent ones} \item{Human pathogenic prevalence: it first searches in more prevalent microorganisms, then less prevalent ones}
\item{Valid MO codes and full names: it first searches in already valid MO code and genus/species combinations} \item{Valid MO codes and full names: it first searches in already valid MO code and known genus/species combinations}
\item{Breakdown of input values: from here it starts to breakdown input values to find possible matches} \item{Breakdown of input values: from here it starts to breakdown input values to find possible matches}
} }
@ -132,16 +132,15 @@ df$mo <- as.mo(df$microorganism_name)
library(dplyr) library(dplyr)
df$mo <- df \%>\% df$mo <- df \%>\%
select(microorganism_name) \%>\% select(microorganism_name) \%>\%
guess_mo() as.mo()
# and can even contain 2 columns, which is convenient for genus/species combinations: # and can even contain 2 columns, which is convenient for genus/species combinations:
df$mo <- df \%>\% df$mo <- df \%>\%
select(genus, species) \%>\% select(genus, species) \%>\%
guess_mo() as.mo()
# although this works easier and does the same:
# same result:
df <- df \%>\% df <- df \%>\%
mutate(mo = guess_mo(paste(genus, species))) mutate(mo = as.mo(paste(genus, species)))
} }
} }
\seealso{ \seealso{

View File

@ -5,6 +5,9 @@ test_that("percentages works", {
expect_equal(percent(0.5), "50%") expect_equal(percent(0.5), "50%")
expect_equal(percent(0.500, force_zero = TRUE), "50.0%") expect_equal(percent(0.500, force_zero = TRUE), "50.0%")
expect_equal(percent(0.1234), "12.3%") expect_equal(percent(0.1234), "12.3%")
# round up 0.5
expect_equal(percent(0.0054), "0.5%")
expect_equal(percent(0.0055), "0.6%")
}) })
test_that("size format works", { test_that("size format works", {

View File

@ -195,6 +195,11 @@ test_that("as.mo works", {
# TSN of prevalent and non prevalent ones # TSN of prevalent and non prevalent ones
expect_equal(mo_TSN(c("Gomphosphaeria aponina delicatula", "Escherichia coli")), expect_equal(mo_TSN(c("Gomphosphaeria aponina delicatula", "Escherichia coli")),
c(717, 285)) c(717, 285))
expect_equal(mo_fullname(c("E. spp.",
"E. spp",
"E. species")),
rep("Escherichia species", 3))
}) })

View File

@ -18,7 +18,7 @@ test_that("read 4D works", {
tf <- tempfile() tf <- tempfile()
write.table(test1, file = tf, quote = F, sep = "\t") write.table(test1, file = tf, quote = F, sep = "\t")
x <- read.4D(tf, skip = 0) x <- read.4D(tf, skip = 0, info = TRUE)
unlink(tf) unlink(tf)
expect_equal(ncol(x), 11) expect_equal(ncol(x), 11)