1
0
mirror of https://github.com/msberends/AMR.git synced 2024-12-26 07:26:13 +01:00
This commit is contained in:
dr. M.S. (Matthijs) Berends 2019-03-28 21:33:28 +01:00
parent 429814c29b
commit b25f2d6213
10 changed files with 143 additions and 82 deletions

View File

@ -1,6 +1,6 @@
Package: AMR Package: AMR
Version: 0.6.0 Version: 0.6.1
Date: 2019-03-27 Date: 2019-03-28
Title: Antimicrobial Resistance Analysis Title: Antimicrobial Resistance Analysis
Authors@R: c( Authors@R: c(
person( person(

View File

@ -1,3 +1,9 @@
# AMR 0.6.1
#### Changed
* Fixed a critical bug when using `eucast_rules()` with `verbose = TRUE`
* Coercion of microbial IDs are now written to the package namespace instead of the user's home folder, to comply with the CRAN policy
# AMR 0.6.0 # AMR 0.6.0
**New website!** **New website!**

View File

@ -454,10 +454,10 @@ eucast_rules <- function(tbl,
stop(e, call. = FALSE) stop(e, call. = FALSE)
} }
) )
suppressMessages( # suppressMessages(
suppressWarnings( # suppressWarnings(
tbl[rows, cols] <<- to # tbl[rows, cols] <<- to
)) # ))
after <- as.character(unlist(as.list(tbl_original[rows, cols]))) after <- as.character(unlist(as.list(tbl_original[rows, cols])))
@ -489,48 +489,24 @@ eucast_rules <- function(tbl,
number_newly_changed_to_R number_newly_changed_to_R
if (verbose == TRUE) { if (verbose == TRUE) {
for (r in 1:length(rows)) { old <- as.data.frame(tbl_bak, stringsAsFactors = FALSE)[rows,]
for (c in 1:length(cols)) { new <- as.data.frame(tbl, stringsAsFactors = FALSE)[rows,]
old <- before_df[rows[r], cols[c]] MOs <- as.data.frame(tbl_original, stringsAsFactors = FALSE)[rows, col_mo][[1]]
new <- tbl[rows[r], cols[c]] for (i in 1:length(cols)) {
if (!identical(old, new)) { verbose_new <- data.frame(row = rows,
verbose_new <- data.frame(row = rows[r], col = cols[i],
col = cols[c], mo = MOs,
mo = tbl_original[rows[r], col_mo],
mo_fullname = "", mo_fullname = "",
old = old, old = as.character(old[, cols[i]]),
new = new, new = as.character(new[, cols[i]]),
rule_source = strip_style(rule[1]), rule_source = strip_style(rule[1]),
rule_group = strip_style(rule[2]), rule_group = strip_style(rule[2]),
stringsAsFactors = FALSE) stringsAsFactors = FALSE)
colnames(verbose_new) <- c("row", "col", "mo", "mo_fullname", "old", "new", "rule_source", "rule_group")
verbose_info <<- rbind(verbose_info, verbose_new) verbose_info <<- rbind(verbose_info, verbose_new)
} }
} }
} }
# verbose_new <- data.frame(row = integer(0),
# col = character(0),
# old = character(0),
# new = character(0),
# rule_source = character(0),
# rule_group = character(0),
# stringsAsFactors = FALSE)
# a <<- rule
# for (i in 1:length(cols)) {
# # add new row for every affected column
# verbose_new <- data.frame(rule_type = strip_style(rule[1]),
# rule_set = strip_style(rule[2]),
# force_to = to,
# found = length(before),
# changed = sum(before != after, na.rm = TRUE),
# target_column = cols[i],
# stringsAsFactors = FALSE)
# verbose_new$target_rows <- list(unname(rows))
# rownames(verbose_new) <- NULL
# verbose_info <<- rbind(verbose_info, verbose_new)
# }
}
}
} }
na.rm <- function(col) { na.rm <- function(col) {
@ -543,6 +519,7 @@ eucast_rules <- function(tbl,
# save original table # save original table
tbl_original <- tbl tbl_original <- tbl
tbl_bak <- tbl
# join to microorganisms data set # join to microorganisms data set
tbl <- tbl %>% tbl <- tbl %>%
@ -1905,6 +1882,9 @@ eucast_rules <- function(tbl,
formatnr(number_changed_to_I), " to I; ", formatnr(number_changed_to_I), " to I; ",
formatnr(number_changed_to_R), " to R)"), formatnr(number_changed_to_R), " to R)"),
"\n"))) "\n")))
if (verbose == FALSE) {
cat(paste("Use", bold("verbose = TRUE"), "to get a data.frame with all specified edits.\n"))
}
} }
if (verbose == TRUE) { if (verbose == TRUE) {
@ -1913,6 +1893,9 @@ eucast_rules <- function(tbl,
verbose_info$mo_fullname <- mo_fullname(verbose_info$mo) verbose_info$mo_fullname <- mo_fullname(verbose_info$mo)
) )
) )
verbose_info <- verbose_info %>%
filter(!is.na(new) & !identical(old, new)) %>%
arrange(row)
return(verbose_info) return(verbose_info)
} }
@ -1932,3 +1915,4 @@ interpretive_reading <- function(...) {
.Deprecated("eucast_rules") .Deprecated("eucast_rules")
eucast_rules(...) eucast_rules(...)
} }

View File

@ -54,7 +54,7 @@ guess_ab_col <- function(tbl = NULL, col = NULL, verbose = FALSE) {
if (is.null(tbl) & is.null(col)) { if (is.null(tbl) & is.null(col)) {
return(as.name("guess_ab_col")) return(as.name("guess_ab_col"))
} }
#stop("This function should not be called directly.")
if (length(col) > 1) { if (length(col) > 1) {
warning("argument 'col' has length > 1 and only the first element will be used") warning("argument 'col' has length > 1 and only the first element will be used")
col <- col[1] col <- col[1]
@ -114,7 +114,7 @@ guess_ab_col <- function(tbl = NULL, col = NULL, verbose = FALSE) {
if (length(ab_result) == 0) { if (length(ab_result) == 0) {
if (verbose == TRUE) { if (verbose == TRUE) {
message('no result found for col "', col, '"') message('no column found for input "', col, '"')
} }
return(NULL) return(NULL)
} else { } else {
@ -124,7 +124,7 @@ guess_ab_col <- function(tbl = NULL, col = NULL, verbose = FALSE) {
} }
if (length(result) == 0) { if (length(result) == 0) {
if (verbose == TRUE) { if (verbose == TRUE) {
message('no result found for col "', col, '"') message('no column found for input "', col, '"')
} }
return(NULL) return(NULL)
} }

6
R/mo.R
View File

@ -61,7 +61,11 @@
#' The algorithm uses data from the Catalogue of Life (see below) and from one other source (see \code{?microorganisms}). #' The algorithm uses data from the Catalogue of Life (see below) and from one other source (see \code{?microorganisms}).
#' #'
#' \strong{Self-learning algoritm} \cr #' \strong{Self-learning algoritm} \cr
#' The \code{as.mo()} function gains experience from previously determined microbial IDs and learns from it. This drastically improves both speed and reliability. Use \code{clean_mo_history()} to reset the algorithms. Only experience from your current \code{AMR} package version is used. This is done because in the future the taxonomic tree (which is included in this package) may change for any organism and it consequently has to rebuild its knowledge. Usually, any guess after the first try runs 80-95\% faster than the first try. The algorithm saves its previous findings to \code{~/.Rhistory_mo}. #' The \code{as.mo()} function gains experience from previously determined microbial IDs and learns from it. This drastically improves both speed and reliability. Use \code{clean_mo_history()} to reset the algorithms. Only experience from your current \code{AMR} package version is used. This is done because in the future the taxonomic tree (which is included in this package) may change for any organism and it consequently has to rebuild its knowledge.
#'
#' Usually, any guess after the first try runs 80-95\% faster than the first try.
#'
#' For now, learning only works per session. If R is closed or terminated, the algorithms reset. This will probably be resolved in a next version.
#' #'
#' \strong{Intelligent rules} \cr #' \strong{Intelligent rules} \cr
#' This function uses intelligent rules to help getting fast and logical results. It tries to find matches in this order: #' This function uses intelligent rules to help getting fast and logical results. It tries to find matches in this order:

View File

@ -19,10 +19,9 @@
# Visit our website for more info: https://msberends.gitab.io/AMR. # # Visit our website for more info: https://msberends.gitab.io/AMR. #
# ==================================================================== # # ==================================================================== #
# print successful as.mo coercions to file, not uncertain ones # print successful as.mo coercions to AMR environment
#' @importFrom dplyr %>% distinct filter #' @importFrom dplyr %>% distinct filter
set_mo_history <- function(x, mo, uncertainty_level, force = FALSE) { set_mo_history <- function(x, mo, uncertainty_level, force = FALSE) {
file_location <- base::path.expand('~/.Rhistory_mo')
if (base::interactive() | force == TRUE) { if (base::interactive() | force == TRUE) {
mo_hist <- read_mo_history(uncertainty_level = uncertainty_level, force = force) mo_hist <- read_mo_history(uncertainty_level = uncertainty_level, force = force)
df <- data.frame(x, mo, stringsAsFactors = FALSE) %>% df <- data.frame(x, mo, stringsAsFactors = FALSE) %>%
@ -37,12 +36,17 @@ set_mo_history <- function(x, mo, uncertainty_level, force = FALSE) {
# save package version too, as both the as.mo() algorithm and the reference data set may change # save package version too, as both the as.mo() algorithm and the reference data set may change
if (NROW(mo_hist[base::which(mo_hist$x == x[i] & if (NROW(mo_hist[base::which(mo_hist$x == x[i] &
mo_hist$uncertainty_level >= uncertainty_level & mo_hist$uncertainty_level >= uncertainty_level &
mo_hist$package_version == utils::packageVersion("AMR")),]) == 0) { mo_hist$package_v == utils::packageVersion("AMR")),]) == 0) {
base::write(x = c(x[i], mo[i], uncertainty_level, base::as.character(utils::packageVersion("AMR"))), assign(x = "mo_history",
file = file_location, value = rbind(mo_hist,
ncolumns = 4, data.frame(
append = TRUE, x = x[i],
sep = "\t") mo = mo[i],
uncertainty_level = uncertainty_level,
package_v = base::as.character(utils::packageVersion("AMR")),
stringsAsFactors = FALSE)
),
envir = asNamespace("AMR"))
} }
} }
} }
@ -50,35 +54,35 @@ set_mo_history <- function(x, mo, uncertainty_level, force = FALSE) {
} }
get_mo_history <- function(x, uncertainty_level, force = FALSE) { get_mo_history <- function(x, uncertainty_level, force = FALSE) {
file_read <- read_mo_history(uncertainty_level = uncertainty_level, force = force) history <- read_mo_history(uncertainty_level = uncertainty_level, force = force)
if (base::is.null(file_read)) { if (base::is.null(history)) {
NA NA
} else { } else {
data.frame(x = toupper(x), stringsAsFactors = FALSE) %>% data.frame(x = toupper(x), stringsAsFactors = FALSE) %>%
left_join(file_read, by = "x") %>% left_join(history, by = "x") %>%
pull(mo) pull(mo)
} }
} }
#' @importFrom dplyr %>% filter distinct #' @importFrom dplyr %>% filter distinct
read_mo_history <- function(uncertainty_level = 2, force = FALSE, unfiltered = FALSE) { read_mo_history <- function(uncertainty_level = 2, force = FALSE, unfiltered = FALSE) {
file_location <- base::path.expand('~/.Rhistory_mo') if ((!base::interactive() & force == FALSE)) {
if (!base::file.exists(file_location) | (!base::interactive() & force == FALSE)) {
return(NULL) return(NULL)
} }
uncertainty_level_param <- uncertainty_level uncertainty_level_param <- uncertainty_level
file_read <- utils::read.table(file = file_location,
header = FALSE, history <- tryCatch(get("mo_history", envir = asNamespace("AMR")),
sep = "\t", error = function(e) NULL)
col.names = c("x", "mo", "uncertainty_level", "package_version"), if (is.null(history)) {
stringsAsFactors = FALSE) return(NULL)
}
# Below: filter on current package version. # Below: filter on current package version.
# Even current fullnames may be replaced by new taxonomic names, so new versions of # Even current fullnames may be replaced by new taxonomic names, so new versions of
# the Catalogue of Life must not lead to data corruption. # the Catalogue of Life must not lead to data corruption.
if (unfiltered == FALSE) { if (unfiltered == FALSE) {
file_read <- file_read %>% history <- history %>%
filter(package_version == utils::packageVersion("AMR"), filter(package_v == as.character(utils::packageVersion("AMR")),
# only take unknowns if uncertainty_level_param is higher # only take unknowns if uncertainty_level_param is higher
((mo == "UNKNOWN" & uncertainty_level_param == uncertainty_level) | ((mo == "UNKNOWN" & uncertainty_level_param == uncertainty_level) |
(mo != "UNKNOWN" & uncertainty_level_param >= uncertainty_level))) %>% (mo != "UNKNOWN" & uncertainty_level_param >= uncertainty_level))) %>%
@ -86,10 +90,10 @@ read_mo_history <- function(uncertainty_level = 2, force = FALSE, unfiltered = F
distinct(x, mo, .keep_all = TRUE) distinct(x, mo, .keep_all = TRUE)
} }
if (nrow(file_read) == 0) { if (nrow(history) == 0) {
NULL NULL
} else { } else {
file_read history
} }
} }
@ -98,20 +102,21 @@ read_mo_history <- function(uncertainty_level = 2, force = FALSE, unfiltered = F
#' @importFrom utils menu #' @importFrom utils menu
#' @export #' @export
clean_mo_history <- function(...) { clean_mo_history <- function(...) {
file_location <- base::path.expand('~/.Rhistory_mo') if (!is.null(read_mo_history())) {
if (file.exists(file_location)) {
if (interactive() & !isTRUE(list(...)$force)) { if (interactive() & !isTRUE(list(...)$force)) {
q <- menu(title = paste("This will remove all", q <- menu(title = paste("This will remove all",
format(nrow(read_mo_history(999, unfiltered = TRUE)), big.mark = ","), format(nrow(read_mo_history(999, unfiltered = TRUE)), big.mark = ","),
"previously determined microbial IDs. Are you sure?"), "microbial IDs determined previously in this session. Are you sure?"),
choices = c("Yes", "No"), choices = c("Yes", "No"),
graphics = FALSE) graphics = FALSE)
if (q != 1) { if (q != 1) {
return(invisible()) return(invisible())
} }
} }
unlink(file_location) assign(x = "mo_history",
cat(red("File", file_location, "removed.")) value = NULL,
envir = asNamespace("AMR"))
cat(red("History removed."))
} }
} }

View File

@ -72,7 +72,11 @@ Use the \code{\link{mo_property}_*} functions to get properties based on the ret
The algorithm uses data from the Catalogue of Life (see below) and from one other source (see \code{?microorganisms}). The algorithm uses data from the Catalogue of Life (see below) and from one other source (see \code{?microorganisms}).
\strong{Self-learning algoritm} \cr \strong{Self-learning algoritm} \cr
The \code{as.mo()} function gains experience from previously determined microbial IDs and learns from it. This drastically improves both speed and reliability. Use \code{clean_mo_history()} to reset the algorithms. Only experience from your current \code{AMR} package version is used. This is done because in the future the taxonomic tree (which is included in this package) may change for any organism and it consequently has to rebuild its knowledge. Usually, any guess after the first try runs 80-95\% faster than the first try. The algorithm saves its previous findings to \code{~/.Rhistory_mo}. The \code{as.mo()} function gains experience from previously determined microbial IDs and learns from it. This drastically improves both speed and reliability. Use \code{clean_mo_history()} to reset the algorithms. Only experience from your current \code{AMR} package version is used. This is done because in the future the taxonomic tree (which is included in this package) may change for any organism and it consequently has to rebuild its knowledge.
Usually, any guess after the first try runs 80-95\% faster than the first try.
For now, learning only works per session. If R is closed or terminated, the algorithms reset. This will probably be resolved in a next version.
\strong{Intelligent rules} \cr \strong{Intelligent rules} \cr
This function uses intelligent rules to help getting fast and logical results. It tries to find matches in this order: This function uses intelligent rules to help getting fast and logical results. It tries to find matches in this order:

View File

@ -0,0 +1,58 @@
# WORK IN PROGRESS --------------------------------------------------------
# vector with official names, return vector with CIDs
get_CID <- function(ab) {
CID <- rep(NA_integer_, length(ab))
p <- progress_estimated(n = length(ab), min_time = 0)
for (i in 1:length(ab)) {
p$tick()$print()
CID[i] <- tryCatch(
data.table::fread(paste0("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/",
ab[i],
"/cids/TXT?name_type=complete"),
showProgress = FALSE)[[1]][1],
error = function(e) NA_integer_)
Sys.sleep(0.2)
}
CID
}
# returns vector with synonyms (brand names) for a single CID
get_synonyms <- function(CID, clean = TRUE) {
p <- progress_estimated(n = length(CID), min_time = 0)
p$tick()$print()
synonyms_txt <- tryCatch(
data.table::fread(paste0("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/fastidentity/cid/",
CID,
"/synonyms/TXT"),
sep = "\n",
showProgress = FALSE)[[1]],
error = function(e) NA_character_)
if (clean == TRUE) {
# remove txt between brackets
synonyms_txt <- trimws(gsub("[(].*[)]", "", gsub("[[].*[]]", "", synonyms_txt)))
# only length 6 to 20 and no txt with reading marks or numbers
synonyms_txt <- synonyms_txt[nchar(synonyms_txt) %in% c(6:20)
& !synonyms_txt %like% "[-&{},_0-9]"]
synonyms_txt <- unlist(strsplit(synonyms_txt, ";", fixed = TRUE))
}
synonyms_txt <- synonyms_txt[tolower(synonyms_txt) %in% unique(tolower(synonyms_txt))]
sort(synonyms_txt)
}
CIDs <- get_CID(antibiotics$official)
synonyms <- character(length(CIDs))
p <- progress_estimated(n = length(synonyms), min_time = 0)
for (i in 365:length(synonyms)) {
#p$tick()$print()
if (!is.na(CIDs[i])) {
synonyms[i] <- paste(get_synonyms(CIDs[i]), collapse = "|")
}
}
antibiotics$cid <- CIDs
antibiotics$trade_name <- synonyms

View File

@ -40,5 +40,5 @@ test_that("mo_history works", {
expect_equal(as.character(as.mo("testsubject", force_mo_history = TRUE)), "B_ESCHR_COL") expect_equal(as.character(as.mo("testsubject", force_mo_history = TRUE)), "B_ESCHR_COL")
expect_equal(colnames(read_mo_history(force = TRUE)), expect_equal(colnames(read_mo_history(force = TRUE)),
c("x", "mo", "uncertainty_level", "package_version")) c("x", "mo", "uncertainty_level", "package_v"))
}) })

View File

@ -103,7 +103,7 @@ boxplot(microbenchmark(
main = "Benchmarks per prevalence") main = "Benchmarks per prevalence")
``` ```
The highest outliers are the first times. All next determinations were done in only thousands of seconds. The highest outliers are the first times. All next determinations were done in only thousands of seconds. For now, learning only works per session. If R is closed or terminated, the algorithms reset. This will probably be resolved in a next version.
Still, uncommon microorganisms take a lot more time than common microorganisms, especially the first time. To relieve this pitfall and further improve performance, two important calculations take almost no time at all: **repetitive results** and **already precalculated results**. Still, uncommon microorganisms take a lot more time than common microorganisms, especially the first time. To relieve this pitfall and further improve performance, two important calculations take almost no time at all: **repetitive results** and **already precalculated results**.