1
0
mirror of https://github.com/msberends/AMR.git synced 2025-07-09 13:42:04 +02:00

(v0.7.1.9073) as.mo() self-learning algorithm

This commit is contained in:
2019-09-15 22:57:30 +02:00
parent cd178ee569
commit 398c5bdc4f
31 changed files with 1030 additions and 2360 deletions

View File

@ -21,7 +21,7 @@
#' Pattern Matching
#'
#' Convenient wrapper around \code{\link[base]{grep}} to match a pattern: \code{a \%like\% b}. It always returns a \code{logical} vector and is always case-insensitive. Also, \code{pattern} (\code{b}) can be as long as \code{x} (\code{a}) to compare items of each index in both vectors.
#' Convenient wrapper around \code{\link[base]{grep}} to match a pattern: \code{a \%like\% b}. It always returns a \code{logical} vector and is always case-insensitive (use \code{a \%like_case\% b} for case-sensitive matching). Also, \code{pattern} (\code{b}) can be as long as \code{x} (\code{a}) to compare items of each index in both vectors, or can both have the same length to iterate over all cases.
#' @inheritParams base::grepl
#' @return A \code{logical} vector
#' @name like
@ -53,14 +53,14 @@
#' left_join_microorganisms() %>%
#' filter(genus %like% '^ent') %>%
#' freq(genus, species)
like <- function(x, pattern) {
like <- function(x, pattern, ignore.case = TRUE) {
if (length(pattern) > 1) {
if (length(x) != length(pattern)) {
if (length(x) == 1) {
x <- rep(x, length(pattern))
}
# return TRUE for every 'x' that matches any 'pattern', FALSE otherwise
res <- sapply(pattern, function(pttrn) x %like% pttrn)
res <- sapply(pattern, function(pttrn) base::grepl(pttrn, x, ignore.case = ignore.case))
res2 <- as.logical(rowSums(res))
# get only first item of every hit in pattern
res2[duplicated(res)] <- FALSE
@ -71,9 +71,9 @@ like <- function(x, pattern) {
res <- vector(length = length(pattern))
for (i in 1:length(res)) {
if (is.factor(x[i])) {
res[i] <- as.integer(x[i]) %in% base::grep(pattern[i], levels(x[i]), ignore.case = TRUE)
res[i] <- as.integer(x[i]) %in% base::grep(pattern[i], levels(x[i]), ignore.case = ignore.case)
} else {
res[i] <- base::grepl(pattern[i], x[i], ignore.case = TRUE)
res[i] <- base::grepl(pattern[i], x[i], ignore.case = ignore.case)
}
}
return(res)
@ -82,16 +82,24 @@ like <- function(x, pattern) {
# the regular way how grepl works; just one pattern against one or more x
if (is.factor(x)) {
as.integer(x) %in% base::grep(pattern, levels(x), ignore.case = TRUE)
as.integer(x) %in% base::grep(pattern, levels(x), ignore.case = ignore.case)
} else {
tryCatch(base::grepl(pattern, x, ignore.case = TRUE),
tryCatch(base::grepl(pattern, x, ignore.case = ignore.case),
error = function(e) ifelse(test = grepl("Invalid regexp", e$message),
# try with perl = TRUE:
yes = return(base::grepl(pattern, x, ignore.case = TRUE, perl = TRUE)),
yes = return(base::grepl(pattern, x, ignore.case = ignore.case, perl = TRUE)),
no = stop(e$message)))
}
}
#' @rdname like
#' @export
"%like%" <- like
"%like%" <- function(x, pattern) {
like(x, pattern, ignore.case = TRUE)
}
#' @rdname like
#' @export
"%like_case%" <- function(x, pattern) {
like(x, pattern, ignore.case = FALSE)
}

View File

@ -41,8 +41,8 @@ percent <- function(x, round = 1, force_zero = FALSE, decimal.mark = getOption("
big.mark <- " "
}
}
x <- percent_clean(x = x, round = round, force_zero = force_zero,
decimal.mark = decimal.mark, big.mark = big.mark, ...)
percent_clean(x = x, round = round, force_zero = force_zero,
decimal.mark = decimal.mark, big.mark = big.mark, ...)
}
#' @importFrom crayon blue bold red

1431
R/mo.R

File diff suppressed because it is too large Load Diff

1489
R/mo2.R

File diff suppressed because it is too large Load Diff

View File

@ -19,116 +19,152 @@
# Visit our website for more info: https://msberends.gitlab.io/AMR. #
# ==================================================================== #
# print successful as.mo coercions to AMR environment
#' @importFrom dplyr %>% distinct filter
set_mo_history <- function(x, mo, uncertainty_level, force = FALSE) {
# disable function for now
return(base::invisible())
mo_history_file <- file.path(file.path(system.file(package = "AMR"), "mo_history"), "mo_history.csv")
# if (base::interactive() | force == TRUE) {
# mo_hist <- read_mo_history(uncertainty_level = uncertainty_level, force = force)
# df <- data.frame(x, mo, stringsAsFactors = FALSE) %>%
# distinct(x, .keep_all = TRUE) %>%
# filter(!is.na(x) & !is.na(mo))
# if (nrow(df) == 0) {
# return(base::invisible())
# }
# x <- toupper(df$x)
# mo <- df$mo
# for (i in 1:length(x)) {
# # save package version too, as both the as.mo() algorithm and the reference data set may change
# if (NROW(mo_hist[base::which(mo_hist$x == x[i] &
# mo_hist$uncertainty_level >= uncertainty_level &
# mo_hist$package_v == utils::packageVersion("AMR")),]) == 0) {
# tryCatch(
# assign(x = "mo_history",
# value = rbind(mo_hist,
# data.frame(
# x = x[i],
# mo = mo[i],
# uncertainty_level = uncertainty_level,
# package_v = base::as.character(utils::packageVersion("AMR")),
# stringsAsFactors = FALSE)),
# envir = asNamespace("AMR")),
# error = function(e) invisible())
# }
# }
# }
# return(base::invisible())
# print successful as.mo coercions to a options entry
#' @importFrom dplyr %>% distinct filter
set_mo_history <- function(x, mo, uncertainty_level, force = FALSE, disable = FALSE) {
if (isTRUE(disable)) {
return(base::invisible())
}
if (base::interactive() | force == TRUE) {
mo_hist <- read_mo_history(uncertainty_level = uncertainty_level, force = force)
df <- data.frame(x, mo, stringsAsFactors = FALSE) %>%
distinct(x, .keep_all = TRUE) %>%
filter(!is.na(x) & !is.na(mo))
if (nrow(df) == 0) {
return(base::invisible())
}
x <- toupper(df$x)
mo <- df$mo
for (i in 1:length(x)) {
# save package version too, as both the as.mo() algorithm and the reference data set may change
if (NROW(mo_hist[base::which(mo_hist$x == x[i] &
mo_hist$uncertainty_level >= uncertainty_level &
mo_hist$package_v == utils::packageVersion("AMR")),]) == 0) {
# # Not using the file system:
# tryCatch(options(mo_remembered_results = rbind(mo_hist,
# data.frame(
# x = x[i],
# mo = mo[i],
# uncertainty_level = uncertainty_level,
# package_v = base::as.character(utils::packageVersion("AMR")),
# stringsAsFactors = FALSE))),
# error = function(e) base::invisible())
# # don't remember more than 1,000 different input values
# if (tryCatch(nrow(getOption("mo_remembered_results")), error = function(e) 1001) > 1000) {
# return(base::invisible())
# }
if (is.null(mo_hist)) {
message(blue(paste0("NOTE: results are saved to ", mo_history_file, ".")))
}
tryCatch(write.csv(rbind(mo_hist,
data.frame(
x = x[i],
mo = mo[i],
uncertainty_level = uncertainty_level,
package_v = base::as.character(utils::packageVersion("AMR")),
stringsAsFactors = FALSE)),
file = mo_history_file, row.names = FALSE),
error = function(e) base::invisible())
}
}
}
return(base::invisible())
}
get_mo_history <- function(x, uncertainty_level, force = FALSE) {
# disable function for now
return(NA)
get_mo_history <- function(x, uncertainty_level, force = FALSE, disable = FALSE) {
if (isTRUE(disable)) {
return(to_class_mo(NA))
}
# history <- read_mo_history(uncertainty_level = uncertainty_level, force = force)
# if (base::is.null(history)) {
# NA
# } else {
# data.frame(x = toupper(x), stringsAsFactors = FALSE) %>%
# left_join(history, by = "x") %>%
# pull(mo)
# }
history <- read_mo_history(uncertainty_level = uncertainty_level, force = force)
if (base::is.null(history)) {
result <- NA
} else {
result <- data.frame(x = toupper(x), stringsAsFactors = FALSE) %>%
left_join(history, by = "x") %>%
pull(mo)
}
to_class_mo(result)
}
#' @importFrom dplyr %>% filter distinct
read_mo_history <- function(uncertainty_level = 2, force = FALSE, unfiltered = FALSE) {
# disable function for now
return(NULL)
read_mo_history <- function(uncertainty_level = 2, force = FALSE, unfiltered = FALSE, disable = FALSE) {
if (isTRUE(disable)) {
return(NULL)
}
# if ((!base::interactive() & force == FALSE)) {
# return(NULL)
# }
# uncertainty_level_param <- uncertainty_level
#
# history <- tryCatch(get("mo_history", envir = asNamespace("AMR")),
# error = function(e) NULL)
# if (is.null(history)) {
# return(NULL)
# }
# # Below: filter on current package version.
# # Even current fullnames may be replaced by new taxonomic names, so new versions of
# # the Catalogue of Life must not lead to data corruption.
#
# if (unfiltered == FALSE) {
# history <- history %>%
# filter(package_v == as.character(utils::packageVersion("AMR")),
# # only take unknowns if uncertainty_level_param is higher
# ((mo == "UNKNOWN" & uncertainty_level_param == uncertainty_level) |
# (mo != "UNKNOWN" & uncertainty_level_param >= uncertainty_level))) %>%
# arrange(desc(uncertainty_level)) %>%
# distinct(x, mo, .keep_all = TRUE)
# }
#
# if (nrow(history) == 0) {
# NULL
# } else {
# history
# }
if ((!base::interactive() & force == FALSE)) {
return(NULL)
}
uncertainty_level_param <- uncertainty_level
# # Not using the file system:
# history <- tryCatch(getOption("mo_remembered_results"),
# error = function(e) NULL)
history <- tryCatch(read.csv(mo_history_file, stringsAsFactors = FALSE),
warning = function(w) invisible(),
error = function(e) NULL)
if (is.null(history)) {
return(NULL)
}
# Below: filter on current package version.
# Even current fullnames may be replaced by new taxonomic names, so new versions of
# the Catalogue of Life must not lead to data corruption.
if (unfiltered == FALSE) {
history <- history %>%
filter(package_v == as.character(utils::packageVersion("AMR")),
# only take unknowns if uncertainty_level_param is higher
((mo == "UNKNOWN" & uncertainty_level_param == uncertainty_level) |
(mo != "UNKNOWN" & uncertainty_level_param >= uncertainty_level))) %>%
arrange(desc(uncertainty_level)) %>%
distinct(x, mo, .keep_all = TRUE)
}
if (nrow(history) == 0) {
NULL
} else {
history
}
}
# @rdname as.mo
# @importFrom crayon red
# @importFrom utils menu
# @export
clean_mo_history <- function(...) {
# if (!is.null(read_mo_history())) {
# if (interactive() & !isTRUE(list(...)$force)) {
# q <- menu(title = paste("This will remove all",
# format(nrow(read_mo_history(999, unfiltered = TRUE)), big.mark = ","),
# "microbial IDs determined previously in this session. Are you sure?"),
# choices = c("Yes", "No"),
# graphics = FALSE)
# if (q != 1) {
# return(invisible())
# }
# }
# tryCatch(
# assign(x = "mo_history",
# value = NULL,
# envir = asNamespace("AMR")),
# error = function(e) invisible())
# cat(red("History removed."))
# }
#' @rdname as.mo
#' @importFrom crayon red
#' @importFrom utils menu
#' @export
clear_mo_history <- function(...) {
if (!is.null(read_mo_history())) {
if (interactive() & !isTRUE(list(...)$force)) {
q <- menu(title = paste("This will clear all",
format(nrow(read_mo_history(999, unfiltered = TRUE)), big.mark = ","),
"previously determined microbial IDs. Are you sure?"),
choices = c("Yes", "No"),
graphics = FALSE)
if (q != 1) {
return(invisible())
}
}
# # Not using the file system:
# success <- tryCatch(options(mo_remembered_results = NULL),
# error = function(e) FALSE)
success <- create_blank_mo_history()
if (!isFALSE(success)) {
cat(red(paste("File", mo_history_file, "cleared.")))
}
}
}
create_blank_mo_history <- function() {
tryCatch(
write.csv(x = data.frame(x = character(0),
mo = character(0),
uncertainty_level = integer(0),
package_v = character(0),
stringsAsFactors = FALSE),
file = mo_history_file),
warning = function(w) invisible(),
error = function(e) TRUE)
}

View File

@ -408,7 +408,7 @@ mo_validate <- function(x, property, ...) {
if (!"AMR" %in% base::.packages()) {
require("AMR")
# check onLoad() in R/zzz.R: data tables are created there.
# check .onLoad() in R/zzz.R: data tables are created there.
}
# try to catch an error when inputting an invalid parameter

View File

@ -26,7 +26,8 @@
# register data
microorganisms.oldDT <- as.data.table(AMR::microorganisms.old)
microorganisms.oldDT$fullname_lower <- tolower(microorganisms.oldDT$fullname)
# for fullname_lower: keep only dots, letters, numbers, slashes, spaces and dashes
microorganisms.oldDT$fullname_lower <- gsub("[^.a-z0-9/ \\-]+", "", tolower(microorganisms.oldDT$fullname))
setkey(microorganisms.oldDT, col_id, fullname)
assign(x = "microorganismsDT",
@ -81,7 +82,8 @@
#' @importFrom data.table as.data.table setkey
make_DT <- function() {
microorganismsDT <- as.data.table(AMR::microorganisms)
microorganismsDT$fullname_lower <- tolower(microorganismsDT$fullname)
# for fullname_lower: keep only dots, letters, numbers, slashes, spaces and dashes
microorganismsDT$fullname_lower <- gsub("[^.a-z0-9/ \\-]+", "", tolower(microorganismsDT$fullname))
setkey(microorganismsDT,
prevalence,
kingdom,