1
0
mirror of https://github.com/msberends/AMR.git synced 2025-07-08 11:11:54 +02:00

(v1.5.0.9021) improve speed of %like%

This commit is contained in:
2021-02-21 20:15:09 +01:00
parent daa12ced2c
commit 5ef8cb41a7
23 changed files with 137 additions and 126 deletions

View File

@ -1077,6 +1077,14 @@ percentage <- function(x, digits = NULL, ...) {
digits = digits, ...)
}
time_start_tracking <- function() {
pkg_env$time_start <- round(as.numeric(Sys.time()) * 1000)
}
time_track <- function(name = NULL) {
paste("(until now:", trimws(round(as.numeric(Sys.time()) * 1000) - pkg_env$time_start), "ms)")
}
# prevent dependency on package 'backports'
# these functions were not available in previous versions of R (last checked: R 4.0.3)
# see here for the full list: https://github.com/r-lib/backports

View File

@ -896,7 +896,7 @@ eucast_rules <- function(x,
target_value <- eucast_rules_df[i, "to_value", drop = TRUE]
if (is.na(source_antibiotics)) {
rows <- tryCatch(which(x[, if_mo_property, drop = TRUE] %like_perl% mo_value),
rows <- tryCatch(which(x[, if_mo_property, drop = TRUE] %like% mo_value),
error = function(e) integer(0))
} else {
source_antibiotics <- get_antibiotic_columns(source_antibiotics, x)
@ -906,17 +906,17 @@ eucast_rules <- function(x,
if (length(source_antibiotics) == 0) {
rows <- integer(0)
} else if (length(source_antibiotics) == 1) {
rows <- tryCatch(which(x[, if_mo_property, drop = TRUE] %like_perl% mo_value
rows <- tryCatch(which(x[, if_mo_property, drop = TRUE] %like% mo_value
& as.rsi_no_warning(x[, source_antibiotics[1L]]) == source_value[1L]),
error = function(e) integer(0))
} else if (length(source_antibiotics) == 2) {
rows <- tryCatch(which(x[, if_mo_property, drop = TRUE] %like_perl% mo_value
rows <- tryCatch(which(x[, if_mo_property, drop = TRUE] %like% mo_value
& as.rsi_no_warning(x[, source_antibiotics[1L]]) == source_value[1L]
& as.rsi_no_warning(x[, source_antibiotics[2L]]) == source_value[2L]),
error = function(e) integer(0))
# nolint start
# } else if (length(source_antibiotics) == 3) {
# rows <- tryCatch(which(x[, if_mo_property, drop = TRUE] %like_perl% mo_value
# rows <- tryCatch(which(x[, if_mo_property, drop = TRUE] %like% mo_value
# & as.rsi_no_warning(x[, source_antibiotics[1L]]) == source_value[1L]
# & as.rsi_no_warning(x[, source_antibiotics[2L]]) == source_value[2L]
# & as.rsi_no_warning(x[, source_antibiotics[3L]]) == source_value[3L]),

View File

@ -39,7 +39,7 @@
#' * Is case-insensitive (use `%like_case%` for case-sensitive matching)
#' * Supports multiple patterns
#' * Checks if `pattern` is a regular expression and sets `fixed = TRUE` if not, to greatly improve speed
#' * Tries again with `perl = TRUE` if regex fails
#' * Always uses compatibility with Perl
#'
#' Using RStudio? The text `%like%` can also be directly inserted in your code from the Addins menu and can have its own Keyboard Shortcut like `Ctrl+Shift+L` or `Cmd+Shift+L` (see `Tools` > `Modify Keyboard Shortcuts...`).
#' @source Idea from the [`like` function from the `data.table` package](https://github.com/Rdatatable/data.table/blob/master/R/like.R)
@ -99,7 +99,7 @@ like <- function(x, pattern, ignore.case = TRUE) {
if (is.factor(x[i])) {
res[i] <- as.integer(x[i]) %in% grep(pattern[i], levels(x[i]), ignore.case = FALSE, fixed = fixed)
} else {
res[i] <- grepl(pattern[i], x[i], ignore.case = FALSE, fixed = fixed)
res[i] <- grepl(pattern[i], x[i], ignore.case = FALSE, fixed = fixed, perl = !fixed)
}
}
res <- vapply(FUN.VALUE = logical(1), pattern, function(pttrn) grepl(pttrn, x, ignore.case = FALSE, fixed = fixed))
@ -112,9 +112,9 @@ like <- function(x, pattern, ignore.case = TRUE) {
# x and pattern are of same length, so items with each other
for (i in seq_len(length(res))) {
if (is.factor(x[i])) {
res[i] <- as.integer(x[i]) %in% grep(pattern[i], levels(x[i]), ignore.case = FALSE, fixed = fixed)
res[i] <- as.integer(x[i]) %in% grep(pattern[i], levels(x[i]), ignore.case = FALSE, fixed = fixed, perl = !fixed)
} else {
res[i] <- grepl(pattern[i], x[i], ignore.case = FALSE, fixed = fixed)
res[i] <- grepl(pattern[i], x[i], ignore.case = FALSE, fixed = fixed, perl = !fixed)
}
}
return(res)
@ -123,22 +123,9 @@ like <- function(x, pattern, ignore.case = TRUE) {
# the regular way how grepl works; just one pattern against one or more x
if (is.factor(x)) {
as.integer(x) %in% grep(pattern, levels(x), ignore.case = FALSE, fixed = fixed)
as.integer(x) %in% grep(pattern, levels(x), ignore.case = FALSE, fixed = fixed, perl = !fixed)
} else {
tryCatch(grepl(pattern, x, ignore.case = FALSE, fixed = fixed),
error = function(e) {
if (grepl("invalid reg(ular )?exp", e$message, ignore.case = TRUE)) {
# try with perl = TRUE:
return(grepl(pattern = pattern,
x = x,
ignore.case = FALSE,
fixed = fixed,
perl = TRUE))
} else {
# stop otherwise
stop(e$message)
}
})
grepl(pattern, x, ignore.case = FALSE, fixed = fixed, perl = !fixed)
}
}
@ -157,15 +144,3 @@ like <- function(x, pattern, ignore.case = TRUE) {
meet_criteria(pattern, allow_NA = FALSE)
like(x, pattern, ignore.case = FALSE)
}
"%like_perl%" <- function(x, pattern) {
meet_criteria(x, allow_NA = TRUE)
meet_criteria(pattern, allow_NA = FALSE)
# convenient for e.g. matching all Klebsiella and Raoultella, but not
# K. aerogenes: fullname %like_perl% "^(Klebsiella(?! aerogenes)|Raoultella)"
grepl(x = tolower(x),
pattern = tolower(pattern),
perl = TRUE,
fixed = FALSE,
ignore.case = TRUE)
}

38
R/mo.R
View File

@ -277,6 +277,10 @@ exec_as.mo <- function(x,
check_dataset_integrity()
if (isTRUE(debug) && initial_search == TRUE) {
time_start_tracking()
}
lookup <- function(needle,
column = property,
haystack = reference_data_to_use,
@ -295,7 +299,8 @@ exec_as.mo <- function(x,
# `column` can be NULL for all columns, or a selection
# returns a character (vector) - if `column` > length 1 then with columns as names
if (isTRUE(debug_mode)) {
cat(font_silver("looking up: ", substitute(needle), collapse = ""))
cat(font_silver("Looking up: ", substitute(needle), collapse = ""),
"\n ", time_track())
}
if (length(column) == 1) {
res_df <- haystack[which(eval(substitute(needle), envir = haystack, enclos = parent.frame())), , drop = FALSE]
@ -313,7 +318,7 @@ exec_as.mo <- function(x,
NA_character_
} else {
if (isTRUE(debug_mode)) {
cat(font_green(paste0(" **MATCH** (", NROW(res_df), " results)\n")))
cat(font_green(paste0(" MATCH (", NROW(res_df), " results)\n")))
}
if ((length(res) > n | uncertainty > 1) & uncertainty != -1) {
# save the other possible results as well, but not for forced certain results (then uncertainty == -1)
@ -327,16 +332,20 @@ exec_as.mo <- function(x,
res[seq_len(min(n, length(res)))]
}
} else {
if (isTRUE(debug_mode)) {
cat("\n")
}
if (is.null(column)) {
column <- names(haystack)
}
res <- haystack[which(eval(substitute(needle), envir = haystack, enclos = parent.frame())), , drop = FALSE]
res <- res[seq_len(min(n, nrow(res))), column, drop = TRUE]
if (NROW(res) == 0) {
if (isTRUE(debug_mode)) {
cat(font_red(" (no rows)\n"))
}
res <- rep(NA_character_, length(column))
} else {
if (isTRUE(debug_mode)) {
cat(font_green(paste0(" MATCH (", NROW(res), " rows)\n")))
}
}
res <- as.character(res)
names(res) <- column
@ -394,7 +403,7 @@ exec_as.mo <- function(x,
check_validity_mo_source(reference_df)
reference_df <- repair_reference_df(reference_df)
}
# all empty
if (all(identical(trimws(x_input), "") | is.na(x_input) | length(x) == 0)) {
if (property == "mo") {
@ -471,10 +480,10 @@ exec_as.mo <- function(x,
x_backup[x_backup_untouched == "Fungi"] <- "Fungi" # is literally the kingdom
# Fill in fullnames and MO codes at once
known_names <- x_backup %in% MO_lookup$fullname
x[known_names] <- MO_lookup[match(x_backup[known_names], MO_lookup$fullname), property, drop = TRUE]
known_codes <- x_backup %in% MO_lookup$mo
x[known_codes] <- MO_lookup[match(x_backup[known_codes], MO_lookup$mo), property, drop = TRUE]
known_names <- tolower(x_backup) %in% MO_lookup$fullname_lower
x[known_names] <- MO_lookup[match(tolower(x_backup)[known_names], MO_lookup$fullname), property, drop = TRUE]
known_codes <- toupper(x_backup) %in% MO_lookup$mo
x[known_codes] <- MO_lookup[match(toupper(x_backup)[known_codes], MO_lookup$mo), property, drop = TRUE]
already_known <- known_names | known_codes
# now only continue where the right taxonomic output is not already known
@ -975,6 +984,7 @@ exec_as.mo <- function(x,
g.x_backup_without_spp %pm>% substr(1, x_length / 2),
".* ",
g.x_backup_without_spp %pm>% substr((x_length / 2) + 1, x_length))
print(x_split)
found <- lookup(fullname_lower %like_case% x_split,
haystack = data_to_check)
if (!is.na(found)) {
@ -1414,6 +1424,10 @@ exec_as.mo <- function(x,
close(progress)
}
if (isTRUE(debug) && initial_search == TRUE) {
cat("Ended search", time_track(), "\n")
}
# handling failures ----
failures <- failures[!failures %in% c(NA, NULL, NaN)]
@ -1571,6 +1585,10 @@ exec_as.mo <- function(x,
}
}
if (isTRUE(debug) && initial_search == TRUE) {
cat("Finished function", time_track(), "\n")
}
x
}