1
0
mirror of https://github.com/msberends/AMR.git synced 2025-07-12 06:21:57 +02:00

(v1.5.0.9021) improve speed of %like%

This commit is contained in:
2021-02-21 20:15:09 +01:00
parent daa12ced2c
commit 5ef8cb41a7
23 changed files with 137 additions and 126 deletions

View File

@ -39,7 +39,7 @@
#' * Is case-insensitive (use `%like_case%` for case-sensitive matching)
#' * Supports multiple patterns
#' * Checks if `pattern` is a regular expression and sets `fixed = TRUE` if not, to greatly improve speed
#' * Tries again with `perl = TRUE` if regex fails
#' * Always uses compatibility with Perl
#'
#' Using RStudio? The text `%like%` can also be directly inserted in your code from the Addins menu and can have its own Keyboard Shortcut like `Ctrl+Shift+L` or `Cmd+Shift+L` (see `Tools` > `Modify Keyboard Shortcuts...`).
#' @source Idea from the [`like` function from the `data.table` package](https://github.com/Rdatatable/data.table/blob/master/R/like.R)
@ -99,7 +99,7 @@ like <- function(x, pattern, ignore.case = TRUE) {
if (is.factor(x[i])) {
res[i] <- as.integer(x[i]) %in% grep(pattern[i], levels(x[i]), ignore.case = FALSE, fixed = fixed)
} else {
res[i] <- grepl(pattern[i], x[i], ignore.case = FALSE, fixed = fixed)
res[i] <- grepl(pattern[i], x[i], ignore.case = FALSE, fixed = fixed, perl = !fixed)
}
}
res <- vapply(FUN.VALUE = logical(1), pattern, function(pttrn) grepl(pttrn, x, ignore.case = FALSE, fixed = fixed))
@ -112,9 +112,9 @@ like <- function(x, pattern, ignore.case = TRUE) {
# x and pattern are of same length, so items with each other
for (i in seq_len(length(res))) {
if (is.factor(x[i])) {
res[i] <- as.integer(x[i]) %in% grep(pattern[i], levels(x[i]), ignore.case = FALSE, fixed = fixed)
res[i] <- as.integer(x[i]) %in% grep(pattern[i], levels(x[i]), ignore.case = FALSE, fixed = fixed, perl = !fixed)
} else {
res[i] <- grepl(pattern[i], x[i], ignore.case = FALSE, fixed = fixed)
res[i] <- grepl(pattern[i], x[i], ignore.case = FALSE, fixed = fixed, perl = !fixed)
}
}
return(res)
@ -123,22 +123,9 @@ like <- function(x, pattern, ignore.case = TRUE) {
# the regular way how grepl works; just one pattern against one or more x
if (is.factor(x)) {
as.integer(x) %in% grep(pattern, levels(x), ignore.case = FALSE, fixed = fixed)
as.integer(x) %in% grep(pattern, levels(x), ignore.case = FALSE, fixed = fixed, perl = !fixed)
} else {
tryCatch(grepl(pattern, x, ignore.case = FALSE, fixed = fixed),
error = function(e) {
if (grepl("invalid reg(ular )?exp", e$message, ignore.case = TRUE)) {
# try with perl = TRUE:
return(grepl(pattern = pattern,
x = x,
ignore.case = FALSE,
fixed = fixed,
perl = TRUE))
} else {
# stop otherwise
stop(e$message)
}
})
grepl(pattern, x, ignore.case = FALSE, fixed = fixed, perl = !fixed)
}
}
@ -157,15 +144,3 @@ like <- function(x, pattern, ignore.case = TRUE) {
meet_criteria(pattern, allow_NA = FALSE)
like(x, pattern, ignore.case = FALSE)
}
"%like_perl%" <- function(x, pattern) {
meet_criteria(x, allow_NA = TRUE)
meet_criteria(pattern, allow_NA = FALSE)
# convenient for e.g. matching all Klebsiella and Raoultella, but not
# K. aerogenes: fullname %like_perl% "^(Klebsiella(?! aerogenes)|Raoultella)"
grepl(x = tolower(x),
pattern = tolower(pattern),
perl = TRUE,
fixed = FALSE,
ignore.case = TRUE)
}