(v1.6.0.9003) like() fix

2026-07-14 11:10:53 +02:00 · 2021-04-16 11:41:05 +02:00
parent d277d58475
commit 00d3e437a8
29 changed files with 78 additions and 62 deletions
--- a/R/aa_helper_functions.R
+++ b/R/aa_helper_functions.R
@@ -211,10 +211,20 @@ search_type_in_df <- function(x, type, info = TRUE) {
  found
 }

-is_possibly_regex <- function(x) {
-  tryCatch(vapply(FUN.VALUE = character(1), strsplit(x, ""),
-                  function(y) any(y %in% c("$", "(", ")", "*", "+", "-", ".", "?", "[", "]", "^", "{", "|", "}", "\\"), na.rm = TRUE)),
-           error = function(e) rep(TRUE, length(x)))
+is_valid_regex <- function(x) {
+  regex_at_all <- tryCatch(vapply(FUN.VALUE = logical(1),
+                                  X = strsplit(x, ""),
+                                  FUN = function(y) any(y %in% c("$", "(", ")", "*", "+", "-",
+                                                                 ".", "?", "[", "]", "^", "{", 
+                                                                 "|", "}", "\\"),
+                                                        na.rm = TRUE)),
+                           error = function(e) rep(TRUE, length(x)))
+  regex_valid <- vapply(FUN.VALUE = logical(1),
+                        X = c("[.", "."),
+                        FUN = function(y) !"try-error" %in% class(try(grepl(y, ""),
+                                                                      silent = TRUE)),
+                        USE.NAMES = FALSE)
+  regex_at_all & regex_valid
 }

 stop_ifnot_installed <- function(package) {
--- a/R/like.R
+++ b/R/like.R
@@ -28,21 +28,21 @@
 #' Convenient wrapper around [grepl()] to match a pattern: `x %like% pattern`. It always returns a [`logical`] vector and is always case-insensitive (use `x %like_case% pattern` for case-sensitive matching). Also, `pattern` can be as long as `x` to compare items of each index in both vectors, or they both can have the same length to iterate over all cases.
 #' @inheritSection lifecycle Stable Lifecycle
 #' @param x a character vector where matches are sought, or an object which can be coerced by [as.character()] to a character vector.
-#' @param pattern a character string containing a regular expression (or [character] string for `fixed = TRUE`) to be matched in the given character vector. Coerced by [as.character()] to a character string if possible.  If a [character] vector of length 2 or more is supplied, the first element is used with a warning.
+#' @param pattern a character vector containing regular expressions (or a [character] string for `fixed = TRUE`) to be matched in the given character vector. Coerced by [as.character()] to a character string if possible.
 #' @param ignore.case if `FALSE`, the pattern matching is *case sensitive* and if `TRUE`, case is ignored during matching.
 #' @return A [logical] vector
 #' @name like
 #' @rdname like
 #' @export
 #' @details
-#' The `%like%` function:
+#' This `%like%` function:
 #' * Is case-insensitive (use `%like_case%` for case-sensitive matching)
 #' * Supports multiple patterns
-#' * Checks if `pattern` is a regular expression and sets `fixed = TRUE` if not, to greatly improve speed
+#' * Checks if `pattern` is a valid regular expression and sets `fixed = TRUE` if not, to greatly improve speed (vectorised over `pattern`)
 #' * Always uses compatibility with Perl unless `fixed = TRUE`, to greatly improve speed
 #' 
 #' Using RStudio? The text `%like%` can also be directly inserted in your code from the Addins menu and can have its own Keyboard Shortcut like `Ctrl+Shift+L` or `Cmd+Shift+L` (see `Tools` > `Modify Keyboard Shortcuts...`).
-#' @source Idea from the [`like` function from the `data.table` package](https://github.com/Rdatatable/data.table/blob/ec1259af1bf13fc0c96a1d3f9e84d55d8106a9a4/R/like.R)
+#' @source Idea from the [`like` function from the `data.table` package](https://github.com/Rdatatable/data.table/blob/ec1259af1bf13fc0c96a1d3f9e84d55d8106a9a4/R/like.R), although altered as explained in *Details*.
 #' @seealso [grepl()]
 #' @inheritSection AMR Read more on Our Website!
 #' @examples
@@ -79,9 +79,10 @@ like <- function(x, pattern, ignore.case = TRUE) {
  if (all(is.na(x))) {
    return(rep(FALSE, length(x)))
  }
-  
-  # set to fixed if no regex found
-  fixed <- !any(is_possibly_regex(pattern))
+
+  # set to fixed if no valid regex (vectorised)
+  fixed <- !is_valid_regex(pattern)
+
  if (ignore.case == TRUE) {
    # set here, otherwise if fixed = TRUE, this warning will be thrown: argument `ignore.case = TRUE` will be ignored
    x <- tolower(x)
@@ -91,7 +92,7 @@ like <- function(x, pattern, ignore.case = TRUE) {
  if (is.factor(x)) {
    x <- as.character(x)
  }
-  
+
  if (length(pattern) == 1) {
    grepl(pattern, x, ignore.case = FALSE, fixed = fixed, perl = !fixed)
  } else {
@@ -105,7 +106,9 @@ like <- function(x, pattern, ignore.case = TRUE) {
      mapply(FUN = grepl,
             x = x,
             pattern = pattern,
-             MoreArgs = list(ignore.case = FALSE, fixed = fixed, perl = !fixed),
+             fixed = fixed,
+             perl = !fixed,
+             MoreArgs = list(ignore.case = FALSE),
             SIMPLIFY = FALSE,
             USE.NAMES = FALSE)
    )
--- a/R/mo_matching_score.R
+++ b/R/mo_matching_score.R
@@ -44,7 +44,7 @@
 #' * \ifelse{html}{\out{<i>p<sub>n</sub></i> is the human pathogenic prevalence group of <i>n</i>, as described below;}}{p_n is the human pathogenic prevalence group of \eqn{n}, as described below;}
 #' * \ifelse{html}{\out{<i>k<sub>n</sub></i> is the taxonomic kingdom of <i>n</i>, set as Bacteria = 1, Fungi = 2, Protozoa = 3, Archaea = 4, others = 5.}}{l_n is the taxonomic kingdom of \eqn{n}, set as Bacteria = 1, Fungi = 2, Protozoa = 3, Archaea = 4, others = 5.}
 #' 
-#' The grouping into human pathogenic prevalence (\eqn{p}) is based on experience from several microbiological laboratories in the Netherlands in conjunction with international reports on pathogen prevalence. **Group 1** (most prevalent microorganisms) consists of all microorganisms where the taxonomic class is Gammaproteobacteria or where the taxonomic genus is *Enterococcus*, *Staphylococcus* or *Streptococcus*. This group consequently contains all common Gram-negative bacteria, such as *Pseudomonas* and *Legionella* and all species within the order Enterobacterales. **Group 2** consists of all microorganisms where the taxonomic phylum is Proteobacteria, Firmicutes, Actinobacteria or Sarcomastigophora, or where the taxonomic genus is *Absidia*, *Acremonium*, *Actinotignum*, *Alternaria*, *Anaerosalibacter*, *Apophysomyces*, *Arachnia*, *Aspergillus*, *Aureobacterium*, *Aureobasidium*, *Bacteroides*, *Basidiobolus*, *Beauveria*, *Blastocystis*, *Branhamella*, *Calymmatobacterium*, *Candida*, *Capnocytophaga*, *Catabacter*, *Chaetomium*, *Chryseobacterium*, *Chryseomonas*, *Chrysonilia*, *Cladophialophora*, *Cladosporium*, *Conidiobolus*, *Cryptococcus*, *Curvularia*, *Exophiala*, *Exserohilum*, *Flavobacterium*, *Fonsecaea*, *Fusarium*, *Fusobacterium*, *Hendersonula*, *Hypomyces*, *Koserella*, *Lelliottia*, *Leptosphaeria*, *Leptotrichia*, *Malassezia*, *Malbranchea*, *Mortierella*, *Mucor*, *Mycocentrospora*, *Mycoplasma*, *Nectria*, *Ochroconis*, *Oidiodendron*, *Phoma*, *Piedraia*, *Pithomyces*, *Pityrosporum*, *Prevotella*, *Pseudallescheria*, *Rhizomucor*, *Rhizopus*, *Rhodotorula*, *Scolecobasidium*, *Scopulariopsis*, *Scytalidium*,*Sporobolomyces*, *Stachybotrys*, *Stomatococcus*, *Treponema*, *Trichoderma*, *Trichophyton*, *Trichosporon*, *Tritirachium* or *Ureaplasma*. **Group 3** consists of all other microorganisms.
+#' The grouping into human pathogenic prevalence (\eqn{p}) is based on experience from several microbiological laboratories in the Netherlands in conjunction with international reports on pathogen prevalence. **Group 1** (most prevalent microorganisms) consists of all microorganisms where the taxonomic class is Gammaproteobacteria or where the taxonomic genus is *Enterococcus*, *Staphylococcus* or *Streptococcus*. This group consequently contains all common Gram-negative bacteria, such as *Pseudomonas* and *Legionella* and all species within the order Enterobacterales. **Group 2** consists of all microorganisms where the taxonomic phylum is Proteobacteria, Firmicutes, Actinobacteria or Sarcomastigophora, or where the taxonomic genus is *Absidia*, *Acremonium*, *Actinotignum*, *Alternaria*, *Anaerosalibacter*, *Apophysomyces*, *Arachnia*, *Aspergillus*, *Aureobacterium*, *Aureobasidium*, *Bacteroides*, *Basidiobolus*, *Beauveria*, *Blastocystis*, *Branhamella*, *Calymmatobacterium*, *Candida*, *Capnocytophaga*, *Catabacter*, *Chaetomium*, *Chryseobacterium*, *Chryseomonas*, *Chrysonilia*, *Cladophialophora*, *Cladosporium*, *Conidiobolus*, *Cryptococcus*, *Curvularia*, *Exophiala*, *Exserohilum*, *Flavobacterium*, *Fonsecaea*, *Fusarium*, *Fusobacterium*, *Hendersonula*, *Hypomyces*, *Koserella*, *Lelliottia*, *Leptosphaeria*, *Leptotrichia*, *Malassezia*, *Malbranchea*, *Mortierella*, *Mucor*, *Mycocentrospora*, *Mycoplasma*, *Nectria*, *Ochroconis*, *Oidiodendron*, *Phoma*, *Piedraia*, *Pithomyces*, *Pityrosporum*, *Prevotella*, *Pseudallescheria*, *Rhizomucor*, *Rhizopus*, *Rhodotorula*, *Scolecobasidium*, *Scopulariopsis*, *Scytalidium*, *Sporobolomyces*, *Stachybotrys*, *Stomatococcus*, *Treponema*, *Trichoderma*, *Trichophyton*, *Trichosporon*, *Tritirachium* or *Ureaplasma*. **Group 3** consists of all other microorganisms.
 #' 
 #' All matches are sorted descending on their matching score and for all user input values, the top match will be returned. This will lead to the effect that e.g., `"E. coli"` will return the microbial ID of *Escherichia coli* (\eqn{m = `r round(mo_matching_score("E. coli", "Escherichia coli"), 3)`}, a highly prevalent microorganism found in humans) and not *Entamoeba coli* (\eqn{m = `r round(mo_matching_score("E. coli", "Entamoeba coli"), 3)`}, a less prevalent microorganism in humans), although the latter would alphabetically come first. 
 #' @export
--- a/R/sysdata.rda
+++ b/R/sysdata.rda