diff --git a/.github/workflows/check.yaml b/.github/workflows/check.yaml index 160c5d6d..982b96e4 100644 --- a/.github/workflows/check.yaml +++ b/.github/workflows/check.yaml @@ -159,7 +159,7 @@ jobs: _R_CHECK_LENGTH_1_CONDITION_: verbose _R_CHECK_LENGTH_1_LOGIC2_: verbose run: | - R CMD check data-raw/AMR_*.tar.gz --no-manual --no-build-vignettes + R CMD check data-raw/AMR_latest.tar.gz --no-manual --no-build-vignettes - name: Show testthat output if: always() diff --git a/DESCRIPTION b/DESCRIPTION index 9d727576..d95c9cc6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: AMR -Version: 1.5.0.9020 -Date: 2021-02-18 +Version: 1.5.0.9021 +Date: 2021-02-21 Title: Antimicrobial Resistance Data Analysis Authors@R: c( person(role = c("aut", "cre"), diff --git a/NEWS.md b/NEWS.md index 402fb27d..9edfd0fe 100755 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,5 @@ -# AMR 1.5.0.9020 -## Last updated: 18 February 2021 +# AMR 1.5.0.9021 +## Last updated: 21 February 2021 ### New * Support for EUCAST Clinical Breakpoints v11.0 (2021), effective in the `eucast_rules()` function and in `as.rsi()` to interpret MIC and disk diffusion values. This is now the default guideline in this package. @@ -58,6 +58,7 @@ * Functions `print()` and `summary()` on a Principal Components Analysis object (`pca()`) now print additional group info if the original data was grouped using `dplyr::group_by()` * Improved speed and reliability of `guess_ab_col()`. As this also internally improves the reliability of `first_isolate()` and `mdro()`, this might have a slight impact on the results of those functions. * Fix for `mo_name()` when used in other languages than English +* The `like()` function (and its fast alias `%like%`) now always use Perl compatibility, improving speed for many functions in this package ### Other * Big documentation updates diff --git a/R/aa_helper_functions.R b/R/aa_helper_functions.R index 7d507fb2..d8f471a2 100755 --- a/R/aa_helper_functions.R +++ b/R/aa_helper_functions.R @@ -1077,6 +1077,14 @@ percentage <- function(x, digits = NULL, ...) { digits = digits, ...) } +time_start_tracking <- function() { + pkg_env$time_start <- round(as.numeric(Sys.time()) * 1000) +} + +time_track <- function(name = NULL) { + paste("(until now:", trimws(round(as.numeric(Sys.time()) * 1000) - pkg_env$time_start), "ms)") +} + # prevent dependency on package 'backports' # these functions were not available in previous versions of R (last checked: R 4.0.3) # see here for the full list: https://github.com/r-lib/backports diff --git a/R/eucast_rules.R b/R/eucast_rules.R index 89f38ba5..f59e11e0 100755 --- a/R/eucast_rules.R +++ b/R/eucast_rules.R @@ -896,7 +896,7 @@ eucast_rules <- function(x, target_value <- eucast_rules_df[i, "to_value", drop = TRUE] if (is.na(source_antibiotics)) { - rows <- tryCatch(which(x[, if_mo_property, drop = TRUE] %like_perl% mo_value), + rows <- tryCatch(which(x[, if_mo_property, drop = TRUE] %like% mo_value), error = function(e) integer(0)) } else { source_antibiotics <- get_antibiotic_columns(source_antibiotics, x) @@ -906,17 +906,17 @@ eucast_rules <- function(x, if (length(source_antibiotics) == 0) { rows <- integer(0) } else if (length(source_antibiotics) == 1) { - rows <- tryCatch(which(x[, if_mo_property, drop = TRUE] %like_perl% mo_value + rows <- tryCatch(which(x[, if_mo_property, drop = TRUE] %like% mo_value & as.rsi_no_warning(x[, source_antibiotics[1L]]) == source_value[1L]), error = function(e) integer(0)) } else if (length(source_antibiotics) == 2) { - rows <- tryCatch(which(x[, if_mo_property, drop = TRUE] %like_perl% mo_value + rows <- tryCatch(which(x[, if_mo_property, drop = TRUE] %like% mo_value & as.rsi_no_warning(x[, source_antibiotics[1L]]) == source_value[1L] & as.rsi_no_warning(x[, source_antibiotics[2L]]) == source_value[2L]), error = function(e) integer(0)) # nolint start # } else if (length(source_antibiotics) == 3) { - # rows <- tryCatch(which(x[, if_mo_property, drop = TRUE] %like_perl% mo_value + # rows <- tryCatch(which(x[, if_mo_property, drop = TRUE] %like% mo_value # & as.rsi_no_warning(x[, source_antibiotics[1L]]) == source_value[1L] # & as.rsi_no_warning(x[, source_antibiotics[2L]]) == source_value[2L] # & as.rsi_no_warning(x[, source_antibiotics[3L]]) == source_value[3L]), diff --git a/R/like.R b/R/like.R index 9a7dae52..7676af3c 100755 --- a/R/like.R +++ b/R/like.R @@ -39,7 +39,7 @@ #' * Is case-insensitive (use `%like_case%` for case-sensitive matching) #' * Supports multiple patterns #' * Checks if `pattern` is a regular expression and sets `fixed = TRUE` if not, to greatly improve speed -#' * Tries again with `perl = TRUE` if regex fails +#' * Always uses compatibility with Perl #' #' Using RStudio? The text `%like%` can also be directly inserted in your code from the Addins menu and can have its own Keyboard Shortcut like `Ctrl+Shift+L` or `Cmd+Shift+L` (see `Tools` > `Modify Keyboard Shortcuts...`). #' @source Idea from the [`like` function from the `data.table` package](https://github.com/Rdatatable/data.table/blob/master/R/like.R) @@ -99,7 +99,7 @@ like <- function(x, pattern, ignore.case = TRUE) { if (is.factor(x[i])) { res[i] <- as.integer(x[i]) %in% grep(pattern[i], levels(x[i]), ignore.case = FALSE, fixed = fixed) } else { - res[i] <- grepl(pattern[i], x[i], ignore.case = FALSE, fixed = fixed) + res[i] <- grepl(pattern[i], x[i], ignore.case = FALSE, fixed = fixed, perl = !fixed) } } res <- vapply(FUN.VALUE = logical(1), pattern, function(pttrn) grepl(pttrn, x, ignore.case = FALSE, fixed = fixed)) @@ -112,9 +112,9 @@ like <- function(x, pattern, ignore.case = TRUE) { # x and pattern are of same length, so items with each other for (i in seq_len(length(res))) { if (is.factor(x[i])) { - res[i] <- as.integer(x[i]) %in% grep(pattern[i], levels(x[i]), ignore.case = FALSE, fixed = fixed) + res[i] <- as.integer(x[i]) %in% grep(pattern[i], levels(x[i]), ignore.case = FALSE, fixed = fixed, perl = !fixed) } else { - res[i] <- grepl(pattern[i], x[i], ignore.case = FALSE, fixed = fixed) + res[i] <- grepl(pattern[i], x[i], ignore.case = FALSE, fixed = fixed, perl = !fixed) } } return(res) @@ -123,22 +123,9 @@ like <- function(x, pattern, ignore.case = TRUE) { # the regular way how grepl works; just one pattern against one or more x if (is.factor(x)) { - as.integer(x) %in% grep(pattern, levels(x), ignore.case = FALSE, fixed = fixed) + as.integer(x) %in% grep(pattern, levels(x), ignore.case = FALSE, fixed = fixed, perl = !fixed) } else { - tryCatch(grepl(pattern, x, ignore.case = FALSE, fixed = fixed), - error = function(e) { - if (grepl("invalid reg(ular )?exp", e$message, ignore.case = TRUE)) { - # try with perl = TRUE: - return(grepl(pattern = pattern, - x = x, - ignore.case = FALSE, - fixed = fixed, - perl = TRUE)) - } else { - # stop otherwise - stop(e$message) - } - }) + grepl(pattern, x, ignore.case = FALSE, fixed = fixed, perl = !fixed) } } @@ -157,15 +144,3 @@ like <- function(x, pattern, ignore.case = TRUE) { meet_criteria(pattern, allow_NA = FALSE) like(x, pattern, ignore.case = FALSE) } - -"%like_perl%" <- function(x, pattern) { - meet_criteria(x, allow_NA = TRUE) - meet_criteria(pattern, allow_NA = FALSE) - # convenient for e.g. matching all Klebsiella and Raoultella, but not - # K. aerogenes: fullname %like_perl% "^(Klebsiella(?! aerogenes)|Raoultella)" - grepl(x = tolower(x), - pattern = tolower(pattern), - perl = TRUE, - fixed = FALSE, - ignore.case = TRUE) -} diff --git a/R/mo.R b/R/mo.R index 62076bdf..013c5707 100755 --- a/R/mo.R +++ b/R/mo.R @@ -277,6 +277,10 @@ exec_as.mo <- function(x, check_dataset_integrity() + if (isTRUE(debug) && initial_search == TRUE) { + time_start_tracking() + } + lookup <- function(needle, column = property, haystack = reference_data_to_use, @@ -295,7 +299,8 @@ exec_as.mo <- function(x, # `column` can be NULL for all columns, or a selection # returns a character (vector) - if `column` > length 1 then with columns as names if (isTRUE(debug_mode)) { - cat(font_silver("looking up: ", substitute(needle), collapse = "")) + cat(font_silver("Looking up: ", substitute(needle), collapse = ""), + "\n ", time_track()) } if (length(column) == 1) { res_df <- haystack[which(eval(substitute(needle), envir = haystack, enclos = parent.frame())), , drop = FALSE] @@ -313,7 +318,7 @@ exec_as.mo <- function(x, NA_character_ } else { if (isTRUE(debug_mode)) { - cat(font_green(paste0(" **MATCH** (", NROW(res_df), " results)\n"))) + cat(font_green(paste0(" MATCH (", NROW(res_df), " results)\n"))) } if ((length(res) > n | uncertainty > 1) & uncertainty != -1) { # save the other possible results as well, but not for forced certain results (then uncertainty == -1) @@ -327,16 +332,20 @@ exec_as.mo <- function(x, res[seq_len(min(n, length(res)))] } } else { - if (isTRUE(debug_mode)) { - cat("\n") - } if (is.null(column)) { column <- names(haystack) } res <- haystack[which(eval(substitute(needle), envir = haystack, enclos = parent.frame())), , drop = FALSE] res <- res[seq_len(min(n, nrow(res))), column, drop = TRUE] if (NROW(res) == 0) { + if (isTRUE(debug_mode)) { + cat(font_red(" (no rows)\n")) + } res <- rep(NA_character_, length(column)) + } else { + if (isTRUE(debug_mode)) { + cat(font_green(paste0(" MATCH (", NROW(res), " rows)\n"))) + } } res <- as.character(res) names(res) <- column @@ -394,7 +403,7 @@ exec_as.mo <- function(x, check_validity_mo_source(reference_df) reference_df <- repair_reference_df(reference_df) } - + # all empty if (all(identical(trimws(x_input), "") | is.na(x_input) | length(x) == 0)) { if (property == "mo") { @@ -471,10 +480,10 @@ exec_as.mo <- function(x, x_backup[x_backup_untouched == "Fungi"] <- "Fungi" # is literally the kingdom # Fill in fullnames and MO codes at once - known_names <- x_backup %in% MO_lookup$fullname - x[known_names] <- MO_lookup[match(x_backup[known_names], MO_lookup$fullname), property, drop = TRUE] - known_codes <- x_backup %in% MO_lookup$mo - x[known_codes] <- MO_lookup[match(x_backup[known_codes], MO_lookup$mo), property, drop = TRUE] + known_names <- tolower(x_backup) %in% MO_lookup$fullname_lower + x[known_names] <- MO_lookup[match(tolower(x_backup)[known_names], MO_lookup$fullname), property, drop = TRUE] + known_codes <- toupper(x_backup) %in% MO_lookup$mo + x[known_codes] <- MO_lookup[match(toupper(x_backup)[known_codes], MO_lookup$mo), property, drop = TRUE] already_known <- known_names | known_codes # now only continue where the right taxonomic output is not already known @@ -975,6 +984,7 @@ exec_as.mo <- function(x, g.x_backup_without_spp %pm>% substr(1, x_length / 2), ".* ", g.x_backup_without_spp %pm>% substr((x_length / 2) + 1, x_length)) + print(x_split) found <- lookup(fullname_lower %like_case% x_split, haystack = data_to_check) if (!is.na(found)) { @@ -1414,6 +1424,10 @@ exec_as.mo <- function(x, close(progress) } + if (isTRUE(debug) && initial_search == TRUE) { + cat("Ended search", time_track(), "\n") + } + # handling failures ---- failures <- failures[!failures %in% c(NA, NULL, NaN)] @@ -1571,6 +1585,10 @@ exec_as.mo <- function(x, } } + if (isTRUE(debug) && initial_search == TRUE) { + cat("Finished function", time_track(), "\n") + } + x } diff --git a/data-raw/AMR_latest.tar.gz b/data-raw/AMR_latest.tar.gz index ee737731..c206b07b 100644 Binary files a/data-raw/AMR_latest.tar.gz and b/data-raw/AMR_latest.tar.gz differ diff --git a/docs/404.html b/docs/404.html index 17888f86..192d1171 100644 --- a/docs/404.html +++ b/docs/404.html @@ -81,7 +81,7 @@
diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index adc8ceba..7db9ce64 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -81,7 +81,7 @@ diff --git a/docs/articles/benchmarks.html b/docs/articles/benchmarks.html index e7b6b63f..215fa0a6 100644 --- a/docs/articles/benchmarks.html +++ b/docs/articles/benchmarks.html @@ -39,7 +39,7 @@ @@ -192,6 +192,7 @@vignettes/benchmarks.Rmd
benchmarks.Rmd
In the table above, all measurements are shown in milliseconds (thousands of seconds). A value of 5 milliseconds means it can determine 200 input values per second. It case of 100 milliseconds, this is only 10 input values per second. It is clear that accepted taxonomic names are extremely fast, but some variations can take up to 500-1000 times as much time.
To improve performance, two important calculations take almost no time at all: repetitive results and already precalculated results.
@@ -281,11 +288,11 @@ # now let's see: run_it <- microbenchmark(mo_name(x), times = 10) -print(run_it, unit = "ms", signif = 3) +print(run_it, unit = "ms", signif = 3) # Unit: milliseconds # expr min lq mean median uq max neval -# mo_name(x) 141 180 218 207 245 312 10 -So getting official taxonomic names of 2,000,000 (!!) items consisting of 90 unique values only takes 0.207 seconds. You only lose time on your unique input values.
+# mo_name(x) 125 144 182 171 186 298 10 +So getting official taxonomic names of 2,000,000 (!!) items consisting of 90 unique values only takes 0.171 seconds. You only lose time on your unique input values.
So going from mo_name("Staphylococcus aureus")
to "Staphylococcus aureus"
takes 0.0025 seconds - it doesn’t even start calculating if the result would be the same as the expected resulting value. That goes for all helper functions:
So going from mo_name("Staphylococcus aureus")
to "Staphylococcus aureus"
takes 0.0017 seconds - it doesn’t even start calculating if the result would be the same as the expected resulting value. That goes for all helper functions:
run_it <- microbenchmark(A = mo_species("aureus"),
B = mo_genus("Staphylococcus"),
@@ -313,17 +320,17 @@
G = mo_phylum("Firmicutes"),
H = mo_kingdom("Bacteria"),
times = 10)
-print(run_it, unit = "ms", signif = 3)
+print(run_it, unit = "ms", signif = 3)
# Unit: milliseconds
-# expr min lq mean median uq max neval
-# A 1.91 1.95 2.06 1.99 2.09 2.62 10
-# B 1.83 1.91 2.09 2.04 2.20 2.45 10
-# C 1.79 1.90 2.03 1.99 2.22 2.30 10
-# D 1.90 2.01 2.18 2.12 2.25 2.71 10
-# E 1.91 2.02 2.14 2.08 2.15 2.81 10
-# F 1.86 1.92 2.00 2.01 2.06 2.16 10
-# G 1.81 1.96 2.09 2.08 2.22 2.41 10
-# H 1.90 1.93 2.05 2.00 2.22 2.29 10
Of course, when running mo_phylum("Firmicutes")
the function has zero knowledge about the actual microorganism, namely S. aureus. But since the result would be "Firmicutes"
anyway, there is no point in calculating the result. And because this package ‘knows’ all phyla of all known bacteria (according to the Catalogue of Life), it can just return the initial value immediately.
Currently supported are German, Dutch, Spanish, Italian, French and Portuguese.
diff --git a/docs/articles/benchmarks_files/figure-html/unnamed-chunk-4-1.png b/docs/articles/benchmarks_files/figure-html/unnamed-chunk-4-1.png index 431f27bf..16afa08a 100644 Binary files a/docs/articles/benchmarks_files/figure-html/unnamed-chunk-4-1.png and b/docs/articles/benchmarks_files/figure-html/unnamed-chunk-4-1.png differ diff --git a/docs/articles/index.html b/docs/articles/index.html index 422c81e8..67bc8815 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -81,7 +81,7 @@ diff --git a/docs/authors.html b/docs/authors.html index 824b29ee..64126664 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -81,7 +81,7 @@ diff --git a/docs/index.html b/docs/index.html index 01de6418..09b3025f 100644 --- a/docs/index.html +++ b/docs/index.html @@ -43,7 +43,7 @@ diff --git a/docs/news/index.html b/docs/news/index.html index becdfb44..4251f57a 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -81,7 +81,7 @@ @@ -236,13 +236,13 @@ Source:NEWS.md
- guess_ab_col()
. As this also internally improves the reliability of first_isolate()
and mdro()
, this might have a slight impact on the results of those functions.mo_name()
when used in other languages than Englishlike()
function (and its fast alias %like%
) now always use Perl compatibility, improving speed for many functions in this packageIs case-insensitive (use %like_case%
for case-sensitive matching)
Supports multiple patterns
Checks if pattern
is a regular expression and sets fixed = TRUE
if not, to greatly improve speed
Tries again with perl = TRUE
if regex fails
Always uses compatibility with Perl
Using RStudio? The text %like%
can also be directly inserted in your code from the Addins menu and can have its own Keyboard Shortcut like Ctrl+Shift+L
or Cmd+Shift+L
(see Tools
> Modify Keyboard Shortcuts...
).