From e73f0e211c06d155e869cbe4ccac98c2489877ec Mon Sep 17 00:00:00 2001 From: "Matthijs S. Berends" Date: Sat, 15 Aug 2020 12:54:47 +0200 Subject: [PATCH] (v1.3.0.9003) as.rsi() speed improvement --- DESCRIPTION | 4 +- NEWS.md | 27 +++++------ R/aa_helper_functions.R | 14 +++--- R/data.R | 1 - R/rsi.R | 21 ++++++--- _pkgdown.yml | 2 +- docs/404.html | 2 +- docs/LICENSE-text.html | 2 +- docs/articles/index.html | 2 +- docs/articles/welcome_to_AMR.html | 4 +- docs/authors.html | 2 +- docs/index.html | 2 +- docs/news/index.html | 59 ++++++++++++++----------- docs/pkgdown.yml | 2 +- docs/reference/antibiotics.html | 2 +- docs/reference/as.ab.html | 2 +- docs/reference/count.html | 4 +- docs/reference/index.html | 8 +++- docs/reference/intrinsic_resistant.html | 5 +-- docs/reference/lifecycle.html | 2 +- docs/reference/microorganisms.html | 2 +- docs/reference/proportion.html | 2 +- docs/reference/rsi_translation.html | 2 +- docs/survey.html | 2 +- man/intrinsic_resistant.Rd | 3 -- 25 files changed, 96 insertions(+), 82 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 69c0be44..1705c0fb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: AMR -Version: 1.3.0.9002 -Date: 2020-08-14 +Version: 1.3.0.9003 +Date: 2020-08-15 Title: Antimicrobial Resistance Analysis Authors@R: c( person(role = c("aut", "cre"), diff --git a/NEWS.md b/NEWS.md index 039eac00..d1527534 100755 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,5 @@ -# AMR 1.3.0.9002 -## Last updated: 14 August 2020 +# AMR 1.3.0.9003 +## Last updated: 15 August 2020 ### New * Data set `intrinsic_resistant`. This data set contains all bug-drug combinations where the 'bug' is intrinsic resistant to the 'drug' according to the latest EUCAST insights. It contains just two columns: `microorganism` and `antibiotic`. @@ -15,18 +15,19 @@ ``` ### Changed -* Support for using `dplyr`'s `across()` in `as.rsi()` to interpret MIC values or disk zone diameters, that now also automatically determines the column with microorganism names or codes. - ```r - # until dplyr 1.0.0 - your_data %>% mutate_if(is.mic, as.rsi) - your_data %>% mutate_if(is.disk, as.rsi) +* Improvements for `as.rsi()`: + * Support for using `dplyr`'s `across()` to interpret MIC values or disk zone diameters, which also automatically determines the column with microorganism names or codes. + ```r + # until dplyr 1.0.0 + your_data %>% mutate_if(is.mic, as.rsi) + your_data %>% mutate_if(is.disk, as.rsi) - # since dplyr 1.0.0 - your_data %>% mutate(across(where(is.mic), as.rsi)) - your_data %>% mutate(across(where(is.disk), as.rsi)) - ``` -* Improved overall speed by tweaking joining functions - + # since dplyr 1.0.0 + your_data %>% mutate(across(where(is.mic), as.rsi)) + your_data %>% mutate(across(where(is.disk), as.rsi)) + ``` + * Big speed improvement for interpreting MIC values and disk zone diameters. When interpreting 5,000 MIC values of two antibiotics (10,000 values in total), our benchmarks showed a total run time going from 80.7-85.1 seconds to 1.8-2.0 seconds. +* Overall speed improvement by tweaking joining functions # AMR 1.3.0 diff --git a/R/aa_helper_functions.R b/R/aa_helper_functions.R index b72563b4..3e11363d 100755 --- a/R/aa_helper_functions.R +++ b/R/aa_helper_functions.R @@ -62,14 +62,12 @@ left_join <- function(x, y, by = NULL, suffix = c(".x", ".y")) { if (length(by) == 1) { by <- rep(by, 2) } - requires_suffix <- any(colnames(x) %in% colnames(y)) - if (requires_suffix == TRUE) { - int_x <- colnames(x) %in% colnames(y) & colnames(x) != by[1] - int_y <- colnames(y) %in% colnames(x) & colnames(y) != by[2] - - colnames(x)[int_x] <- paste0(colnames(x)[int_x], suffix[1L]) - colnames(y)[int_y] <- paste0(colnames(y)[int_y], suffix[2L]) - } + + int_x <- colnames(x) %in% colnames(y) & colnames(x) != by[1] + int_y <- colnames(y) %in% colnames(x) & colnames(y) != by[2] + colnames(x)[int_x] <- paste0(colnames(x)[int_x], suffix[1L]) + colnames(y)[int_y] <- paste0(colnames(y)[int_y], suffix[2L]) + merged <- cbind(x, y[match(x[, by[1], drop = TRUE], y[, by[2], drop = TRUE]), diff --git a/R/data.R b/R/data.R index 4ffa5a76..6f914640 100755 --- a/R/data.R +++ b/R/data.R @@ -255,5 +255,4 @@ catalogue_of_life <- list( #' pull(microorganism) #' # [1] "Enterococcus casseliflavus" "Enterococcus gallinarum" #' } -#' @seealso [intrinsic_resistant] "intrinsic_resistant" diff --git a/R/rsi.R b/R/rsi.R index 371b4baa..81add8b8 100755 --- a/R/rsi.R +++ b/R/rsi.R @@ -311,7 +311,7 @@ as.rsi.mic <- function(x, stop_('No information was supplied about the microorganisms (missing parameter "mo"). See ?as.rsi.\n\n', "To transform certain columns with e.g. mutate_at(), use\n", "`data %>% mutate_at(vars(...), as.rsi, mo = .$x)`, where x is your column with microorganisms.\n\n", - "To tranform all MIC variables in a data set, use `as.rsi(data)` or `data %>% as.rsi()`.", call = FALSE) + "To tranform all MIC values in a data set, use `data %>% as.rsi()` or data %>% mutate_if(is.mic, as.rsi).", call = FALSE) } ab_coerced <- suppressWarnings(as.ab(ab)) @@ -379,7 +379,7 @@ as.rsi.disk <- function(x, stop_('No information was supplied about the microorganisms (missing parameter "mo"). See ?as.rsi.\n\n', "To transform certain columns with e.g. mutate_at(), use\n", "`data %>% mutate_at(vars(...), as.rsi, mo = .$x)`, where x is your column with microorganisms.\n\n", - "To tranform all disk diffusion zones in a data set, use `as.rsi(data)` or `data %>% as.rsi()`.", call = FALSE) + "To tranform all disk diffusion zones in a data set, use `data %>% as.rsi()` or data %>% mutate_if(is.disk, as.rsi).", call = FALSE) } ab_coerced <- suppressWarnings(as.ab(ab)) @@ -535,6 +535,11 @@ get_guideline <- function(guideline) { } exec_as.rsi <- function(method, x, mo, ab, guideline, uti, conserve_capped_values) { + x_bak <- data.frame(x_mo = paste0(x, mo)) + df <- unique(data.frame(x, mo), stringsAsFactors = FALSE) + x <- df$x + mo <- df$mo + if (method == "mic") { x <- as.mic(x) # when as.rsi.mic is called directly } else if (method == "disk") { @@ -575,10 +580,10 @@ exec_as.rsi <- function(method, x, mo, ab, guideline, uti, conserve_capped_value warning("Interpretation of ", font_bold(ab_name(ab, tolower = TRUE)), " for some microorganisms is only available for (uncomplicated) urinary tract infections (UTI).\n Use parameter 'uti' to set which isolates are from urine. See ?as.rsi.", call. = FALSE) warned <- TRUE } - + for (i in seq_len(length(x))) { get_record <- trans %>% - # no UTI for now + # no sebsetting to UTI for now subset(lookup %in% c(lookup_mo[i], lookup_genus[i], lookup_family[i], @@ -591,7 +596,7 @@ exec_as.rsi <- function(method, x, mo, ab, guideline, uti, conserve_capped_value get_record <- get_record %>% # be as specific as possible (i.e. prefer species over genus): # desc(uti) = TRUE on top and FALSE on bottom - arrange(desc(uti), desc(nchar(mo))) # 'uti' is a column in rsi_translation + arrange(desc(uti), desc(nchar(mo))) # 'uti' is a column in data set 'rsi_translation' } else { get_record <- get_record %>% filter(uti == FALSE) %>% # 'uti' is a column in rsi_translation @@ -620,9 +625,15 @@ exec_as.rsi <- function(method, x, mo, ab, guideline, uti, conserve_capped_value } } } + + new_rsi <- x_bak %>% + left_join(data.frame(x_mo = paste0(df$x, df$mo), new_rsi), by = "x_mo") %>% + pull(new_rsi) + if (warned == FALSE) { message(font_green("OK.")) } + structure(.Data = factor(new_rsi, levels = c("S", "I", "R"), ordered = TRUE), class = c("rsi", "ordered", "factor")) } diff --git a/_pkgdown.yml b/_pkgdown.yml index 81d368a5..1320a95a 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -139,7 +139,7 @@ reference: contents: - "`microorganisms`" - "`antibiotics`" - - "`antivirals`" + - "`intrinsic_resistant`" - "`example_isolates`" - "`example_isolates_unclean`" - "`rsi_translation`" diff --git a/docs/404.html b/docs/404.html index a1a6f673..3469455d 100644 --- a/docs/404.html +++ b/docs/404.html @@ -81,7 +81,7 @@ AMR (for R) - 1.3.0.9002 + 1.3.0.9003 diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index c0a2449d..16496ec7 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -81,7 +81,7 @@ AMR (for R) - 1.3.0.9002 + 1.3.0.9003 diff --git a/docs/articles/index.html b/docs/articles/index.html index cb9deb89..6821137a 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -81,7 +81,7 @@ AMR (for R) - 1.3.0.9002 + 1.3.0.9003 diff --git a/docs/articles/welcome_to_AMR.html b/docs/articles/welcome_to_AMR.html index 2a42dbbc..3323932e 100644 --- a/docs/articles/welcome_to_AMR.html +++ b/docs/articles/welcome_to_AMR.html @@ -39,7 +39,7 @@ AMR (for R) - 1.3.0.9002 + 1.3.0.9003 @@ -186,7 +186,7 @@

Welcome to the AMR package

Matthijs S. Berends

-

14 August 2020

+

15 August 2020

Source: vignettes/welcome_to_AMR.Rmd diff --git a/docs/authors.html b/docs/authors.html index 35791072..fe5ae604 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -81,7 +81,7 @@ AMR (for R) - 1.3.0.9002 + 1.3.0.9003 diff --git a/docs/index.html b/docs/index.html index 99c7ec08..8e1b7767 100644 --- a/docs/index.html +++ b/docs/index.html @@ -43,7 +43,7 @@ AMR (for R) - 1.3.0.9002 + 1.3.0.9003 diff --git a/docs/news/index.html b/docs/news/index.html index 08fed07b..a24afd8e 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -81,7 +81,7 @@ AMR (for R) - 1.3.0.9002 + 1.3.0.9003 @@ -229,13 +229,13 @@ Source: NEWS.md -
-

-AMR 1.3.0.9002 Unreleased +
+

+AMR 1.3.0.9003 Unreleased

-
+

-Last updated: 14 August 2020 +Last updated: 15 August 2020

@@ -259,8 +259,10 @@

Changed

    +
  • Improvements for as.rsi(): +
    • -

      Support for using dplyr’s across() in as.rsi() to interpret MIC values or disk zone diameters, that now also automatically determines the column with microorganism names or codes.

      +

      Support for using dplyr’s across() to interpret MIC values or disk zone diameters, which also automatically determines the column with microorganism names or codes.

       # until dplyr 1.0.0
       your_data %>% mutate_if(is.mic, as.rsi)
      @@ -271,7 +273,10 @@
       your_data %>% mutate(across(where(is.disk), as.rsi))
       
    • -
    • Improved overall speed by tweaking joining functions

    • +
    • Big speed improvement for interpreting MIC values and disk zone diameters. When interpreting 5,000 MIC values of two antibiotics (10,000 values in total), our benchmarks showed a total run time going from 80.7-85.1 seconds to 1.8-2.0 seconds.

    • +
    +
  • +
  • Overall speed improvement by tweaking joining functions
@@ -372,7 +377,7 @@

Making this package independent of especially the tidyverse (e.g. packages dplyr and tidyr) tremendously increases sustainability on the long term, since tidyverse functions change quite often. Good for users, but hard for package maintainers. Most of our functions are replaced with versions that only rely on base R, which keeps this package fully functional for many years to come, without requiring a lot of maintenance to keep up with other packages anymore. Another upside it that this package can now be used with all versions of R since R-3.0.0 (April 2013). Our package is being used in settings where the resources are very limited. Fewer dependencies on newer software is helpful for such settings.

Negative effects of this change are:

    -
  • Function freq() that was borrowed from the cleaner package was removed. Use cleaner::freq(), or run library("cleaner") before you use freq().
  • +
  • Function freq() that was borrowed from the cleaner package was removed. Use cleaner::freq(), or run library("cleaner") before you use freq().
  • Printing values of class mo or rsi in a tibble will no longer be in colour and printing rsi in a tibble will show the class <ord>, not <rsi> anymore. This is purely a visual effect.
  • All functions from the mo_* family (like mo_name() and mo_gramstain()) are noticeably slower when running on hundreds of thousands of rows.
  • For developers: classes mo and ab now both also inherit class character, to support any data transformation. This change invalidates code that checks for class length == 1.
  • @@ -709,7 +714,7 @@ This works for all drug combinations, such as ampicillin/sulbactam, ceftazidime/

This is important, because a value like "testvalue" could never be understood by e.g. mo_name(), although the class would suggest a valid microbial code.

-
  • Function freq() has moved to a new package, clean (CRAN link), since creating frequency tables actually does not fit the scope of this package. The freq() function still works, since it is re-exported from the clean package (which will be installed automatically upon updating this AMR package).

  • +
  • Function freq() has moved to a new package, clean (CRAN link), since creating frequency tables actually does not fit the scope of this package. The freq() function still works, since it is re-exported from the clean package (which will be installed automatically upon updating this AMR package).

  • Renamed data set septic_patients to example_isolates

  • @@ -978,7 +983,7 @@ This works for all drug combinations, such as ampicillin/sulbactam, ceftazidime/
  • The age() function gained a new parameter exact to determine ages with decimals
  • Removed deprecated functions guess_mo(), guess_atc(), EUCAST_rules(), interpretive_reading(), rsi()
  • -
  • Frequency tables (freq()): +
  • Frequency tables (freq()):
    • speed improvement for microbial IDs

    • fixed factor level names for R Markdown

    • @@ -987,12 +992,12 @@ This works for all drug combinations, such as ampicillin/sulbactam, ceftazidime/

      support for boxplots:

       septic_patients %>% 
      -  freq(age) %>% 
      +  freq(age) %>% 
         boxplot()
       # grouped boxplots:
       septic_patients %>% 
         group_by(hospital_id) %>% 
      -  freq(age) %>%
      +  freq(age) %>%
         boxplot()
       
      @@ -1003,7 +1008,7 @@ This works for all drug combinations, such as ampicillin/sulbactam, ceftazidime/
    • Added ceftazidim intrinsic resistance to Streptococci
    • Changed default settings for age_groups(), to let groups of fives and tens end with 100+ instead of 120+
    • -
    • Fix for freq() for when all values are NA +
    • Fix for freq() for when all values are NA
    • Fix for first_isolate() for when dates are missing
    • Improved speed of guess_ab_col() @@ -1244,7 +1249,7 @@ This works for all drug combinations, such as ampicillin/sulbactam, ceftazidime/
  • -
  • Frequency tables (freq() function): +
  • Frequency tables (freq() function):
    • Support for tidyverse quasiquotation! Now you can create frequency tables of function outcomes:

      @@ -1253,15 +1258,15 @@ This works for all drug combinations, such as ampicillin/sulbactam, ceftazidime/ # OLD WAY septic_patients %>% mutate(genus = mo_genus(mo)) %>% - freq(genus) + freq(genus) # NEW WAY septic_patients %>% - freq(mo_genus(mo)) + freq(mo_genus(mo)) # Even supports grouping variables: septic_patients %>% group_by(gender) %>% - freq(mo_genus(mo)) + freq(mo_genus(mo))
  • Header info is now available as a list, with the header function

  • @@ -1345,21 +1350,21 @@ This works for all drug combinations, such as ampicillin/sulbactam, ceftazidime/
  • Using portion_* functions now throws a warning when total available isolate is below parameter minimum

  • Functions as.mo, as.rsi, as.mic, as.atc and freq will not set package name as attribute anymore

  • -

    Frequency tables - freq():

    +

    Frequency tables - freq():

    @@ -1626,13 +1631,13 @@ This works for all drug combinations, such as ampicillin/sulbactam, ceftazidime/