From 298e67a45bcea6530e12fa6f8afa12ad303a8210 Mon Sep 17 00:00:00 2001 From: "Matthijs S. Berends" Date: Wed, 1 Jul 2020 16:21:36 +0200 Subject: [PATCH] (v1.2.0.9022) as.ab() improvement --- DESCRIPTION | 2 +- NEWS.md | 11 +++++--- R/ab.R | 40 ++++++++++++++++++++++++++---- docs/404.html | 2 +- docs/LICENSE-text.html | 2 +- docs/articles/index.html | 2 +- docs/authors.html | 2 +- docs/index.html | 2 +- docs/news/index.html | 20 +++++++++------ docs/pkgdown.yml | 2 +- docs/reference/as.ab.html | 9 ++++++- docs/reference/index.html | 2 +- man/as.ab.Rd | 8 ++++++ tests/testthat/test-ab.R | 7 ++++-- tests/testthat/test-ab_from_text.R | 2 +- 15 files changed, 84 insertions(+), 29 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 4656d732..0551e029 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: AMR -Version: 1.2.0.9021 +Version: 1.2.0.9022 Date: 2020-07-01 Title: Antimicrobial Resistance Analysis Authors@R: c( diff --git a/NEWS.md b/NEWS.md index 09485fce..517f1fe8 100755 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -# AMR 1.2.0.9021 +# AMR 1.2.0.9022 ## Last updated: 01-Jul-2020 ### New @@ -20,16 +20,19 @@ ### Changed * Using unexisting columns in all `count_*()`, `proportion_*()`, `susceptibility()` and `resistance()` functions wil now return an error instead of dropping them silently +* Improvements for `as.ab()`: + * Dramatic improvement of the algorithm behind `as.ab()`, making many more input errors translatable like from digitalised health care records, using too few or too many vowels or consonants and many more + * Added progress bar + * Fixed a bug where `as.ab()` would return an error on invalid input values + * The `as.ab()` function will now throw a note if more than 1 antimicrobial drug could be retrieved from a single input value. * Fixed a bug where `eucast_rules()` would not work on a tibble when the `tibble` or `dplyr` package was loaded * All `*_join_microorganisms()` functions and `bug_drug_combinations()` now return the original data class (e.g. `tibble`s and `data.table`s) -* Fixed a bug where `as.ab()` would return an error on invalid input values * Fixed a bug for using grouped versions of `rsi_df()`, `proportion_df()` and `count_df()`, and fixed a bug where not all different antimicrobial results were added as rows * Improved auto-determination for columns of types `` and `` * Fixed a bug in `bug_drug_combinations()` for when only one antibiotic was in the input data * Changed the summary for class ``, to highlight the %SI vs. %R * Improved error handling, giving more useful info when functions return an error -* Algorithm improvements to `as.ab()`, many more misspellings are now translatable. The `as.ab()` function will now throw a note if more than 1 antimicrobial drug could be retrieved from a single input value. -* Added progress bar to `as.ab()` + # AMR 1.2.0 diff --git a/R/ab.R b/R/ab.R index 80845712..624dc57c 100755 --- a/R/ab.R +++ b/R/ab.R @@ -29,6 +29,13 @@ #' @rdname as.ab #' @inheritSection WHOCC WHOCC #' @details All entries in the [antibiotics] data set have three different identifiers: a human readable EARS-Net code (column `ab`, used by ECDC and WHONET), an ATC code (column `atc`, used by WHO), and a CID code (column `cid`, Compound ID, used by PubChem). The data set contains more than 5,000 official brand names from many different countries, as found in PubChem. +#' +#' All these properties will be searched for the user input. The [as.ab()] can correct for different forms of misspelling: +#' +#' * Wrong spelling of drug names (like "tobramicin" or "gentamycin"), which corrects for most audible similarities such as f/ph, x/ks, c/z/s, t/th, etc. +#' * Too few or too many vowels or consonants +#' * Switching two characters (like "mreopenem", often the case in clinical data, when doctors typed too fast) +#' * Digitalised paper records, leaving artefacts like 0/o/O (zero and O's), B/8, n/r, etc. #' #' Use the [ab_property()] functions to get properties based on the returned antibiotic ID, see Examples. #' @@ -231,7 +238,9 @@ as.ab <- function(x, flag_multiple_results = TRUE, ...) { # replace spaces and slashes with a possibility on both x_spelling <- gsub("[ /]", "( .*|.*/)", x_spelling) # correct for digital reading text (OCR) - x_spelling <- gsub("[NRD]", "[NRD]", x_spelling) + x_spelling <- gsub("[NRD8B]", "[NRD8B]", x_spelling) + x_spelling <- gsub("(O|0)", "(O|0)+", x_spelling) + x_spelling <- gsub("++", "+", x_spelling, fixed = TRUE) } # try if name starts with it @@ -246,6 +255,7 @@ as.ab <- function(x, flag_multiple_results = TRUE, ...) { x_new[i] <- note_if_more_than_one_found(found, i, from_text) next } + # and try if any synonym starts with it synonym_found <- unlist(lapply(antibiotics$synonyms, function(s) any(s %like% paste0("^", x_spelling)))) @@ -254,7 +264,7 @@ as.ab <- function(x, flag_multiple_results = TRUE, ...) { x_new[i] <- note_if_more_than_one_found(found, i, from_text) next } - + # INITIAL SEARCH - More uncertain results ---- if (initial_search == TRUE) { @@ -341,7 +351,7 @@ as.ab <- function(x, flag_multiple_results = TRUE, ...) { x_new[i] <- note_if_more_than_one_found(found, i, from_text) next } - + # first 5 except for cephalosporins, then first 7 (those cephalosporins all start quite the same!) found <- suppressWarnings(as.ab(substr(x[i], 1, 5), initial_search = FALSE)) if (!is.na(found) && !ab_group(found, initial_search = FALSE) %like% "cephalosporins") { @@ -365,7 +375,7 @@ as.ab <- function(x, flag_multiple_results = TRUE, ...) { x_new[i] <- note_if_more_than_one_found(found, i, from_text) next } - + # make all vowels facultative search_str <- gsub("([AEIOUY])", "\\1*", x[i]) found <- suppressWarnings(as.ab(search_str, initial_search = FALSE, already_regex = TRUE)) @@ -390,8 +400,28 @@ as.ab <- function(x, flag_multiple_results = TRUE, ...) { next } + # try with switched character, like "mreopenem" + for (j in seq_len(nchar(x[i]))) { + x_switched <- paste0( + # beginning part: + substr(x[i], 1, j - 1), + # here is the switching of 2 characters: + substr(x[i], j + 1, j + 1), + substr(x[i], j, j), + # ending part: + substr(x[i], j + 2, nchar(x[i]))) + found <- suppressWarnings(as.ab(x_switched, initial_search = FALSE)) + if (!is.na(found)) { + break + } + } + if (!is.na(found)) { + x_new[i] <- found[1L] + next + } + } # end of initial_search = TRUE - + # not found x_unknown <- c(x_unknown, x_bak[x[i] == x_bak_clean][1]) } diff --git a/docs/404.html b/docs/404.html index 9a286d3e..12d6418c 100644 --- a/docs/404.html +++ b/docs/404.html @@ -81,7 +81,7 @@ AMR (for R) - 1.2.0.9021 + 1.2.0.9022 diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index aaf38e4a..4c4c321b 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -81,7 +81,7 @@ AMR (for R) - 1.2.0.9021 + 1.2.0.9022 diff --git a/docs/articles/index.html b/docs/articles/index.html index f8c582bf..8f128859 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -81,7 +81,7 @@ AMR (for R) - 1.2.0.9021 + 1.2.0.9022 diff --git a/docs/authors.html b/docs/authors.html index 535b6b78..21efa42f 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -81,7 +81,7 @@ AMR (for R) - 1.2.0.9021 + 1.2.0.9022 diff --git a/docs/index.html b/docs/index.html index 5e6b092c..be40d2f8 100644 --- a/docs/index.html +++ b/docs/index.html @@ -43,7 +43,7 @@ AMR (for R) - 1.2.0.9021 + 1.2.0.9022 diff --git a/docs/news/index.html b/docs/news/index.html index b630c113..06e6aa06 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -81,7 +81,7 @@ AMR (for R) - 1.2.0.9021 + 1.2.0.9022 @@ -229,9 +229,9 @@ Source: NEWS.md -
-

-AMR 1.2.0.9021 Unreleased +
+

+AMR 1.2.0.9022 Unreleased

@@ -263,18 +263,22 @@ Changed

  • Using unexisting columns in all count_*(), proportion_*(), susceptibility() and resistance() functions wil now return an error instead of dropping them silently
  • +
  • Improvements for as.ab(): +
      +
    • Dramatic improvement of the algorithm behind as.ab(), making many more input errors translatable like from digitalised health care records, using too few or too many vowels or consonants and many more
    • +
    • Added progress bar
    • +
    • Fixed a bug where as.ab() would return an error on invalid input values
    • +
    • The as.ab() function will now throw a note if more than 1 antimicrobial drug could be retrieved from a single input value.
    • +
    +
  • Fixed a bug where eucast_rules() would not work on a tibble when the tibble or dplyr package was loaded
  • All *_join_microorganisms() functions and bug_drug_combinations() now return the original data class (e.g. tibbles and data.tables)
  • -
  • Fixed a bug where as.ab() would return an error on invalid input values
  • Fixed a bug for using grouped versions of rsi_df(), proportion_df() and count_df(), and fixed a bug where not all different antimicrobial results were added as rows
  • Improved auto-determination for columns of types <mo> and <Date>
  • Fixed a bug in bug_drug_combinations() for when only one antibiotic was in the input data
  • Changed the summary for class <mo>, to highlight the %SI vs. %R
  • Improved error handling, giving more useful info when functions return an error
  • -
  • Algorithm improvements to as.ab(), many more misspellings are now translatable. The as.ab() function will now throw a note if more than 1 antimicrobial drug could be retrieved from a single input value.
  • -
  • Added progress bar to as.ab() -
diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml index 0e0ca883..97db21d3 100644 --- a/docs/pkgdown.yml +++ b/docs/pkgdown.yml @@ -10,7 +10,7 @@ articles: WHONET: WHONET.html benchmarks: benchmarks.html resistance_predict: resistance_predict.html -last_built: 2020-07-01T09:51Z +last_built: 2020-07-01T14:20Z urls: reference: https://msberends.gitlab.io/AMR/reference article: https://msberends.gitlab.io/AMR/articles diff --git a/docs/reference/as.ab.html b/docs/reference/as.ab.html index 264aa57e..7db72fc6 100644 --- a/docs/reference/as.ab.html +++ b/docs/reference/as.ab.html @@ -82,7 +82,7 @@ AMR (for R) - 1.2.0.9019 + 1.2.0.9022

@@ -262,6 +262,13 @@

Details

All entries in the antibiotics data set have three different identifiers: a human readable EARS-Net code (column ab, used by ECDC and WHONET), an ATC code (column atc, used by WHO), and a CID code (column cid, Compound ID, used by PubChem). The data set contains more than 5,000 official brand names from many different countries, as found in PubChem.

+

All these properties will be searched for the user input. The as.ab() can correct for different forms of misspelling:

    +
  • Wrong spelling of drug names (like "tobramicin" or "gentamycin"), which corrects for most audible similarities such as f/ph, x/ks, c/z/s, t/th, etc.

  • +
  • Too few or too many vowels or consonants

  • +
  • Switching two characters (like "mreopenem", often the case in clinical data, when doctors typed too fast)

  • +
  • Digitalised paper records, leaving artefacts like 0/o/O (zero and O's), B/8, n/r, etc.

  • +
+

Use the ab_property() functions to get properties based on the returned antibiotic ID, see Examples.

Source

diff --git a/docs/reference/index.html b/docs/reference/index.html index e33a0bd7..72af1df4 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -81,7 +81,7 @@ AMR (for R) - 1.2.0.9021 + 1.2.0.9022 diff --git a/man/as.ab.Rd b/man/as.ab.Rd index 105c55d2..28cb9911 100644 --- a/man/as.ab.Rd +++ b/man/as.ab.Rd @@ -26,6 +26,14 @@ Use this function to determine the antibiotic code of one or more antibiotics. T \details{ All entries in the \link{antibiotics} data set have three different identifiers: a human readable EARS-Net code (column \code{ab}, used by ECDC and WHONET), an ATC code (column \code{atc}, used by WHO), and a CID code (column \code{cid}, Compound ID, used by PubChem). The data set contains more than 5,000 official brand names from many different countries, as found in PubChem. +All these properties will be searched for the user input. The \code{\link[=as.ab]{as.ab()}} can correct for different forms of misspelling: +\itemize{ +\item Wrong spelling of drug names (like "tobramicin" or "gentamycin"), which corrects for most audible similarities such as f/ph, x/ks, c/z/s, t/th, etc. +\item Too few or too many vowels or consonants +\item Switching two characters (like "mreopenem", often the case in clinical data, when doctors typed too fast) +\item Digitalised paper records, leaving artefacts like 0/o/O (zero and O's), B/8, n/r, etc. +} + Use the \code{\link[=ab_property]{ab_property()}} functions to get properties based on the returned antibiotic ID, see Examples. } \section{Source}{ diff --git a/tests/testthat/test-ab.R b/tests/testthat/test-ab.R index 05000130..87aaa62b 100755 --- a/tests/testthat/test-ab.R +++ b/tests/testthat/test-ab.R @@ -40,7 +40,7 @@ test_that("as.ab works", { expect_output(print(as.ab("amox"))) expect_output(print(data.frame(a = as.ab("amox")))) - expect_warning(as.ab("Z00ZZ00")) # not yet available in data set + expect_warning(as.ab("J00AA00")) # ATC not yet available in data set expect_warning(as.ab("UNKNOWN")) expect_warning(as.ab("")) @@ -55,8 +55,11 @@ test_that("as.ab works", { expect_equal(as.character(as.ab("Amoxy + clavulaanzuur")), "AMC") + expect_equal(as.character(as.ab(c("mreopenem", "co-maoxiclav"))), + c("MEM", "AMC")) + expect_message(as.ab("cipro mero")) - + # assigning and subsetting x <- antibiotics$ab expect_s3_class(x[1], "ab") diff --git a/tests/testthat/test-ab_from_text.R b/tests/testthat/test-ab_from_text.R index 2c63625c..cd7776d5 100644 --- a/tests/testthat/test-ab_from_text.R +++ b/tests/testthat/test-ab_from_text.R @@ -28,7 +28,7 @@ test_that("ab_from_text works", { expect_identical(ab_from_text("28/03/2020 regular amoxicilliin 500mg po tds", translate_ab = TRUE)[[1]], "Amoxicillin") expect_identical(ab_from_text("administered amoxi/clav and cipro", collapse = ", ")[[1]], - "AMX, CIP") + "AMC, CIP") expect_identical(ab_from_text("28/03/2020 regular amoxicilliin 500mg po tds", type = "dose")[[1]], 500)