(v1.2.0.9022) as.ab() improvement

This commit is contained in:
dr. M.S. (Matthijs) Berends 2020-07-01 16:21:36 +02:00
parent afc660dc33
commit 298e67a45b
15 changed files with 84 additions and 29 deletions

View File

@ -1,5 +1,5 @@
Package: AMR Package: AMR
Version: 1.2.0.9021 Version: 1.2.0.9022
Date: 2020-07-01 Date: 2020-07-01
Title: Antimicrobial Resistance Analysis Title: Antimicrobial Resistance Analysis
Authors@R: c( Authors@R: c(

11
NEWS.md
View File

@ -1,4 +1,4 @@
# AMR 1.2.0.9021 # AMR 1.2.0.9022
## <small>Last updated: 01-Jul-2020</small> ## <small>Last updated: 01-Jul-2020</small>
### New ### New
@ -20,16 +20,19 @@
### Changed ### Changed
* Using unexisting columns in all `count_*()`, `proportion_*()`, `susceptibility()` and `resistance()` functions wil now return an error instead of dropping them silently * Using unexisting columns in all `count_*()`, `proportion_*()`, `susceptibility()` and `resistance()` functions wil now return an error instead of dropping them silently
* Improvements for `as.ab()`:
* Dramatic improvement of the algorithm behind `as.ab()`, making many more input errors translatable like from digitalised health care records, using too few or too many vowels or consonants and many more
* Added progress bar
* Fixed a bug where `as.ab()` would return an error on invalid input values
* The `as.ab()` function will now throw a note if more than 1 antimicrobial drug could be retrieved from a single input value.
* Fixed a bug where `eucast_rules()` would not work on a tibble when the `tibble` or `dplyr` package was loaded * Fixed a bug where `eucast_rules()` would not work on a tibble when the `tibble` or `dplyr` package was loaded
* All `*_join_microorganisms()` functions and `bug_drug_combinations()` now return the original data class (e.g. `tibble`s and `data.table`s) * All `*_join_microorganisms()` functions and `bug_drug_combinations()` now return the original data class (e.g. `tibble`s and `data.table`s)
* Fixed a bug where `as.ab()` would return an error on invalid input values
* Fixed a bug for using grouped versions of `rsi_df()`, `proportion_df()` and `count_df()`, and fixed a bug where not all different antimicrobial results were added as rows * Fixed a bug for using grouped versions of `rsi_df()`, `proportion_df()` and `count_df()`, and fixed a bug where not all different antimicrobial results were added as rows
* Improved auto-determination for columns of types `<mo>` and `<Date>` * Improved auto-determination for columns of types `<mo>` and `<Date>`
* Fixed a bug in `bug_drug_combinations()` for when only one antibiotic was in the input data * Fixed a bug in `bug_drug_combinations()` for when only one antibiotic was in the input data
* Changed the summary for class `<mo>`, to highlight the %SI vs. %R * Changed the summary for class `<mo>`, to highlight the %SI vs. %R
* Improved error handling, giving more useful info when functions return an error * Improved error handling, giving more useful info when functions return an error
* Algorithm improvements to `as.ab()`, many more misspellings are now translatable. The `as.ab()` function will now throw a note if more than 1 antimicrobial drug could be retrieved from a single input value.
* Added progress bar to `as.ab()`
# AMR 1.2.0 # AMR 1.2.0

40
R/ab.R
View File

@ -29,6 +29,13 @@
#' @rdname as.ab #' @rdname as.ab
#' @inheritSection WHOCC WHOCC #' @inheritSection WHOCC WHOCC
#' @details All entries in the [antibiotics] data set have three different identifiers: a human readable EARS-Net code (column `ab`, used by ECDC and WHONET), an ATC code (column `atc`, used by WHO), and a CID code (column `cid`, Compound ID, used by PubChem). The data set contains more than 5,000 official brand names from many different countries, as found in PubChem. #' @details All entries in the [antibiotics] data set have three different identifiers: a human readable EARS-Net code (column `ab`, used by ECDC and WHONET), an ATC code (column `atc`, used by WHO), and a CID code (column `cid`, Compound ID, used by PubChem). The data set contains more than 5,000 official brand names from many different countries, as found in PubChem.
#'
#' All these properties will be searched for the user input. The [as.ab()] can correct for different forms of misspelling:
#'
#' * Wrong spelling of drug names (like "tobramicin" or "gentamycin"), which corrects for most audible similarities such as f/ph, x/ks, c/z/s, t/th, etc.
#' * Too few or too many vowels or consonants
#' * Switching two characters (like "mreopenem", often the case in clinical data, when doctors typed too fast)
#' * Digitalised paper records, leaving artefacts like 0/o/O (zero and O's), B/8, n/r, etc.
#' #'
#' Use the [ab_property()] functions to get properties based on the returned antibiotic ID, see Examples. #' Use the [ab_property()] functions to get properties based on the returned antibiotic ID, see Examples.
#' #'
@ -231,7 +238,9 @@ as.ab <- function(x, flag_multiple_results = TRUE, ...) {
# replace spaces and slashes with a possibility on both # replace spaces and slashes with a possibility on both
x_spelling <- gsub("[ /]", "( .*|.*/)", x_spelling) x_spelling <- gsub("[ /]", "( .*|.*/)", x_spelling)
# correct for digital reading text (OCR) # correct for digital reading text (OCR)
x_spelling <- gsub("[NRD]", "[NRD]", x_spelling) x_spelling <- gsub("[NRD8B]", "[NRD8B]", x_spelling)
x_spelling <- gsub("(O|0)", "(O|0)+", x_spelling)
x_spelling <- gsub("++", "+", x_spelling, fixed = TRUE)
} }
# try if name starts with it # try if name starts with it
@ -246,6 +255,7 @@ as.ab <- function(x, flag_multiple_results = TRUE, ...) {
x_new[i] <- note_if_more_than_one_found(found, i, from_text) x_new[i] <- note_if_more_than_one_found(found, i, from_text)
next next
} }
# and try if any synonym starts with it # and try if any synonym starts with it
synonym_found <- unlist(lapply(antibiotics$synonyms, synonym_found <- unlist(lapply(antibiotics$synonyms,
function(s) any(s %like% paste0("^", x_spelling)))) function(s) any(s %like% paste0("^", x_spelling))))
@ -254,7 +264,7 @@ as.ab <- function(x, flag_multiple_results = TRUE, ...) {
x_new[i] <- note_if_more_than_one_found(found, i, from_text) x_new[i] <- note_if_more_than_one_found(found, i, from_text)
next next
} }
# INITIAL SEARCH - More uncertain results ---- # INITIAL SEARCH - More uncertain results ----
if (initial_search == TRUE) { if (initial_search == TRUE) {
@ -341,7 +351,7 @@ as.ab <- function(x, flag_multiple_results = TRUE, ...) {
x_new[i] <- note_if_more_than_one_found(found, i, from_text) x_new[i] <- note_if_more_than_one_found(found, i, from_text)
next next
} }
# first 5 except for cephalosporins, then first 7 (those cephalosporins all start quite the same!) # first 5 except for cephalosporins, then first 7 (those cephalosporins all start quite the same!)
found <- suppressWarnings(as.ab(substr(x[i], 1, 5), initial_search = FALSE)) found <- suppressWarnings(as.ab(substr(x[i], 1, 5), initial_search = FALSE))
if (!is.na(found) && !ab_group(found, initial_search = FALSE) %like% "cephalosporins") { if (!is.na(found) && !ab_group(found, initial_search = FALSE) %like% "cephalosporins") {
@ -365,7 +375,7 @@ as.ab <- function(x, flag_multiple_results = TRUE, ...) {
x_new[i] <- note_if_more_than_one_found(found, i, from_text) x_new[i] <- note_if_more_than_one_found(found, i, from_text)
next next
} }
# make all vowels facultative # make all vowels facultative
search_str <- gsub("([AEIOUY])", "\\1*", x[i]) search_str <- gsub("([AEIOUY])", "\\1*", x[i])
found <- suppressWarnings(as.ab(search_str, initial_search = FALSE, already_regex = TRUE)) found <- suppressWarnings(as.ab(search_str, initial_search = FALSE, already_regex = TRUE))
@ -390,8 +400,28 @@ as.ab <- function(x, flag_multiple_results = TRUE, ...) {
next next
} }
# try with switched character, like "mreopenem"
for (j in seq_len(nchar(x[i]))) {
x_switched <- paste0(
# beginning part:
substr(x[i], 1, j - 1),
# here is the switching of 2 characters:
substr(x[i], j + 1, j + 1),
substr(x[i], j, j),
# ending part:
substr(x[i], j + 2, nchar(x[i])))
found <- suppressWarnings(as.ab(x_switched, initial_search = FALSE))
if (!is.na(found)) {
break
}
}
if (!is.na(found)) {
x_new[i] <- found[1L]
next
}
} # end of initial_search = TRUE } # end of initial_search = TRUE
# not found # not found
x_unknown <- c(x_unknown, x_bak[x[i] == x_bak_clean][1]) x_unknown <- c(x_unknown, x_bak[x[i] == x_bak_clean][1])
} }

View File

@ -81,7 +81,7 @@
</button> </button>
<span class="navbar-brand"> <span class="navbar-brand">
<a class="navbar-link" href="https://msberends.gitlab.io/AMR/index.html">AMR (for R)</a> <a class="navbar-link" href="https://msberends.gitlab.io/AMR/index.html">AMR (for R)</a>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.2.0.9021</span> <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.2.0.9022</span>
</span> </span>
</div> </div>

View File

@ -81,7 +81,7 @@
</button> </button>
<span class="navbar-brand"> <span class="navbar-brand">
<a class="navbar-link" href="index.html">AMR (for R)</a> <a class="navbar-link" href="index.html">AMR (for R)</a>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.2.0.9021</span> <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.2.0.9022</span>
</span> </span>
</div> </div>

View File

@ -81,7 +81,7 @@
</button> </button>
<span class="navbar-brand"> <span class="navbar-brand">
<a class="navbar-link" href="../index.html">AMR (for R)</a> <a class="navbar-link" href="../index.html">AMR (for R)</a>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.2.0.9021</span> <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.2.0.9022</span>
</span> </span>
</div> </div>

View File

@ -81,7 +81,7 @@
</button> </button>
<span class="navbar-brand"> <span class="navbar-brand">
<a class="navbar-link" href="index.html">AMR (for R)</a> <a class="navbar-link" href="index.html">AMR (for R)</a>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.2.0.9021</span> <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.2.0.9022</span>
</span> </span>
</div> </div>

View File

@ -43,7 +43,7 @@
</button> </button>
<span class="navbar-brand"> <span class="navbar-brand">
<a class="navbar-link" href="index.html">AMR (for R)</a> <a class="navbar-link" href="index.html">AMR (for R)</a>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.2.0.9021</span> <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.2.0.9022</span>
</span> </span>
</div> </div>

View File

@ -81,7 +81,7 @@
</button> </button>
<span class="navbar-brand"> <span class="navbar-brand">
<a class="navbar-link" href="../index.html">AMR (for R)</a> <a class="navbar-link" href="../index.html">AMR (for R)</a>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.2.0.9021</span> <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.2.0.9022</span>
</span> </span>
</div> </div>
@ -229,9 +229,9 @@
<small>Source: <a href='https://gitlab.com/msberends/AMR/blob/master/NEWS.md'><code>NEWS.md</code></a></small> <small>Source: <a href='https://gitlab.com/msberends/AMR/blob/master/NEWS.md'><code>NEWS.md</code></a></small>
</div> </div>
<div id="amr-1209021" class="section level1"> <div id="amr-1209022" class="section level1">
<h1 class="page-header" data-toc-text="1.2.0.9021"> <h1 class="page-header" data-toc-text="1.2.0.9022">
<a href="#amr-1209021" class="anchor"></a>AMR 1.2.0.9021<small> Unreleased </small> <a href="#amr-1209022" class="anchor"></a>AMR 1.2.0.9022<small> Unreleased </small>
</h1> </h1>
<div id="last-updated-01-jul-2020" class="section level2"> <div id="last-updated-01-jul-2020" class="section level2">
<h2 class="hasAnchor"> <h2 class="hasAnchor">
@ -263,18 +263,22 @@
<a href="#changed" class="anchor"></a>Changed</h3> <a href="#changed" class="anchor"></a>Changed</h3>
<ul> <ul>
<li>Using unexisting columns in all <code>count_*()</code>, <code>proportion_*()</code>, <code><a href="../reference/proportion.html">susceptibility()</a></code> and <code><a href="../reference/proportion.html">resistance()</a></code> functions wil now return an error instead of dropping them silently</li> <li>Using unexisting columns in all <code>count_*()</code>, <code>proportion_*()</code>, <code><a href="../reference/proportion.html">susceptibility()</a></code> and <code><a href="../reference/proportion.html">resistance()</a></code> functions wil now return an error instead of dropping them silently</li>
<li>Improvements for <code><a href="../reference/as.ab.html">as.ab()</a></code>:
<ul>
<li>Dramatic improvement of the algorithm behind <code><a href="../reference/as.ab.html">as.ab()</a></code>, making many more input errors translatable like from digitalised health care records, using too few or too many vowels or consonants and many more</li>
<li>Added progress bar</li>
<li>Fixed a bug where <code><a href="../reference/as.ab.html">as.ab()</a></code> would return an error on invalid input values</li>
<li>The <code><a href="../reference/as.ab.html">as.ab()</a></code> function will now throw a note if more than 1 antimicrobial drug could be retrieved from a single input value.</li>
</ul>
</li>
<li>Fixed a bug where <code><a href="../reference/eucast_rules.html">eucast_rules()</a></code> would not work on a tibble when the <code>tibble</code> or <code>dplyr</code> package was loaded</li> <li>Fixed a bug where <code><a href="../reference/eucast_rules.html">eucast_rules()</a></code> would not work on a tibble when the <code>tibble</code> or <code>dplyr</code> package was loaded</li>
<li>All <code>*_join_microorganisms()</code> functions and <code><a href="../reference/bug_drug_combinations.html">bug_drug_combinations()</a></code> now return the original data class (e.g. <code>tibble</code>s and <code>data.table</code>s)</li> <li>All <code>*_join_microorganisms()</code> functions and <code><a href="../reference/bug_drug_combinations.html">bug_drug_combinations()</a></code> now return the original data class (e.g. <code>tibble</code>s and <code>data.table</code>s)</li>
<li>Fixed a bug where <code><a href="../reference/as.ab.html">as.ab()</a></code> would return an error on invalid input values</li>
<li>Fixed a bug for using grouped versions of <code><a href="../reference/proportion.html">rsi_df()</a></code>, <code><a href="../reference/proportion.html">proportion_df()</a></code> and <code><a href="../reference/count.html">count_df()</a></code>, and fixed a bug where not all different antimicrobial results were added as rows</li> <li>Fixed a bug for using grouped versions of <code><a href="../reference/proportion.html">rsi_df()</a></code>, <code><a href="../reference/proportion.html">proportion_df()</a></code> and <code><a href="../reference/count.html">count_df()</a></code>, and fixed a bug where not all different antimicrobial results were added as rows</li>
<li>Improved auto-determination for columns of types <code>&lt;mo&gt;</code> and <code>&lt;Date&gt;</code> <li>Improved auto-determination for columns of types <code>&lt;mo&gt;</code> and <code>&lt;Date&gt;</code>
</li> </li>
<li>Fixed a bug in <code><a href="../reference/bug_drug_combinations.html">bug_drug_combinations()</a></code> for when only one antibiotic was in the input data</li> <li>Fixed a bug in <code><a href="../reference/bug_drug_combinations.html">bug_drug_combinations()</a></code> for when only one antibiotic was in the input data</li>
<li>Changed the summary for class <code>&lt;mo&gt;</code>, to highlight the %SI vs. %R</li> <li>Changed the summary for class <code>&lt;mo&gt;</code>, to highlight the %SI vs. %R</li>
<li>Improved error handling, giving more useful info when functions return an error</li> <li>Improved error handling, giving more useful info when functions return an error</li>
<li>Algorithm improvements to <code><a href="../reference/as.ab.html">as.ab()</a></code>, many more misspellings are now translatable. The <code><a href="../reference/as.ab.html">as.ab()</a></code> function will now throw a note if more than 1 antimicrobial drug could be retrieved from a single input value.</li>
<li>Added progress bar to <code><a href="../reference/as.ab.html">as.ab()</a></code>
</li>
</ul> </ul>
</div> </div>
</div> </div>

View File

@ -10,7 +10,7 @@ articles:
WHONET: WHONET.html WHONET: WHONET.html
benchmarks: benchmarks.html benchmarks: benchmarks.html
resistance_predict: resistance_predict.html resistance_predict: resistance_predict.html
last_built: 2020-07-01T09:51Z last_built: 2020-07-01T14:20Z
urls: urls:
reference: https://msberends.gitlab.io/AMR/reference reference: https://msberends.gitlab.io/AMR/reference
article: https://msberends.gitlab.io/AMR/articles article: https://msberends.gitlab.io/AMR/articles

View File

@ -82,7 +82,7 @@
</button> </button>
<span class="navbar-brand"> <span class="navbar-brand">
<a class="navbar-link" href="../index.html">AMR (for R)</a> <a class="navbar-link" href="../index.html">AMR (for R)</a>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.2.0.9019</span> <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.2.0.9022</span>
</span> </span>
</div> </div>
@ -262,6 +262,13 @@
<h2 class="hasAnchor" id="details"><a class="anchor" href="#details"></a>Details</h2> <h2 class="hasAnchor" id="details"><a class="anchor" href="#details"></a>Details</h2>
<p>All entries in the <a href='antibiotics.html'>antibiotics</a> data set have three different identifiers: a human readable EARS-Net code (column <code>ab</code>, used by ECDC and WHONET), an ATC code (column <code>atc</code>, used by WHO), and a CID code (column <code>cid</code>, Compound ID, used by PubChem). The data set contains more than 5,000 official brand names from many different countries, as found in PubChem.</p> <p>All entries in the <a href='antibiotics.html'>antibiotics</a> data set have three different identifiers: a human readable EARS-Net code (column <code>ab</code>, used by ECDC and WHONET), an ATC code (column <code>atc</code>, used by WHO), and a CID code (column <code>cid</code>, Compound ID, used by PubChem). The data set contains more than 5,000 official brand names from many different countries, as found in PubChem.</p>
<p>All these properties will be searched for the user input. The <code>as.ab()</code> can correct for different forms of misspelling:</p><ul>
<li><p>Wrong spelling of drug names (like "tobramicin" or "gentamycin"), which corrects for most audible similarities such as f/ph, x/ks, c/z/s, t/th, etc.</p></li>
<li><p>Too few or too many vowels or consonants</p></li>
<li><p>Switching two characters (like "mreopenem", often the case in clinical data, when doctors typed too fast)</p></li>
<li><p>Digitalised paper records, leaving artefacts like 0/o/O (zero and O's), B/8, n/r, etc.</p></li>
</ul>
<p>Use the <code><a href='ab_property.html'>ab_property()</a></code> functions to get properties based on the returned antibiotic ID, see Examples.</p> <p>Use the <code><a href='ab_property.html'>ab_property()</a></code> functions to get properties based on the returned antibiotic ID, see Examples.</p>
<h2 class="hasAnchor" id="source"><a class="anchor" href="#source"></a>Source</h2> <h2 class="hasAnchor" id="source"><a class="anchor" href="#source"></a>Source</h2>

View File

@ -81,7 +81,7 @@
</button> </button>
<span class="navbar-brand"> <span class="navbar-brand">
<a class="navbar-link" href="../index.html">AMR (for R)</a> <a class="navbar-link" href="../index.html">AMR (for R)</a>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.2.0.9021</span> <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.2.0.9022</span>
</span> </span>
</div> </div>

View File

@ -26,6 +26,14 @@ Use this function to determine the antibiotic code of one or more antibiotics. T
\details{ \details{
All entries in the \link{antibiotics} data set have three different identifiers: a human readable EARS-Net code (column \code{ab}, used by ECDC and WHONET), an ATC code (column \code{atc}, used by WHO), and a CID code (column \code{cid}, Compound ID, used by PubChem). The data set contains more than 5,000 official brand names from many different countries, as found in PubChem. All entries in the \link{antibiotics} data set have three different identifiers: a human readable EARS-Net code (column \code{ab}, used by ECDC and WHONET), an ATC code (column \code{atc}, used by WHO), and a CID code (column \code{cid}, Compound ID, used by PubChem). The data set contains more than 5,000 official brand names from many different countries, as found in PubChem.
All these properties will be searched for the user input. The \code{\link[=as.ab]{as.ab()}} can correct for different forms of misspelling:
\itemize{
\item Wrong spelling of drug names (like "tobramicin" or "gentamycin"), which corrects for most audible similarities such as f/ph, x/ks, c/z/s, t/th, etc.
\item Too few or too many vowels or consonants
\item Switching two characters (like "mreopenem", often the case in clinical data, when doctors typed too fast)
\item Digitalised paper records, leaving artefacts like 0/o/O (zero and O's), B/8, n/r, etc.
}
Use the \code{\link[=ab_property]{ab_property()}} functions to get properties based on the returned antibiotic ID, see Examples. Use the \code{\link[=ab_property]{ab_property()}} functions to get properties based on the returned antibiotic ID, see Examples.
} }
\section{Source}{ \section{Source}{

View File

@ -40,7 +40,7 @@ test_that("as.ab works", {
expect_output(print(as.ab("amox"))) expect_output(print(as.ab("amox")))
expect_output(print(data.frame(a = as.ab("amox")))) expect_output(print(data.frame(a = as.ab("amox"))))
expect_warning(as.ab("Z00ZZ00")) # not yet available in data set expect_warning(as.ab("J00AA00")) # ATC not yet available in data set
expect_warning(as.ab("UNKNOWN")) expect_warning(as.ab("UNKNOWN"))
expect_warning(as.ab("")) expect_warning(as.ab(""))
@ -55,8 +55,11 @@ test_that("as.ab works", {
expect_equal(as.character(as.ab("Amoxy + clavulaanzuur")), expect_equal(as.character(as.ab("Amoxy + clavulaanzuur")),
"AMC") "AMC")
expect_equal(as.character(as.ab(c("mreopenem", "co-maoxiclav"))),
c("MEM", "AMC"))
expect_message(as.ab("cipro mero")) expect_message(as.ab("cipro mero"))
# assigning and subsetting # assigning and subsetting
x <- antibiotics$ab x <- antibiotics$ab
expect_s3_class(x[1], "ab") expect_s3_class(x[1], "ab")

View File

@ -28,7 +28,7 @@ test_that("ab_from_text works", {
expect_identical(ab_from_text("28/03/2020 regular amoxicilliin 500mg po tds", translate_ab = TRUE)[[1]], expect_identical(ab_from_text("28/03/2020 regular amoxicilliin 500mg po tds", translate_ab = TRUE)[[1]],
"Amoxicillin") "Amoxicillin")
expect_identical(ab_from_text("administered amoxi/clav and cipro", collapse = ", ")[[1]], expect_identical(ab_from_text("administered amoxi/clav and cipro", collapse = ", ")[[1]],
"AMX, CIP") "AMC, CIP")
expect_identical(ab_from_text("28/03/2020 regular amoxicilliin 500mg po tds", type = "dose")[[1]], expect_identical(ab_from_text("28/03/2020 regular amoxicilliin 500mg po tds", type = "dose")[[1]],
500) 500)