From 298e67a45bcea6530e12fa6f8afa12ad303a8210 Mon Sep 17 00:00:00 2001
From: "Matthijs S. Berends" <berendsms@gmail.com>
Date: Wed, 1 Jul 2020 16:21:36 +0200
Subject: [PATCH] (v1.2.0.9022) as.ab() improvement

---
 DESCRIPTION                        |  2 +-
 NEWS.md                            | 11 +++++---
 R/ab.R                             | 40 ++++++++++++++++++++++++++----
 docs/404.html                      |  2 +-
 docs/LICENSE-text.html             |  2 +-
 docs/articles/index.html           |  2 +-
 docs/authors.html                  |  2 +-
 docs/index.html                    |  2 +-
 docs/news/index.html               | 20 +++++++++------
 docs/pkgdown.yml                   |  2 +-
 docs/reference/as.ab.html          |  9 ++++++-
 docs/reference/index.html          |  2 +-
 man/as.ab.Rd                       |  8 ++++++
 tests/testthat/test-ab.R           |  7 ++++--
 tests/testthat/test-ab_from_text.R |  2 +-
 15 files changed, 84 insertions(+), 29 deletions(-)
diff --git a/DESCRIPTION b/DESCRIPTION
index 4656d732..0551e029 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,5 +1,5 @@
 Package: AMR
-Version: 1.2.0.9021
+Version: 1.2.0.9022
 Date: 2020-07-01
 Title: Antimicrobial Resistance Analysis
 Authors@R: c(
diff --git a/NEWS.md b/NEWS.md
index 09485fce..517f1fe8 100755
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,4 +1,4 @@
-# AMR 1.2.0.9021
+# AMR 1.2.0.9022
 ## <small>Last updated: 01-Jul-2020</small>
 
 ### New
@@ -20,16 +20,19 @@
 
 ### Changed
 * Using unexisting columns in all `count_*()`, `proportion_*()`, `susceptibility()` and `resistance()` functions wil now return an error instead of dropping them silently
+* Improvements for `as.ab()`:
+  * Dramatic improvement of the algorithm behind `as.ab()`, making many more input errors translatable like from digitalised health care records, using too few or too many vowels or consonants and many more
+  * Added progress bar
+  * Fixed a bug where `as.ab()` would return an error on invalid input values
+  * The `as.ab()` function will now throw a note if more than 1 antimicrobial drug could be retrieved from a single input value.
 * Fixed a bug where `eucast_rules()` would not work on a tibble when the `tibble` or `dplyr` package was loaded
 * All `*_join_microorganisms()` functions and `bug_drug_combinations()` now return the original data class (e.g. `tibble`s and `data.table`s)
-* Fixed a bug where `as.ab()` would return an error on invalid input values
 * Fixed a bug for using grouped versions of `rsi_df()`, `proportion_df()` and `count_df()`, and fixed a bug where not all different antimicrobial results were added as rows
 * Improved auto-determination for columns of types `<mo>` and `<Date>`
 * Fixed a bug in `bug_drug_combinations()` for when only one antibiotic was in the input data
 * Changed the summary for class `<mo>`, to highlight the %SI vs. %R
 * Improved error handling, giving more useful info when functions return an error
-* Algorithm improvements to `as.ab()`, many more misspellings are now translatable. The `as.ab()` function will now throw a note if more than 1 antimicrobial drug could be retrieved from a single input value.
-* Added progress bar to `as.ab()`
+
 
 # AMR 1.2.0
 
diff --git a/R/ab.R b/R/ab.R
index 80845712..624dc57c 100755
--- a/R/ab.R
+++ b/R/ab.R
@@ -29,6 +29,13 @@
 #' @rdname as.ab
 #' @inheritSection WHOCC WHOCC
 #' @details All entries in the [antibiotics] data set have three different identifiers: a human readable EARS-Net code (column `ab`, used by ECDC and WHONET), an ATC code (column `atc`, used by WHO), and a CID code (column `cid`, Compound ID, used by PubChem). The data set contains more than 5,000 official brand names from many different countries, as found in PubChem.
+#' 
+#' All these properties will be searched for the user input. The [as.ab()] can correct for different forms of misspelling:
+#' 
+#'  * Wrong spelling of drug names (like "tobramicin" or "gentamycin"), which corrects for most audible similarities such as f/ph, x/ks, c/z/s, t/th, etc.
+#'  * Too few or too many vowels or consonants
+#'  * Switching two characters (like "mreopenem", often the case in clinical data, when doctors typed too fast)
+#'  * Digitalised paper records, leaving artefacts like 0/o/O (zero and O's), B/8, n/r, etc.
 #'
 #' Use the [ab_property()] functions to get properties based on the returned antibiotic ID, see Examples.
 #' 
@@ -231,7 +238,9 @@ as.ab <- function(x, flag_multiple_results = TRUE, ...) {
       # replace spaces and slashes with a possibility on both
       x_spelling <- gsub("[ /]", "( .*|.*/)", x_spelling)
       # correct for digital reading text (OCR)
-      x_spelling <- gsub("[NRD]", "[NRD]", x_spelling)
+      x_spelling <- gsub("[NRD8B]", "[NRD8B]", x_spelling)
+      x_spelling <- gsub("(O|0)", "(O|0)+", x_spelling)
+      x_spelling <- gsub("++", "+", x_spelling, fixed = TRUE)
     }
 
     # try if name starts with it
@@ -246,6 +255,7 @@ as.ab <- function(x, flag_multiple_results = TRUE, ...) {
       x_new[i] <- note_if_more_than_one_found(found, i, from_text)
       next
     }
+
     # and try if any synonym starts with it
     synonym_found <- unlist(lapply(antibiotics$synonyms,
                                    function(s) any(s %like% paste0("^", x_spelling))))
@@ -254,7 +264,7 @@ as.ab <- function(x, flag_multiple_results = TRUE, ...) {
       x_new[i] <- note_if_more_than_one_found(found, i, from_text)
       next
     }
-    
+
     # INITIAL SEARCH - More uncertain results ----
     
     if (initial_search == TRUE) {
@@ -341,7 +351,7 @@ as.ab <- function(x, flag_multiple_results = TRUE, ...) {
         x_new[i] <- note_if_more_than_one_found(found, i, from_text)
         next
       }
-      
+
       # first 5 except for cephalosporins, then first 7 (those cephalosporins all start quite the same!)
       found <- suppressWarnings(as.ab(substr(x[i], 1, 5), initial_search = FALSE))
       if (!is.na(found) && !ab_group(found, initial_search = FALSE) %like% "cephalosporins") {
@@ -365,7 +375,7 @@ as.ab <- function(x, flag_multiple_results = TRUE, ...) {
         x_new[i] <- note_if_more_than_one_found(found, i, from_text)
         next
       }
-      
+  
       # make all vowels facultative
       search_str <- gsub("([AEIOUY])", "\\1*", x[i])
       found <- suppressWarnings(as.ab(search_str, initial_search = FALSE, already_regex = TRUE))
@@ -390,8 +400,28 @@ as.ab <- function(x, flag_multiple_results = TRUE, ...) {
         next
       }
       
+      # try with switched character, like "mreopenem"
+      for (j in seq_len(nchar(x[i]))) {
+        x_switched <- paste0(
+          # beginning part:
+          substr(x[i], 1, j - 1),
+          # here is the switching of 2 characters:
+          substr(x[i], j + 1, j + 1), 
+          substr(x[i], j, j), 
+          # ending part:
+          substr(x[i], j + 2, nchar(x[i])))
+        found <- suppressWarnings(as.ab(x_switched, initial_search = FALSE))
+        if (!is.na(found)) {
+          break
+        }
+      }
+      if (!is.na(found)) {
+        x_new[i] <- found[1L]
+        next
+      }
+      
     } # end of initial_search = TRUE
-
+    
     # not found
     x_unknown <- c(x_unknown, x_bak[x[i] == x_bak_clean][1])
   }
diff --git a/docs/404.html b/docs/404.html
index 9a286d3e..12d6418c 100644
--- a/docs/404.html
+++ b/docs/404.html
@@ -81,7 +81,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="https://msberends.gitlab.io/AMR/index.html">AMR (for R)</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.2.0.9021</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.2.0.9022</span>
       </span>
     </div>
 
diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html
index aaf38e4a..4c4c321b 100644
--- a/docs/LICENSE-text.html
+++ b/docs/LICENSE-text.html
@@ -81,7 +81,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="index.html">AMR (for R)</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.2.0.9021</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.2.0.9022</span>
       </span>
     </div>
 
diff --git a/docs/articles/index.html b/docs/articles/index.html
index f8c582bf..8f128859 100644
--- a/docs/articles/index.html
+++ b/docs/articles/index.html
@@ -81,7 +81,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">AMR (for R)</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.2.0.9021</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.2.0.9022</span>
       </span>
     </div>
 
diff --git a/docs/authors.html b/docs/authors.html
index 535b6b78..21efa42f 100644
--- a/docs/authors.html
+++ b/docs/authors.html
@@ -81,7 +81,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="index.html">AMR (for R)</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.2.0.9021</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.2.0.9022</span>
       </span>
     </div>
 
diff --git a/docs/index.html b/docs/index.html
index 5e6b092c..be40d2f8 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -43,7 +43,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="index.html">AMR (for R)</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.2.0.9021</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.2.0.9022</span>
       </span>
     </div>
 
diff --git a/docs/news/index.html b/docs/news/index.html
index b630c113..06e6aa06 100644
--- a/docs/news/index.html
+++ b/docs/news/index.html
@@ -81,7 +81,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">AMR (for R)</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.2.0.9021</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.2.0.9022</span>
       </span>
     </div>
 
@@ -229,9 +229,9 @@
       <small>Source: <a href='https://gitlab.com/msberends/AMR/blob/master/NEWS.md'><code>NEWS.md</code></a></small>
     </div>
 
-    <div id="amr-1209021" class="section level1">
-<h1 class="page-header" data-toc-text="1.2.0.9021">
-<a href="#amr-1209021" class="anchor"></a>AMR 1.2.0.9021<small> Unreleased </small>
+    <div id="amr-1209022" class="section level1">
+<h1 class="page-header" data-toc-text="1.2.0.9022">
+<a href="#amr-1209022" class="anchor"></a>AMR 1.2.0.9022<small> Unreleased </small>
 </h1>
 <div id="last-updated-01-jul-2020" class="section level2">
 <h2 class="hasAnchor">
@@ -263,18 +263,22 @@
 <a href="#changed" class="anchor"></a>Changed</h3>
 <ul>
 <li>Using unexisting columns in all <code>count_*()</code>, <code>proportion_*()</code>, <code><a href="../reference/proportion.html">susceptibility()</a></code> and <code><a href="../reference/proportion.html">resistance()</a></code> functions wil now return an error instead of dropping them silently</li>
+<li>Improvements for <code><a href="../reference/as.ab.html">as.ab()</a></code>:
+<ul>
+<li>Dramatic improvement of the algorithm behind <code><a href="../reference/as.ab.html">as.ab()</a></code>, making many more input errors translatable like from digitalised health care records, using too few or too many vowels or consonants and many more</li>
+<li>Added progress bar</li>
+<li>Fixed a bug where <code><a href="../reference/as.ab.html">as.ab()</a></code> would return an error on invalid input values</li>
+<li>The <code><a href="../reference/as.ab.html">as.ab()</a></code> function will now throw a note if more than 1 antimicrobial drug could be retrieved from a single input value.</li>
+</ul>
+</li>
 <li>Fixed a bug where <code><a href="../reference/eucast_rules.html">eucast_rules()</a></code> would not work on a tibble when the <code>tibble</code> or <code>dplyr</code> package was loaded</li>
 <li>All <code>*_join_microorganisms()</code> functions and <code><a href="../reference/bug_drug_combinations.html">bug_drug_combinations()</a></code> now return the original data class (e.g. <code>tibble</code>s and <code>data.table</code>s)</li>
-<li>Fixed a bug where <code><a href="../reference/as.ab.html">as.ab()</a></code> would return an error on invalid input values</li>
 <li>Fixed a bug for using grouped versions of <code><a href="../reference/proportion.html">rsi_df()</a></code>, <code><a href="../reference/proportion.html">proportion_df()</a></code> and <code><a href="../reference/count.html">count_df()</a></code>, and fixed a bug where not all different antimicrobial results were added as rows</li>
 <li>Improved auto-determination for columns of types <code>&lt;mo&gt;</code> and <code>&lt;Date&gt;</code>
 </li>
 <li>Fixed a bug in <code><a href="../reference/bug_drug_combinations.html">bug_drug_combinations()</a></code> for when only one antibiotic was in the input data</li>
 <li>Changed the summary for class <code>&lt;mo&gt;</code>, to highlight the %SI vs. %R</li>
 <li>Improved error handling, giving more useful info when functions return an error</li>
-<li>Algorithm improvements to <code><a href="../reference/as.ab.html">as.ab()</a></code>, many more misspellings are now translatable. The <code><a href="../reference/as.ab.html">as.ab()</a></code> function will now throw a note if more than 1 antimicrobial drug could be retrieved from a single input value.</li>
-<li>Added progress bar to <code><a href="../reference/as.ab.html">as.ab()</a></code>
-</li>
 </ul>
 </div>
 </div>
diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml
index 0e0ca883..97db21d3 100644
--- a/docs/pkgdown.yml
+++ b/docs/pkgdown.yml
@@ -10,7 +10,7 @@ articles:
   WHONET: WHONET.html
   benchmarks: benchmarks.html
   resistance_predict: resistance_predict.html
-last_built: 2020-07-01T09:51Z
+last_built: 2020-07-01T14:20Z
 urls:
   reference: https://msberends.gitlab.io/AMR/reference
   article: https://msberends.gitlab.io/AMR/articles
diff --git a/docs/reference/as.ab.html b/docs/reference/as.ab.html
index 264aa57e..7db72fc6 100644
--- a/docs/reference/as.ab.html
+++ b/docs/reference/as.ab.html
@@ -82,7 +82,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">AMR (for R)</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.2.0.9019</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.2.0.9022</span>
       </span>
     </div>
 
@@ -262,6 +262,13 @@
     <h2 class="hasAnchor" id="details"><a class="anchor" href="#details"></a>Details</h2>
 
     <p>All entries in the <a href='antibiotics.html'>antibiotics</a> data set have three different identifiers: a human readable EARS-Net code (column <code>ab</code>, used by ECDC and WHONET), an ATC code (column <code>atc</code>, used by WHO), and a CID code (column <code>cid</code>, Compound ID, used by PubChem). The data set contains more than 5,000 official brand names from many different countries, as found in PubChem.</p>
+<p>All these properties will be searched for the user input. The <code>as.ab()</code> can correct for different forms of misspelling:</p><ul>
+<li><p>Wrong spelling of drug names (like "tobramicin" or "gentamycin"), which corrects for most audible similarities such as f/ph, x/ks, c/z/s, t/th, etc.</p></li>
+<li><p>Too few or too many vowels or consonants</p></li>
+<li><p>Switching two characters (like "mreopenem", often the case in clinical data, when doctors typed too fast)</p></li>
+<li><p>Digitalised paper records, leaving artefacts like 0/o/O (zero and O's), B/8, n/r, etc.</p></li>
+</ul>
+
 <p>Use the <code><a href='ab_property.html'>ab_property()</a></code> functions to get properties based on the returned antibiotic ID, see Examples.</p>
     <h2 class="hasAnchor" id="source"><a class="anchor" href="#source"></a>Source</h2>
 
diff --git a/docs/reference/index.html b/docs/reference/index.html
index e33a0bd7..72af1df4 100644
--- a/docs/reference/index.html
+++ b/docs/reference/index.html
@@ -81,7 +81,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">AMR (for R)</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.2.0.9021</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.2.0.9022</span>
       </span>
     </div>
 
diff --git a/man/as.ab.Rd b/man/as.ab.Rd
index 105c55d2..28cb9911 100644
--- a/man/as.ab.Rd
+++ b/man/as.ab.Rd
@@ -26,6 +26,14 @@ Use this function to determine the antibiotic code of one or more antibiotics. T
 \details{
 All entries in the \link{antibiotics} data set have three different identifiers: a human readable EARS-Net code (column \code{ab}, used by ECDC and WHONET), an ATC code (column \code{atc}, used by WHO), and a CID code (column \code{cid}, Compound ID, used by PubChem). The data set contains more than 5,000 official brand names from many different countries, as found in PubChem.
 
+All these properties will be searched for the user input. The \code{\link[=as.ab]{as.ab()}} can correct for different forms of misspelling:
+\itemize{
+\item Wrong spelling of drug names (like "tobramicin" or "gentamycin"), which corrects for most audible similarities such as f/ph, x/ks, c/z/s, t/th, etc.
+\item Too few or too many vowels or consonants
+\item Switching two characters (like "mreopenem", often the case in clinical data, when doctors typed too fast)
+\item Digitalised paper records, leaving artefacts like 0/o/O (zero and O's), B/8, n/r, etc.
+}
+
 Use the \code{\link[=ab_property]{ab_property()}} functions to get properties based on the returned antibiotic ID, see Examples.
 }
 \section{Source}{
diff --git a/tests/testthat/test-ab.R b/tests/testthat/test-ab.R
index 05000130..87aaa62b 100755
--- a/tests/testthat/test-ab.R
+++ b/tests/testthat/test-ab.R
@@ -40,7 +40,7 @@ test_that("as.ab works", {
   expect_output(print(as.ab("amox")))
   expect_output(print(data.frame(a = as.ab("amox"))))
 
-  expect_warning(as.ab("Z00ZZ00")) # not yet available in data set
+  expect_warning(as.ab("J00AA00")) # ATC not yet available in data set
   expect_warning(as.ab("UNKNOWN"))
   expect_warning(as.ab(""))
 
@@ -55,8 +55,11 @@ test_that("as.ab works", {
   expect_equal(as.character(as.ab("Amoxy + clavulaanzuur")),
                "AMC")
   
+  expect_equal(as.character(as.ab(c("mreopenem", "co-maoxiclav"))),
+               c("MEM", "AMC"))
+  
   expect_message(as.ab("cipro mero"))
-
+  
   # assigning and subsetting
   x <- antibiotics$ab
   expect_s3_class(x[1], "ab")
diff --git a/tests/testthat/test-ab_from_text.R b/tests/testthat/test-ab_from_text.R
index 2c63625c..cd7776d5 100644
--- a/tests/testthat/test-ab_from_text.R
+++ b/tests/testthat/test-ab_from_text.R
@@ -28,7 +28,7 @@ test_that("ab_from_text works", {
   expect_identical(ab_from_text("28/03/2020 regular amoxicilliin 500mg po tds", translate_ab = TRUE)[[1]],
                    "Amoxicillin")
   expect_identical(ab_from_text("administered amoxi/clav and cipro", collapse = ", ")[[1]],
-                   "AMX, CIP")
+                   "AMC, CIP")
   
   expect_identical(ab_from_text("28/03/2020 regular amoxicilliin 500mg po tds", type = "dose")[[1]],
                    500)