fix for NA in as.ab()

2025-07-23 08:23:20 +02:00 · 2022-10-10 15:44:59 +02:00
parent 43c638d122
commit 4fe90092c7
13 changed files with 67 additions and 47 deletions
--- a/2
+++ b/2
@ -1,5 +1,5 @@
 Package: AMR
-Version: 1.8.2.9008
+Version: 1.8.2.9009
 Date: 2022-10-10
 Title: Antimicrobial Resistance Data Analysis
 Description: Functions to simplify and standardise antimicrobial resistance (AMR)
--- a/NEWS.md
+++ b/NEWS.md
@ -1,4 +1,4 @@
-# AMR 1.8.2.9008
+# AMR 1.8.2.9009

 This version will eventually become v2.0! We're happy to reach a new major milestone soon!

@ -42,6 +42,7 @@ This version will eventually become v2.0! We're happy to reach a new major miles
 * Fix for using `ab_atc()` on non-existing ATC codes
 * Black and white message texts are now reversed in colour if using an RStudio dark theme
 * `mo_snomed()` now returns class `character`, not `numeric` anymore (to make long SNOMED codes readable)
+* Fix for using `as.ab()` on `NA` values

 ### Other
 * New website to make use of the new Bootstrap 5 and pkgdown 2.0. The website now contains results for all examples and will be automatically regenerated with every change to our repository, using GitHub Actions
--- a/R/ab.R
+++ b/R/ab.R
@ -135,7 +135,7 @@ as.ab <- function(x, flag_multiple_results = TRUE, info = interactive(), ...) {
        abnames <- abnames[!abnames %in% c("clavulanic acid", "avibactam")]
      }
      if (length(abnames) > 1) {
-        message_(
+        warning_(
          "More than one result was found for item ", index, ": ",
          vector_and(abnames, quotes = FALSE)
        )
@ -164,13 +164,19 @@ as.ab <- function(x, flag_multiple_results = TRUE, info = interactive(), ...) {
    USE.NAMES = FALSE
  )]
  x_new[known_codes_cid] <- AB_lookup$ab[match(x[known_codes_cid], AB_lookup$cid)]
-  already_known <- known_names | known_codes_ab | known_codes_atc | known_codes_cid
-
+  previously_coerced <- x %in% AMR_env$ab_previously_coerced$x
+  x_new[previously_coerced & is.na(x_new)] <- AMR_env$ab_previously_coerced$ab[match(x[is.na(x_new) & x %in% AMR_env$ab_previously_coerced$x], AMR_env$ab_previously_coerced$x)]
+  already_known <- known_names | known_codes_ab | known_codes_atc | known_codes_cid | previously_coerced
+  
+  # fix for NAs
+  x_new[is.na(x)] <- NA
+  already_known[is.na(x)] <- FALSE
+  
  if (initial_search == TRUE && sum(already_known) < length(x)) {
    progress <- progress_ticker(n = sum(!already_known), n_min = 25, print = info) # start if n >= 25
    on.exit(close(progress))
  }
-
+  
  for (i in which(!already_known)) {
    if (initial_search == TRUE) {
      progress$tick()
@ -481,6 +487,16 @@ as.ab <- function(x, flag_multiple_results = TRUE, info = interactive(), ...) {
  if (initial_search == TRUE && sum(already_known) < length(x)) {
    close(progress)
  }
+  
+  # save to package env to save time for next time
+  AMR_env$ab_previously_coerced <- unique(rbind(AMR_env$ab_previously_coerced,
+                                                data.frame(
+                                                  x = x,
+                                                  ab = x_new,
+                                                  stringsAsFactors = FALSE
+                                                ),
+                                                stringsAsFactors = FALSE
+  ))

  # take failed ATC codes apart from rest
  if (length(x_unknown_ATCs) > 0 && fast_mode == FALSE) {
--- a/R/ab_from_text.R
+++ b/R/ab_from_text.R
@ -39,14 +39,14 @@
 #' @param ... arguments passed on to [as.ab()]
 #' @details This function is also internally used by [as.ab()], although it then only searches for the first drug name and will throw a note if more drug names could have been returned. Note: the [as.ab()] function may use very long regular expression to match brand names of antimicrobial agents. This may fail on some systems.
 #'
-#' ## Argument `type`
+#' ### Argument `type`
 #' At default, the function will search for antimicrobial drug names. All text elements will be searched for official names, ATC codes and brand names. As it uses [as.ab()] internally, it will correct for misspelling.
 #'
 #' With `type = "dose"` (or similar, like "dosing", "doses"), all text elements will be searched for [numeric] values that are higher than 100 and do not resemble years. The output will be [numeric]. It supports any unit (g, mg, IE, etc.) and multiple values in one clinical text, see *Examples*.
 #'
 #' With `type = "administration"` (or abbreviations, like "admin", "adm"), all text elements will be searched for a form of drug administration. It supports the following forms (including common abbreviations): buccal, implant, inhalation, instillation, intravenous, nasal, oral, parenteral, rectal, sublingual, transdermal and vaginal. Abbreviations for oral (such as 'po', 'per os') will become "oral", all values for intravenous (such as 'iv', 'intraven') will become "iv". It supports multiple values in one clinical text, see *Examples*.
 #'
-#' ## Argument `collapse`
+#' ### Argument `collapse`
 #' Without using `collapse`, this function will return a [list]. This can be convenient to use e.g. inside a `mutate()`):\cr
 #' `df %>% mutate(abx = ab_from_text(clinical_text))`
 #'
--- a/R/data.R
+++ b/R/data.R
@ -31,7 +31,7 @@
 #'
 #' Two data sets containing all antibiotics/antimycotics and antivirals. Use [as.ab()] or one of the [`ab_*`][ab_property()] functions to retrieve values from the [antibiotics] data set. Three identifiers are included in this data set: an antibiotic ID (`ab`, primarily used in this package) as defined by WHONET/EARS-Net, an ATC code (`atc`) as defined by the WHO, and a Compound ID (`cid`) as found in PubChem. Other properties in this data set are derived from one or more of these codes. Note that some drugs have multiple ATC codes.
 #' @format
-#' ## For the [antibiotics] data set: a [tibble][tibble::tibble] with `r nrow(antibiotics)` observations and `r ncol(antibiotics)` variables:
+#' ### For the [antibiotics] data set: a [tibble][tibble::tibble] with `r nrow(antibiotics)` observations and `r ncol(antibiotics)` variables:
 #' - `ab`\cr Antibiotic ID as used in this package (such as `AMC`), using the official EARS-Net (European Antimicrobial Resistance Surveillance Network) codes where available
 #' - `cid`\cr Compound ID as found in PubChem
 #' - `name`\cr Official name as used by WHONET/EARS-Net or the WHO
@ -47,7 +47,7 @@
 #' - `iv_units`\cr Units of `iv_ddd`
 #' - `loinc`\cr All LOINC codes (Logical Observation Identifiers Names and Codes) associated with the name of the antimicrobial agent. Use [ab_loinc()] to retrieve them quickly, see [ab_property()].
 #'
-#' ## For the [antivirals] data set: a [tibble][tibble::tibble] with `r nrow(antivirals)` observations and `r ncol(antivirals)` variables:
+#' ### For the [antivirals] data set: a [tibble][tibble::tibble] with `r nrow(antivirals)` observations and `r ncol(antivirals)` variables:
 #' - `atc`\cr ATC codes (Anatomical Therapeutic Chemical) as defined by the WHOCC
 #' - `cid`\cr Compound ID as found in PubChem
 #' - `name`\cr Official name as used by WHONET/EARS-Net or the WHO
@ -61,7 +61,7 @@
 #'
 #' Synonyms (i.e. trade names) were derived from the Compound ID (`cid`) and consequently only available where a CID is available.
 #'
-#' ## Direct download
+#' ### Direct download
 #' Like all data sets in this package, these data sets are publicly available for download in the following formats: R, MS Excel, Apache Feather, Apache Parquet, SPSS, SAS, and Stata. Please visit [our website for the download links](https://msberends.github.io/AMR/articles/datasets.html). The actual files are of course available on [our GitHub repository](https://github.com/msberends/AMR/tree/main/data-raw).
 #' @source World Health Organization (WHO) Collaborating Centre for Drug Statistics Methodology (WHOCC): <https://www.whocc.no/atc_ddd_index/>
 #'
@ -111,7 +111,7 @@
 #' - The identifier of the parent taxons
 #' - The year and first author of the related scientific publication
 #'
-#' ## Manual additions
+#' ### Manual additions
 #' For convenience, some entries were added manually:
 #'
 #' - 11 entries of *Streptococcus* (beta-haemolytic: groups A, B, C, D, F, G, H, K and unspecified; other: viridans, milleri)
@ -122,7 +122,7 @@
 #'
 #' The syntax used to transform the original data to a cleansed \R format, can be found here: <https://github.com/msberends/AMR/blob/main/data-raw/reproduction_of_microorganisms.R>.
 #'
-#' ## Direct download
+#' ### Direct download
 #' Like all data sets in this package, this data set is publicly available for download in the following formats: R, MS Excel, Apache Feather, Apache Parquet, SPSS, SAS, and Stata. Please visit [our website for the download links](https://msberends.github.io/AMR/articles/datasets.html). The actual files are of course available on [our GitHub repository](https://github.com/msberends/AMR/tree/main/data-raw).
 #' @section About the Records from LPSN (see *Source*):
 #' LPSN is the main source for bacteriological taxonomy of this `AMR` package.
@ -253,7 +253,7 @@
 #' @details
 #' This data set is based on `r format_eucast_version_nr(3.3)`.
 #'
-#' ## Direct download
+#' ### Direct download
 #' Like all data sets in this package, this data set is publicly available for download in the following formats: R, MS Excel, Apache Feather, Apache Parquet, SPSS, SAS, and Stata. Please visit [our website for the download links](https://msberends.github.io/AMR/articles/datasets.html). The actual files are of course available on [our GitHub repository](https://github.com/msberends/AMR/tree/main/data-raw).
 #'
 #' They **allow for machine reading EUCAST and CLSI guidelines**, which is almost impossible with the MS Excel and PDF files distributed by EUCAST and CLSI.
@ -277,7 +277,7 @@
 #' @details
 #' This data set is based on `r format_eucast_version_nr(11.0)`.
 #'
-#' ## Direct download
+#' ### Direct download
 #' Like all data sets in this package, this data set is publicly available for download in the following formats: R, MS Excel, Apache Feather, Apache Parquet, SPSS, SAS, and Stata. Please visit [our website for the download links](https://msberends.github.io/AMR/articles/datasets.html). The actual files are of course available on [our GitHub repository](https://github.com/msberends/AMR/tree/main/data-raw).
 #' @examples
 #' dosage
--- a/R/first_isolate.R
+++ b/R/first_isolate.R
@ -59,7 +59,7 @@
 #'
 #' All isolates with a microbial ID of `NA` will be excluded as first isolate.
 #'
-#' ## Different methods
+#' ### Different methods
 #'
 #' According to Hindler *et al.* (2007, \doi{10.1086/511864}), there are different methods (algorithms) to select first isolates with increasing reliability: isolate-based, patient-based, episode-based and phenotype-based. All methods select on a combination of the taxonomic genus and species (not subspecies).
 #'
--- a/R/g.test.R
+++ b/R/g.test.R
@ -41,14 +41,14 @@
 #'
 #' In the goodness-of-fit case simulation is done by random sampling from the discrete distribution specified by `p`, each sample being of size `n = sum(x)`. This simulation is done in \R and may be slow.
 #'
-#' ## *G*-test Of Goodness-of-Fit (Likelihood Ratio Test)
+#' ### *G*-test Of Goodness-of-Fit (Likelihood Ratio Test)
 #' Use the *G*-test of goodness-of-fit when you have one nominal variable with two or more values (such as male and female, or red, pink and white flowers). You compare the observed counts of numbers of observations in each category with the expected counts, which you calculate using some kind of theoretical expectation (such as a 1:1 sex ratio or a 1:2:1 ratio in a genetic cross).
 #'
 #' If the expected number of observations in any category is too small, the *G*-test may give inaccurate results, and you should use an exact test instead ([fisher.test()]).
 #'
 #' The *G*-test of goodness-of-fit is an alternative to the chi-square test of goodness-of-fit ([chisq.test()]); each of these tests has some advantages and some disadvantages, and the results of the two tests are usually very similar.
 #'
-#' ## *G*-test of Independence
+#' ### *G*-test of Independence
 #' Use the *G*-test of independence when you have two nominal variables, each with two or more possible values. You want to know whether the proportions for one variable are different among values of the other variable.
 #'
 #' It is also possible to do a *G*-test of independence with more than two nominal variables. For example, Jackson et al. (2013) also had data for children under 3, so you could do an analysis of old vs. young, thigh vs. arm, and reaction vs. no reaction, all analyzed together.
@ -57,7 +57,7 @@
 #'
 #' The *G*-test of independence is an alternative to the chi-square test of independence ([chisq.test()]), and they will give approximately the same results.
 #'
-#' ## How the Test Works
+#' ### How the Test Works
 #' Unlike the exact test of goodness-of-fit ([fisher.test()]), the *G*-test does not directly calculate the probability of obtaining the observed results or something more extreme. Instead, like almost all statistical tests, the *G*-test has an intermediate step; it uses the data to calculate a test statistic that measures how far the observed data are from the null expectation. You then use a mathematical relationship, in this case the chi-square distribution, to estimate the probability of obtaining that value of the test statistic.
 #'
 #' The *G*-test uses the log of the ratio of two likelihoods as the test statistic, which is why it is also called a likelihood ratio test or log-likelihood ratio test. The formula to calculate a *G*-statistic is:
--- a/R/ggplot_rsi.R
+++ b/R/ggplot_rsi.R
@ -52,7 +52,7 @@
 #' @param ... other arguments passed on to [geom_rsi()] or, in case of [scale_rsi_colours()], named values to set colours. The default colours are colour-blind friendly, while maintaining the convention that e.g. 'susceptible' should be green and 'resistant' should be red. See *Examples*.
 #' @details At default, the names of antibiotics will be shown on the plots using [ab_name()]. This can be set with the `translate_ab` argument. See [count_df()].
 #'
-#' ## The Functions
+#' ### The Functions
 #' [geom_rsi()] will take any variable from the data that has an [`rsi`] class (created with [as.rsi()]) using [rsi_df()] and will plot bars with the percentage R, I and S. The default behaviour is to have the bars stacked and to have the different antibiotics on the x axis.
 #'
 #' [facet_rsi()] creates 2d plots (at default based on S/I/R) using [ggplot2::facet_wrap()].
--- a/R/rsi.R
+++ b/R/rsi.R
@ -43,7 +43,7 @@
 #' @param threshold maximum fraction of invalid antimicrobial interpretations of `x`, see *Examples*
 #' @param ... for using on a [data.frame]: names of columns to apply [as.rsi()] on (supports tidy selection such as `column1:column4`). Otherwise: arguments passed on to methods.
 #' @details
-#' ## How it Works
+#' ### How it Works
 #'
 #' The [as.rsi()] function works in four ways:
 #'
@ -66,21 +66,21 @@
 #'
 #' For points 2, 3 and 4: Use [rsi_interpretation_history()] to retrieve a [data.frame] (or [tibble][tibble::tibble()] if the `tibble` package is installed) with all results of the last [as.rsi()] call.
 #'
-#' ## Supported Guidelines
+#' ### Supported Guidelines
 #'
 #' For interpreting MIC values as well as disk diffusion diameters, currently implemented guidelines are EUCAST (`r min(as.integer(gsub("[^0-9]", "", subset(rsi_translation, guideline %like% "EUCAST")$guideline)))`-`r max(as.integer(gsub("[^0-9]", "", subset(rsi_translation, guideline %like% "EUCAST")$guideline)))`) and CLSI (`r min(as.integer(gsub("[^0-9]", "", subset(rsi_translation, guideline %like% "CLSI")$guideline)))`-`r max(as.integer(gsub("[^0-9]", "", subset(rsi_translation, guideline %like% "CLSI")$guideline)))`).
 #'
 #' Thus, the `guideline` argument must be set to e.g., ``r paste0('"', subset(rsi_translation, guideline %like% "EUCAST")$guideline[1], '"')`` or ``r paste0('"', subset(rsi_translation, guideline %like% "CLSI")$guideline[1], '"')``. By simply using `"EUCAST"` (the default) or `"CLSI"` as input, the latest included version of that guideline will automatically be selected. You can set your own data set using the `reference_data` argument. The `guideline` argument will then be ignored.
 #'
-#' ## After Interpretation
+#' ### After Interpretation
 #'
 #' After using [as.rsi()], you can use the [eucast_rules()] defined by EUCAST to (1) apply inferred susceptibility and resistance based on results of other antimicrobials and (2) apply intrinsic resistance based on taxonomic properties of a microorganism.
 #'
-#' ## Machine-Readable Interpretation Guidelines
+#' ### Machine-Readable Interpretation Guidelines
 #'
 #' The repository of this package [contains a machine-readable version](https://github.com/msberends/AMR/blob/main/data-raw/rsi_translation.txt) of all guidelines. This is a CSV file consisting of `r format(nrow(AMR::rsi_translation), big.mark = ",")` rows and `r ncol(AMR::rsi_translation)` columns. This file is machine-readable, since it contains one row for every unique combination of the test method (MIC or disk diffusion), the antimicrobial agent and the microorganism. **This allows for easy implementation of these rules in laboratory information systems (LIS)**. Note that it only contains interpretation guidelines for humans - interpretation guidelines from CLSI for animals were removed.
 #'
-#' ## Other
+#' ### Other
 #'
 #' The function [is.rsi()] detects if the input contains class `<rsi>`. If the input is a [data.frame], it iterates over all columns and returns a [logical] vector.
 #'
--- a/R/translate.R
+++ b/R/translate.R
@ -42,15 +42,13 @@
 #'
 #' # Add e.g. Italian support to that file using:
 #' options(AMR_locale = "Italian")
-#' # or using:
-#' AMR::set_AMR_locale("Italian")
-#'
-#' # And save the file!
 #' ```
+#' 
+#' And then save the file.
 #'
 #' Please read about adding or updating a language in [our Wiki](https://github.com/msberends/AMR/wiki/).
 #'
-#' ## Changing the Default Language
+#' ### Changing the Default Language
 #' The system language will be used at default (as returned by `Sys.getenv("LANG")` or, if `LANG` is not set, [`Sys.getlocale("LC_COLLATE")`][Sys.getlocale()]), if that language is supported. But the language to be used can be overwritten in two ways and will be checked in this order:
 #'
 #'   1. Setting the R option `AMR_locale`, either by using e.g. `set_AMR_locale("German")` or by running e.g. `options(AMR_locale = "German")`.
@ -93,7 +91,6 @@ get_AMR_locale <- function() {
  if (!is.null(getOption("AMR_locale", default = NULL))) {
    return(validate_language(getOption("AMR_locale"), extra_txt = "set with `options(AMR_locale = ...)`"))
  }
-
  lang <- ""
  # now check the LANGUAGE system variable - return it if set
  if (!identical("", Sys.getenv("LANGUAGE"))) {
@ -105,16 +102,7 @@ get_AMR_locale <- function() {
  if (lang == "") {
    lang <- Sys.getlocale("LC_COLLATE")
  }
-
-  lang <- find_language(lang)
-  if (lang != "en" && interactive() && message_not_thrown_before("get_AMR_locale", entire_session = TRUE)) {
-    message_(
-      "Assuming the ", LANGUAGES_SUPPORTED_NAMES[[lang]]$exonym, " language (",
-      LANGUAGES_SUPPORTED_NAMES[[lang]]$endonym, ") for the AMR package. Change this with `set_AMR_locale()`. ",
-      "This note will be shown once per session but can be silenced, see `?set_AMR_locale()`."
-    )
-  }
-  lang
+  find_language(lang)
 }

 #' @rdname translate
--- a/R/zzz.R
+++ b/R/zzz.R
@ -45,6 +45,11 @@ AMR_env$mo_previously_coerced <- data.frame(
  mo = character(0),
  stringsAsFactors = FALSE
 )
+AMR_env$ab_previously_coerced <- data.frame(
+  x = character(0),
+  ab = character(0),
+  stringsAsFactors = FALSE
+)
 AMR_env$rsi_interpretation_history <- data.frame(
  datetime = Sys.time()[0],
  index = integer(0),
@ -147,6 +152,18 @@ if (utf8_supported && !is_latex) {
  assign(x = "INTRINSIC_R", value = create_intr_resistance(), envir = asNamespace("AMR"))
 }

+.onAttach <- function(lib, pkg) {
+  if (interactive() && is.null(getOption("AMR_locale", default = NULL))) {
+    current_lang <- get_AMR_locale()
+    if (current_lang != "en") {
+      packageStartupMessage(word_wrap(
+        "Assuming the ", LANGUAGES_SUPPORTED_NAMES[[current_lang]]$exonym, " language (",
+        LANGUAGES_SUPPORTED_NAMES[[current_lang]]$endonym, ") for the AMR package. See `set_AMR_locale()` to change this or to silence this note.",
+        add_fn = list(font_blue), as_note = TRUE))
+    }
+  }
+}
+
 # Helper functions --------------------------------------------------------

 create_AB_lookup <- function() {
--- a/man/first_isolate.Rd
+++ b/man/first_isolate.Rd
@ -126,6 +126,8 @@ All mentioned methods are covered in the \code{\link[=first_isolate]{first_isola
   - Any difference in key antimicrobial results \tab - \code{first_isolate(x, type = "keyantimicrobials")} \cr
 }

+}
+
 \subsection{Isolate-based}{

 This method does not require any selection, as all isolates should be included. It does, however, respect all arguments set in the \code{\link[=first_isolate]{first_isolate()}} function. For example, the default setting for \code{include_unknown} (\code{FALSE}) will omit selection of rows without a microbial ID.
@ -160,8 +162,6 @@ Key antimicrobials are internally selected using the \code{\link[=key_antimicrob
 }

 The default method is phenotype-based (using \code{type = "points"}) and episode-based (using \code{episode_days = 365}). This makes sure that every genus-species combination is selected per patient once per year, while taking into account all antimicrobial test results. If no antimicrobial test results are available in the data set, only the episode-based method is applied at default.
-}
-
 }
 }
 \examples{
--- a/man/translate.Rd
+++ b/man/translate.Rd
@ -34,12 +34,10 @@ utils::file.edit("~/.Rprofile")

 # Add e.g. Italian support to that file using:
 options(AMR_locale = "Italian")
-# or using:
-AMR::set_AMR_locale("Italian")
-
-# And save the file!
 }\if{html}{\out{</div>}}

+And then save the file.
+
 Please read about adding or updating a language in \href{https://github.com/msberends/AMR/wiki/}{our Wiki}.
 \subsection{Changing the Default Language}{