new class bactid

2026-06-21 10:56:23 +02:00 · 2018-07-23 14:14:03 +02:00
parent 40de1b4ac2
commit 8421638b60
21 changed files with 408 additions and 251 deletions
--- a/1
+++ b/1
@@ -28,7 +28,6 @@ Depends:
    R (>= 3.0.0)
 Imports:
    backports,
-    broom,
    clipr,
    curl,
    dplyr (>= 0.7.0),
--- a/6
+++ b/6
@@ -1,5 +1,6 @@
 # Generated by roxygen2: do not edit by hand

+S3method(as.data.frame,bactid)
 S3method(as.data.frame,frequency_tbl)
 S3method(as.double,mic)
 S3method(as.integer,mic)
@@ -16,6 +17,7 @@ S3method(kurtosis,matrix)
 S3method(plot,frequency_tbl)
 S3method(plot,mic)
 S3method(plot,rsi)
+S3method(print,bactid)
 S3method(print,frequency_tbl)
 S3method(print,mic)
 S3method(print,rsi)
@@ -32,6 +34,7 @@ export(MDRO)
 export(MRGN)
 export(abname)
 export(anti_join_microorganisms)
+export(as.bactid)
 export(as.mic)
 export(as.rsi)
 export(atc_ddd)
@@ -48,6 +51,7 @@ export(guess_atc)
 export(guess_bactid)
 export(inner_join_microorganisms)
 export(interpretive_reading)
+export(is.bactid)
 export(is.mic)
 export(is.rsi)
 export(key_antibiotics)
@@ -68,6 +72,7 @@ export(semi_join_microorganisms)
 export(skewness)
 export(susceptibility)
 export(top_freq)
+exportMethods(as.data.frame.bactid)
 exportMethods(as.data.frame.frequency_tbl)
 exportMethods(as.double.mic)
 exportMethods(as.integer.mic)
@@ -85,6 +90,7 @@ exportMethods(kurtosis.matrix)
 exportMethods(plot.frequency_tbl)
 exportMethods(plot.mic)
 exportMethods(plot.rsi)
+exportMethods(print.bactid)
 exportMethods(print.frequency_tbl)
 exportMethods(print.mic)
 exportMethods(print.rsi)
--- a/NEWS.md
+++ b/NEWS.md
@@ -4,6 +4,7 @@
 * **BREAKING**: the methodology for determining first weighted isolates was changed. The antibiotics that are compared between isolates (call *key antibiotics*) to include more first isolates (afterwards called first *weighted* isolates) are now as follows:
  * Gram-positive: amoxicillin, amoxicillin/clavlanic acid, cefuroxime, piperacillin/tazobactam, ciprofloxacin, trimethoprim/sulfamethoxazole, vancomycin, teicoplanin, tetracycline, erythromycin, oxacillin, rifampicin
  * Gram-negative: amoxicillin, amoxicillin/clavlanic acid, cefuroxime, piperacillin/tazobactam, ciprofloxacin, trimethoprim/sulfamethoxazole, gentamicin, tobramycin, colistin, cefotaxime, ceftazidime, meropenem
+* Functions `as.bactid` and `is.bactid` to transform/look up microbial ID's; this replaces the function `guess_bactid` but it will remain available for backwards compatibility
 * For convience, new descriptive statistical functions `kurtosis` and `skewness` that are lacking in base R - they are generic functions and have support for vectors, data.frames and matrices
 * Function `g.test` to perform the Χ<sup>2</sup> distributed [*G*-test](https://en.wikipedia.org/wiki/G-test), which use is the same as `chisq.test`
 * Function `ratio` to transform a vector of values to a preset ratio
@@ -28,8 +29,9 @@
 * Printing of class `mic` now shows all MIC values
 * `%like%` now supports multiple patterns
 * Frequency tables are now actual `data.frame`s with altered console printing to make it look like a frequency table. Because of this, the parameter `toConsole` is not longer needed.
-* Small translational improvements to the `septic_patients` dataset
-* Small improvements to the `microorganisms` dataset, especially for *Salmonella*
+* Fix for `freq` where the class of an item would be lost
+* Small translational improvements to the `septic_patients` dataset and the column `bactid` now has the new class `"bactid"`
+* Small improvements to the `microorganisms` dataset (especially for *Salmonella*) and the column `bactid` now has the new class `"bactid"`
 * Combined MIC/RSI values will now be coerced by the `rsi` and `mic` functions:
  * `as.rsi("<=0.002; S")` will return `S`
  * `as.mic("<=0.002; S")` will return `<=0.002`
@@ -38,7 +40,8 @@
 * Build-in host check for `atc_property` as it requires the host set by `url` to be responsive
 * Improved `first_isolate` algorithm to exclude isolates where bacteria ID or genus is unavailable
 * Fix for warning *hybrid evaluation forced for row_number* ([`924b62`](https://github.com/tidyverse/dplyr/commit/924b62)) from the `dplyr` package v0.7.5 and above
-* Support for 1 or 2 columns as input for `guess_bactid`
+* Support for empty values and for 1 or 2 columns as input for `guess_bactid` (now called `as.bactid`)
+  * So `yourdata %>% select(genus, species) %>% as.bactid()` now also works

 #### Other
 * Unit testing for R 3.0 and the latest available release: https://travis-ci.org/msberends/AMR
--- a/R/guess_bactid.R
+++ b/R/guess_bactid.R
@@ -16,37 +16,58 @@
 # GNU General Public License for more details.                         #
 # ==================================================================== #

-#' Find bacteria ID based on genus/species
+#' Transform to bacteria ID
 #'
-#' Use this function to determine a valid ID based on a genus (and species). This input could be a full name (like \code{"Staphylococcus aureus"}), an abbreviated name (like \code{"S. aureus"}), or just a genus. You could also \code{\link{select}} a genus and species column, zie Examples.
-#' @param x character vector or a dataframe with one or two columns
+#' Use this function to determine a valid ID based on a genus (and species). This input can be a full name (like \code{"Staphylococcus aureus"}), an abbreviated name (like \code{"S. aureus"}), or just a genus. You could also \code{\link{select}} a genus and species column, zie Examples.
+#' @param x a character vector or a dataframe with one or two columns
+#' @rdname as.bactid
+#' @details Some exceptions have been built in to get more logical results, based on prevalence of human pathogens. For example:
+#' \itemize{
+#'   \item{\code{"E. coli"} will return the ID of \emph{Escherichia coli} and not \emph{Entamoeba coli}, although the latter would alphabetically come first}
+#'   \item{\code{"H. influenzae"} will return the ID of \emph{Haemophilus influenzae} and not \emph{Haematobacter influenzae}}
+#'   \item{Something like \code{"p aer"} will return the ID of \emph{Pseudomonas aeruginosa} and not \emph{Pasteurella aerogenes}}
+#'   \item{Something like \code{"stau"} or \code{"staaur"} will return the ID of \emph{Staphylococcus aureus} and not \emph{Staphylococcus auricularis}}
+#' }
+#' Moreover, this function also supports ID's based on only Gram stain, when the species is not known. \cr
+#' For example, \code{"Gram negative rods"} and \code{"GNR"} will both return the ID of a Gram negative rod: \code{GNR}.
 #' @export
 #' @importFrom dplyr %>% filter pull
-#' @return Character (vector).
+#' @return Character (vector) with class \code{"bactid"}. Unknown values will return \code{NA}.
 #' @seealso \code{\link{microorganisms}} for the dataframe that is being used to determine ID's.
 #' @examples
 #' # These examples all return "STAAUR", the ID of S. aureus:
-#' guess_bactid("stau")
-#' guess_bactid("STAU")
-#' guess_bactid("staaur")
-#' guess_bactid("S. aureus")
-#' guess_bactid("S aureus")
-#' guess_bactid("Staphylococcus aureus")
-#' guess_bactid("MRSA") # Methicillin-resistant S. aureus
-#' guess_bactid("VISA") # Vancomycin Intermediate S. aureus
+#' as.bactid("stau")
+#' as.bactid("STAU")
+#' as.bactid("staaur")
+#' as.bactid("S. aureus")
+#' as.bactid("S aureus")
+#' as.bactid("Staphylococcus aureus")
+#' as.bactid("MRSA") # Methicillin Resistant S. aureus
+#' as.bactid("VISA") # Vancomycin Intermediate S. aureus
+#' as.bactid("VRSA") # Vancomycin Resistant S. aureus
 #'
 #' \dontrun{
-#' df$bactid <- guess_bactid(df$microorganism_name)
+#' df$bactid <- as.bactid(df$microorganism_name)
 #'
 #' # the select function of tidyverse is also supported:
-#' df$bactid <- df %>% select(microorganism_name) %>% guess_bactid()
+#' library(dplyr)
+#' df$bactid <- df %>%
+#'   select(microorganism_name) %>%
+#'   as.bactid()
 #'
 #' # and can even contain 2 columns, which is convenient for genus/species combinations:
-#' df$bactid <- df %>% select(genus, species) %>% guess_bactid()
+#' df$bactid <- df %>%
+#'   select(genus, species) %>%
+#'   as.bactid()
+#'
 #' # same result:
-#' df <- df %>% mutate(bactid = paste(genus, species)) %>% guess_bactid())
+#' df <- df %>%
+#'   mutate(bactid = paste(genus, species) %>%
+#'                     as.bactid())
 #' }
-guess_bactid <- function(x) {
+as.bactid <- function(x) {
+
+  failures <- character(0)

  if (NCOL(x) == 2) {
    # support tidyverse selection like: df %>% select(colA, colB)
@@ -60,17 +81,19 @@ guess_bactid <- function(x) {
    if (NCOL(x) > 2) {
      stop('`x` can be 2 columns at most', call. = FALSE)
    }
+
    # support tidyverse selection like: df %>% select(colA)
    if (!is.vector(x)) {
      x <- pull(x, 1)
    }
  }

+  x.fullbackup <- x
  # remove dots and other non-text in case of "E. coli" except spaces
-  x <- gsub("[^a-zA-Z ]+", "", x)
+  x <- gsub("[^a-zA-Z0-9 ]+", "", x)
  # but spaces before and after should be omitted
  x <- trimws(x, which = "both")
-  x.bak <- x
+  x.backup <- x
  # replace space by regex sign
  x <- gsub(" ", ".*", x, fixed = TRUE)
  # add start and stop
@@ -96,42 +119,44 @@ guess_bactid <- function(x) {
      # avoid detection of Pasteurella aerogenes in case of Pseudomonas aeruginosa
      x[i] <- 'Pseudomonas aeruginosa'
    }
-    if (tolower(x[i]) %like% 'coagulase') {
-      # coerce S. coagulase negative
+    if (tolower(x[i]) %like% 'coagulase'
+        | tolower(x[i]) %like% 'cns'
+        | tolower(x[i]) %like% 'cons') {
+      # coerce S. coagulase negative, also as CNS and CoNS
      x[i] <- 'Coagulase Negative Staphylococcus (CNS)'
    }

    # translate known trivial names to genus+species
-    if (!is.na(x.bak[i])) {
-      if (toupper(x.bak[i]) == 'MRSA'
-          | toupper(x.bak[i]) == 'VISA'
-          | toupper(x.bak[i]) == 'VRSA') {
+    if (!is.na(x.backup[i])) {
+      if (toupper(x.backup[i]) == 'MRSA'
+          | toupper(x.backup[i]) == 'VISA'
+          | toupper(x.backup[i]) == 'VRSA') {
        x[i] <- 'Staphylococcus aureus'
      }
-      if (toupper(x.bak[i]) == 'MRSE') {
+      if (toupper(x.backup[i]) == 'MRSE') {
        x[i] <- 'Staphylococcus epidermidis'
      }
-      if (toupper(x.bak[i]) == 'VRE') {
+      if (toupper(x.backup[i]) == 'VRE') {
        x[i] <- 'Enterococcus'
      }
-      if (toupper(x.bak[i]) == 'MRPA') {
+      if (toupper(x.backup[i]) == 'MRPA') {
        # multi resistant P. aeruginosa
        x[i] <- 'Pseudomonas aeruginosa'
      }
-      if (toupper(x.bak[i]) == 'PISP'
-          | toupper(x.bak[i]) == 'PRSP') {
+      if (toupper(x.backup[i]) == 'PISP'
+          | toupper(x.backup[i]) == 'PRSP') {
        # peni resistant S. pneumoniae
        x[i] <- 'Streptococcus pneumoniae'
      }
-      if (toupper(x.bak[i]) == 'VISP'
-          | toupper(x.bak[i]) == 'VRSP') {
+      if (toupper(x.backup[i]) == 'VISP'
+          | toupper(x.backup[i]) == 'VRSP') {
        # vanco resistant S. pneumoniae
        x[i] <- 'Streptococcus pneumoniae'
      }
    }

    # let's try the ID's first
-    found <- AMR::microorganisms %>% filter(bactid == x.bak[i])
+    found <- AMR::microorganisms %>% filter(bactid == x.backup[i])

    if (nrow(found) == 0) {
      # now try exact match
@@ -152,38 +177,82 @@ guess_bactid <- function(x) {
    }
    if (nrow(found) == 0) {
      # search for GLIMS code
-      if (toupper(x.bak[i]) %in% toupper(AMR::microorganisms.umcg$mocode)) {
-        found <- AMR::microorganisms.umcg %>% filter(toupper(mocode) == toupper(x.bak[i]))
+      if (toupper(x.backup[i]) %in% toupper(AMR::microorganisms.umcg$mocode)) {
+        found <- AMR::microorganisms.umcg %>% filter(toupper(mocode) == toupper(x.backup[i]))
      }
    }
    if (nrow(found) == 0) {
      # try splitting of characters and then find ID
      # like esco = E. coli, klpn = K. pneumoniae, stau = S. aureus
      x_split <- x
-      x_length <- nchar(x.bak[i])
-      x_split[i] <- paste0(x.bak[i] %>% substr(1, x_length / 2) %>% trimws(),
+      x_length <- nchar(x.backup[i])
+      x_split[i] <- paste0(x.backup[i] %>% substr(1, x_length / 2) %>% trimws(),
                           '.* ',
-                           x.bak[i] %>% substr((x_length / 2) + 1, x_length) %>% trimws())
+                           x.backup[i] %>% substr((x_length / 2) + 1, x_length) %>% trimws())
      found <- AMR::microorganisms %>% filter(fullname %like% paste0('^', x_split[i]))
    }
    if (nrow(found) == 0) {
      # try any match with text before and after original search string
      # so "negative rods" will be "GNR"
-      if (x.bak[i] %like% "^Gram") {
-        x.bak[i] <- gsub("^Gram", "", x.bak[i], ignore.case = TRUE)
+      if (x.backup[i] %like% "^Gram") {
+        x.backup[i] <- gsub("^Gram", "", x.backup[i], ignore.case = TRUE)
        # remove leading and trailing spaces again
-        x.bak[i] <- trimws(x.bak[i], which = "both")
+        x.backup[i] <- trimws(x.backup[i], which = "both")
      }
-      if (!is.na(x.bak[i])) {
-        found <- AMR::microorganisms %>% filter(fullname %like% x.bak[i])
+      if (!is.na(x.backup[i])) {
+        found <- AMR::microorganisms %>% filter(fullname %like% x.backup[i])
      }
    }

-    if (nrow(found) != 0) {
+    if (nrow(found) != 0 & x.backup[i] != "") {
      x[i] <- as.character(found[1, 'bactid'])
    } else {
-      x[i] <- ""
+      x[i] <- NA_character_
+      failures <- c(failures, x.fullbackup[i])
    }
  }
+
+  failures <- failures[!failures %in% c(NA, NULL, NaN)]
+  if (length(failures) > 0) {
+    warning("These values could not be coerced to a valid bactid: ",
+            paste('"', unique(failures), '"', sep = "", collapse = ', '),
+            ".",
+            call. = FALSE)
+  }
+  class(x) <- "bactid"
+  attr(x, 'package') <- 'AMR'
+  attr(x, 'package.version') <- packageDescription('AMR')$Version
  x
 }
+
+#' @rdname as.bactid
+#' @export
+guess_bactid <- as.bactid
+
+#' @rdname as.bactid
+#' @export
+is.bactid <- function(x) {
+  identical(class(x), "bactid")
+}
+
+#' @exportMethod print.bactid
+#' @export
+#' @noRd
+print.bactid <- function(x, ...) {
+  cat("Class 'bactid'\n")
+  print.default(as.character(x), quote = FALSE)
+}
+
+#' @exportMethod as.data.frame.bactid
+#' @export
+#' @noRd
+as.data.frame.bactid <- function (x, ...) {
+  # same as as.data.frame.character but with removed stringsAsFactors
+  nm <- paste(deparse(substitute(x), width.cutoff = 500L),
+              collapse = " ")
+  if (!"nm" %in% names(list(...))) {
+    as.data.frame.vector(x, ..., nm = nm)
+  } else {
+    as.data.frame.vector(x, ...)
+  }
+}
--- a/R/eucast.R
+++ b/R/eucast.R
@@ -201,9 +201,10 @@ EUCAST_rules <- function(tbl,
  }

  # join to microorganisms table
-  joinby <- colnames(AMR::microorganisms)[1]
-  names(joinby) <- col_bactid
-  tbl <- tbl %>% left_join(y = AMR::microorganisms, by = joinby, suffix = c("_tempmicroorganisms", ""))
+  if (!tbl %>% pull(col_bactid) %>% is.bactid()) {
+    tbl[, col_bactid] <- tbl %>% pull(col_bactid) %>% as.bactid()
+  }
+  tbl <- tbl %>% left_join_microorganisms(by = col_bactid, suffix = c("_tempmicroorganisms", ""))

  # antibiotic classes
  aminoglycosides <- c(tobr, gent, kana, neom, neti, siso)
--- a/R/first_isolates.R
+++ b/R/first_isolates.R
@@ -22,7 +22,7 @@
 #' @param tbl a \code{data.frame} containing isolates.
 #' @param col_date column name of the result date (or date that is was received on the lab)
 #' @param col_patient_id column name of the unique IDs of the patients
-#' @param col_bactid column name of the unique IDs of the microorganisms (should occur in the \code{\link{microorganisms}} dataset). Get your bactid's with the function \code{\link{guess_bactid}}, that takes microorganism names as input.
+#' @param col_bactid column name of the unique IDs of the microorganisms: \code{bactid}'s. If this column has another class than \code{"bactid"}, values will be coerced using \code{\link{as.bactid}}.
 #' @param col_testcode column name of the test codes. Use \code{col_testcode = NA} to \strong{not} exclude certain test codes (like test codes for screening). In that case \code{testcodes_exclude} will be ignored. Supports tidyverse-like quotation.
 #' @param col_specimen column name of the specimen type or group
 #' @param col_icu column name of the logicals (\code{TRUE}/\code{FALSE}) whether a ward or department is an Intensive Care Unit (ICU)
@@ -126,7 +126,7 @@ first_isolate <- function(tbl,

  # bactid OR genus+species must be available
  if (is.na(col_bactid) & (is.na(col_genus) | is.na(col_species))) {
-    stop('`col_bactid or both `col_genus` and `col_species` must be available.')
+    stop('`col_bactid` or both `col_genus` and `col_species` must be available.')
  }

  # check if columns exist
@@ -152,6 +152,9 @@ first_isolate <- function(tbl,
  check_columns_existance(col_keyantibiotics)

  if (!is.na(col_bactid)) {
+    if (!tbl %>% pull(col_bactid) %>% is.bactid()) {
+      tbl[, col_bactid] <- tbl %>% pull(col_bactid) %>% as.bactid()
+    }
    tbl <- tbl %>% left_join_microorganisms(by = col_bactid)
    col_genus <- "genus"
    col_species <- "species"
--- a/R/freq.R
+++ b/R/freq.R
@@ -273,8 +273,11 @@ frequency_tbl <- function(x,
  } else {
    NAs <- x[is.na(x)]
  }
+
  if (na.rm == TRUE) {
+    x_class <- class(x)
    x <- x[!x %in% NAs]
+    class(x) <- x_class
  }

  if (missing(sort.count) & 'factor' %in% class(x)) {
--- a/R/join_microorganisms.R
+++ b/R/join_microorganisms.R
@@ -26,8 +26,8 @@
 #' df2 <- left_join_microorganisms(df, "bacteria_id")
 #' colnames(df2)
 inner_join_microorganisms <- function(x, by = 'bactid', suffix = c("2", ""), ...) {
-  if (any(class(x) %in% c('character', 'factor'))) {
-    x <- data.frame(bactid = x, stringsAsFactors = FALSE)
+  if (!any(class(x) %in% c("bactid", "data.frame", "matrix"))) {
+    x <- data.frame(bactid = as.bactid(x), stringsAsFactors = FALSE)
  }
  # no name set to `by` parameter
  if (is.null(names(by))) {
@@ -36,7 +36,9 @@ inner_join_microorganisms <- function(x, by = 'bactid', suffix = c("2", ""), ...
  } else {
    joinby <- by
  }
-  join <- dplyr::inner_join(x = x, y = AMR::microorganisms, by = joinby, suffix = c("2", ""), ...)
+  join <- suppressWarnings(
+    dplyr::inner_join(x = x, y = AMR::microorganisms, by = joinby, suffix = c("2", ""), ...)
+  )
  if (nrow(join) > nrow(x)) {
    warning('the newly joined tbl contains ', nrow(join) - nrow(x), ' rows more that its original')
  }
@@ -46,8 +48,8 @@ inner_join_microorganisms <- function(x, by = 'bactid', suffix = c("2", ""), ...
 #' @rdname join
 #' @export
 left_join_microorganisms <- function(x, by = 'bactid', suffix = c("2", ""), ...) {
-  if (any(class(x) %in% c('character', 'factor'))) {
-    x <- data.frame(bactid = x, stringsAsFactors = FALSE)
+  if (!any(class(x) %in% c("bactid", "data.frame", "matrix"))) {
+    x <- data.frame(bactid = as.bactid(x), stringsAsFactors = FALSE)
  }
  # no name set to `by` parameter
  if (is.null(names(by))) {
@@ -56,7 +58,9 @@ left_join_microorganisms <- function(x, by = 'bactid', suffix = c("2", ""), ...)
  } else {
    joinby <- by
  }
-  join <- dplyr::left_join(x = x, y = AMR::microorganisms, by = joinby, suffix = c("2", ""), ...)
+  join <- suppressWarnings(
+    dplyr::left_join(x = x, y = AMR::microorganisms, by = joinby, suffix = c("2", ""), ...)
+  )
  if (nrow(join) > nrow(x)) {
    warning('the newly joined tbl contains ', nrow(join) - nrow(x), ' rows more that its original')
  }
@@ -66,8 +70,8 @@ left_join_microorganisms <- function(x, by = 'bactid', suffix = c("2", ""), ...)
 #' @rdname join
 #' @export
 right_join_microorganisms <- function(x, by = 'bactid', suffix = c("2", ""), ...) {
-  if (any(class(x) %in% c('character', 'factor'))) {
-    x <- data.frame(bactid = x, stringsAsFactors = FALSE)
+  if (!any(class(x) %in% c("bactid", "data.frame", "matrix"))) {
+    x <- data.frame(bactid = as.bactid(x), stringsAsFactors = FALSE)
  }
  # no name set to `by` parameter
  if (is.null(names(by))) {
@@ -76,7 +80,9 @@ right_join_microorganisms <- function(x, by = 'bactid', suffix = c("2", ""), ...
  } else {
    joinby <- by
  }
-  join <- dplyr::right_join(x = x, y = AMR::microorganisms, by = joinby, suffix = c("2", ""), ...)
+  join <- suppressWarnings(
+    dplyr::right_join(x = x, y = AMR::microorganisms, by = joinby, suffix = c("2", ""), ...)
+  )
  if (nrow(join) > nrow(x)) {
    warning('the newly joined tbl contains ', nrow(join) - nrow(x), ' rows more that its original')
  }
@@ -86,8 +92,8 @@ right_join_microorganisms <- function(x, by = 'bactid', suffix = c("2", ""), ...
 #' @rdname join
 #' @export
 full_join_microorganisms <- function(x, by = 'bactid', suffix = c("2", ""), ...) {
-  if (any(class(x) %in% c('character', 'factor'))) {
-    x <- data.frame(bactid = x, stringsAsFactors = FALSE)
+  if (!any(class(x) %in% c("bactid", "data.frame", "matrix"))) {
+    x <- data.frame(bactid = as.bactid(x), stringsAsFactors = FALSE)
  }
  # no name set to `by` parameter
  if (is.null(names(by))) {
@@ -96,7 +102,9 @@ full_join_microorganisms <- function(x, by = 'bactid', suffix = c("2", ""), ...)
  } else {
    joinby <- by
  }
-  join <- dplyr::full_join(x = x, y = AMR::microorganisms, by = joinby, suffix = c("2", ""), ...)
+  join <- suppressWarnings(
+    dplyr::full_join(x = x, y = AMR::microorganisms, by = joinby, suffix = c("2", ""), ...)
+  )
  if (nrow(join) > nrow(x)) {
    warning('the newly joined tbl contains ', nrow(join) - nrow(x), ' rows more that its original')
  }
@@ -106,8 +114,8 @@ full_join_microorganisms <- function(x, by = 'bactid', suffix = c("2", ""), ...)
 #' @rdname join
 #' @export
 semi_join_microorganisms <- function(x, by = 'bactid', ...) {
-  if (any(class(x) %in% c('character', 'factor'))) {
-    x <- data.frame(bactid = x, stringsAsFactors = FALSE)
+  if (!any(class(x) %in% c("bactid", "data.frame", "matrix"))) {
+    x <- data.frame(bactid = as.bactid(x), stringsAsFactors = FALSE)
  }
  # no name set to `by` parameter
  if (is.null(names(by))) {
@@ -116,14 +124,16 @@ semi_join_microorganisms <- function(x, by = 'bactid', ...) {
  } else {
    joinby <- by
  }
-  dplyr::semi_join(x = x, y = AMR::microorganisms, by = joinby, ...)
+  suppressWarnings(
+    dplyr::semi_join(x = x, y = AMR::microorganisms, by = joinby, ...)
+  )
 }

 #' @rdname join
 #' @export
 anti_join_microorganisms <- function(x, by = 'bactid', ...) {
-  if (any(class(x) %in% c('character', 'factor'))) {
-    x <- data.frame(bactid = x, stringsAsFactors = FALSE)
+  if (!any(class(x) %in% c("bactid", "data.frame", "matrix"))) {
+    x <- data.frame(bactid = as.bactid(x), stringsAsFactors = FALSE)
  }
  # no name set to `by` parameter
  if (is.null(names(by))) {
@@ -132,5 +142,7 @@ anti_join_microorganisms <- function(x, by = 'bactid', ...) {
  } else {
    joinby <- by
  }
-  dplyr::anti_join(x = x, y = AMR::microorganisms, by = joinby, ...)
+  suppressWarnings(
+    dplyr::anti_join(x = x, y = AMR::microorganisms, by = joinby, ...)
+  )
 }
--- a/README.md
+++ b/README.md
@@ -126,6 +126,86 @@ after
 # 5 PSEAER    R    R    -    -    R
 ```

+Bacteria ID's can be retrieved with the `as.bactid` function. It uses any type of info about a microorganism as input. For example, all these will return value `STAAUR`, the ID of *S. aureus*:
+```r
+as.bactid("stau")
+as.bactid("STAU")
+as.bactid("staaur")
+as.bactid("S. aureus")
+as.bactid("S aureus")
+as.bactid("Staphylococcus aureus")
+as.bactid("MRSA") # Methicillin Resistant S. aureus
+as.bactid("VISA") # Vancomycin Intermediate S. aureus
+as.bactid("VRSA") # Vancomycin Resistant S. aureus
+```
+
+### New classes
+This package contains two new S3 classes: `mic` for MIC values (e.g. from Vitek or Phoenix) and `rsi` for antimicrobial drug interpretations (i.e. S, I and R). Both are actually ordered factors under the hood (an MIC of `2` being higher than `<=1` but lower than `>=32`, and for class `rsi` factors are ordered as `S < I < R`). 
+Both classes have extensions for existing generic functions like `print`, `summary` and `plot`.
+
+```r
+# Transform values to new classes
+mic_data <- as.mic(c(">=32", "1.0", "8", "<=0.128", "8", "16", "16"))
+rsi_data <- as.rsi(c(rep("S", 474), rep("I", 36), rep("R", 370)))
+```
+These functions also try to coerce valid values.
+
+Quick overviews when just printing objects:
+```r
+mic_data
+# Class 'mic': 7 isolates
+# 
+# <NA>  0
+# 
+# <=0.128       1       8      16    >=32
+#       1       1       2       2       1
+
+rsi_data
+# Class 'rsi': 880 isolates
+# 
+# <NA>:       0 
+# Sum of S:   474 
+# Sum of IR:  406 
+# - Sum of R: 370 
+# - Sum of I: 36 
+# 
+#   %S  %IR   %I   %R 
+# 53.9 46.1  4.1 42.0 
+```
+
+A plot of `rsi_data`:
+```r
+plot(rsi_data)
+```
+
+![example1](man/figures/rsi_example.png)
+
+A plot of `mic_data` (defaults to bar plot):
+```r
+plot(mic_data)
+```
+
+![example2](man/figures/mic_example.png)
+
+Other epidemiological functions:
+
+```r
+# Determine key antibiotic based on bacteria ID
+key_antibiotics(...)
+
+# Selection of first isolates of any patient
+first_isolate(...)
+
+# Calculate resistance levels of antibiotics, can be used with `summarise` (dplyr)
+rsi(...)
+# Predict resistance levels of antibiotics
+rsi_predict(...)
+
+# Get name of antibiotic by ATC code
+abname(...)
+abname("J01CR02", from = "atc", to = "umcg") # "AMCL"
+```
+
 ### Frequency tables
 Base R lacks a simple function to create frequency tables. We created such a function that works with almost all data types: `freq` (or `frequency_tbl`). It can be used in two ways:
 ```r
@@ -235,79 +315,12 @@ Learn more about this function with:
 ?freq
 ```

-### New classes
-This package contains two new S3 classes: `mic` for MIC values (e.g. from Vitek or Phoenix) and `rsi` for antimicrobial drug interpretations (i.e. S, I and R). Both are actually ordered factors under the hood (an MIC of `2` being higher than `<=1` but lower than `>=32`, and for class `rsi` factors are ordered as `S < I < R`). 
-Both classes have extensions for existing generic functions like `print`, `summary` and `plot`.
-
-```r
-# Transform values to new classes
-mic_data <- as.mic(c(">=32", "1.0", "8", "<=0.128", "8", "16", "16"))
-rsi_data <- as.rsi(c(rep("S", 474), rep("I", 36), rep("R", 370)))
-```
-These functions also try to coerce valid values.
-
-Quick overviews when just printing objects:
-```r
-mic_data
-# Class 'mic': 7 isolates
-# 
-# <NA>  0
-# 
-# <=0.128       1       8      16    >=32
-#       1       1       2       2       1
-
-rsi_data
-# Class 'rsi': 880 isolates
-# 
-# <NA>:       0 
-# Sum of S:   474 
-# Sum of IR:  406 
-# - Sum of R: 370 
-# - Sum of I: 36 
-# 
-#   %S  %IR   %I   %R 
-# 53.9 46.1  4.1 42.0 
-```
-
-A plot of `rsi_data`:
-```r
-plot(rsi_data)
-```
-
-![example1](man/figures/rsi_example.png)
-
-A plot of `mic_data` (defaults to bar plot):
-```r
-plot(mic_data)
-```
-
-![example2](man/figures/mic_example.png)
-
-Other epidemiological functions:
-
-```r
-# Determine key antibiotic based on bacteria ID
-key_antibiotics(...)
-
-# Selection of first isolates of any patient
-first_isolate(...)
-
-# Calculate resistance levels of antibiotics, can be used with `summarise` (dplyr)
-rsi(...)
-# Predict resistance levels of antibiotics
-rsi_predict(...)
-
-# Get name of antibiotic by ATC code
-abname(...)
-abname("J01CR02", from = "atc", to = "umcg") # "AMCL"
-```
-
 ### Databases included in package
 Datasets to work with antibiotics and bacteria properties.
 ```r
 # Dataset with 2000 random blood culture isolates from anonymised
 # septic patients between 2001 and 2017 in 5 Dutch hospitals
-septic_patients   # A tibble: 4,000 x 47
+septic_patients   # A tibble: 2,000 x 47

 # Dataset with ATC antibiotics codes, official names, trade names 
 # and DDD's (oral and parenteral)
--- a/data/microorganisms.rda
+++ b/data/microorganisms.rda
--- a/data/septic_patients.rda
+++ b/data/septic_patients.rda
--- a/man/as.bactid.Rd
+++ b/man/as.bactid.Rd
@@ -0,0 +1,69 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/bactid.R
+\name{as.bactid}
+\alias{as.bactid}
+\alias{guess_bactid}
+\alias{is.bactid}
+\title{Transform to bacteria ID}
+\usage{
+as.bactid(x)
+
+guess_bactid(x)
+
+is.bactid(x)
+}
+\arguments{
+\item{x}{a character vector or a dataframe with one or two columns}
+}
+\value{
+Character (vector) with class \code{"bactid"}. Unknown values will return \code{NA}.
+}
+\description{
+Use this function to determine a valid ID based on a genus (and species). This input can be a full name (like \code{"Staphylococcus aureus"}), an abbreviated name (like \code{"S. aureus"}), or just a genus. You could also \code{\link{select}} a genus and species column, zie Examples.
+}
+\details{
+Some exceptions have been built in to get more logical results, based on prevalence of human pathogens. For example:
+\itemize{
+  \item{\code{"E. coli"} will return the ID of \emph{Escherichia coli} and not \emph{Entamoeba coli}, although the latter would alphabetically come first}
+  \item{\code{"H. influenzae"} will return the ID of \emph{Haemophilus influenzae} and not \emph{Haematobacter influenzae}}
+  \item{Something like \code{"p aer"} will return the ID of \emph{Pseudomonas aeruginosa} and not \emph{Pasteurella aerogenes}}
+  \item{Something like \code{"stau"} or \code{"staaur"} will return the ID of \emph{Staphylococcus aureus} and not \emph{Staphylococcus auricularis}}
+}
+Moreover, this function also supports ID's based on only Gram stain, when the species is not known. \cr
+For example, \code{"Gram negative rods"} and \code{"GNR"} will both return the ID of a Gram negative rod: \code{GNR}.
+}
+\examples{
+# These examples all return "STAAUR", the ID of S. aureus:
+as.bactid("stau")
+as.bactid("STAU")
+as.bactid("staaur")
+as.bactid("S. aureus")
+as.bactid("S aureus")
+as.bactid("Staphylococcus aureus")
+as.bactid("MRSA") # Methicillin Resistant S. aureus
+as.bactid("VISA") # Vancomycin Intermediate S. aureus
+as.bactid("VRSA") # Vancomycin Resistant S. aureus
+
+\dontrun{
+df$bactid <- as.bactid(df$microorganism_name)
+
+# the select function of tidyverse is also supported:
+library(dplyr)
+df$bactid <- df \%>\%
+  select(microorganism_name) \%>\%
+  as.bactid()
+
+# and can even contain 2 columns, which is convenient for genus/species combinations:
+df$bactid <- df \%>\%
+  select(genus, species) \%>\%
+  as.bactid()
+
+# same result:
+df <- df \%>\%
+  mutate(bactid = paste(genus, species) \%>\%
+                    as.bactid())
+}
+}
+\seealso{
+\code{\link{microorganisms}} for the dataframe that is being used to determine ID's.
+}
--- a/man/first_isolate.Rd
+++ b/man/first_isolate.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/first_isolates.R
+% Please edit documentation in R/first_isolate.R
 \name{first_isolate}
 \alias{first_isolate}
 \title{Determine first (weighted) isolates}
@@ -21,7 +21,7 @@ first_isolate(tbl, col_date, col_patient_id, col_bactid = NA,

 \item{col_patient_id}{column name of the unique IDs of the patients}

-\item{col_bactid}{column name of the unique IDs of the microorganisms (should occur in the \code{\link{microorganisms}} dataset). Get your bactid's with the function \code{\link{guess_bactid}}, that takes microorganism names as input.}
+\item{col_bactid}{column name of the unique IDs of the microorganisms: \code{bactid}'s. If this column has another class than \code{"bactid"}, values will be coerced using \code{\link{as.bactid}}.}

 \item{col_testcode}{column name of the test codes. Use \code{col_testcode = NA} to \strong{not} exclude certain test codes (like test codes for screening). In that case \code{testcodes_exclude} will be ignored. Supports tidyverse-like quotation.}

--- a/man/guess_bactid.Rd
+++ b/man/guess_bactid.Rd
@@ -1,43 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/guess_bactid.R
-\name{guess_bactid}
-\alias{guess_bactid}
-\title{Find bacteria ID based on genus/species}
-\usage{
-guess_bactid(x)
-}
-\arguments{
-\item{x}{character vector or a dataframe with one or two columns}
-}
-\value{
-Character (vector).
-}
-\description{
-Use this function to determine a valid ID based on a genus (and species). This input could be a full name (like \code{"Staphylococcus aureus"}), an abbreviated name (like \code{"S. aureus"}), or just a genus. You could also \code{\link{select}} a genus and species column, zie Examples.
-}
-\examples{
-# These examples all return "STAAUR", the ID of S. aureus:
-guess_bactid("stau")
-guess_bactid("STAU")
-guess_bactid("staaur")
-guess_bactid("S. aureus")
-guess_bactid("S aureus")
-guess_bactid("Staphylococcus aureus")
-guess_bactid("MRSA") # Methicillin-resistant S. aureus
-guess_bactid("VISA") # Vancomycin Intermediate S. aureus
-
-\dontrun{
-df$bactid <- guess_bactid(df$microorganism_name)
-
-# the select function of tidyverse is also supported:
-df$bactid <- df \%>\% select(microorganism_name) \%>\% guess_bactid()
-
-# and can even contain 2 columns, which is convenient for genus/species combinations:
-df$bactid <- df \%>\% select(genus, species) \%>\% guess_bactid()
-# same result:
-df <- df \%>\% mutate(bactid = paste(genus, species)) \%>\% guess_bactid())
-}
-}
-\seealso{
-\code{\link{microorganisms}} for the dataframe that is being used to determine ID's.
-}
--- a/man/join.Rd
+++ b/man/join.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/join.R
+% Please edit documentation in R/join_microorganisms.R
 \name{join}
 \alias{join}
 \alias{inner_join_microorganisms}
--- a/man/key_antibiotics.Rd
+++ b/man/key_antibiotics.Rd
@@ -19,7 +19,7 @@ key_antibiotics_equal(x, y, type = c("keyantibiotics", "points"),
 \arguments{
 \item{tbl}{table with antibiotics coloms, like \code{amox} and \code{amcl}.}

-\item{col_bactid}{column name of the unique IDs of the microorganisms (should occur in the \code{\link{microorganisms}} dataset). Get your bactid's with the function \code{\link{guess_bactid}}, that takes microorganism names as input.}
+\item{col_bactid}{column name of the unique IDs of the microorganisms: \code{bactid}'s. If this column has another class than \code{"bactid"}, values will be coerced using \code{\link{as.bactid}}.}

 \item{universal_1, universal_2, universal_3, universal_4, universal_5, universal_6}{column names of \strong{broad-spectrum} antibiotics, case-insensitive}

--- a/tests/testthat/test-bactid.R
+++ b/tests/testthat/test-bactid.R
@@ -0,0 +1,64 @@
+context("bactid.R")
+
+test_that("as.bactid works", {
+  expect_identical(
+    as.character(as.bactid(c("E. coli", "H. influenzae"))),
+    c("ESCCOL", "HAEINF"))
+
+  expect_equal(as.character(as.bactid("Escherichia coli")), "ESCCOL")
+  expect_equal(as.character(as.bactid("P. aer")), "PSEAER") # not Pasteurella aerogenes
+
+  expect_equal(as.character(as.bactid("Negative rods")), "GNR")
+
+  expect_equal(as.character(as.bactid("MRSE")), "STAEPI")
+  expect_equal(as.character(as.bactid("VRE")), "ENC")
+  expect_equal(as.character(as.bactid("MRPA")), "PSEAER")
+  expect_equal(as.character(as.bactid("PISP")), "STCPNE")
+  expect_equal(as.character(as.bactid("PRSP")), "STCPNE")
+  expect_equal(as.character(as.bactid("VISP")), "STCPNE")
+  expect_equal(as.character(as.bactid("VRSP")), "STCPNE")
+
+  expect_identical(
+    as.character(
+      as.bactid(c("stau",
+                     "STAU",
+                     "staaur",
+                     "S. aureus",
+                     "S aureus",
+                     "Staphylococcus aureus",
+                     "MRSA",
+                     "VISA"))),
+    rep("STAAUR", 8))
+
+  # select with one column
+  expect_identical(
+    septic_patients[1:10,] %>%
+      left_join_microorganisms() %>%
+      select(genus) %>%
+      as.bactid() %>%
+      as.character(),
+    c("STC", "STC", "NEI", "STA", "STA",
+      "NEI", "ENT", "ENT", "ESC", "KLE"))
+
+  # select with two columns
+  expect_identical(
+    septic_patients[1:10,] %>%
+      pull(bactid),
+    septic_patients[1:10,] %>%
+      left_join_microorganisms() %>%
+      select(genus, species) %>%
+      as.bactid() %>%
+      as.character())
+
+  # unknown results
+  expect_warning(as.bactid(c("INVALID", "Yeah, unknown")))
+
+  # print
+  expect_output(print(as.bactid(c("ESCCOL", NA))))
+
+  # helper function
+  expect_identical(as.bactid("ESCCOL"),
+                   guess_bactid("ESCCOL"))
+
+
+})
--- a/tests/testthat/test-eucast.R
+++ b/tests/testthat/test-eucast.R
@@ -1,27 +1,34 @@
 context("eucast.R")

 test_that("EUCAST rules work", {
-  a <- suppressWarnings(EUCAST_rules(septic_patients))
+  expect_identical(colnames(septic_patients),
+                   colnames(suppressWarnings(EUCAST_rules(septic_patients))))

-  a <- data.frame(bactid = c("KLEPNE",  # Klebsiella pneumoniae
-                             "PSEAER",  # Pseudomonas aeruginosa
-                             "ENTAER"), # Enterobacter aerogenes
+  a <- data.frame(bactid =
+                    c("KLEPNE",  # Klebsiella pneumoniae
+                      "PSEAER",  # Pseudomonas aeruginosa
+                      "ENTAER"), # Enterobacter aerogenes
                  amox = "-",           # Amoxicillin
                  stringsAsFactors = FALSE)
-  b <- data.frame(bactid = c("KLEPNE",  # Klebsiella pneumoniae
-                             "PSEAER",  # Pseudomonas aeruginosa
-                             "ENTAER"), # Enterobacter aerogenes
+  b <- data.frame(bactid =
+                    as.bactid(
+                      c("KLEPNE",  # Klebsiella pneumoniae
+                        "PSEAER",  # Pseudomonas aeruginosa
+                        "ENTAER")), # Enterobacter aerogenes
                  amox = "R",           # Amoxicillin
                  stringsAsFactors = FALSE)
-  expect_equal(EUCAST_rules(a, info = FALSE), b)
-  expect_equal(suppressWarnings(interpretive_reading(a, info = TRUE)), b)
+  expect_identical(EUCAST_rules(a, info = FALSE), b)
+  expect_identical(suppressWarnings(interpretive_reading(a, info = TRUE)), b)

-  a <- data.frame(bactid = c("STAAUR",  # Staphylococcus aureus
-                             "STCGRA"), # Streptococcus pyognenes (Lancefield Group A)
+  a <- data.frame(bactid =
+                    c("STAAUR",  # Staphylococcus aureus
+                      "STCGRA"), # Streptococcus pyognenes (Lancefield Group A)
                  coli = "-",           # Colistin
                  stringsAsFactors = FALSE)
-  b <- data.frame(bactid = c("STAAUR",  # Staphylococcus aureus
-                             "STCGRA"), # Streptococcus pyognenes (Lancefield Group A)
+  b <- data.frame(bactid =
+                    as.bactid(
+                      c("STAAUR",  # Staphylococcus aureus
+                        "STCGRA")), # Streptococcus pyognenes (Lancefield Group A)
                  coli = "R",           # Colistin
                  stringsAsFactors = FALSE)
  expect_equal(EUCAST_rules(a, info = FALSE), b)
--- a/tests/testthat/test-first_isolates.R
+++ b/tests/testthat/test-first_isolates.R
@@ -1,4 +1,4 @@
-context("first_isolates.R")
+context("first_isolate.R")

 test_that("first isolates work", {
  # septic_patients contains 1959 out of 2000 first isolates
--- a/tests/testthat/test-guess_bactid.R
+++ b/tests/testthat/test-guess_bactid.R
@@ -1,49 +0,0 @@
-context("guess_bactid.R")
-
-test_that("guess_bactid works", {
-  expect_identical(
-    guess_bactid(c("E. coli", "H. influenzae")),
-    c("ESCCOL", "HAEINF"))
-
-  expect_equal(guess_bactid("Escherichia coli"), "ESCCOL")
-  expect_equal(guess_bactid("P. aer"), "PSEAER") # not Pasteurella aerogenes
-
-  expect_equal(guess_bactid("Negative rods"), "GNR")
-
-  expect_equal(guess_bactid("MRSE"), "STAEPI")
-  expect_equal(guess_bactid("VRE"), "ENC")
-  expect_equal(guess_bactid("MRPA"), "PSEAER")
-  expect_equal(guess_bactid("PISP"), "STCPNE")
-  expect_equal(guess_bactid("PRSP"), "STCPNE")
-  expect_equal(guess_bactid("VISP"), "STCPNE")
-  expect_equal(guess_bactid("VRSP"), "STCPNE")
-
-  expect_identical(
-    guess_bactid(c("stau",
-                   "STAU",
-                   "staaur",
-                   "S. aureus",
-                   "S aureus",
-                   "Staphylococcus aureus",
-                   "MRSA",
-                   "VISA")),
-    rep("STAAUR", 8))
-
-  # select with one column
-  expect_identical(
-    septic_patients[1:10,] %>%
-      left_join_microorganisms() %>%
-      select(genus) %>%
-      guess_bactid(),
-    c("STC", "STC", "NEI", "STA", "STA",
-      "NEI", "ENT", "ENT", "ESC", "KLE"))
-
-  # select with two columns
-  expect_identical(
-    septic_patients[1:10,] %>%
-      pull(bactid),
-    septic_patients[1:10,] %>%
-      left_join_microorganisms() %>%
-      select(genus, species) %>%
-      guess_bactid())
-})
--- a/tests/testthat/test-join_microorganisms.R
+++ b/tests/testthat/test-join_microorganisms.R
@@ -1,4 +1,4 @@
-context("joins.R")
+context("join_microorganisms.R")

 test_that("joins work", {
  unjoined <- septic_patients