Catalogue of Life, replaces ITIS

2025-08-24 16:22:10 +02:00 · 2019-02-18 02:33:37 +01:00
parent 9209787a53
commit 54f8258f8c
88 changed files with 607 additions and 590 deletions
--- a/R/data.R
+++ b/R/data.R
@@ -130,14 +130,14 @@
 #
 "antibiotics"

-#' Data set with ~20,000 microorganisms
+#' Data set with ~60,000 microorganisms
 #'
-#' A data set containing the complete microbial taxonomy of the kingdoms Bacteria, Fungi and Protozoa from ITIS. MO codes can be looked up using \code{\link{as.mo}}.
+#' A data set containing the microbial taxonomy of six kingdoms from the Catalogue of Life. MO codes can be looked up using \code{\link{as.mo}}.
 #' @inheritSection ITIS ITIS
-#' @format A \code{\link{data.frame}} with 19,456 observations and 15 variables:
+#' @format A \code{\link{data.frame}} with 56,659 observations and 15 variables:
 #' \describe{
 #'   \item{\code{mo}}{ID of microorganism}
-#'   \item{\code{tsn}}{Taxonomic Serial Number (TSN), as defined by ITIS}
+#'   \item{\code{col_id}}{Catalogue of Life ID}
 #'   \item{\code{genus}}{Taxonomic genus of the microorganism as found in ITIS, see Source}
 #'   \item{\code{species}}{Taxonomic species of the microorganism as found in ITIS, see Source}
 #'   \item{\code{subspecies}}{Taxonomic subspecies of the microorganism as found in ITIS, see Source}
@@ -155,15 +155,10 @@
 #' @source Integrated Taxonomic Information System (ITIS) public online database, \url{https://www.itis.gov}.
 #' @details Manually added were:
 #' \itemize{
-#'   \item{605 species of Aspergillus (as Aspergillus misses from ITIS, list from https://en.wikipedia.org/wiki/List_of_Aspergillus_species on 2019-02-05)}
-#'   \item{23 species of Trichophyton (as Trichophyton misses from ITIS, list from https://en.wikipedia.org/wiki/Trichophyton on 2019-02-05)}
-#'   \item{9 species of Streptococcus (beta haemolytic groups A, B, C, D, F, G, H, K and unspecified)}
-#'   \item{2 species of Straphylococcus (coagulase-negative [CoNS] and coagulase-positive [CoPS])}
-#'   \item{1 species of Candida (C. glabrata)}
+#'   \item{9 species of \emph{Streptococcus} (beta haemolytic groups A, B, C, D, F, G, H, K and unspecified)}
+#'   \item{2 species of \emph{Staphylococcus} (coagulase-negative [CoNS] and coagulase-positive [CoPS])}
 #'   \item{2 other undefined (unknown Gram negatives and unknown Gram positives)}
 #' }
-#'
-#' These manual entries have no Taxonomic Serial Number (TSN), so can be looked up with \code{filter(microorganisms, is.na(tsn)}.
 #' @inheritSection AMR Read more on our website!
 #' @seealso \code{\link{as.mo}} \code{\link{mo_property}} \code{\link{microorganisms.codes}}
 "microorganisms"
@@ -172,12 +167,12 @@
 #'
 #' A data set containing old (previously valid or accepted) taxonomic names according to ITIS. This data set is used internally by \code{\link{as.mo}}.
 #' @inheritSection as.mo ITIS
-#' @format A \code{\link{data.frame}} with 2,383 observations and 4 variables:
+#' @format A \code{\link{data.frame}} with 14,506 observations and 4 variables:
 #' \describe{
-#'   \item{\code{tsn}}{Old Taxonomic Serial Number (TSN), as defined by ITIS}
-#'   \item{\code{name}}{Old taxonomic name of the microorganism as found in ITIS, see Source}
-#'   \item{\code{tsn_new}}{New Taxonomic Serial Number (TSN), as defined by ITIS}
-#'   \item{\code{ref}}{Author(s) and year of concerning publication as found in ITIS, see Source}
+#'   \item{\code{col_id}}{Catalogue of Life ID}
+#'   \item{\code{tsn_new}}{New Catalogue of Life ID}
+#'   \item{\code{fullname}}{Old taxonomic name of the microorganism as found in the CoL, see Source}
+#'   \item{\code{ref}}{Author(s) and year of concerning publication as found in the CoL, see Source}
 #' }
 #' @source [3] Integrated Taxonomic Information System (ITIS) on-line database, \url{https://www.itis.gov}.
 #' @inheritSection AMR Read more on our website!
@@ -250,36 +245,3 @@
 #' }
 #' @inheritSection AMR Read more on our website!
 "WHONET"
-
-#' Supplementary Data
-#'
-#' These \code{\link{data.table}s} are transformed from the \code{\link{microorganisms}} and \code{\link{microorganisms}} data sets to improve speed of \code{\link{as.mo}}. They are meant for internal use only, and are only mentioned here for reference.
-#' @rdname supplementary_data
-#' @name supplementary_data
-#' @inheritSection AMR Read more on our website!
-# # Renew data:
-# # sorted on (1) bacteria, (2) fungi, (3) protozoa and then human pathogenic prevalence and then TSN:
-# microorganismsDT <- data.table::as.data.table(AMR::microorganisms)
-# data.table::setkey(microorganismsDT, kingdom, prevalence, fullname)
-# microorganisms.prevDT <- microorganismsDT[prevalence != 9999,]
-# microorganisms.unprevDT <- microorganismsDT[prevalence == 9999,]
-# microorganisms.oldDT <- data.table::as.data.table(AMR::microorganisms.old)
-# data.table::setkey(microorganisms.oldDT, tsn, name)
-# usethis::use_data(microorganismsDT, overwrite = TRUE)
-# usethis::use_data(microorganisms.prevDT, overwrite = TRUE)
-# usethis::use_data(microorganisms.unprevDT, overwrite = TRUE)
-# usethis::use_data(microorganisms.oldDT, overwrite = TRUE)
-# rm(microorganismsDT)
-# rm(microorganisms.prevDT)
-# rm(microorganisms.unprevDT)
-# rm(microorganisms.oldDT)
-"microorganismsDT"
-
-#' @rdname supplementary_data
-"microorganisms.prevDT"
-
-#' @rdname supplementary_data
-"microorganisms.unprevDT"
-
-#' @rdname supplementary_data
-"microorganisms.oldDT"
--- a/R/eucast_rules.R
+++ b/R/eucast_rules.R
@@ -548,6 +548,7 @@ eucast_rules <- function(tbl,
  tbl <- tbl %>%
    mutate_at(vars(col_mo), as.mo) %>%
    left_join_microorganisms(by = col_mo, suffix = c("_oldcols", "")) %>%
+    mutate(gramstain = mo_gramstain(pull(., col_mo))) %>%
    as.data.frame(stringsAsFactors = FALSE)

  if (info == TRUE) {
--- a/R/first_isolate.R
+++ b/R/first_isolate.R
@@ -70,7 +70,7 @@
 #' @keywords isolate isolates first
 #' @seealso \code{\link{key_antibiotics}}
 #' @export
-#' @importFrom dplyr arrange_at lag between row_number filter mutate arrange
+#' @importFrom dplyr arrange_at lag between row_number filter mutate arrange pull
 #' @importFrom crayon blue bold silver
 #' @return Logical vector
 #' @source Methodology of this function is based on: \strong{M39 Analysis and Presentation of Cumulative Antimicrobial Susceptibility Test Data, 4th Edition}, 2014, \emph{Clinical and Laboratory Standards Institute (CLSI)}. \url{https://clsi.org/standards/products/microbiology/documents/m39/}.
--- a/R/key_antibiotics.R
+++ b/R/key_antibiotics.R
@@ -43,7 +43,7 @@
 #' @inheritSection first_isolate Key antibiotics
 #' @rdname key_antibiotics
 #' @export
-#' @importFrom dplyr %>% mutate if_else
+#' @importFrom dplyr %>% mutate if_else pull
 #' @importFrom crayon blue bold
 #' @seealso \code{\link{first_isolate}}
 #' @inheritSection AMR Read more on our website!
@@ -149,7 +149,8 @@ key_antibiotics <- function(tbl,
  tbl <- tbl %>%
    mutate_at(vars(col_mo), as.mo) %>%
    left_join_microorganisms(by = col_mo) %>%
-    mutate(key_ab = NA_character_)
+    mutate(key_ab = NA_character_,
+           gramstain = mo_gramstain(pull(., col_mo)))

  # Gram +
  tbl <- tbl %>% mutate(key_ab =
--- a/R/mo.R
+++ b/R/mo.R
@@ -47,7 +47,8 @@
 #'   |   |    |    ----> subspecies, a 3-4 letter acronym
 #'   |   |     ----> species, a 3-4 letter acronym
 #'   |    ----> genus, a 5-7 letter acronym, mostly without vowels
-#'    ----> taxonomic kingdom, either B (Bacteria), F (Fungi) or P (Protozoa)
+#'    ----> taxonomic kingdom: A (Archaea), B (Bacteria), C (Chromista),
+#'                             F (Fungi), P (Protozoa) or V (Viruses)
 #' }
 #'
 #' Use the \code{\link{mo_property}} functions to get properties based on the returned code, see Examples.
@@ -81,7 +82,7 @@
 #'
 #' Examples:
 #' \itemize{
-#'   \item{\code{"Streptococcus group B (known as S. agalactiae)"}. The text between brackets will be removed and a warning will be thrown that the result \emph{Streptococcus group B} (\code{B_STRPTC_GRB}) needs review.}
+#'   \item{\code{"Streptococcus group B (known as S. agalactiae)"}. The text between brackets will be removed and a warning will be thrown that the result \emph{Streptococcus group B} (\code{B_STRPT_GRB}) needs review.}
 #'   \item{\code{"S. aureus - please mind: MRSA"}. The last word will be stripped, after which the function will try to find a match. If it does not, the second last word will be stripped, etc. Again, a warning will be thrown that the result \emph{Staphylococcus aureus} (\code{B_STPHY_AUR}) needs review.}
 #'   \item{\code{"D. spartina"}. This is the abbreviation of an old taxonomic name: \emph{Didymosphaeria spartinae} (the last "e" was missing from the input). This fungus was renamed to \emph{Leptosphaeria obiones}, so a warning will be thrown that this result (\code{F_LPTSP_OBI}) needs review.}
 #'   \item{\code{"Fluoroquinolone-resistant Neisseria gonorrhoeae"}. The first word will be stripped, after which the function will try to find a match. A warning will be thrown that the result \emph{Neisseria gonorrhoeae} (\code{B_NESSR_GON}) needs review.}
@@ -118,8 +119,6 @@
 #' as.mo("MRSA") # Methicillin Resistant S. aureus
 #' as.mo("VISA") # Vancomycin Intermediate S. aureus
 #' as.mo("VRSA") # Vancomycin Resistant S. aureus
-#' as.mo(369)    # Search on TSN (Taxonomic Serial Number), a unique identifier
-#'               # for the Integrated Taxonomic Information System (ITIS)
 #'
 #' as.mo("Streptococcus group A")
 #' as.mo("GAS") # Group A Streptococci
@@ -128,8 +127,8 @@
 #' as.mo("S. epidermidis")                 # will remain species: B_STPHY_EPI
 #' as.mo("S. epidermidis", Becker = TRUE)  # will not remain species: B_STPHY_CNS
 #'
-#' as.mo("S. pyogenes")                    # will remain species: B_STRPTC_PYO
-#' as.mo("S. pyogenes", Lancefield = TRUE) # will not remain species: B_STRPTC_GRA
+#' as.mo("S. pyogenes")                    # will remain species: B_STRPT_PYO
+#' as.mo("S. pyogenes", Lancefield = TRUE) # will not remain species: B_STRPT_GRA
 #'
 #' # Use mo_* functions to get a specific property based on `mo`
 #' Ecoli <- as.mo("E. coli")     # returns `B_ESCHR_COL`
@@ -178,11 +177,7 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,

  if (!"AMR" %in% base::.packages()) {
    library("AMR")
-    # These data.tables are available as data sets when the AMR package is loaded:
-    #   microorganismsDT        # this one is sorted by kingdom (B<F<P), prevalence, TSN
-    #   microorganisms.prevDT   # same as microorganismsDT, but with prevalence != 9999
-    #   microorganisms.unprevDT # same as microorganismsDT, but with prevalence == 9999
-    #   microorganisms.oldDT    # old taxonomic names, sorted by name (genus+species), TSN
+    # check onLoad() in R/zzz.R: data tables are created there.
  }

  if (clear_options == TRUE) {
@@ -220,6 +215,17 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
  # remove empty values (to later fill them in again with NAs)
  x <- x[!is.na(x) & !is.null(x) & !identical(x, "")]

+
+  # conversion v0.5.0 to v0.6.0, remove for v0.7.0
+  x <- gsub("B_STRPTC", "B_STRPT", x)
+  x <- gsub("B_STRPT_EQUI", "B_STRPT_EQU", x)
+  x <- gsub("B_PDMNS", "B_PSDMN", x)
+  x <- gsub("B_CTRDM", "B_CLSTR", x)
+  x <- gsub("F_CANDD_GLB", "F_CANDD_GLA", x)
+  x <- gsub("F_CANDD_LUS", "F_CANDD", x)
+  x <- gsub("B_FCTRM", "B_FSBCT", x)
+
+
  # defined df to check for
  if (!is.null(reference_df)) {
    if (!is.data.frame(reference_df) | NCOL(reference_df) < 2) {
@@ -239,8 +245,6 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
    # all empty
    if (property == "mo") {
      return(structure(rep(NA_character_, length(x_input)), class = "mo"))
-    } else if (property == "tsn") {
-      return(rep(NA_integer_, length(x_input)))
    } else {
      return(rep(NA_character_, length(x_input)))
    }
@@ -283,7 +287,7 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
    # replace hemolytic by haemolytic
    x <- gsub("ha?emoly", "haemoly", x)
    # place minus back in streptococci
-    x <- gsub("(alpha|beta|gamma) haemoly", "\\1-haemolytic", x)
+    x <- gsub("(alpha|beta|gamma) ha?emoly", "\\1-haemoly", x)
    # remove genus as first word
    x <- gsub("^Genus ", "", x)

@@ -372,7 +376,7 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
        }
        if (toupper(x_trimmed[i]) == 'MRPA') {
          # multi resistant P. aeruginosa
-          x[i] <- microorganismsDT[mo == 'B_PDMNS_AER', ..property][[1]][1L]
+          x[i] <- microorganismsDT[mo == 'B_PSDMN_AER', ..property][[1]][1L]
          next
        }
        if (toupper(x_trimmed[i]) == 'CRS'
@@ -383,22 +387,22 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
        }
        if (toupper(x_trimmed[i]) %in% c('PISP', 'PRSP', 'VISP', 'VRSP')) {
          # peni I, peni R, vanco I, vanco R: S. pneumoniae
-          x[i] <- microorganismsDT[mo == 'B_STRPTC_PNE', ..property][[1]][1L]
+          x[i] <- microorganismsDT[mo == 'B_STRPT_PNE', ..property][[1]][1L]
          next
        }
        if (toupper(x_trimmed[i]) %like% '^G[ABCDFGHK]S$') {
-          # Streptococci, like GBS = Group B Streptococci (B_STRPTC_GRB)
-          x[i] <- microorganismsDT[mo == gsub("G([ABCDFGHK])S", "B_STRPTC_GR\\1", x_trimmed[i], ignore.case = TRUE), ..property][[1]][1L]
+          # Streptococci, like GBS = Group B Streptococci (B_STRPT_GRB)
+          x[i] <- microorganismsDT[mo == gsub("G([ABCDFGHK])S", "B_STRPT_GR\\1", x_trimmed[i], ignore.case = TRUE), ..property][[1]][1L]
          next
        }
        if (toupper(x_trimmed[i]) %like% '(streptococc|streptokok).* [ABCDFGHK]$') {
          # Streptococci in different languages, like "estreptococos grupo B"
-          x[i] <- microorganismsDT[mo == gsub(".*(streptococ|streptokok|estreptococ).* ([ABCDFGHK])$", "B_STRPTC_GR\\2", x_trimmed[i], ignore.case = TRUE), ..property][[1]][1L]
+          x[i] <- microorganismsDT[mo == gsub(".*(streptococ|streptokok|estreptococ).* ([ABCDFGHK])$", "B_STRPT_GR\\2", x_trimmed[i], ignore.case = TRUE), ..property][[1]][1L]
          next
        }
        if (toupper(x_trimmed[i]) %like% 'group [ABCDFGHK] (streptococ|streptokok|estreptococ)') {
          # Streptococci in different languages, like "Group A Streptococci"
-          x[i] <- microorganismsDT[mo == gsub(".*group ([ABCDFGHK]) (streptococ|streptokok|estreptococ).*", "B_STRPTC_GR\\1", x_trimmed[i], ignore.case = TRUE), ..property][[1]][1L]
+          x[i] <- microorganismsDT[mo == gsub(".*group ([ABCDFGHK]) (streptococ|streptokok|estreptococ).*", "B_STRPT_GR\\1", x_trimmed[i], ignore.case = TRUE), ..property][[1]][1L]
          next
        }
        # CoNS/CoPS in different languages (support for German, Dutch, Spanish, Portuguese) ----
@@ -490,15 +494,72 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
        }
      }

-      # TRY FIRST THOUSAND MOST PREVALENT IN HUMAN INFECTIONS ----
-      found <- microorganisms.prevDT[tolower(fullname) %in% tolower(c(x_backup[i], x_trimmed[i])), ..property][[1]]
+      # FIRST TRY SUPERPREVALENT IN HUMAN INFECTIONS ----
+      found <- microorganisms.superprevDT[tolower(fullname) %in% tolower(c(x_backup[i], x_trimmed[i])), ..property][[1]]
      # most probable: is exact match in fullname
      if (length(found) > 0) {
        x[i] <- found[1L]
        next
      }
-      found <- microorganisms.prevDT[tsn == x_trimmed[i], ..property][[1]]
-      # is a valid TSN
+      found <- microorganisms.superprevDT[mo == toupper(x_backup[i]), ..property][[1]]
+      # is a valid mo
+      if (length(found) > 0) {
+        x[i] <- found[1L]
+        next
+      }
+      found <- microorganisms.superprevDT[tolower(fullname) == tolower(x_trimmed_without_group[i]), ..property][[1]]
+      if (length(found) > 0) {
+        x[i] <- found[1L]
+        next
+      }
+
+      # try any match keeping spaces ----
+      found <- microorganisms.superprevDT[fullname %like% x_withspaces_start_end[i], ..property][[1]]
+      if (length(found) > 0 & nchar(x_trimmed[i]) >= 6) {
+        x[i] <- found[1L]
+        next
+      }
+
+      # try any match keeping spaces, not ending with $ ----
+      found <- microorganisms.superprevDT[fullname %like% x_withspaces_start_only[i], ..property][[1]]
+      if (length(found) > 0 & nchar(x_trimmed[i]) >= 6) {
+        x[i] <- found[1L]
+        next
+      }
+
+      # try any match diregarding spaces ----
+      found <- microorganisms.superprevDT[fullname %like% x[i], ..property][[1]]
+      if (length(found) > 0 & nchar(x_trimmed[i]) >= 6) {
+        x[i] <- found[1L]
+        next
+      }
+
+      # try splitting of characters in the middle and then find ID ----
+      # only when text length is 6 or lower
+      # like esco = E. coli, klpn = K. pneumoniae, stau = S. aureus, staaur = S. aureus
+      if (nchar(x_trimmed[i]) <= 6) {
+        x_length <- nchar(x_trimmed[i])
+        x[i] <- paste0(x_trimmed[i] %>% substr(1, x_length / 2),
+                       '.* ',
+                       x_trimmed[i] %>% substr((x_length / 2) + 1, x_length))
+        found <- microorganisms.superprevDT[fullname %like% paste0('^', x[i]), ..property][[1]]
+        if (length(found) > 0) {
+          x[i] <- found[1L]
+          next
+        }
+      }
+
+      # try fullname without start and stop regex, to also find subspecies ----
+      # like "K. pneu rhino" >> "Klebsiella pneumoniae (rhinoscleromatis)" = KLEPNERH
+      found <- microorganisms.superprevDT[fullname %like% x_withspaces_start_only[i], ..property][[1]]
+      if (length(found) > 0) {
+        x[i] <- found[1L]
+        next
+      }
+
+      # TRY PREVALENT IN HUMAN INFECTIONS ----
+      found <- microorganisms.prevDT[tolower(fullname) %in% tolower(c(x_backup[i], x_trimmed[i])), ..property][[1]]
+      # most probable: is exact match in fullname
      if (length(found) > 0) {
        x[i] <- found[1L]
        next
@@ -559,7 +620,7 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
        next
      }

-      # THEN TRY ALL OTHERS ----
+      # THEN UNPREVALENT IN HUMAN INFECTIONS ----
      found <- microorganisms.unprevDT[tolower(fullname) == tolower(x_backup[i]), ..property][[1]]
      # most probable: is exact match in fullname
      if (length(found) > 0) {
@@ -572,12 +633,6 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
        x[i] <- found[1L]
        next
      }
-      found <- microorganisms.unprevDT[tsn == x_trimmed[i], ..property][[1]]
-      # is a valid TSN
-      if (length(found) > 0) {
-        x[i] <- found[1L]
-        next
-      }
      found <- microorganisms.unprevDT[mo == toupper(x_backup[i]), ..property][[1]]
      # is a valid mo
      if (length(found) > 0) {
@@ -633,23 +688,23 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
      # MISCELLANEOUS ----

      # look for old taxonomic names ----
-      found <- microorganisms.oldDT[tolower(name) == tolower(x_backup[i])
-                                    | tsn == x_trimmed[i]
-                                    | name %like% x_withspaces_start_end[i],]
+      found <- microorganisms.oldDT[tolower(fullname) == tolower(x_backup[i])
+                                    | fullname %like% x_withspaces_start_end[i],]
      if (NROW(found) > 0) {
+        col_id_new <- found[1, col_id_new]
        # when property is "ref" (which is the case in mo_ref, mo_authors and mo_year), return the old value, so:
        # mo_ref("Chlamydia psittaci) = "Page, 1968" (with warning)
        # mo_ref("Chlamydophila psittaci) = "Everett et al., 1999"
        if (property == "ref") {
          x[i] <- found[1, ref]
        } else {
-          x[i] <- microorganismsDT[tsn == found[1, tsn_new], ..property][[1]]
+          x[i] <- microorganismsDT[col_id == found[1, col_id_new], ..property][[1]]
        }
-        was_renamed(name_old = found[1, name],
-                    name_new = microorganismsDT[tsn == found[1, tsn_new], fullname],
+        was_renamed(name_old = found[1, fullname],
+                    name_new = microorganismsDT[col_id == found[1, col_id_new], fullname],
                    ref_old = found[1, ref],
-                    ref_new = microorganismsDT[tsn == found[1, tsn_new], ref],
-                    mo = microorganismsDT[tsn == found[1, tsn_new], mo])
+                    ref_new = microorganismsDT[col_id == found[1, col_id_new], ref],
+                    mo = microorganismsDT[col_id == found[1, col_id_new], mo])
        next
      }

@@ -673,9 +728,9 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
          }

          # (2) look again for old taxonomic names, now for G. species ----
-          found <- microorganisms.oldDT[name %like% c.x_withspaces_start_end
-                                        | name %like% d.x_withspaces_start_only
-                                        | name %like% e.x,]
+          found <- microorganisms.oldDT[fullname %like% c.x_withspaces_start_end
+                                        | fullname %like% d.x_withspaces_start_only
+                                        | fullname %like% e.x,]
          if (NROW(found) > 0 & nchar(b.x_trimmed) >= 6) {
            if (property == "ref") {
              # when property is "ref" (which is the case in mo_ref, mo_authors and mo_year), return the old value, so:
@@ -683,15 +738,15 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
              # mo_ref("Chlamydophila psittaci) = "Everett et al., 1999"
              x <- found[1, ref]
            } else {
-              x <- microorganismsDT[tsn == found[1, tsn_new], ..property][[1]]
+              x <- microorganismsDT[col_id == found[1, col_id_new], ..property][[1]]
            }
-            was_renamed(name_old = found[1, name],
-                        name_new = microorganismsDT[tsn == found[1, tsn_new], fullname],
+            was_renamed(name_old = found[1, fullname],
+                        name_new = microorganismsDT[col_id == found[1, col_id_new], fullname],
                        ref_old = found[1, ref],
-                        ref_new = microorganismsDT[tsn == found[1, tsn_new], ref],
-                        mo = microorganismsDT[tsn == found[1, tsn_new], mo])
+                        ref_new = microorganismsDT[col_id == found[1, col_id_new], ref],
+                        mo = microorganismsDT[col_id == found[1, col_id_new], mo])
            uncertainties <<- c(uncertainties,
-                                paste0("'", a.x_backup, "' >> ", found[1, name], " (TSN ", found[1, tsn], ")"))
+                                paste0("'", a.x_backup, "' >> ", found[1, fullname], " (Catalogue of Life ID ", found[1, col_id], ")"))
            return(x)
          }

@@ -835,25 +890,25 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,
  # Lancefield ----
  if (Lancefield == TRUE | Lancefield == "all") {
    # group A - S. pyogenes
-    x[x == microorganismsDT[mo == 'B_STRPTC_PYO', ..property][[1]][1L]] <- microorganismsDT[mo == 'B_STRPTC_GRA', ..property][[1]][1L]
+    x[x == microorganismsDT[mo == 'B_STRPT_PYO', ..property][[1]][1L]] <- microorganismsDT[mo == 'B_STRPT_GRA', ..property][[1]][1L]
    # group B - S. agalactiae
-    x[x == microorganismsDT[mo == 'B_STRPTC_AGA', ..property][[1]][1L]] <- microorganismsDT[mo == 'B_STRPTC_GRB', ..property][[1]][1L]
+    x[x == microorganismsDT[mo == 'B_STRPT_AGA', ..property][[1]][1L]] <- microorganismsDT[mo == 'B_STRPT_GRB', ..property][[1]][1L]
    # group C
    S_groupC <- microorganismsDT %>% filter(genus == "Streptococcus",
                                            species %in% c("equisimilis", "equi",
                                                           "zooepidemicus", "dysgalactiae")) %>%
      pull(property)
-    x[x %in% S_groupC] <- microorganismsDT[mo == 'B_STRPTC_GRC', ..property][[1]][1L]
+    x[x %in% S_groupC] <- microorganismsDT[mo == 'B_STRPT_GRC', ..property][[1]][1L]
    if (Lancefield == "all") {
      # all Enterococci
-      x[x %like% "^(Enterococcus|B_ENTRC)"] <- microorganismsDT[mo == 'B_STRPTC_GRD', ..property][[1]][1L]
+      x[x %like% "^(Enterococcus|B_ENTRC)"] <- microorganismsDT[mo == 'B_STRPT_GRD', ..property][[1]][1L]
    }
    # group F - S. anginosus
-    x[x == microorganismsDT[mo == 'B_STRPTC_ANG', ..property][[1]][1L]] <- microorganismsDT[mo == 'B_STRPTC_GRF', ..property][[1]][1L]
+    x[x == microorganismsDT[mo == 'B_STRPT_ANG', ..property][[1]][1L]] <- microorganismsDT[mo == 'B_STRPT_GRF', ..property][[1]][1L]
    # group H - S. sanguinis
-    x[x == microorganismsDT[mo == 'B_STRPTC_SAN', ..property][[1]][1L]] <- microorganismsDT[mo == 'B_STRPTC_GRH', ..property][[1]][1L]
+    x[x == microorganismsDT[mo == 'B_STRPT_SAN', ..property][[1]][1L]] <- microorganismsDT[mo == 'B_STRPT_GRH', ..property][[1]][1L]
    # group K - S. salivarius
-    x[x == microorganismsDT[mo == 'B_STRPTC_SAL', ..property][[1]][1L]] <- microorganismsDT[mo == 'B_STRPTC_GRK', ..property][[1]][1L]
+    x[x == microorganismsDT[mo == 'B_STRPT_SAL', ..property][[1]][1L]] <- microorganismsDT[mo == 'B_STRPT_GRK', ..property][[1]][1L]
  }


@@ -875,8 +930,6 @@ exec_as.mo <- function(x, Becker = FALSE, Lancefield = FALSE,

  if (property == "mo") {
    class(x) <- "mo"
-  } else if (property == "tsn") {
-    x <- as.integer(x)
  }

  if (length(mo_renamed()) > 0) {
--- a/R/mo_property.R
+++ b/R/mo_property.R
@@ -38,7 +38,7 @@
 #' @rdname mo_property
 #' @name mo_property
 #' @return \itemize{
-#'   \item{An \code{integer} in case of \code{mo_TSN} and \code{mo_year}}
+#'   \item{An \code{integer} in case of \code{mo_year}}
 #'   \item{A \code{list} in case of \code{mo_taxonomy}}
 #'   \item{A \code{character} in all other cases}
 #' }
@@ -57,7 +57,6 @@
 #' mo_genus("E. coli")           # "Escherichia"
 #' mo_species("E. coli")         # "coli"
 #' mo_subspecies("E. coli")      # NA
-#' mo_TSN("E. coli")             # 285 (Taxonomic Serial Number)
 #'
 #' ## colloquial properties
 #' mo_fullname("E. coli")        # "Escherichia coli"
@@ -242,17 +241,15 @@ mo_type <- function(x, language = get_locale(), ...) {
 #' @rdname mo_property
 #' @export
 mo_gramstain <- function(x, language = get_locale(), ...) {
-  mo_translate(mo_validate(x = x, property = "gramstain", ...), language = language)
-}
-
-#' @rdname mo_property
-#' @export
-mo_TSN  <- function(x, ...) {
-  res <- mo_validate(x = x, property = "tsn", ...)
-  if (any(is.na(res))) {
-    warning("Some results do not have a TSN, because they are missing from ITIS and were added manually. See ?microorganisms.")
-  }
-  res
+  x.bak <- x
+  x <- mo_phylum(x, ...)
+  x[x %in% c("Actinobacteria",
+             "Chloroflexi",
+             "Firmicutes",
+             "Tenericutes")] <- "Gram positive"
+  x[x != "Gram positive"] <- "Gram negative"
+  x[mo_kingdom(x.bak) != "Bacteria"] <- NA_character_
+  mo_translate(x, language = language)
 }

 #' @rdname mo_property
@@ -284,7 +281,6 @@ mo_year <- function(x, ...) {
 mo_taxonomy <- function(x, ...) {
  x <- AMR::as.mo(x, ...)
  base::list(kingdom = mo_kingdom(x),
-             subkingdom = mo_subkingdom(x),
             phylum = mo_phylum(x),
             class = mo_class(x),
             order = mo_order(x),
@@ -472,11 +468,7 @@ mo_validate <- function(x, property, ...) {

  if (!"AMR" %in% base::.packages()) {
    library("AMR")
-    # These data.tables are available as data sets when the AMR package is loaded:
-    #   microorganismsDT        # this one is sorted by kingdom (B<F<P), prevalence, TSN
-    #   microorganisms.prevDT   # same as microorganismsDT, but with prevalence != 9999
-    #   microorganisms.unprevDT # same as microorganismsDT, but with prevalence == 9999
-    #   microorganisms.oldDT    # old taxonomic names, sorted by name (genus+species), TSN
+    # check onLoad() in R/zzz.R: data tables are created there.
  }

  if (!all(x %in% microorganismsDT[[property]])
@@ -486,8 +478,6 @@ mo_validate <- function(x, property, ...) {
  } else {
    if (property == "mo") {
      return(structure(x, class = "mo"))
-    } else if (property == "tsn") {
-      return(as.integer(x))
    } else {
      return(x)
    }
--- a/R/zzz.R
+++ b/R/zzz.R
@@ -65,6 +65,69 @@
 #' @rdname AMR
 NULL

+#' @importFrom dplyr mutate
+#' @importFrom data.table as.data.table setkey
 .onLoad <- function(libname, pkgname) {
+  # get new functions not available in older versions of R
  backports::import(pkgname)
+
+  # register data
+  if (!all(c("microorganismsDT",
+             "microorganisms.prevDT",
+             "microorganisms.unprevDT",
+             "microorganisms.oldDT") %in% ls(envir = asNamespace("AMR")))) {
+
+    # packageStartupMessage("Loading taxonomic database...", appendLF = FALSE)
+
+    microorganismsDT <- AMR::microorganisms %>%
+      mutate(prevalent = ifelse(phylum %in% c("Proteobacteria",
+                                              "Firmicutes",
+                                              "Actinobacteria",
+                                              "Bacteroidetes")
+                                | genus %in% c("Candida",
+                                               "Aspergillus",
+                                               "Trichophyton",
+                                               "Giardia",
+                                               "Dientamoeba",
+                                               "Entamoeba"),
+                                0,
+                                1),
+             superprevalent = ifelse(
+               # most important Gram negatives
+               class == "Gammaproteobacteria"
+               # Streptococci and Staphylococci
+               | order %in% c("Lactobacillales",
+                              "Bacillales"),
+               0,
+               1)) %>%
+      as.data.table()
+    setkey(microorganismsDT, kingdom, superprevalent, prevalent, fullname)
+    microorganisms.superprevDT <- microorganismsDT[superprevalent == 0,]
+    microorganisms.prevDT <- microorganismsDT[superprevalent == 1 & prevalent == 0,]
+    microorganisms.unprevDT <- microorganismsDT[superprevalent == 1 & prevalent == 1,]
+    microorganisms.oldDT <- as.data.table(AMR::microorganisms.old)
+    setkey(microorganisms.oldDT, col_id, fullname)
+
+    assign(x = "microorganismsDT",
+           value = microorganismsDT,
+           envir = asNamespace("AMR"))
+
+    assign(x = "microorganisms.superprevDT",
+           value = microorganisms.superprevDT,
+           envir = asNamespace("AMR"))
+
+    assign(x = "microorganisms.prevDT",
+           value = microorganisms.prevDT,
+           envir = asNamespace("AMR"))
+
+    assign(x = "microorganisms.unprevDT",
+           value = microorganisms.unprevDT,
+           envir = asNamespace("AMR"))
+
+    assign(x = "microorganisms.oldDT",
+           value = microorganisms.oldDT,
+           envir = asNamespace("AMR"))
+
+    # packageStartupMessage("OK.", appendLF = TRUE)
+  }
 }