2018-06-08 12:06:54 +02:00
# ==================================================================== #
# TITLE #
# Antimicrobial Resistance (AMR) Analysis #
# #
2019-01-02 23:24:07 +01:00
# SOURCE #
# https://gitlab.com/msberends/AMR #
2018-06-08 12:06:54 +02:00
# #
# LICENCE #
2019-01-02 23:24:07 +01:00
# (c) 2019 Berends MS (m.s.berends@umcg.nl), Luz CF (c.f.luz@umcg.nl) #
2018-06-08 12:06:54 +02:00
# #
2019-01-02 23:24:07 +01:00
# This R package is free software; you can freely use and distribute #
# it for both personal and commercial purposes under the terms of the #
# GNU General Public License version 2.0 (GNU GPL-2), as published by #
# the Free Software Foundation. #
# #
# This R package was created for academic research and was publicly #
# released in the hope that it will be useful, but it comes WITHOUT #
# ANY WARRANTY OR LIABILITY. #
# Visit our website for more info: https://msberends.gitab.io/AMR. #
2018-06-08 12:06:54 +02:00
# ==================================================================== #
2018-08-31 13:36:19 +02:00
#' Transform to microorganism ID
2018-06-08 12:06:54 +02:00
#'
2018-09-24 23:33:29 +02:00
#' Use this function to determine a valid microorganism ID (\code{mo}). Determination is done using Artificial Intelligence (AI) and the complete taxonomic kingdoms \emph{Bacteria}, \emph{Fungi} and \emph{Protozoa} (see Source), so the input can be almost anything: a full name (like \code{"Staphylococcus aureus"}), an abbreviated name (like \code{"S. aureus"}), an abbreviation known in the field (like \code{"MRSA"}), or just a genus. You could also \code{\link{select}} a genus and species column, zie Examples.
2018-09-04 11:33:30 +02:00
#' @param x a character vector or a \code{data.frame} with one or two columns
#' @param Becker a logical to indicate whether \emph{Staphylococci} should be categorised into Coagulase Negative \emph{Staphylococci} ("CoNS") and Coagulase Positive \emph{Staphylococci} ("CoPS") instead of their own species, according to Karsten Becker \emph{et al.} [1].
#'
#' This excludes \emph{Staphylococcus aureus} at default, use \code{Becker = "all"} to also categorise \emph{S. aureus} as "CoPS".
2018-09-24 23:33:29 +02:00
#' @param Lancefield a logical to indicate whether beta-haemolytic \emph{Streptococci} should be categorised into Lancefield groups instead of their own species, according to Rebecca C. Lancefield [2]. These \emph{Streptococci} will be categorised in their first group, e.g. \emph{Streptococcus dysgalactiae} will be group C, although officially it was also categorised into groups G and L.
2018-09-04 11:33:30 +02:00
#'
#' This excludes \emph{Enterococci} at default (who are in group D), use \code{Lancefield = "all"} to also categorise all \emph{Enterococci} as group D.
2018-12-07 12:04:55 +01:00
#' @param allow_uncertain a logical to indicate whether the input should be checked for less possible results, see Details
2019-01-21 15:53:01 +01:00
#' @param reference_df a \code{data.frame} to use for extra reference when translating \code{x} to a valid \code{mo}. See \code{\link{set_mo_source}} and \code{\link{get_mo_source}} to automate the usage of your own codes (e.g. used in your analysis or organisation).
2018-08-31 13:36:19 +02:00
#' @rdname as.mo
#' @aliases mo
#' @keywords mo Becker becker Lancefield lancefield guess
2018-09-24 23:33:29 +02:00
#' @details
2018-10-01 11:39:43 +02:00
#' A microbial ID from this package (class: \code{mo}) typically looks like these examples:\cr
2018-09-24 23:33:29 +02:00
#' \preformatted{
2018-12-07 12:04:55 +01:00
#' Code Full name
#' --------------- --------------------------------------
#' B_KLBSL Klebsiella
#' B_KLBSL_PNE Klebsiella pneumoniae
#' B_KLBSL_PNE_RHI Klebsiella pneumoniae rhinoscleromatis
2018-09-24 23:33:29 +02:00
#' | | | |
#' | | | |
#' | | | ----> subspecies, a 3-4 letter acronym
#' | | ----> species, a 3-4 letter acronym
#' | ----> genus, a 5-7 letter acronym, mostly without vowels
2019-02-18 02:33:37 +01:00
#' ----> taxonomic kingdom: A (Archaea), B (Bacteria), C (Chromista),
#' F (Fungi), P (Protozoa) or V (Viruses)
2018-09-24 23:33:29 +02:00
#' }
2018-08-01 08:03:31 +02:00
#'
2018-09-04 11:33:30 +02:00
#' Use the \code{\link{mo_property}} functions to get properties based on the returned code, see Examples.
2018-08-28 13:51:13 +02:00
#'
2018-10-31 12:10:49 +01:00
#' This function uses Artificial Intelligence (AI) to help getting fast and logical results. It tries to find matches in this order:
#' \itemize{
2019-02-08 16:06:54 +01:00
#' \item{Taxonomic kingdom: it first searches in Bacteria, then Fungi, then Protozoa}
2018-10-31 12:10:49 +01:00
#' \item{Human pathogenic prevalence: it first searches in more prevalent microorganisms, then less prevalent ones}
2018-11-24 20:25:09 +01:00
#' \item{Valid MO codes and full names: it first searches in already valid MO code and known genus/species combinations}
2018-10-31 12:10:49 +01:00
#' \item{Breakdown of input values: from here it starts to breakdown input values to find possible matches}
#' }
#'
2018-12-07 12:04:55 +01:00
#' A couple of effects because of these rules:
2018-07-23 14:14:03 +02:00
#' \itemize{
#' \item{\code{"E. coli"} will return the ID of \emph{Escherichia coli} and not \emph{Entamoeba coli}, although the latter would alphabetically come first}
2018-09-16 16:43:29 +02:00
#' \item{\code{"H. influenzae"} will return the ID of \emph{Haemophilus influenzae} and not \emph{Haematobacter influenzae} for the same reason}
2018-07-23 14:14:03 +02:00
#' \item{Something like \code{"p aer"} will return the ID of \emph{Pseudomonas aeruginosa} and not \emph{Pasteurella aerogenes}}
2018-09-16 16:43:29 +02:00
#' \item{Something like \code{"stau"} or \code{"S aur"} will return the ID of \emph{Staphylococcus aureus} and not \emph{Staphylococcus auricularis}}
2018-07-23 14:14:03 +02:00
#' }
2018-10-01 11:39:43 +02:00
#' This means that looking up human pathogenic microorganisms takes less time than looking up human \strong{non}-pathogenic microorganisms.
2018-09-24 23:33:29 +02:00
#'
2019-02-08 16:06:54 +01:00
#' \strong{UNCERTAIN RESULTS} \cr
#' When using \code{allow_uncertain = TRUE} (which is the default setting), it will use additional rules if all previous AI rules failed to get valid results. These are:
#' \itemize{
#' \item{It tries to look for previously accepted (but now invalid) taxonomic names}
#' \item{It strips off values between brackets and the brackets itself, and re-evaluates the input with all previous rules}
#' \item{It strips off words from the end one by one and re-evaluates the input with all previous rules}
#' \item{It strips off words from the start one by one and re-evaluates the input with all previous rules}
2019-02-20 00:04:48 +01:00
#' \item{It tries to look for some manual changes which are not yet published to the Catalogue of Life (like \emph{Propionibacterium} not yet being \emph{Cutibacterium})}
2019-02-08 16:06:54 +01:00
#' }
#'
#' Examples:
2018-12-07 12:04:55 +01:00
#' \itemize{
2019-02-18 02:33:37 +01:00
#' \item{\code{"Streptococcus group B (known as S. agalactiae)"}. The text between brackets will be removed and a warning will be thrown that the result \emph{Streptococcus group B} (\code{B_STRPT_GRB}) needs review.}
2018-12-07 12:04:55 +01:00
#' \item{\code{"S. aureus - please mind: MRSA"}. The last word will be stripped, after which the function will try to find a match. If it does not, the second last word will be stripped, etc. Again, a warning will be thrown that the result \emph{Staphylococcus aureus} (\code{B_STPHY_AUR}) needs review.}
#' \item{\code{"D. spartina"}. This is the abbreviation of an old taxonomic name: \emph{Didymosphaeria spartinae} (the last "e" was missing from the input). This fungus was renamed to \emph{Leptosphaeria obiones}, so a warning will be thrown that this result (\code{F_LPTSP_OBI}) needs review.}
2019-02-08 16:06:54 +01:00
#' \item{\code{"Fluoroquinolone-resistant Neisseria gonorrhoeae"}. The first word will be stripped, after which the function will try to find a match. A warning will be thrown that the result \emph{Neisseria gonorrhoeae} (\code{B_NESSR_GON}) needs review.}
2018-12-07 12:04:55 +01:00
#' }
#'
2019-02-08 16:06:54 +01:00
#' Use \code{mo_failures()} to get a vector with all values that could not be coerced to a valid value.
#'
#' Use \code{mo_uncertainties()} to get a vector with all values that were coerced to a valid value, but with uncertainty.
#'
#' Use \code{mo_renamed()} to get a vector with all values that could be coerced based on an old, previously accepted taxonomic name.
#'
2019-02-20 00:04:48 +01:00
#' @inheritSection catalogue_of_life Catalogue of Life
2018-10-31 12:10:49 +01:00
# (source as a section, so it can be inherited by other man pages)
2018-09-24 23:33:29 +02:00
#' @section Source:
2018-09-04 11:33:30 +02:00
#' [1] Becker K \emph{et al.} \strong{Coagulase-Negative Staphylococci}. 2014. Clin Microbiol Rev. 27(4): 870– 926. \url{https://dx.doi.org/10.1128/CMR.00109-13}
#'
#' [2] Lancefield RC \strong{A serological differentiation of human and other groups of hemolytic streptococci}. 1933. J Exp Med. 57(4): 571– 95. \url{https://dx.doi.org/10.1084/jem.57.4.571}
2018-09-24 23:33:29 +02:00
#'
2019-02-20 00:04:48 +01:00
#' [3] Catalogue of Life: Annual Checklist (public online database), \url{www.catalogueoflife.org}.
2018-06-08 12:06:54 +02:00
#' @export
2018-08-31 13:36:19 +02:00
#' @return Character (vector) with class \code{"mo"}. Unknown values will return \code{NA}.
2019-02-20 00:04:48 +01:00
#' @seealso \code{\link{microorganisms}} for the \code{data.frame} that is being used to determine ID's. \cr
2018-09-24 23:33:29 +02:00
#' The \code{\link{mo_property}} functions (like \code{\link{mo_genus}}, \code{\link{mo_gramstain}}) to get properties based on the returned code.
2019-01-02 23:24:07 +01:00
#' @inheritSection AMR Read more on our website!
2018-06-08 12:06:54 +02:00
#' @examples
2018-10-01 11:39:43 +02:00
#' # These examples all return "B_STPHY_AUR", the ID of S. aureus:
2018-08-31 13:36:19 +02:00
#' as.mo("stau")
#' as.mo("STAU")
#' as.mo("staaur")
#' as.mo("S. aureus")
#' as.mo("S aureus")
#' as.mo("Staphylococcus aureus")
2018-12-07 12:04:55 +01:00
#' as.mo("Staphylococcus aureus (MRSA)")
2018-08-31 13:36:19 +02:00
#' as.mo("MRSA") # Methicillin Resistant S. aureus
#' as.mo("VISA") # Vancomycin Intermediate S. aureus
#' as.mo("VRSA") # Vancomycin Resistant S. aureus
2018-06-08 12:06:54 +02:00
#'
2018-09-05 10:51:46 +02:00
#' as.mo("Streptococcus group A")
#' as.mo("GAS") # Group A Streptococci
#' as.mo("GBS") # Group B Streptococci
#'
2019-01-06 16:40:55 +01:00
#' as.mo("S. epidermidis") # will remain species: B_STPHY_EPI
#' as.mo("S. epidermidis", Becker = TRUE) # will not remain species: B_STPHY_CNS
2018-08-02 13:15:45 +02:00
#'
2019-02-18 02:33:37 +01:00
#' as.mo("S. pyogenes") # will remain species: B_STRPT_PYO
#' as.mo("S. pyogenes", Lancefield = TRUE) # will not remain species: B_STRPT_GRA
2018-08-02 13:15:45 +02:00
#'
2018-08-31 13:36:19 +02:00
#' # Use mo_* functions to get a specific property based on `mo`
2018-09-24 23:33:29 +02:00
#' Ecoli <- as.mo("E. coli") # returns `B_ESCHR_COL`
2018-08-28 13:51:13 +02:00
#' mo_genus(Ecoli) # returns "Escherichia"
2018-09-24 23:33:29 +02:00
#' mo_gramstain(Ecoli) # returns "Gram negative"
#' # but it uses as.mo internally too, so you could also just use:
#' mo_genus("E. coli") # returns "Escherichia"
#'
2018-08-28 13:51:13 +02:00
#'
2018-06-08 12:06:54 +02:00
#' \dontrun{
2018-08-31 13:36:19 +02:00
#' df$mo <- as.mo(df$microorganism_name)
2018-06-08 12:06:54 +02:00
#'
#' # the select function of tidyverse is also supported:
2018-07-23 14:14:03 +02:00
#' library(dplyr)
2018-08-31 13:36:19 +02:00
#' df$mo <- df %>%
2018-07-23 14:14:03 +02:00
#' select(microorganism_name) %>%
2018-11-24 20:25:09 +01:00
#' as.mo()
2018-06-08 12:06:54 +02:00
#'
#' # and can even contain 2 columns, which is convenient for genus/species combinations:
2018-08-31 13:36:19 +02:00
#' df$mo <- df %>%
2018-07-23 14:14:03 +02:00
#' select(genus, species) %>%
2018-11-24 20:25:09 +01:00
#' as.mo()
#' # although this works easier and does the same:
2018-07-23 14:14:03 +02:00
#' df <- df %>%
2018-11-24 20:25:09 +01:00
#' mutate(mo = as.mo(paste(genus, species)))
2018-06-08 12:06:54 +02:00
#' }
2019-01-21 15:53:01 +01:00
as.mo <- function ( x , Becker = FALSE , Lancefield = FALSE , allow_uncertain = TRUE , reference_df = get_mo_source ( ) ) {
2018-12-06 14:36:39 +01:00
mo <- mo_validate ( x = x , property = " mo" ,
Becker = Becker , Lancefield = Lancefield ,
allow_uncertain = allow_uncertain , reference_df = reference_df )
structure ( .Data = mo , class = " mo" )
2018-09-27 23:23:48 +02:00
}
#' @rdname as.mo
#' @export
is.mo <- function ( x ) {
2018-12-22 22:39:34 +01:00
identical ( class ( x ) , " mo" )
2018-09-27 23:23:48 +02:00
}
2019-01-21 15:53:01 +01:00
#' @importFrom dplyr %>% pull left_join n_distinct progress_estimated filter
2018-10-31 12:10:49 +01:00
#' @importFrom data.table data.table as.data.table setkey
2019-02-08 16:06:54 +01:00
#' @importFrom crayon magenta red silver italic has_color
2018-12-06 14:36:39 +01:00
exec_as.mo <- function ( x , Becker = FALSE , Lancefield = FALSE ,
2019-01-21 15:53:01 +01:00
allow_uncertain = TRUE , reference_df = get_mo_source ( ) ,
2018-12-06 14:36:39 +01:00
property = " mo" , clear_options = TRUE ) {
2018-10-31 12:10:49 +01:00
2018-11-24 20:25:09 +01:00
if ( ! " AMR" %in% base :: .packages ( ) ) {
library ( " AMR" )
2019-02-18 02:33:37 +01:00
# check onLoad() in R/zzz.R: data tables are created there.
2018-11-24 20:25:09 +01:00
}
2018-10-31 12:10:49 +01:00
2018-12-06 14:36:39 +01:00
if ( clear_options == TRUE ) {
options ( mo_failures = NULL )
2019-02-08 16:06:54 +01:00
options ( mo_uncertainties = NULL )
2018-12-06 14:36:39 +01:00
options ( mo_renamed = NULL )
}
2018-06-08 12:06:54 +02:00
if ( NCOL ( x ) == 2 ) {
# support tidyverse selection like: df %>% select(colA, colB)
# paste these columns together
x_vector <- vector ( " character" , NROW ( x ) )
for ( i in 1 : NROW ( x ) ) {
x_vector [i ] <- paste ( pull ( x [i , ] , 1 ) , pull ( x [i , ] , 2 ) , sep = " " )
}
x <- x_vector
} else {
if ( NCOL ( x ) > 2 ) {
stop ( ' `x` can be 2 columns at most' , call. = FALSE )
}
2018-09-09 12:11:44 +02:00
x [is.null ( x ) ] <- NA
2018-07-23 14:14:03 +02:00
2018-06-08 12:06:54 +02:00
# support tidyverse selection like: df %>% select(colA)
2018-11-15 12:42:35 +01:00
if ( ! is.vector ( x ) & ! is.null ( dim ( x ) ) ) {
2018-06-08 12:06:54 +02:00
x <- pull ( x , 1 )
}
}
2018-12-14 10:52:20 +01:00
notes <- character ( 0 )
2019-02-08 16:06:54 +01:00
uncertainties <- character ( 0 )
2018-09-27 23:23:48 +02:00
failures <- character ( 0 )
x_input <- x
# only check the uniques, which is way faster
x <- unique ( x )
2018-11-30 12:05:59 +01:00
# remove empty values (to later fill them in again with NAs)
2018-10-19 00:17:03 +02:00
x <- x [ ! is.na ( x ) & ! is.null ( x ) & ! identical ( x , " " ) ]
2018-09-27 23:23:48 +02:00
2019-02-18 02:33:37 +01:00
2019-02-20 00:04:48 +01:00
# conversion of old MO codes from v0.5.0 (ITIS) to later versions (Catalogue of Life)
if ( any ( x %like% " ^[BFP]_[A-Z]{3,7}" ) ) {
leftpart <- gsub ( " ^([BFP]_[A-Z]{3,7}).*" , " \\1" , x )
if ( any ( leftpart %in% names ( mo_codes_v0.5.0 ) ) ) {
rightpart <- gsub ( " ^[BFP]_[A-Z]{3,7}(.*)" , " \\1" , x )
leftpart <- mo_codes_v0.5.0 [leftpart ]
x [ ! is.na ( leftpart ) ] <- paste0 ( leftpart [ ! is.na ( leftpart ) ] , rightpart [ ! is.na ( leftpart ) ] )
}
}
2019-02-18 02:33:37 +01:00
2018-10-01 11:39:43 +02:00
# defined df to check for
if ( ! is.null ( reference_df ) ) {
if ( ! is.data.frame ( reference_df ) | NCOL ( reference_df ) < 2 ) {
2018-10-09 15:41:44 +02:00
stop ( ' `reference_df` must be a data.frame with at least two columns.' , call. = FALSE )
2018-10-01 11:39:43 +02:00
}
2019-01-21 15:53:01 +01:00
if ( ! " mo" %in% colnames ( reference_df ) ) {
stop ( " `reference_df` must contain a column `mo` with values from the 'microorganisms' data set." , call. = FALSE )
}
reference_df <- reference_df %>% filter ( ! is.na ( mo ) )
# # remove factors, just keep characters
2018-10-01 11:39:43 +02:00
suppressWarnings (
reference_df [ ] <- lapply ( reference_df , as.character )
)
}
2019-01-21 15:53:01 +01:00
2019-01-08 16:23:45 +01:00
if ( all ( identical ( trimws ( x_input ) , " " ) | is.na ( x_input ) ) ) {
# all empty
if ( property == " mo" ) {
return ( structure ( rep ( NA_character_ , length ( x_input ) ) , class = " mo" ) )
} else {
return ( rep ( NA_character_ , length ( x_input ) ) )
}
2019-01-21 15:53:01 +01:00
} else if ( all ( x %in% reference_df [ , 1 ] )
& all ( reference_df [ , " mo" ] %in% microorganismsDT [ [ " mo" ] ] ) ) {
# all in reference df
2018-10-01 11:39:43 +02:00
colnames ( reference_df ) [1 ] <- " x"
suppressWarnings (
x <- data.frame ( x = x , stringsAsFactors = FALSE ) %>%
left_join ( reference_df , by = " x" ) %>%
2018-11-24 20:25:09 +01:00
left_join ( microorganisms , by = " mo" ) %>%
2018-10-01 11:39:43 +02:00
pull ( property )
)
2019-01-21 15:53:01 +01:00
} else if ( all ( x %in% microorganismsDT [ [ " mo" ] ] ) ) {
# existing mo codes when not looking for property "mo", like mo_genus("B_ESCHR_COL")
x <- microorganismsDT [data.table ( mo = x ) , on = " mo" , ..property ] [ [1 ] ]
} else if ( all ( toupper ( x ) %in% microorganisms.codes [ , " code" ] ) ) {
# commonly used MO codes
y <- as.data.table ( microorganisms.codes ) [data.table ( code = toupper ( x ) ) , on = " code" , ]
2018-10-31 12:10:49 +01:00
x <- microorganismsDT [data.table ( mo = y [ [ " mo" ] ] ) , on = " mo" , ..property ] [ [1 ] ]
} else if ( ! all ( x %in% microorganismsDT [ [property ] ] ) ) {
2018-09-24 23:33:29 +02:00
2018-09-27 23:23:48 +02:00
x_backup <- trimws ( x , which = " both" )
2018-11-24 20:25:09 +01:00
# remove spp and species
2019-02-08 16:06:54 +01:00
x <- trimws ( gsub ( " +(spp.?|ssp.?|sp.? |ss ?.?|subsp.?|subspecies|biovar |serovar |species)" , " " , x_backup , ignore.case = TRUE ) , which = " both" )
2018-11-24 20:25:09 +01:00
x_species <- paste ( x , " species" )
2018-09-27 23:23:48 +02:00
# translate to English for supported languages of mo_property
2018-12-06 14:36:39 +01:00
x <- gsub ( " (Gruppe|gruppe|groep|grupo|gruppo|groupe)" , " group" , x , ignore.case = TRUE )
2018-09-27 23:23:48 +02:00
# remove 'empty' genus and species values
x <- gsub ( " (no MO)" , " " , x , fixed = TRUE )
2018-11-24 20:25:09 +01:00
# remove non-text in case of "E. coli" except dots and spaces
x <- gsub ( " [^.a-zA-Z0-9/ \\-]+" , " " , x )
2019-02-08 16:06:54 +01:00
# replace minus by a space
x <- gsub ( " -+" , " " , x )
# replace hemolytic by haemolytic
x <- gsub ( " ha?emoly" , " haemoly" , x )
# place minus back in streptococci
2019-02-18 02:33:37 +01:00
x <- gsub ( " (alpha|beta|gamma) ha?emoly" , " \\1-haemoly" , x )
2019-02-08 16:06:54 +01:00
# remove genus as first word
x <- gsub ( " ^Genus " , " " , x )
2018-11-24 20:25:09 +01:00
2018-09-27 23:23:48 +02:00
# but spaces before and after should be omitted
x <- trimws ( x , which = " both" )
x_trimmed <- x
x_trimmed_species <- paste ( x_trimmed , " species" )
2018-12-06 14:36:39 +01:00
x_trimmed_without_group <- gsub ( " group$" , " " , x_trimmed , ignore.case = TRUE )
# remove last part from "-" or "/"
x_trimmed_without_group <- gsub ( " (.*)[-/].*" , " \\1" , x_trimmed_without_group )
2018-11-24 20:25:09 +01:00
# replace space and dot by regex sign
x_withspaces <- gsub ( " [ .]+" , " .* " , x )
x <- gsub ( " [ .]+" , " .*" , x )
2018-09-27 23:23:48 +02:00
# add start en stop regex
x <- paste0 ( ' ^' , x , ' $' )
2019-02-08 16:06:54 +01:00
x_withspaces_start_only <- paste0 ( ' ^' , x_withspaces )
2019-02-21 18:55:52 +01:00
x_withspaces_end_only <- paste0 ( x_withspaces , ' $' )
2019-02-08 16:06:54 +01:00
x_withspaces_start_end <- paste0 ( ' ^' , x_withspaces , ' $' )
2018-09-27 23:23:48 +02:00
2018-12-06 14:36:39 +01:00
# cat(paste0('x "', x, '"\n'))
# cat(paste0('x_species "', x_species, '"\n'))
2019-02-08 16:06:54 +01:00
# cat(paste0('x_withspaces_start_only "', x_withspaces_start_only, '"\n'))
2019-02-21 18:55:52 +01:00
# cat(paste0('x_withspaces_end_only "', x_withspaces_end_only, '"\n'))
2019-02-08 16:06:54 +01:00
# cat(paste0('x_withspaces_start_end "', x_withspaces_start_end, '"\n'))
2018-12-06 14:36:39 +01:00
# cat(paste0('x_backup "', x_backup, '"\n'))
# cat(paste0('x_trimmed "', x_trimmed, '"\n'))
# cat(paste0('x_trimmed_species "', x_trimmed_species, '"\n'))
# cat(paste0('x_trimmed_without_group "', x_trimmed_without_group, '"\n'))
2018-09-27 23:23:48 +02:00
2018-12-07 12:04:55 +01:00
progress <- progress_estimated ( n = length ( x ) , min_time = 3 )
2018-09-27 23:23:48 +02:00
for ( i in 1 : length ( x ) ) {
2018-12-07 12:04:55 +01:00
progress $ tick ( ) $ print ( )
2019-02-08 16:06:54 +01:00
if ( tolower ( x_trimmed [i ] ) %in% c ( " " , " xxx" , " other" , " none" , " unknown" ) ) {
# empty and nonsense values, ignore without warning ("xxx" is WHONET code for 'no growth')
2018-10-19 00:17:03 +02:00
x [i ] <- NA_character_
next
}
2019-02-08 16:06:54 +01:00
if ( nchar ( gsub ( " [^a-zA-Z]" , " " , x_trimmed [i ] ) ) < 3 ) {
2018-11-24 20:25:09 +01:00
# check if search term was like "A. species", then return first genus found with ^A
2019-02-08 16:06:54 +01:00
if ( x_backup [i ] %like% " [a-z]+ species" | x_backup [i ] %like% " [a-z] spp[.]?" ) {
2018-11-24 20:25:09 +01:00
# get mo code of first hit
2019-02-08 16:06:54 +01:00
found <- microorganismsDT [fullname %like% x_withspaces_start_only [i ] , mo ]
2018-11-24 20:25:09 +01:00
if ( length ( found ) > 0 ) {
2018-11-30 12:05:59 +01:00
mo_code <- found [1L ] %>% strsplit ( " _" ) %>% unlist ( ) %>% .[1 : 2 ] %>% paste ( collapse = " _" )
found <- microorganismsDT [mo == mo_code , ..property ] [ [1 ] ]
# return first genus that begins with x_trimmed, e.g. when "E. spp."
if ( length ( found ) > 0 ) {
x [i ] <- found [1L ]
next
}
2018-11-24 20:25:09 +01:00
}
}
2018-11-30 12:05:59 +01:00
# fewer than 3 chars and not looked for species, add as failure
x [i ] <- NA_character_
failures <- c ( failures , x_backup [i ] )
next
2018-09-27 23:23:48 +02:00
}
2018-09-24 23:33:29 +02:00
2019-02-08 16:06:54 +01:00
if ( x_trimmed [i ] %like% " virus" ) {
# there is no fullname like virus, so don't try to coerce it
2019-01-21 21:24:40 +01:00
x [i ] <- NA_character_
failures <- c ( failures , x_backup [i ] )
next
}
2018-09-27 23:23:48 +02:00
# translate known trivial abbreviations to genus + species ----
if ( ! is.na ( x_trimmed [i ] ) ) {
2019-01-21 21:24:40 +01:00
if ( toupper ( x_trimmed [i ] ) %in% c ( ' MRSA' , ' MSSA' , ' VISA' , ' VRSA' ) ) {
2018-10-31 12:10:49 +01:00
x [i ] <- microorganismsDT [mo == ' B_STPHY_AUR' , ..property ] [ [1 ] ] [1L ]
2018-09-27 23:23:48 +02:00
next
}
2019-01-21 21:24:40 +01:00
if ( toupper ( x_trimmed [i ] ) %in% c ( ' MRSE' , ' MSSE' ) ) {
2018-10-31 12:10:49 +01:00
x [i ] <- microorganismsDT [mo == ' B_STPHY_EPI' , ..property ] [ [1 ] ] [1L ]
2018-09-27 23:23:48 +02:00
next
}
2018-12-06 14:36:39 +01:00
if ( toupper ( x_trimmed [i ] ) == " VRE"
| x_trimmed [i ] %like% ' (enterococci|enterokok|enterococo)[a-z]*?$' ) {
2018-10-31 12:10:49 +01:00
x [i ] <- microorganismsDT [mo == ' B_ENTRC' , ..property ] [ [1 ] ] [1L ]
2018-09-27 23:23:48 +02:00
next
}
2019-02-21 23:32:30 +01:00
if ( toupper ( x_trimmed [i ] ) %in% c ( " EHEC" , " EPEC" , " EIEC" , " STEC" , " ATEC" ) ) {
2019-02-08 16:06:54 +01:00
x [i ] <- microorganismsDT [mo == ' B_ESCHR_COL' , ..property ] [ [1 ] ] [1L ]
next
}
2018-09-27 23:23:48 +02:00
if ( toupper ( x_trimmed [i ] ) == ' MRPA' ) {
# multi resistant P. aeruginosa
2019-02-18 02:33:37 +01:00
x [i ] <- microorganismsDT [mo == ' B_PSDMN_AER' , ..property ] [ [1 ] ] [1L ]
2018-09-27 23:23:48 +02:00
next
}
2018-10-12 16:35:18 +02:00
if ( toupper ( x_trimmed [i ] ) == ' CRS'
| toupper ( x_trimmed [i ] ) == ' CRSM' ) {
# co-trim resistant S. maltophilia
2018-10-31 12:10:49 +01:00
x [i ] <- microorganismsDT [mo == ' B_STNTR_MAL' , ..property ] [ [1 ] ] [1L ]
2018-10-12 16:35:18 +02:00
next
}
2018-09-27 23:23:48 +02:00
if ( toupper ( x_trimmed [i ] ) %in% c ( ' PISP' , ' PRSP' , ' VISP' , ' VRSP' ) ) {
# peni I, peni R, vanco I, vanco R: S. pneumoniae
2019-02-18 02:33:37 +01:00
x [i ] <- microorganismsDT [mo == ' B_STRPT_PNE' , ..property ] [ [1 ] ] [1L ]
2018-09-27 23:23:48 +02:00
next
}
if ( toupper ( x_trimmed [i ] ) %like% ' ^G[ABCDFGHK]S$' ) {
2019-02-18 02:33:37 +01:00
# Streptococci, like GBS = Group B Streptococci (B_STRPT_GRB)
x [i ] <- microorganismsDT [mo == gsub ( " G([ABCDFGHK])S" , " B_STRPT_GR\\1" , x_trimmed [i ] , ignore.case = TRUE ) , ..property ] [ [1 ] ] [1L ]
2018-12-06 14:36:39 +01:00
next
}
if ( toupper ( x_trimmed [i ] ) %like% ' (streptococc|streptokok).* [ABCDFGHK]$' ) {
# Streptococci in different languages, like "estreptococos grupo B"
2019-02-18 02:33:37 +01:00
x [i ] <- microorganismsDT [mo == gsub ( " .*(streptococ|streptokok|estreptococ).* ([ABCDFGHK])$" , " B_STRPT_GR\\2" , x_trimmed [i ] , ignore.case = TRUE ) , ..property ] [ [1 ] ] [1L ]
2018-12-06 14:36:39 +01:00
next
}
if ( toupper ( x_trimmed [i ] ) %like% ' group [ABCDFGHK] (streptococ|streptokok|estreptococ)' ) {
# Streptococci in different languages, like "Group A Streptococci"
2019-02-18 02:33:37 +01:00
x [i ] <- microorganismsDT [mo == gsub ( " .*group ([ABCDFGHK]) (streptococ|streptokok|estreptococ).*" , " B_STRPT_GR\\1" , x_trimmed [i ] , ignore.case = TRUE ) , ..property ] [ [1 ] ] [1L ]
2018-09-27 23:23:48 +02:00
next
}
# CoNS/CoPS in different languages (support for German, Dutch, Spanish, Portuguese) ----
if ( tolower ( x [i ] ) %like% ' [ck]oagulas[ea] negatie?[vf]'
| tolower ( x_trimmed [i ] ) %like% ' [ck]oagulas[ea] negatie?[vf]'
| tolower ( x [i ] ) %like% ' [ck]o?ns[^a-z]?$' ) {
# coerce S. coagulase negative
2018-10-31 12:10:49 +01:00
x [i ] <- microorganismsDT [mo == ' B_STPHY_CNS' , ..property ] [ [1 ] ] [1L ]
2018-09-27 23:23:48 +02:00
next
}
if ( tolower ( x [i ] ) %like% ' [ck]oagulas[ea] positie?[vf]'
| tolower ( x_trimmed [i ] ) %like% ' [ck]oagulas[ea] positie?[vf]'
| tolower ( x [i ] ) %like% ' [ck]o?ps[^a-z]?$' ) {
# coerce S. coagulase positive
2018-10-31 12:10:49 +01:00
x [i ] <- microorganismsDT [mo == ' B_STPHY_CPS' , ..property ] [ [1 ] ] [1L ]
2018-09-27 23:23:48 +02:00
next
}
2018-12-06 14:36:39 +01:00
if ( tolower ( x [i ] ) %like% ' gram[ -]?neg.*'
| tolower ( x_trimmed [i ] ) %like% ' gram[ -]?neg.*' ) {
2018-11-02 10:27:57 +01:00
# coerce S. coagulase positive
x [i ] <- microorganismsDT [mo == ' B_GRAMN' , ..property ] [ [1 ] ] [1L ]
next
}
2018-12-06 14:36:39 +01:00
if ( tolower ( x [i ] ) %like% ' gram[ -]?pos.*'
| tolower ( x_trimmed [i ] ) %like% ' gram[ -]?pos.*' ) {
2018-11-02 10:27:57 +01:00
# coerce S. coagulase positive
x [i ] <- microorganismsDT [mo == ' B_GRAMP' , ..property ] [ [1 ] ] [1L ]
next
}
2018-12-06 14:36:39 +01:00
if ( grepl ( " [sS]almonella [A-Z][a-z]+ ?.*" , x_trimmed [i ] ) ) {
2019-02-08 16:06:54 +01:00
if ( x_trimmed [i ] %like% " Salmonella group" ) {
# Salmonella Group A to Z, just return S. species for now
x [i ] <- microorganismsDT [mo == ' B_SLMNL' , ..property ] [ [1 ] ] [1L ]
notes <- c ( notes ,
magenta ( paste0 ( " Note: " ,
italic ( " Salmonella" ) , " " , trimws ( gsub ( " Salmonella" , " " , x_trimmed [i ] ) ) ,
" was considered " ,
italic ( " Salmonella species" ) ,
" (B_SLMNL)" ) ) )
} else {
# Salmonella with capital letter species like "Salmonella Goettingen" - they're all S. enterica
x [i ] <- microorganismsDT [mo == ' B_SLMNL_ENT' , ..property ] [ [1 ] ] [1L ]
notes <- c ( notes ,
magenta ( paste0 ( " Note: " ,
italic ( " Salmonella" ) , " " , trimws ( gsub ( " Salmonella" , " " , x_trimmed [i ] ) ) ,
" was considered a subspecies of " ,
italic ( " Salmonella enterica" ) ,
" (B_SLMNL_ENT)" ) ) )
}
2018-12-06 14:36:39 +01:00
next
}
2018-09-27 23:23:48 +02:00
}
2018-08-28 13:51:13 +02:00
2018-09-27 23:23:48 +02:00
# FIRST TRY FULLNAMES AND CODES
2018-10-01 11:39:43 +02:00
# if only genus is available, return only genus
2018-09-27 23:23:48 +02:00
if ( all ( ! c ( x [i ] , x_trimmed [i ] ) %like% " " ) ) {
2018-10-31 12:10:49 +01:00
found <- microorganismsDT [tolower ( fullname ) %in% tolower ( c ( x_species [i ] , x_trimmed_species [i ] ) ) , ..property ] [ [1 ] ]
2018-09-27 23:23:48 +02:00
if ( length ( found ) > 0 ) {
x [i ] <- found [1L ]
next
}
2019-02-08 16:06:54 +01:00
if ( nchar ( x_trimmed [i ] ) >= 6 ) {
found <- microorganismsDT [tolower ( fullname ) %like% paste0 ( x_withspaces_start_only [i ] , " [a-z]+ species" ) , ..property ] [ [1 ] ]
2018-09-27 23:23:48 +02:00
if ( length ( found ) > 0 ) {
x [i ] <- found [1L ]
next
}
}
2019-02-08 16:06:54 +01:00
# rest of genus only is in allow_uncertain part.
2018-09-27 23:23:48 +02:00
}
2018-08-28 13:51:13 +02:00
2018-10-01 11:39:43 +02:00
# TRY OTHER SOURCES ----
2019-01-21 15:53:01 +01:00
if ( toupper ( x_backup [i ] ) %in% microorganisms.codes [ , 1 ] ) {
mo_found <- microorganisms.codes [toupper ( x_backup [i ] ) == microorganisms.codes [ , 1 ] , " mo" ] [1L ]
2018-11-30 12:05:59 +01:00
if ( length ( mo_found ) > 0 ) {
x [i ] <- microorganismsDT [mo == mo_found , ..property ] [ [1 ] ] [1L ]
next
}
}
2018-11-24 20:25:09 +01:00
if ( ! is.null ( reference_df ) ) {
if ( x_backup [i ] %in% reference_df [ , 1 ] ) {
2019-01-21 15:53:01 +01:00
ref_mo <- reference_df [reference_df [ , 1 ] == x_backup [i ] , " mo" ]
2018-11-24 20:25:09 +01:00
if ( ref_mo %in% microorganismsDT [ , mo ] ) {
x [i ] <- microorganismsDT [mo == ref_mo , ..property ] [ [1 ] ] [1L ]
next
} else {
warning ( " Value '" , x_backup [i ] , " ' was found in reference_df, but '" , ref_mo , " ' is not a valid MO code." , call. = FALSE )
}
2018-10-01 11:39:43 +02:00
}
}
2019-02-21 18:55:52 +01:00
check_per_prevalence <- function ( data_to_check ,
a.x_backup ,
b.x_trimmed ,
c.x_trimmed_without_group ,
d.x_withspaces_start_end ,
e.x_withspaces_start_only ,
f.x_withspaces_end_only ) {
2019-02-18 02:33:37 +01:00
2019-02-21 18:55:52 +01:00
found <- data_to_check [tolower ( fullname ) %in% tolower ( c ( a.x_backup , b.x_trimmed ) ) , ..property ] [ [1 ] ]
# most probable: is exact match in fullname
2019-02-18 02:33:37 +01:00
if ( length ( found ) > 0 ) {
2019-02-21 18:55:52 +01:00
return ( found [1L ] )
2019-02-18 02:33:37 +01:00
}
2019-02-21 18:55:52 +01:00
found <- data_to_check [mo == toupper ( a.x_backup ) , ..property ] [ [1 ] ]
# is a valid mo
if ( length ( found ) > 0 ) {
return ( found [1L ] )
}
found <- data_to_check [tolower ( fullname ) == tolower ( c.x_trimmed_without_group ) , ..property ] [ [1 ] ]
if ( length ( found ) > 0 ) {
return ( found [1L ] )
}
2019-02-18 02:33:37 +01:00
2019-02-21 18:55:52 +01:00
# try any match keeping spaces ----
found <- data_to_check [fullname %like% d.x_withspaces_start_end , ..property ] [ [1 ] ]
if ( length ( found ) > 0 & nchar ( b.x_trimmed ) >= 6 ) {
return ( found [1L ] )
}
2018-12-06 14:36:39 +01:00
2019-02-21 18:55:52 +01:00
# try any match keeping spaces, not ending with $ ----
found <- data_to_check [fullname %like% paste0 ( trimws ( e.x_withspaces_start_only ) , " " ) , ..property ] [ [1 ] ]
if ( length ( found ) > 0 ) {
return ( found [1L ] )
}
found <- data_to_check [fullname %like% e.x_withspaces_start_only , ..property ] [ [1 ] ]
if ( length ( found ) > 0 & nchar ( b.x_trimmed ) >= 6 ) {
return ( found [1L ] )
}
2018-09-27 23:23:48 +02:00
2019-02-21 18:55:52 +01:00
# try any match keeping spaces, not start with ^ ----
found <- data_to_check [fullname %like% paste0 ( " " , trimws ( f.x_withspaces_end_only ) ) , ..property ] [ [1 ] ]
if ( length ( found ) > 0 ) {
return ( found [1L ] )
}
found <- data_to_check [fullname %like% f.x_withspaces_end_only , ..property ] [ [1 ] ]
if ( length ( found ) > 0 & nchar ( b.x_trimmed ) >= 6 ) {
return ( found [1L ] )
}
2018-09-27 23:23:48 +02:00
2019-02-21 18:55:52 +01:00
# try splitting of characters in the middle and then find ID ----
# only when text length is 6 or lower
# like esco = E. coli, klpn = K. pneumoniae, stau = S. aureus, staaur = S. aureus
if ( nchar ( b.x_trimmed ) <= 6 ) {
x_length <- nchar ( b.x_trimmed )
x_split <- paste0 ( " ^" ,
b.x_trimmed %>% substr ( 1 , x_length / 2 ) ,
' .* ' ,
b.x_trimmed %>% substr ( ( x_length / 2 ) + 1 , x_length ) )
found <- data_to_check [fullname %like% x_split , ..property ] [ [1 ] ]
if ( length ( found ) > 0 ) {
return ( found [1L ] )
}
}
2018-09-27 23:23:48 +02:00
2019-02-21 18:55:52 +01:00
# try fullname without start and without nchar limit of >= 6 ----
# like "K. pneu rhino" >> "Klebsiella pneumoniae (rhinoscleromatis)" = KLEPNERH
found <- data_to_check [fullname %like% e.x_withspaces_start_only , ..property ] [ [1 ] ]
2018-10-29 17:26:17 +01:00
if ( length ( found ) > 0 ) {
2019-02-21 18:55:52 +01:00
return ( found [1L ] )
2018-10-29 17:26:17 +01:00
}
2018-06-08 12:06:54 +02:00
2019-02-21 18:55:52 +01:00
# didn't found any
return ( NA_character_ )
2018-11-02 10:27:57 +01:00
}
2018-09-27 23:23:48 +02:00
2019-02-21 18:55:52 +01:00
# FIRST TRY VERY PREVALENT IN HUMAN INFECTIONS ----
x [i ] <- check_per_prevalence ( data_to_check = microorganismsDT [prevalence == 1 ] ,
a.x_backup = x_backup [i ] ,
b.x_trimmed = x_trimmed [i ] ,
c.x_trimmed_without_group = x_trimmed_without_group [i ] ,
d.x_withspaces_start_end = x_withspaces_start_end [i ] ,
e.x_withspaces_start_only = x_withspaces_start_only [i ] ,
f.x_withspaces_end_only = x_withspaces_end_only [i ] )
if ( ! is.na ( x [i ] ) ) {
2018-09-24 23:33:29 +02:00
next
}
2019-02-21 18:55:52 +01:00
# THEN TRY PREVALENT IN HUMAN INFECTIONS ----
x [i ] <- check_per_prevalence ( data_to_check = microorganismsDT [prevalence == 2 ] ,
a.x_backup = x_backup [i ] ,
b.x_trimmed = x_trimmed [i ] ,
c.x_trimmed_without_group = x_trimmed_without_group [i ] ,
d.x_withspaces_start_end = x_withspaces_start_end [i ] ,
e.x_withspaces_start_only = x_withspaces_start_only [i ] ,
f.x_withspaces_end_only = x_withspaces_end_only [i ] )
if ( ! is.na ( x [i ] ) ) {
2018-09-27 23:23:48 +02:00
next
}
2019-02-21 18:55:52 +01:00
# THEN UNPREVALENT IN HUMAN INFECTIONS ----
x [i ] <- check_per_prevalence ( data_to_check = microorganismsDT [prevalence == 3 ] ,
a.x_backup = x_backup [i ] ,
b.x_trimmed = x_trimmed [i ] ,
c.x_trimmed_without_group = x_trimmed_without_group [i ] ,
d.x_withspaces_start_end = x_withspaces_start_end [i ] ,
e.x_withspaces_start_only = x_withspaces_start_only [i ] ,
f.x_withspaces_end_only = x_withspaces_end_only [i ] )
if ( ! is.na ( x [i ] ) ) {
2018-11-02 10:27:57 +01:00
next
}
2018-09-27 23:23:48 +02:00
# MISCELLANEOUS ----
# look for old taxonomic names ----
2019-02-18 02:33:37 +01:00
found <- microorganisms.oldDT [tolower ( fullname ) == tolower ( x_backup [i ] )
| fullname %like% x_withspaces_start_end [i ] , ]
2018-09-25 16:44:40 +02:00
if ( NROW ( found ) > 0 ) {
2019-02-18 02:33:37 +01:00
col_id_new <- found [1 , col_id_new ]
2018-11-09 13:11:54 +01:00
# when property is "ref" (which is the case in mo_ref, mo_authors and mo_year), return the old value, so:
2019-02-21 23:32:30 +01:00
# mo_ref("Chlamydia psittaci") = "Page, 1968" (with warning)
# mo_ref("Chlamydophila psittaci") = "Everett et al., 1999"
2018-11-09 13:11:54 +01:00
if ( property == " ref" ) {
x [i ] <- found [1 , ref ]
} else {
2019-02-18 02:33:37 +01:00
x [i ] <- microorganismsDT [col_id == found [1 , col_id_new ] , ..property ] [ [1 ] ]
2018-11-09 13:11:54 +01:00
}
2019-02-18 02:33:37 +01:00
was_renamed ( name_old = found [1 , fullname ] ,
name_new = microorganismsDT [col_id == found [1 , col_id_new ] , fullname ] ,
2019-02-08 16:06:54 +01:00
ref_old = found [1 , ref ] ,
2019-02-18 02:33:37 +01:00
ref_new = microorganismsDT [col_id == found [1 , col_id_new ] , ref ] ,
mo = microorganismsDT [col_id == found [1 , col_id_new ] , mo ] )
2018-09-25 16:44:40 +02:00
next
}
2018-09-27 23:23:48 +02:00
# check for uncertain results ----
if ( allow_uncertain == TRUE ) {
2018-12-14 10:52:20 +01:00
2019-02-21 23:32:30 +01:00
uncertain_fn <- function ( a.x_backup , b.x_trimmed , c.x_withspaces_start_end , d.x_withspaces_start_only ) {
2019-02-08 16:06:54 +01:00
# (1) look for genus only, part of name ----
if ( nchar ( b.x_trimmed ) > 4 & ! b.x_trimmed %like% " " ) {
if ( ! grepl ( " ^[A-Z][a-z]+" , b.x_trimmed , ignore.case = FALSE ) ) {
# not when input is like Genustext, because then Neospora would lead to Actinokineospora
found <- microorganismsDT [tolower ( fullname ) %like% paste ( b.x_trimmed , " species" ) , ..property ] [ [1 ] ]
if ( length ( found ) > 0 ) {
x [i ] <- found [1L ]
uncertainties <<- c ( uncertainties ,
paste0 ( " '" , a.x_backup , " ' >> " , microorganismsDT [mo == found [1L ] , fullname ] [ [1 ] ] , " (" , found [1L ] , " )" ) )
return ( x )
}
}
}
# (2) look again for old taxonomic names, now for G. species ----
2019-02-18 02:33:37 +01:00
found <- microorganisms.oldDT [fullname %like% c.x_withspaces_start_end
2019-02-21 23:32:30 +01:00
| fullname %like% d.x_withspaces_start_only ]
2018-12-14 11:44:15 +01:00
if ( NROW ( found ) > 0 & nchar ( b.x_trimmed ) >= 6 ) {
2018-12-14 10:52:20 +01:00
if ( property == " ref" ) {
# when property is "ref" (which is the case in mo_ref, mo_authors and mo_year), return the old value, so:
# mo_ref("Chlamydia psittaci) = "Page, 1968" (with warning)
# mo_ref("Chlamydophila psittaci) = "Everett et al., 1999"
x <- found [1 , ref ]
} else {
2019-02-18 02:33:37 +01:00
x <- microorganismsDT [col_id == found [1 , col_id_new ] , ..property ] [ [1 ] ]
2018-12-14 10:52:20 +01:00
}
2019-02-18 02:33:37 +01:00
was_renamed ( name_old = found [1 , fullname ] ,
name_new = microorganismsDT [col_id == found [1 , col_id_new ] , fullname ] ,
2019-02-08 16:06:54 +01:00
ref_old = found [1 , ref ] ,
2019-02-18 02:33:37 +01:00
ref_new = microorganismsDT [col_id == found [1 , col_id_new ] , ref ] ,
mo = microorganismsDT [col_id == found [1 , col_id_new ] , mo ] )
2019-02-08 16:06:54 +01:00
uncertainties <<- c ( uncertainties ,
2019-02-18 02:33:37 +01:00
paste0 ( " '" , a.x_backup , " ' >> " , found [1 , fullname ] , " (Catalogue of Life ID " , found [1 , col_id ] , " )" ) )
2018-12-14 10:52:20 +01:00
return ( x )
2018-11-09 13:11:54 +01:00
}
2018-09-27 23:23:48 +02:00
2019-02-08 16:06:54 +01:00
# (3) strip values between brackets ----
2018-12-14 11:44:15 +01:00
a.x_backup_stripped <- gsub ( " ( [(].*[)])" , " " , a.x_backup )
a.x_backup_stripped <- trimws ( gsub ( " " , " " , a.x_backup_stripped , fixed = TRUE ) )
found <- suppressMessages ( suppressWarnings ( exec_as.mo ( a.x_backup_stripped , clear_options = FALSE , allow_uncertain = FALSE ) ) )
if ( ! is.na ( found ) & nchar ( b.x_trimmed ) >= 6 ) {
2019-01-21 21:24:40 +01:00
found_result <- found
2018-12-14 10:52:20 +01:00
found <- microorganismsDT [mo == found , ..property ] [ [1 ] ]
2019-02-08 16:06:54 +01:00
uncertainties <<- c ( uncertainties ,
paste0 ( " '" , a.x_backup , " ' >> " , microorganismsDT [mo == found_result [1L ] , fullname ] [ [1 ] ] , " (" , found_result [1L ] , " )" ) )
2018-12-14 10:52:20 +01:00
return ( found [1L ] )
}
2018-12-06 14:36:39 +01:00
2019-02-08 16:06:54 +01:00
# (4) try to strip off one element from end and check the remains ----
2018-12-14 11:44:15 +01:00
x_strip <- a.x_backup %>% strsplit ( " " ) %>% unlist ( )
if ( length ( x_strip ) > 1 & nchar ( b.x_trimmed ) >= 6 ) {
for ( i in 1 : ( length ( x_strip ) - 1 ) ) {
x_strip_collapsed <- paste ( x_strip [1 : ( length ( x_strip ) - i ) ] , collapse = " " )
found <- suppressMessages ( suppressWarnings ( exec_as.mo ( x_strip_collapsed , clear_options = FALSE , allow_uncertain = FALSE ) ) )
if ( ! is.na ( found ) ) {
2019-01-21 21:24:40 +01:00
found_result <- found
2018-12-14 11:44:15 +01:00
found <- microorganismsDT [mo == found , ..property ] [ [1 ] ]
2019-02-08 16:06:54 +01:00
uncertainties <<- c ( uncertainties ,
paste0 ( " '" , a.x_backup , " ' >> " , microorganismsDT [mo == found_result [1L ] , fullname ] [ [1 ] ] , " (" , found_result [1L ] , " )" ) )
return ( found [1L ] )
}
}
}
# (5) try to strip off one element from start and check the remains ----
x_strip <- a.x_backup %>% strsplit ( " " ) %>% unlist ( )
if ( length ( x_strip ) > 1 & nchar ( b.x_trimmed ) >= 6 ) {
for ( i in 2 : ( length ( x_strip ) ) ) {
x_strip_collapsed <- paste ( x_strip [i : length ( x_strip ) ] , collapse = " " )
found <- suppressMessages ( suppressWarnings ( exec_as.mo ( x_strip_collapsed , clear_options = FALSE , allow_uncertain = FALSE ) ) )
if ( ! is.na ( found ) ) {
found_result <- found
found <- microorganismsDT [mo == found , ..property ] [ [1 ] ]
uncertainties <<- c ( uncertainties ,
paste0 ( " '" , a.x_backup , " ' >> " , microorganismsDT [mo == found_result [1L ] , fullname ] [ [1 ] ] , " (" , found_result [1L ] , " )" ) )
2018-12-14 11:44:15 +01:00
return ( found [1L ] )
2018-12-06 14:36:39 +01:00
}
}
}
2018-12-14 11:44:15 +01:00
2019-02-21 23:32:30 +01:00
# (6) not yet implemented taxonomic changes in Catalogue of Life ----
2019-01-21 21:24:40 +01:00
found <- suppressMessages ( suppressWarnings ( exec_as.mo ( TEMPORARY_TAXONOMY ( b.x_trimmed ) , clear_options = FALSE , allow_uncertain = FALSE ) ) )
2019-01-08 16:23:45 +01:00
if ( ! is.na ( found ) ) {
2019-01-21 21:24:40 +01:00
found_result <- found
2019-01-08 16:23:45 +01:00
found <- microorganismsDT [mo == found , ..property ] [ [1 ] ]
2019-02-08 16:06:54 +01:00
warning ( silver ( paste0 ( ' Guessed with uncertainty: "' ,
a.x_backup , ' " >> ' , italic ( microorganismsDT [mo == found_result [1L ] , fullname ] [ [1 ] ] ) , " (" , found_result [1L ] , " )" ) ) ,
2019-01-08 16:23:45 +01:00
call. = FALSE , immediate. = FALSE )
2019-02-08 16:06:54 +01:00
uncertainties <<- c ( uncertainties ,
paste0 ( ' "' , a.x_backup , ' " >> ' , microorganismsDT [mo == found_result [1L ] , fullname ] [ [1 ] ] , " (" , found_result [1L ] , " )" ) )
2019-01-08 16:23:45 +01:00
return ( found [1L ] )
}
2018-12-14 11:44:15 +01:00
# didn't found in uncertain results too
return ( NA_character_ )
2018-12-06 14:36:39 +01:00
}
2018-12-14 10:52:20 +01:00
2019-02-21 23:32:30 +01:00
x [i ] <- uncertain_fn ( x_backup [i ] , x_trimmed [i ] , x_withspaces_start_end [i ] , x_withspaces_start_only [i ] )
2018-09-27 23:23:48 +02:00
if ( ! is.na ( x [i ] ) ) {
next
}
2018-06-08 12:06:54 +02:00
}
2018-09-27 23:23:48 +02:00
# not found ----
x [i ] <- NA_character_
failures <- c ( failures , x_backup [i ] )
}
2018-06-08 12:06:54 +02:00
}
2018-07-23 14:14:03 +02:00
2019-02-08 16:06:54 +01:00
# failures
2018-07-23 14:14:03 +02:00
failures <- failures [ ! failures %in% c ( NA , NULL , NaN ) ]
if ( length ( failures ) > 0 ) {
2018-12-06 14:36:39 +01:00
options ( mo_failures = sort ( unique ( failures ) ) )
2019-02-08 16:06:54 +01:00
plural <- c ( " value" , " it" )
2019-01-21 21:24:40 +01:00
if ( n_distinct ( failures ) > 1 ) {
2019-02-08 16:06:54 +01:00
plural <- c ( " values" , " them" )
2018-12-06 14:36:39 +01:00
}
2019-01-25 13:18:41 +01:00
total_failures <- length ( x_input [x_input %in% failures & ! x_input %in% c ( NA , NULL , NaN ) ] )
total_n <- length ( x_input [ ! x_input %in% c ( NA , NULL , NaN ) ] )
2019-02-08 16:06:54 +01:00
msg <- paste0 ( " \n" , n_distinct ( failures ) , " unique " , plural [1 ] ,
2019-01-25 13:18:41 +01:00
" (^= " , percent ( total_failures / total_n , round = 1 , force_zero = TRUE ) ,
" ) could not be coerced to a valid MO code" )
2019-01-21 21:24:40 +01:00
if ( n_distinct ( failures ) <= 10 ) {
msg <- paste0 ( msg , " : " , paste ( ' "' , unique ( failures ) , ' "' , sep = " " , collapse = ' , ' ) )
}
2019-02-08 16:06:54 +01:00
msg <- paste0 ( msg , " . Use mo_failures() to review " , plural [2 ] , " ." )
warning ( red ( msg ) ,
call. = FALSE ,
immediate. = TRUE ) # thus will always be shown, even if >= warnings
}
# uncertainties
if ( length ( uncertainties ) > 0 ) {
options ( mo_uncertainties = sort ( unique ( uncertainties ) ) )
plural <- c ( " value" , " it" )
if ( n_distinct ( failures ) > 1 ) {
plural <- c ( " values" , " them" )
}
msg <- paste0 ( " \nResults of " , n_distinct ( uncertainties ) , " input " , plural [1 ] ,
" guessed with uncertainty. Use mo_uncertainties() to review " , plural [2 ] , " ." )
2019-01-21 21:24:40 +01:00
warning ( red ( msg ) ,
call. = FALSE ,
immediate. = TRUE ) # thus will always be shown, even if >= warnings
2018-07-23 14:14:03 +02:00
}
2018-08-28 13:51:13 +02:00
2018-09-14 10:31:21 +02:00
# Becker ----
2018-09-01 21:19:46 +02:00
if ( Becker == TRUE | Becker == " all" ) {
# See Source. It's this figure:
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4187637/figure/F3/
2018-10-31 12:10:49 +01:00
MOs_staph <- microorganismsDT [genus == " Staphylococcus" ]
2018-09-25 16:44:40 +02:00
setkey ( MOs_staph , species )
CoNS <- MOs_staph [species %in% c ( " arlettae" , " auricularis" , " capitis" ,
" caprae" , " carnosus" , " cohnii" , " condimenti" ,
" devriesei" , " epidermidis" , " equorum" ,
" fleurettii" , " gallinarum" , " haemolyticus" ,
" hominis" , " jettensis" , " kloosii" , " lentus" ,
" lugdunensis" , " massiliensis" , " microti" ,
" muscae" , " nepalensis" , " pasteuri" , " petrasii" ,
" pettenkoferi" , " piscifermentans" , " rostri" ,
" saccharolyticus" , " saprophyticus" , " sciuri" ,
" stepanovicii" , " simulans" , " succinus" ,
2018-09-27 23:23:48 +02:00
" vitulinus" , " warneri" , " xylosus" ) , ..property ] [ [1 ] ]
2018-09-25 16:44:40 +02:00
CoPS <- MOs_staph [species %in% c ( " simiae" , " agnetis" , " chromogenes" ,
" delphini" , " felis" , " lutrae" ,
" hyicus" , " intermedius" ,
" pseudintermedius" , " pseudointermedius" ,
2018-09-27 23:23:48 +02:00
" schleiferi" ) , ..property ] [ [1 ] ]
2018-10-31 12:10:49 +01:00
x [x %in% CoNS ] <- microorganismsDT [mo == ' B_STPHY_CNS' , ..property ] [ [1 ] ] [1L ]
x [x %in% CoPS ] <- microorganismsDT [mo == ' B_STPHY_CPS' , ..property ] [ [1 ] ] [1L ]
2018-09-01 21:19:46 +02:00
if ( Becker == " all" ) {
2018-10-31 12:10:49 +01:00
x [x == microorganismsDT [mo == ' B_STPHY_AUR' , ..property ] [ [1 ] ] [1L ] ] <- microorganismsDT [mo == ' B_STPHY_CPS' , ..property ] [ [1 ] ] [1L ]
2018-09-01 21:19:46 +02:00
}
}
2018-09-14 10:31:21 +02:00
# Lancefield ----
2018-09-04 11:33:30 +02:00
if ( Lancefield == TRUE | Lancefield == " all" ) {
2018-09-27 23:23:48 +02:00
# group A - S. pyogenes
2019-02-18 02:33:37 +01:00
x [x == microorganismsDT [mo == ' B_STRPT_PYO' , ..property ] [ [1 ] ] [1L ] ] <- microorganismsDT [mo == ' B_STRPT_GRA' , ..property ] [ [1 ] ] [1L ]
2018-09-27 23:23:48 +02:00
# group B - S. agalactiae
2019-02-18 02:33:37 +01:00
x [x == microorganismsDT [mo == ' B_STRPT_AGA' , ..property ] [ [1 ] ] [1L ] ] <- microorganismsDT [mo == ' B_STRPT_GRB' , ..property ] [ [1 ] ] [1L ]
2018-09-01 21:19:46 +02:00
# group C
2018-10-31 12:10:49 +01:00
S_groupC <- microorganismsDT %>% filter ( genus == " Streptococcus" ,
species %in% c ( " equisimilis" , " equi" ,
" zooepidemicus" , " dysgalactiae" ) ) %>%
2018-09-27 23:23:48 +02:00
pull ( property )
2019-02-18 02:33:37 +01:00
x [x %in% S_groupC ] <- microorganismsDT [mo == ' B_STRPT_GRC' , ..property ] [ [1 ] ] [1L ]
2018-09-04 11:33:30 +02:00
if ( Lancefield == " all" ) {
2018-09-27 23:23:48 +02:00
# all Enterococci
2019-02-18 02:33:37 +01:00
x [x %like% " ^(Enterococcus|B_ENTRC)" ] <- microorganismsDT [mo == ' B_STRPT_GRD' , ..property ] [ [1 ] ] [1L ]
2018-09-27 23:23:48 +02:00
}
# group F - S. anginosus
2019-02-18 02:33:37 +01:00
x [x == microorganismsDT [mo == ' B_STRPT_ANG' , ..property ] [ [1 ] ] [1L ] ] <- microorganismsDT [mo == ' B_STRPT_GRF' , ..property ] [ [1 ] ] [1L ]
2018-09-27 23:23:48 +02:00
# group H - S. sanguinis
2019-02-18 02:33:37 +01:00
x [x == microorganismsDT [mo == ' B_STRPT_SAN' , ..property ] [ [1 ] ] [1L ] ] <- microorganismsDT [mo == ' B_STRPT_GRH' , ..property ] [ [1 ] ] [1L ]
2018-09-27 23:23:48 +02:00
# group K - S. salivarius
2019-02-18 02:33:37 +01:00
x [x == microorganismsDT [mo == ' B_STRPT_SAL' , ..property ] [ [1 ] ] [1L ] ] <- microorganismsDT [mo == ' B_STRPT_GRK' , ..property ] [ [1 ] ] [1L ]
2018-09-01 21:19:46 +02:00
}
2019-02-08 16:06:54 +01:00
# Wrap up ----------------------------------------------------------------
2018-10-19 13:53:31 +02:00
# comply to x, which is also unique and without empty values
x_input_unique_nonempty <- unique ( x_input [ ! is.na ( x_input ) & ! is.null ( x_input ) & ! identical ( x_input , " " ) ] )
2018-10-19 00:17:03 +02:00
2018-08-28 13:51:13 +02:00
# left join the found results to the original input values (x_input)
2018-10-19 13:53:31 +02:00
df_found <- data.frame ( input = as.character ( x_input_unique_nonempty ) ,
found = as.character ( x ) ,
2018-08-28 13:51:13 +02:00
stringsAsFactors = FALSE )
2018-09-27 23:23:48 +02:00
df_input <- data.frame ( input = as.character ( x_input ) ,
2018-08-28 13:51:13 +02:00
stringsAsFactors = FALSE )
2018-09-27 23:23:48 +02:00
x <- df_input %>%
left_join ( df_found ,
by = " input" ) %>%
pull ( found )
if ( property == " mo" ) {
class ( x ) <- " mo"
}
2019-02-08 16:06:54 +01:00
if ( length ( mo_renamed ( ) ) > 0 ) {
if ( has_color ( ) ) {
notes <- getOption ( " mo_renamed" )
} else {
notes <- mo_renamed ( )
}
2018-12-14 10:52:20 +01:00
notes <- sort ( notes )
for ( i in 1 : length ( notes ) ) {
2019-02-08 16:06:54 +01:00
base :: message ( blue ( paste ( " Note:" , notes [i ] ) ) )
2018-12-14 10:52:20 +01:00
}
}
2018-06-08 12:06:54 +02:00
x
}
2018-07-23 14:14:03 +02:00
2019-01-21 21:24:40 +01:00
TEMPORARY_TAXONOMY <- function ( x ) {
2019-01-08 16:23:45 +01:00
x [x %like% ' Cutibacterium' ] <- gsub ( ' Cutibacterium' , ' Propionibacterium' , x [x %like% ' Cutibacterium' ] )
2019-01-21 21:24:40 +01:00
x
2019-01-08 16:23:45 +01:00
}
2018-12-06 16:07:06 +01:00
#' @importFrom crayon blue italic
2019-02-08 16:06:54 +01:00
was_renamed <- function ( name_old , name_new , ref_old = " " , ref_new = " " , mo = " " ) {
2018-10-01 14:44:40 +02:00
if ( ! is.na ( ref_old ) ) {
ref_old <- paste0 ( " (" , ref_old , " )" )
} else {
ref_old <- " "
}
if ( ! is.na ( ref_new ) ) {
ref_new <- paste0 ( " (" , ref_new , " )" )
} else {
ref_new <- " "
}
2018-12-06 16:07:06 +01:00
if ( ! is.na ( mo ) ) {
mo <- paste0 ( " (" , mo , " )" )
} else {
mo <- " "
}
msg <- paste0 ( italic ( name_old ) , ref_old , " was renamed " , italic ( name_new ) , ref_new , mo )
msg <- gsub ( " et al." , italic ( " et al." ) , msg )
2019-02-08 16:06:54 +01:00
options ( mo_renamed = sort ( msg ) )
2018-09-25 16:44:40 +02:00
}
2018-08-31 13:36:19 +02:00
#' @exportMethod print.mo
#' @export
#' @noRd
print.mo <- function ( x , ... ) {
cat ( " Class 'mo'\n" )
2018-10-12 16:35:18 +02:00
x_names <- names ( x )
x <- as.character ( x )
names ( x ) <- x_names
print.default ( x , quote = FALSE )
2018-08-31 13:36:19 +02:00
}
2018-07-23 14:14:03 +02:00
2018-12-07 12:04:55 +01:00
#' @exportMethod summary.mo
#' @export
#' @noRd
summary.mo <- function ( object , ... ) {
# unique and top 1-3
x <- object
top_3 <- unname ( top_freq ( freq ( x ) , 3 ) )
c ( " Class" = " mo" ,
" <NA>" = length ( x [is.na ( x ) ] ) ,
" Unique" = dplyr :: n_distinct ( x [ ! is.na ( x ) ] ) ,
" #1" = top_3 [1 ] ,
" #2" = top_3 [2 ] ,
" #3" = top_3 [3 ] )
}
2018-08-31 13:36:19 +02:00
#' @exportMethod as.data.frame.mo
2018-07-23 14:14:03 +02:00
#' @export
2018-08-31 13:36:19 +02:00
#' @noRd
as.data.frame.mo <- function ( x , ... ) {
2018-10-31 12:10:49 +01:00
# same as as.data.frame.character but with removed stringsAsFactors, since it will be class "mo"
2018-08-31 13:36:19 +02:00
nm <- paste ( deparse ( substitute ( x ) , width.cutoff = 500L ) ,
collapse = " " )
if ( ! " nm" %in% names ( list ( ... ) ) ) {
as.data.frame.vector ( x , ... , nm = nm )
} else {
as.data.frame.vector ( x , ... )
}
}
#' @exportMethod pull.mo
#' @export
#' @importFrom dplyr pull
#' @noRd
pull.mo <- function ( .data , ... ) {
pull ( as.data.frame ( .data ) , ... )
2018-07-23 14:14:03 +02:00
}
2018-12-06 14:36:39 +01:00
2019-02-08 16:06:54 +01:00
#' @rdname as.mo
2018-12-06 14:36:39 +01:00
#' @export
mo_failures <- function ( ) {
getOption ( " mo_failures" )
}
2019-02-08 16:06:54 +01:00
#' @rdname as.mo
#' @export
mo_uncertainties <- function ( ) {
getOption ( " mo_uncertainties" )
}
#' @rdname as.mo
2018-12-06 14:36:39 +01:00
#' @export
mo_renamed <- function ( ) {
2019-02-08 16:06:54 +01:00
strip_style ( gsub ( " was renamed" , " >>" , getOption ( " mo_renamed" ) , fixed = TRUE ) )
2018-12-06 14:36:39 +01:00
}