mirror of
https://github.com/msberends/AMR.git
synced 2026-03-07 12:03:10 +01:00
1370 lines
63 KiB
R
Executable File
1370 lines
63 KiB
R
Executable File
# ==================================================================== #
|
|
# TITLE: #
|
|
# AMR: An R Package for Working with Antimicrobial Resistance Data #
|
|
# #
|
|
# SOURCE CODE: #
|
|
# https://github.com/msberends/AMR #
|
|
# #
|
|
# PLEASE CITE THIS SOFTWARE AS: #
|
|
# Berends MS, Luz CF, Friedrich AW, et al. (2022). #
|
|
# AMR: An R Package for Working with Antimicrobial Resistance Data. #
|
|
# Journal of Statistical Software, 104(3), 1-31. #
|
|
# https://doi.org/10.18637/jss.v104.i03 #
|
|
# #
|
|
# Developed at the University of Groningen and the University Medical #
|
|
# Center Groningen in The Netherlands, in collaboration with many #
|
|
# colleagues from around the world, see our website. #
|
|
# #
|
|
# This R package is free software; you can freely use and distribute #
|
|
# it for both personal and commercial purposes under the terms of the #
|
|
# GNU General Public License version 2.0 (GNU GPL-2), as published by #
|
|
# the Free Software Foundation. #
|
|
# We created this package for both routine data analysis and academic #
|
|
# research and it was publicly released in the hope that it will be #
|
|
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. #
|
|
# #
|
|
# Visit our website for the full manual and a complete tutorial about #
|
|
# how to conduct AMR data analysis: https://amr-for-r.org #
|
|
# ==================================================================== #
|
|
|
|
#' Transform Arbitrary Input to Valid Microbial Taxonomy
|
|
#'
|
|
#' Use this function to get a valid microorganism code ([`mo`]) based on arbitrary user input. Determination is done using intelligent rules and the complete taxonomic tree of the kingdoms `r vector_and(unique(microorganisms$kingdom[which(!grepl("(unknown|Fungi)", microorganisms$kingdom))]), quotes = FALSE)`, and most microbial species from the kingdom Fungi (see *Source*). The input can be almost anything: a full name (like `"Staphylococcus aureus"`), an abbreviated name (such as `"S. aureus"`), an abbreviation known in the field (such as `"MRSA"`), or just a genus. See *Examples*.
|
|
#' @param x A [character] vector or a [data.frame] with one or two columns.
|
|
#' @param Becker A [logical] to indicate whether staphylococci should be categorised into coagulase-negative staphylococci ("CoNS") and coagulase-positive staphylococci ("CoPS") instead of their own species, according to Karsten Becker *et al.* (see *Source*). Please see *Details* for a full list of staphylococcal species that will be converted.
|
|
#'
|
|
#' This excludes *Staphylococcus aureus* at default, use `Becker = "all"` to also categorise *S. aureus* as "CoPS".
|
|
#' @param Lancefield A [logical] to indicate whether a beta-haemolytic *Streptococcus* should be categorised into Lancefield groups instead of their own species, according to Rebecca C. Lancefield (see *Source*). These streptococci will be categorised in their first group, e.g. *Streptococcus dysgalactiae* will be group C, although officially it was also categorised into groups G and L. . Please see *Details* for a full list of streptococcal species that will be converted.
|
|
#'
|
|
#' This excludes enterococci at default (who are in group D), use `Lancefield = "all"` to also categorise all enterococci as group D.
|
|
#' @param minimum_matching_score A numeric value to set as the lower limit for the [MO matching score][mo_matching_score()]. When left blank, this will be determined automatically based on the character length of `x`, its [taxonomic kingdom][microorganisms] and [human pathogenicity][mo_matching_score()].
|
|
#' @param keep_synonyms A [logical] to indicate if outdated, previously valid taxonomic names must be preserved and not be corrected to currently accepted names. Do note that the term "synonym" is in this case jargon from the field of microbial taxonomy - it is not in place to denote that e.g. "Streptococcus Group A" is a synonym of *S. pyogenes*. Though this is practically the case, taxonomically it is not as "Streptococcus Group A" is not even a valid taxonomic name.
|
|
#'
|
|
#' The default is `FALSE`, which will return a note if outdated taxonomic names were processed. The default can be set with the package option [`AMR_keep_synonyms`][AMR-options], i.e. `options(AMR_keep_synonyms = TRUE)` or `options(AMR_keep_synonyms = FALSE)`.
|
|
#' @param reference_df A [data.frame] to be used for extra reference when translating `x` to a valid [`mo`]. See [set_mo_source()] and [get_mo_source()] to automate the usage of your own codes (e.g. used in your analysis or organisation).
|
|
#' @param ignore_pattern A Perl-compatible [regular expression][base::regex] (case-insensitive) of which all matches in `x` must return `NA`. This can be convenient to exclude known non-relevant input and can also be set with the package option [`AMR_ignore_pattern`][AMR-options], e.g. `options(AMR_ignore_pattern = "(not reported|contaminated flora)")`.
|
|
#' @param cleaning_regex A Perl-compatible [regular expression][base::regex] (case-insensitive) to clean the input of `x`. Every matched part in `x` will be removed. At default, this is the outcome of [mo_cleaning_regex()], which removes texts between brackets and texts such as "species" and "serovar". The default can be set with the package option [`AMR_cleaning_regex`][AMR-options].
|
|
#' @param only_fungi A [logical] to indicate if only fungi must be found, making sure that e.g. misspellings always return records from the kingdom of Fungi. This can be set globally for [all microorganism functions][mo_property()] with the package option [`AMR_only_fungi`][AMR-options], i.e. `options(AMR_only_fungi = TRUE)`.
|
|
#' @param language Language to translate text like "no growth", which defaults to the system language (see [get_AMR_locale()]).
|
|
#' @param info A [logical] to indicate that info must be printed, e.g. a progress bar when more than 25 items are to be coerced, or a list with outdated taxonomic names. The default is `TRUE` only in interactive mode.
|
|
#' @param ... Other arguments passed on to functions.
|
|
#' @rdname as.mo
|
|
#' @aliases mo
|
|
#' @details
|
|
#' A microorganism (MO) code from this package (class: [`mo`]) is human-readable and typically looks like these examples:
|
|
#'
|
|
#' ```
|
|
#' Code Full name
|
|
#' --------------- --------------------------------------
|
|
#' B_KLBSL Klebsiella
|
|
#' B_KLBSL_PNMN Klebsiella pneumoniae
|
|
#' B_KLBSL_PNMN_RHNS Klebsiella pneumoniae rhinoscleromatis
|
|
#' | | | |
|
|
#' | | | |
|
|
#' | | | \---> subspecies, a 3-5 letter acronym
|
|
#' | | \----> species, a 3-6 letter acronym
|
|
#' | \----> genus, a 4-8 letter acronym
|
|
#' \----> kingdom: A (Archaea), AN (Animalia), B (Bacteria),
|
|
#' C (Chromista), F (Fungi), PL (Plantae),
|
|
#' P (Protozoa)
|
|
#' ```
|
|
#'
|
|
#' Values that cannot be coerced will be considered 'unknown' and will return the MO code `UNKNOWN` with a warning.
|
|
#'
|
|
#' Use the [`mo_*`][mo_property()] functions to get properties based on the returned code, see *Examples*.
|
|
#'
|
|
#' The [as.mo()] function uses a novel and scientifically validated (\doi{10.18637/jss.v104.i03}) matching score algorithm (see *Matching Score for Microorganisms* below) to match input against the [available microbial taxonomy][microorganisms] in this package. This implicates that e.g. `"E. coli"` (a microorganism highly prevalent in humans) will return the microbial ID of *Escherichia coli* and not *Entamoeba coli* (a microorganism less prevalent in humans), although the latter would alphabetically come first.
|
|
#'
|
|
#' ### Coping with Uncertain Results
|
|
#'
|
|
#' Results of non-exact taxonomic input are based on their [matching score][mo_matching_score()]. The lowest allowed score can be set with the `minimum_matching_score` argument. At default this will be determined based on the character length of the input, the [taxonomic kingdom][microorganisms], and the [human pathogenicity][mo_matching_score()] of the taxonomic outcome. If values are matched with uncertainty, a message will be shown to suggest the user to inspect the results with [mo_uncertainties()], which returns a [data.frame] with all specifications.
|
|
#'
|
|
#' To increase the quality of matching, the `cleaning_regex` argument is used to clean the input. This must be a [regular expression][base::regex] that matches parts of the input that should be removed before the input is matched against the [available microbial taxonomy][microorganisms]. It will be matched Perl-compatible and case-insensitive. The default value of `cleaning_regex` is the outcome of the helper function [mo_cleaning_regex()].
|
|
#'
|
|
#' There are three helper functions that can be run after using the [as.mo()] function:
|
|
#' - Use [mo_uncertainties()] to get a [data.frame] that prints in a pretty format with all taxonomic names that were guessed. The output contains the matching score for all matches (see *Matching Score for Microorganisms* below).
|
|
#' - Use [mo_failures()] to get a [character] [vector] with all values that could not be coerced to a valid value.
|
|
#' - Use [mo_renamed()] to get a [data.frame] with all values that could be coerced based on outdated, previously accepted taxonomic names.
|
|
#'
|
|
#' ### For Mycologists
|
|
#'
|
|
#' The [matching score algorithm][mo_matching_score()] gives precedence to bacteria over fungi. If you are only analysing fungi, be sure to use `only_fungi = TRUE`, or better yet, add this to your code and run it once every session:
|
|
#'
|
|
#' ```r
|
|
#' options(AMR_only_fungi = TRUE)
|
|
#' ```
|
|
#'
|
|
#' This will make sure that no bacteria or other 'non-fungi' will be returned by [as.mo()], or any of the [`mo_*`][mo_property()] functions.
|
|
#'
|
|
#' ### Coagulase-negative and Coagulase-positive Staphylococci
|
|
#'
|
|
#' With `Becker = TRUE`, the following staphylococci will be converted to their corresponding coagulase group:
|
|
#'
|
|
#' * Coagulase-negative: `r vector_and(gsub("Staphylococcus", "S.", mo_name(MO_CONS[MO_CONS != "B_STPHY_CONS"], keep_synonyms = TRUE)), quotes = "*")`
|
|
#' * Coagulase-positive: `r vector_and(gsub("Staphylococcus", "S.", mo_name(MO_COPS[MO_COPS != "B_STPHY_COPS"], keep_synonyms = TRUE)), quotes = "*")`
|
|
#'
|
|
#' This is based on:
|
|
#'
|
|
#' * Becker K *et al.* (2014). **Coagulase-Negative Staphylococci.** *Clin Microbiol Rev.* 27(4): 870-926; \doi{10.1128/CMR.00109-13}
|
|
#' * Becker K *et al.* (2019). **Implications of identifying the recently defined members of the *S. aureus* complex, *S. argenteus* and *S. schweitzeri*: A position paper of members of the ESCMID Study Group for staphylococci and Staphylococcal Diseases (ESGS).** *Clin Microbiol Infect*; \doi{10.1016/j.cmi.2019.02.028}
|
|
#' * Becker K *et al.* (2020). **Emergence of coagulase-negative staphylococci.** *Expert Rev Anti Infect Ther.* 18(4):349-366; \doi{10.1080/14787210.2020.1730813}
|
|
#'
|
|
#' For newly named staphylococcal species, such as *S. brunensis* (2024) and *S. shinii* (2023), we looked up the scientific reference to make sure the species are considered for the correct coagulase group.
|
|
#'
|
|
#' ### Lancefield Groups in Streptococci
|
|
#'
|
|
#' With `Lancefield = TRUE`, the following streptococci will be converted to their corresponding Lancefield group:
|
|
#'
|
|
#' * `r paste(apply(aggregate(mo_name ~ mo_group_name, data = microorganisms.groups[microorganisms.groups$mo_group_name %like_case% "Streptococcus Group [A-Z]$", ], FUN = function(x) vector_and(gsub("Streptococcus", "S.", x, fixed = TRUE), quotes = "*", sort = TRUE)), 1, function(row) paste(row["mo_group_name"], ": ", row["mo_name"], sep = "")), collapse = "\n* ")`
|
|
#'
|
|
#' This is based on:
|
|
#'
|
|
#' * Lancefield RC (1933). **A serological differentiation of human and other groups of hemolytic streptococci.** *J Exp Med.* 57(4): 571-95; \doi{10.1084/jem.57.4.571}
|
|
#'
|
|
#' @inheritSection mo_matching_score Matching Score for Microorganisms
|
|
#'
|
|
# (source as a section here, so it can be inherited by other man pages)
|
|
#' @section Source:
|
|
#' * Berends MS *et al.* (2022). **AMR: An R Package for Working with Antimicrobial Resistance Data**. *Journal of Statistical Software*, 104(3), 1-31; \doi{10.18637/jss.v104.i03}
|
|
#' * `r TAXONOMY_VERSION$LPSN$citation` Accessed from <`r TAXONOMY_VERSION$LPSN$url`> on `r documentation_date(TAXONOMY_VERSION$LPSN$accessed_date)`.
|
|
#' * `r TAXONOMY_VERSION$MycoBank$citation` Accessed from <`r TAXONOMY_VERSION$MycoBank$url`> on `r documentation_date(TAXONOMY_VERSION$MycoBank$accessed_date)`.
|
|
#' * `r TAXONOMY_VERSION$GBIF$citation` Accessed from <`r TAXONOMY_VERSION$GBIF$url`> on `r documentation_date(TAXONOMY_VERSION$GBIF$accessed_date)`.
|
|
#' * `r TAXONOMY_VERSION$BacDive$citation` Accessed from <`r TAXONOMY_VERSION$BacDive$url`> on `r documentation_date(TAXONOMY_VERSION$BacDive$accessed_date)`.
|
|
#' * `r TAXONOMY_VERSION$SNOMED$citation` URL: <`r TAXONOMY_VERSION$SNOMED$url`>
|
|
#' * Bartlett A *et al.* (2022). **A comprehensive list of bacterial pathogens infecting humans** *Microbiology* 168:001269; \doi{10.1099/mic.0.001269}
|
|
#' @export
|
|
#' @return A [character] [vector] with additional class [`mo`]
|
|
#' @seealso [microorganisms] for the [data.frame] that is being used to determine ID's.
|
|
#'
|
|
#' The [`mo_*`][mo_property()] functions (such as [mo_genus()], [mo_gramstain()]) to get properties based on the returned code.
|
|
#' @inheritSection AMR Download Our Reference Data
|
|
#' @examples
|
|
#' \donttest{
|
|
#' # These examples all return "B_STPHY_AURS", the ID of S. aureus:
|
|
#' as.mo(c(
|
|
#' "sau", # WHONET code
|
|
#' "stau",
|
|
#' "STAU",
|
|
#' "staaur",
|
|
#' "S. aureus",
|
|
#' "S aureus",
|
|
#' "Sthafilokkockus aureus", # handles incorrect spelling
|
|
#' "Staphylococcus aureus (MRSA)",
|
|
#' "MRSA", # Methicillin Resistant S. aureus
|
|
#' "VISA", # Vancomycin Intermediate S. aureus
|
|
#' "VRSA", # Vancomycin Resistant S. aureus
|
|
#' 115329001 # SNOMED CT code
|
|
#' ))
|
|
#'
|
|
#' # Dyslexia is no problem - these all work:
|
|
#' as.mo(c(
|
|
#' "Ureaplasma urealyticum",
|
|
#' "Ureaplasma urealyticus",
|
|
#' "Ureaplasmium urealytica",
|
|
#' "Ureaplazma urealitycium"
|
|
#' ))
|
|
#'
|
|
#' # input will get cleaned up with the input given in the `cleaning_regex` argument,
|
|
#' # which defaults to `mo_cleaning_regex()`:
|
|
#' cat(mo_cleaning_regex(), "\n")
|
|
#'
|
|
#' as.mo("Streptococcus group A")
|
|
#'
|
|
#' as.mo("S. epidermidis") # will remain species: B_STPHY_EPDR
|
|
#' as.mo("S. epidermidis", Becker = TRUE) # will not remain species: B_STPHY_CONS
|
|
#'
|
|
#' as.mo("S. pyogenes") # will remain species: B_STRPT_PYGN
|
|
#' as.mo("S. pyogenes", Lancefield = TRUE) # will not remain species: B_STRPT_GRPA
|
|
#'
|
|
#' # All mo_* functions use as.mo() internally too (see ?mo_property):
|
|
#' mo_genus("E. coli")
|
|
#' mo_gramstain("ESCO")
|
|
#' mo_is_intrinsic_resistant("ESCCOL", ab = "vanco")
|
|
#' }
|
|
as.mo <- function(x,
|
|
Becker = FALSE,
|
|
Lancefield = FALSE,
|
|
minimum_matching_score = NULL,
|
|
keep_synonyms = getOption("AMR_keep_synonyms", FALSE),
|
|
reference_df = get_mo_source(),
|
|
ignore_pattern = getOption("AMR_ignore_pattern", NULL),
|
|
cleaning_regex = getOption("AMR_cleaning_regex", mo_cleaning_regex()),
|
|
only_fungi = getOption("AMR_only_fungi", FALSE),
|
|
language = get_AMR_locale(),
|
|
info = interactive(),
|
|
...) {
|
|
meet_criteria(x, allow_class = c("mo", "data.frame", "list", "character", "numeric", "integer", "factor"), allow_NA = TRUE)
|
|
meet_criteria(Becker, allow_class = c("logical", "character"), has_length = 1)
|
|
meet_criteria(Lancefield, allow_class = c("logical", "character"), has_length = 1)
|
|
meet_criteria(minimum_matching_score, allow_class = c("numeric", "integer"), has_length = 1, allow_NULL = TRUE, is_positive_or_zero = TRUE, is_finite = TRUE)
|
|
meet_criteria(keep_synonyms, allow_class = "logical", has_length = 1)
|
|
meet_criteria(reference_df, allow_class = "data.frame", allow_NULL = TRUE)
|
|
meet_criteria(ignore_pattern, allow_class = "character", has_length = 1, allow_NULL = TRUE)
|
|
meet_criteria(cleaning_regex, allow_class = "character", has_length = 1, allow_NULL = TRUE)
|
|
meet_criteria(only_fungi, allow_class = "logical", has_length = 1)
|
|
language <- validate_language(language)
|
|
meet_criteria(info, allow_class = "logical", has_length = 1)
|
|
|
|
add_MO_lookup_to_AMR_env()
|
|
|
|
if (tryCatch(all(x %in% c(AMR_env$MO_lookup$mo, NA)), error = function(e) FALSE) &&
|
|
isFALSE(Becker) &&
|
|
isFALSE(Lancefield) &&
|
|
isTRUE(keep_synonyms)) {
|
|
# don't look into valid MO codes, just return them
|
|
# is.mo() won't work - MO codes might change between package versions
|
|
return(set_clean_class(x, new_class = c("mo", "character")))
|
|
}
|
|
|
|
# start off with replaced language-specific non-ASCII characters with ASCII characters
|
|
x <- parse_and_convert(x)
|
|
# replace mo codes used in older package versions
|
|
x <- replace_old_mo_codes(x, property = "mo")
|
|
# ignore cases that match the ignore pattern
|
|
x <- replace_ignore_pattern(x, ignore_pattern)
|
|
|
|
x_lower <- tolower(x)
|
|
|
|
# WHONET: xxx = no growth
|
|
x[x_lower %in% c("", "xxx", "na", "nan")] <- NA_character_
|
|
|
|
out <- rep(NA_character_, length(x))
|
|
|
|
# below we use base R's match(), known for powering '%in%', and incredibly fast!
|
|
|
|
# From reference_df ----
|
|
reference_df <- repair_reference_df(reference_df)
|
|
if (!is.null(reference_df)) {
|
|
out[x %in% reference_df[[1]]] <- reference_df[[2]][match(x[x %in% reference_df[[1]]], reference_df[[1]])]
|
|
}
|
|
# From MO code ----
|
|
out[is.na(out) & toupper(x) %in% AMR_env$MO_lookup$mo] <- toupper(x[is.na(out) & toupper(x) %in% AMR_env$MO_lookup$mo])
|
|
# From full name ----
|
|
out[is.na(out) & x_lower %in% AMR_env$MO_lookup$fullname_lower] <- AMR_env$MO_lookup$mo[match(x_lower[is.na(out) & x_lower %in% AMR_env$MO_lookup$fullname_lower], AMR_env$MO_lookup$fullname_lower)]
|
|
# one exception: "Fungi" matches the kingdom, but instead it should return the 'unknown' code for fungi
|
|
out[out == "F_[KNG]_FUNGI"] <- "F_FUNGUS"
|
|
# From known codes ----
|
|
ind <- is.na(out) & toupper(x) %in% AMR::microorganisms.codes$code
|
|
out[ind] <- AMR::microorganisms.codes$mo[match(toupper(x)[ind], AMR::microorganisms.codes$code)]
|
|
if (length(which(ind)) > 0 && isTRUE(info) && message_not_thrown_before("as.mo_microorganisms.codes", is.na(out), toupper(x))) {
|
|
message_(
|
|
"Retrieved value", ifelse(sum(ind) > 1, "s", ""),
|
|
" from the `microorganisms.codes` data set for ", vector_and(toupper(x)[ind]), "."
|
|
)
|
|
}
|
|
# From SNOMED ----
|
|
# based on this extremely fast gem: https://stackoverflow.com/a/11002456/4575331
|
|
snomeds <- unlist(AMR_env$MO_lookup$snomed)
|
|
snomeds <- snomeds[!is.na(snomeds)]
|
|
out[is.na(out) & x %in% snomeds] <- AMR_env$MO_lookup$mo[rep(seq_along(AMR_env$MO_lookup$snomed), vapply(FUN.VALUE = double(1), AMR_env$MO_lookup$snomed, length))[match(x[is.na(out) & x %in% snomeds], snomeds)]]
|
|
# From other familiar output ----
|
|
# such as Salmonella groups, colloquial names, etc.
|
|
out[is.na(out)] <- convert_colloquial_input(x[is.na(out)])
|
|
# From previous hits in this session ----
|
|
old <- out
|
|
out[is.na(out) & paste(x, minimum_matching_score, only_fungi) %in% AMR_env$mo_previously_coerced$x] <- AMR_env$mo_previously_coerced$mo[match(paste(x, minimum_matching_score, only_fungi)[is.na(out) & paste(x, minimum_matching_score, only_fungi) %in% AMR_env$mo_previously_coerced$x], AMR_env$mo_previously_coerced$x)]
|
|
new <- out
|
|
if (isTRUE(info) && message_not_thrown_before("as.mo", old, new, entire_session = TRUE) && any(is.na(old) & !is.na(new), na.rm = TRUE)) {
|
|
message_(
|
|
"Returning previously coerced value", ifelse(sum(is.na(old) & !is.na(new)) > 1, "s", ""),
|
|
" for ", vector_and(x[is.na(old) & !is.na(new)]), ". Run `mo_reset_session()` to reset this. This note will be shown once per session for this input."
|
|
)
|
|
}
|
|
|
|
# For all other input ----
|
|
if (any(is.na(out) & !is.na(x))) {
|
|
# reset uncertainties
|
|
AMR_env$mo_uncertainties <- AMR_env$mo_uncertainties[0, ]
|
|
AMR_env$mo_failures <- NULL
|
|
|
|
# Laboratory systems: remove (translated) entries like "no growth", "not E. coli", etc.
|
|
x[trimws2(x) %like% translate_AMR("no .*growth", language = language)] <- NA_character_
|
|
x[trimws2(x) %like% paste0("^(", translate_AMR("no|not", language = language), ") ")] <- NA_character_
|
|
|
|
# groups are in our taxonomic table with a capital G
|
|
x <- gsub(" group( |$)", " Group\\1", x, perl = TRUE)
|
|
|
|
# convert translations
|
|
x[x %like_case% "enter[o\u00F6]?[ck]o[ck](ken)?$"] <- gsub("(.* )?enter[o\u00F6]?[ck]o[ck](ken)?$", "enterococcus", x[x %like_case% "enter[o\u00F6]?[ck]o[ck](ken)?$"], perl = TRUE)
|
|
x[x %like_case% "strept[o\u00F6]?[ck]o[ck](ken)?$"] <- gsub("(.* )?strept[o\u00F6]?[ck]o[ck](ken)?$", "streptococcus", x[x %like_case% "strept[o\u00F6]?[ck]o[ck](ken)?$"], perl = TRUE)
|
|
x[x %like_case% "staph[yij]?[lo]*[ck]o[ck](ken)?$"] <- gsub("(.* )?staph[yij]?[lo]*[ck]o[ck](ken)?$", "staphylococcus", x[x %like_case% "staph[yij]?[lo]*[ck]o[ck](ken)?$"], perl = TRUE)
|
|
|
|
# run over all unique leftovers
|
|
x_unique <- unique(x[is.na(out) & !is.na(x)])
|
|
|
|
# set up progress bar
|
|
progress <- progress_ticker(n = length(x_unique), n_min = 10, print = info, title = "Converting microorganism input")
|
|
on.exit(close(progress))
|
|
|
|
msg <- character(0)
|
|
|
|
MO_lookup_current <- AMR_env$MO_lookup
|
|
if (isTRUE(only_fungi)) {
|
|
MO_lookup_current <- MO_lookup_current[MO_lookup_current$kingdom == "Fungi", , drop = FALSE]
|
|
}
|
|
|
|
# run it
|
|
x_coerced <- vapply(FUN.VALUE = character(1), x_unique, function(x_search) {
|
|
progress$tick()
|
|
|
|
# some required cleaning steps
|
|
x_out <- trimws2(x_search)
|
|
# this applies the `cleaning_regex` argument, which defaults to mo_cleaning_regex()
|
|
x_out <- gsub(cleaning_regex, " ", x_out, ignore.case = TRUE, perl = TRUE)
|
|
x_out <- trimws2(gsub(" +", " ", x_out, perl = TRUE))
|
|
x_search_cleaned <- x_out
|
|
x_out <- tolower(x_out)
|
|
# when x_search_cleaned are only capitals (such as in codes), make them lowercase to increase matching score
|
|
x_search_cleaned[x_search_cleaned == toupper(x_search_cleaned)] <- x_out[x_search_cleaned == toupper(x_search_cleaned)]
|
|
|
|
# first check if cleaning led to an exact result, case-insensitive
|
|
if (x_out %in% MO_lookup_current$fullname_lower) {
|
|
return(as.character(MO_lookup_current$mo[match(x_out, MO_lookup_current$fullname_lower)]))
|
|
}
|
|
|
|
# input must not be too short
|
|
if (nchar(x_out) < 3) {
|
|
return("UNKNOWN")
|
|
}
|
|
|
|
# take out the parts, split by space
|
|
x_parts <- strsplit(gsub("-", " ", x_out, fixed = TRUE), " ", fixed = TRUE)[[1]]
|
|
# do a pre-match on first character (and if it contains a space, first chars of first two terms)
|
|
if (length(x_parts) %in% c(2, 3)) {
|
|
# for genus + species + subspecies
|
|
if (paste(x_parts[1:2], collapse = " ") %in% MO_lookup_current$fullname_lower) {
|
|
filtr <- which(MO_lookup_current$fullname_lower %like% paste(x_parts[1:2], collapse = " "))
|
|
} else if (x_parts[1] %in% MO_lookup_current$genus_lower && !paste(x_parts[1:2], collapse = " ") %in% MO_lookup_current$fullname_lower) {
|
|
# for a known genus, but unknown (sub)species
|
|
filtr <- which(MO_lookup_current$genus_lower == x_parts[1])
|
|
minimum_matching_score <- 0.05
|
|
} else if (nchar(gsub("[^a-z]", "", x_parts[1], perl = TRUE)) <= 3) {
|
|
filtr <- which(MO_lookup_current$full_first == substr(x_parts[1], 1, 1) &
|
|
(MO_lookup_current$species_first == substr(x_parts[2], 1, 1) |
|
|
MO_lookup_current$subspecies_first == substr(x_parts[2], 1, 1) |
|
|
MO_lookup_current$subspecies_first == substr(x_parts[3], 1, 1)))
|
|
} else {
|
|
filtr <- which(MO_lookup_current$full_first == substr(x_parts[1], 1, 1) |
|
|
MO_lookup_current$species_first == substr(x_parts[2], 1, 1) |
|
|
MO_lookup_current$subspecies_first == substr(x_parts[2], 1, 1) |
|
|
MO_lookup_current$subspecies_first == substr(x_parts[3], 1, 1))
|
|
}
|
|
} else if (length(x_parts) > 3) {
|
|
first_chars <- paste0("(^| )[", paste(substr(x_parts, 1, 1), collapse = ""), "]")
|
|
filtr <- which(MO_lookup_current$full_first %like_case% first_chars)
|
|
} else if (nchar(x_out) == 3) {
|
|
# no space and 3 characters - probably a code such as SAU or ECO
|
|
msg <<- c(msg, paste0("Input \"", x_search, "\" was assumed to be a microorganism code - tried to match on \"", totitle(substr(x_out, 1, 1)), AMR_env$ellipsis_icon, " ", substr(x_out, 2, 3), AMR_env$ellipsis_icon, "\""))
|
|
filtr <- which(MO_lookup_current$fullname_lower %like_case% paste0("(^| )", substr(x_out, 1, 1), ".* ", substr(x_out, 2, 3)))
|
|
} else if (nchar(x_out) == 4) {
|
|
# no space and 4 characters - probably a code such as STAU or ESCO
|
|
msg <<- c(msg, paste0("Input \"", x_search, "\" was assumed to be a microorganism code - tried to match on \"", totitle(substr(x_out, 1, 2)), AMR_env$ellipsis_icon, " ", substr(x_out, 3, 4), AMR_env$ellipsis_icon, "\""))
|
|
filtr <- which(MO_lookup_current$fullname_lower %like_case% paste0("(^| )", substr(x_out, 1, 2), ".* ", substr(x_out, 3, 4)))
|
|
} else if (nchar(x_out) <= 6) {
|
|
# no space and 5-6 characters - probably a code such as STAAUR or ESCCOL
|
|
first_part <- paste0(substr(x_out, 1, 2), "[a-z]*", substr(x_out, 3, 3))
|
|
second_part <- substr(x_out, 4, nchar(x_out))
|
|
msg <<- c(msg, paste0("Input \"", x_search, "\" was assumed to be a microorganism code - tried to match on \"", gsub("[a-z]*", AMR_env$ellipsis_icon, totitle(first_part), fixed = TRUE), " ", second_part, AMR_env$ellipsis_icon, "\""))
|
|
filtr <- which(MO_lookup_current$fullname_lower %like_case% paste0("(^| )", first_part, ".* ", second_part))
|
|
} else {
|
|
# for genus or species or subspecies
|
|
filtr <- which(MO_lookup_current$full_first == substr(x_parts, 1, 1) |
|
|
MO_lookup_current$species_first == substr(x_parts, 1, 1) |
|
|
MO_lookup_current$subspecies_first == substr(x_parts, 1, 1))
|
|
}
|
|
if (length(filtr) == 0) {
|
|
mo_to_search <- MO_lookup_current$fullname
|
|
} else {
|
|
mo_to_search <- MO_lookup_current$fullname[filtr]
|
|
}
|
|
|
|
AMR_env$mo_to_search <- mo_to_search
|
|
# determine the matching score on the original search value
|
|
m <- mo_matching_score(x = x_search_cleaned, n = mo_to_search)
|
|
if (is.null(minimum_matching_score)) {
|
|
minimum_matching_score_current <- min(0.6, min(10, nchar(x_search_cleaned)) * 0.08)
|
|
# correct back for prevalence
|
|
minimum_matching_score_current <- minimum_matching_score_current / MO_lookup_current$prevalence[match(mo_to_search, MO_lookup_current$fullname)]
|
|
# correct back for kingdom
|
|
minimum_matching_score_current <- minimum_matching_score_current / MO_lookup_current$kingdom_index[match(mo_to_search, MO_lookup_current$fullname)]
|
|
minimum_matching_score_current <- pmax(minimum_matching_score_current, m)
|
|
if (length(x_parts) > 1 && all(m <= 0.55, na.rm = TRUE)) {
|
|
# if the highest score is 0.5, we have nothing serious - 0.5 is the lowest for pathogenic group 1
|
|
# make everything NA so the results will get removed below
|
|
# (we added length(x_parts) > 1 to exclude microbial codes from this rule, such as "STAU")
|
|
m[seq_len(length(m))] <- NA_real_
|
|
}
|
|
} else {
|
|
# minimum_matching_score was set, so remove everything below it
|
|
m[m < minimum_matching_score] <- NA_real_
|
|
minimum_matching_score_current <- minimum_matching_score
|
|
}
|
|
|
|
top_hits <- mo_to_search[order(m, decreasing = TRUE, na.last = NA)] # na.last = NA will remove the NAs
|
|
if (length(top_hits) == 0) {
|
|
warning_("No hits found for \"", x_search, "\" with minimum_matching_score = ", ifelse(is.null(minimum_matching_score), paste0("NULL (=", round(min(minimum_matching_score_current, na.rm = TRUE), 3), ")"), minimum_matching_score), ". Try setting this value lower or even to 0.", call = FALSE)
|
|
result_mo <- NA_character_
|
|
} else {
|
|
result_mo <- MO_lookup_current$mo[match(top_hits[1], MO_lookup_current$fullname)]
|
|
AMR_env$mo_uncertainties <- rbind_AMR(
|
|
AMR_env$mo_uncertainties,
|
|
data.frame(
|
|
original_input = x_search,
|
|
input = x_search_cleaned,
|
|
fullname = top_hits[1],
|
|
mo = result_mo,
|
|
candidates = ifelse(length(top_hits) > 1, paste(top_hits[2:min(99, length(top_hits))], collapse = ", "), ""),
|
|
minimum_matching_score = ifelse(is.null(minimum_matching_score), "NULL", minimum_matching_score),
|
|
keep_synonyms = keep_synonyms,
|
|
stringsAsFactors = FALSE
|
|
)
|
|
)
|
|
# save to package env to save time for next time
|
|
AMR_env$mo_previously_coerced <- unique(rbind_AMR(
|
|
AMR_env$mo_previously_coerced,
|
|
data.frame(
|
|
x = paste(x_search, minimum_matching_score, only_fungi),
|
|
mo = result_mo,
|
|
stringsAsFactors = FALSE
|
|
)
|
|
))
|
|
}
|
|
# the actual result:
|
|
as.character(result_mo)
|
|
})
|
|
|
|
# remove progress bar from console
|
|
close(progress)
|
|
# expand from unique again
|
|
out[is.na(out)] <- x_coerced[match(x[is.na(out)], x_unique)]
|
|
|
|
# Throw note about uncertainties ----
|
|
if (isTRUE(info) && NROW(AMR_env$mo_uncertainties) > 0) {
|
|
if (message_not_thrown_before("as.mo", "uncertainties", AMR_env$mo_uncertainties$original_input)) {
|
|
plural <- c("", "this")
|
|
if (length(AMR_env$mo_uncertainties$original_input) > 1) {
|
|
plural <- c("s", "these uncertainties")
|
|
}
|
|
if (length(AMR_env$mo_uncertainties$original_input) <= 3) {
|
|
examples <- vector_and(
|
|
paste0(
|
|
'"', AMR_env$mo_uncertainties$original_input,
|
|
'" (assumed ', italicise(AMR_env$mo_uncertainties$fullname), ")"
|
|
),
|
|
quotes = FALSE
|
|
)
|
|
} else {
|
|
examples <- paste0(nr2char(length(AMR_env$mo_uncertainties$original_input)), " microorganism", plural[1])
|
|
}
|
|
msg <- c(msg, paste0(
|
|
"Microorganism translation was uncertain for ", examples,
|
|
". Run `mo_uncertainties()` to review ", plural[2], ", or use `add_custom_microorganisms()` to add custom entries."
|
|
))
|
|
|
|
for (m in msg) {
|
|
message_(m)
|
|
}
|
|
}
|
|
}
|
|
} # end of loop over all yet unknowns
|
|
|
|
# Keep or replace synonyms ----
|
|
out_current <- synonym_mo_to_accepted_mo(out, fill_in_accepted = FALSE)
|
|
AMR_env$mo_renamed <- list(old = out[!is.na(out_current)])
|
|
if (isFALSE(keep_synonyms)) {
|
|
out[!is.na(out_current)] <- out_current[!is.na(out_current)]
|
|
if (isTRUE(info) && length(AMR_env$mo_renamed$old) > 0) {
|
|
print(mo_renamed(), extra_txt = " (use `keep_synonyms = TRUE` to leave uncorrected)")
|
|
}
|
|
} else if (is.null(getOption("AMR_keep_synonyms")) && length(AMR_env$mo_renamed$old) > 0 && message_not_thrown_before("as.mo", "keep_synonyms_warning", entire_session = TRUE)) {
|
|
# keep synonyms is TRUE, so check if any do have synonyms
|
|
warning_("Function `as.mo()` returned ", nr2char(length(unique(AMR_env$mo_renamed$old))), " outdated taxonomic name", ifelse(length(unique(AMR_env$mo_renamed$old)) > 1, "s", ""), ". Use `as.mo(..., keep_synonyms = FALSE)` to clean the input to currently accepted taxonomic names, or set the R option `AMR_keep_synonyms` to `FALSE`. This warning will be shown once per session.", call = FALSE)
|
|
}
|
|
|
|
# Apply Becker ----
|
|
if (!isTRUE(only_fungi) && (isTRUE(Becker) || Becker == "all")) {
|
|
# warn when species found that are not in:
|
|
# - Becker et al. 2014, PMID 25278577
|
|
# - Becker et al. 2019, PMID 30872103
|
|
# - Becker et al. 2020, PMID 32056452
|
|
|
|
# comment below code if all staphylococcal species are categorised as CoNS/CoPS
|
|
post_Becker <- paste(
|
|
"Staphylococcus",
|
|
c("caledonicus", "canis", "durrellii", "lloydii", "ratti", "roterodami", "singaporensis", "taiwanensis")
|
|
)
|
|
if (any(out %in% AMR_env$MO_lookup$mo[match(post_Becker, AMR_env$MO_lookup$fullname)])) {
|
|
if (message_not_thrown_before("as.mo", "becker")) {
|
|
warning_("in `as.mo()`: Becker ", font_italic("et al."), " (2014, 2019, 2020) does not contain these species named after their publication: ",
|
|
vector_and(font_italic(gsub("Staphylococcus", "S.", post_Becker, fixed = TRUE), collapse = NULL), quotes = FALSE),
|
|
". Categorisation to CoNS/CoPS was taken from the original scientific publication(s).",
|
|
immediate = TRUE, call = FALSE
|
|
)
|
|
}
|
|
}
|
|
|
|
# 'MO_CONS' and 'MO_COPS' are 'mo' vectors created in R/_pre_commit_checks.R
|
|
out[out %in% MO_CONS] <- "B_STPHY_CONS"
|
|
out[out %in% MO_COPS] <- "B_STPHY_COPS"
|
|
if (Becker == "all") {
|
|
out[out == "B_STPHY_AURS"] <- "B_STPHY_COPS"
|
|
}
|
|
}
|
|
|
|
# Apply Lancefield ----
|
|
if (!isTRUE(only_fungi) && (isTRUE(Lancefield) || Lancefield == "all")) {
|
|
# (using `%like_case%` to also match subspecies)
|
|
|
|
# group A - S. pyogenes
|
|
out[out %like_case% "^B_STRPT_PYGN(_|$)"] <- "B_STRPT_GRPA"
|
|
# group B - S. agalactiae
|
|
out[out %like_case% "^B_STRPT_AGLC(_|$)"] <- "B_STRPT_GRPB"
|
|
# group C - all subspecies within S. dysgalactiae and S. equi (such as S. equi zooepidemicus)
|
|
out[out %like_case% "^B_STRPT_(DYSG|EQUI)(_|$)"] <- "B_STRPT_GRPC"
|
|
if (Lancefield == "all") {
|
|
# group D - all enterococci
|
|
out[out %like_case% "^B_ENTRC(_|$)"] <- "B_STRPT_GRPD"
|
|
}
|
|
# group F - Milleri group == S. anginosus group, which incl. S. anginosus, S. constellatus, S. intermedius
|
|
out[out %like_case% "^B_STRPT_(ANGN|CNST|INTR)(_|$)"] <- "B_STRPT_GRPF"
|
|
# group G - S. dysgalactiae and S. canis (though dysgalactiae is also group C and will be matched there)
|
|
out[out %like_case% "^B_STRPT_(DYSG|CANS)(_|$)"] <- "B_STRPT_GRPG"
|
|
# group H - S. sanguinis
|
|
out[out %like_case% "^B_STRPT_SNGN(_|$)"] <- "B_STRPT_GRPH"
|
|
# group K - S. salivarius, incl. S. salivarius salivarius and S. salivarius thermophilus
|
|
out[out %like_case% "^B_STRPT_SLVR(_|$)"] <- "B_STRPT_GRPK"
|
|
# group L - only S. dysgalactiae which is also group C & G, so ignore it here
|
|
}
|
|
|
|
# All unknowns ----
|
|
out[is.na(out) & !is.na(x)] <- "UNKNOWN"
|
|
AMR_env$mo_failures <- unique(x[out == "UNKNOWN" & !toupper(x) %in% c("UNKNOWN", "CON", "UNK") & !x %like_case% "^[(]unknown [a-z]+[)]$" & !is.na(x)])
|
|
if (length(AMR_env$mo_failures) > 0) {
|
|
warning_("The following input could not be coerced and was returned as \"UNKNOWN\": ", vector_and(AMR_env$mo_failures, quotes = TRUE), ".\nYou can retrieve this list with `mo_failures()`.", call = FALSE)
|
|
}
|
|
|
|
# Return class ----
|
|
set_clean_class(out,
|
|
new_class = c("mo", "character")
|
|
)
|
|
}
|
|
|
|
# OTHER DOCUMENTED FUNCTIONS ----------------------------------------------
|
|
|
|
#' @rdname as.mo
|
|
#' @export
|
|
is.mo <- function(x) {
|
|
inherits(x, "mo")
|
|
}
|
|
|
|
#' @rdname as.mo
|
|
#' @export
|
|
mo_uncertainties <- function() {
|
|
set_clean_class(AMR_env$mo_uncertainties, new_class = c("mo_uncertainties", "data.frame"))
|
|
}
|
|
|
|
#' @rdname as.mo
|
|
#' @export
|
|
mo_renamed <- function() {
|
|
add_MO_lookup_to_AMR_env()
|
|
x <- AMR_env$mo_renamed
|
|
|
|
x$new <- synonym_mo_to_accepted_mo(x$old)
|
|
mo_old <- AMR_env$MO_lookup$fullname[match(x$old, AMR_env$MO_lookup$mo)]
|
|
mo_new <- AMR_env$MO_lookup$fullname[match(x$new, AMR_env$MO_lookup$mo)]
|
|
ref_old <- AMR_env$MO_lookup$ref[match(x$old, AMR_env$MO_lookup$mo)]
|
|
ref_new <- AMR_env$MO_lookup$ref[match(x$new, AMR_env$MO_lookup$mo)]
|
|
|
|
df_renamed <- data.frame(
|
|
old = mo_old,
|
|
new = mo_new,
|
|
ref_old = ref_old,
|
|
ref_new = ref_new,
|
|
stringsAsFactors = FALSE
|
|
)
|
|
df_renamed <- unique(df_renamed)
|
|
df_renamed <- df_renamed[order(df_renamed$old), , drop = FALSE]
|
|
set_clean_class(df_renamed, new_class = c("mo_renamed", "data.frame"))
|
|
}
|
|
|
|
#' @rdname as.mo
|
|
#' @export
|
|
mo_failures <- function() {
|
|
AMR_env$mo_failures
|
|
}
|
|
|
|
#' @rdname as.mo
|
|
#' @export
|
|
mo_reset_session <- function() {
|
|
if (NROW(AMR_env$mo_previously_coerced) > 0) {
|
|
message_("Reset ", nr2char(NROW(AMR_env$mo_previously_coerced)), " previously matched input value", ifelse(NROW(AMR_env$mo_previously_coerced) > 1, "s", ""), ".")
|
|
AMR_env$mo_previously_coerced <- AMR_env$mo_previously_coerced[0, , drop = FALSE]
|
|
AMR_env$mo_uncertainties <- AMR_env$mo_uncertainties[0, , drop = FALSE]
|
|
} else {
|
|
message_("No previously matched input values to reset.")
|
|
}
|
|
}
|
|
|
|
#' @rdname as.mo
|
|
#' @export
|
|
mo_cleaning_regex <- function() {
|
|
parts_to_remove <- c(
|
|
"e?spp([^a-z]+|$)", "e?ssp([^a-z]+|$)", "e?ss([^a-z]+|$)", "e?sp([^a-z]+|$)", "e?subsp", "sube?species", "e?species",
|
|
"biovar[a-z]*", "biotype", "serovar[a-z]*", "var([^a-z]+|$)", "serogr.?up[a-z]*",
|
|
"titer", "dummy", "Ig[ADEGM]", " ?[a-z-]+[-](resistant|susceptible) ?"
|
|
)
|
|
|
|
paste0(
|
|
"(",
|
|
"[^A-Za-z- \\(\\)\\[\\]{}]+",
|
|
"|",
|
|
"([({]|\\[).+([})]|\\])",
|
|
"|(^| )(",
|
|
paste0(parts_to_remove[order(1 - nchar(parts_to_remove))], collapse = "|"),
|
|
"))"
|
|
)
|
|
}
|
|
|
|
#' @rdname as.mo
|
|
#' @details `NA_mo_` is a missing value of the new `mo` class, analogous to e.g. base \R's [`NA_character_`][base::NA].
|
|
#' @format NULL
|
|
#' @export
|
|
NA_mo_ <- set_clean_class(NA_character_,
|
|
new_class = c("mo", "character")
|
|
)
|
|
|
|
# UNDOCUMENTED METHODS ----------------------------------------------------
|
|
|
|
# this prevents the requirement for putting the dependency in Imports:
|
|
#' @rawNamespace if(getRversion() >= "3.0.0") S3method(pillar::pillar_shaft, mo)
|
|
pillar_shaft.mo <- function(x, ...) {
|
|
add_MO_lookup_to_AMR_env()
|
|
out <- trimws(format(x))
|
|
# grey out the kingdom (part until first "_")
|
|
out[!is.na(x)] <- gsub("^([A-Z]+_)(.*)", paste0(font_subtle("\\1"), "\\2"), out[!is.na(x)], perl = TRUE)
|
|
# and grey out every _
|
|
out[!is.na(x)] <- gsub("_", font_subtle("_"), out[!is.na(x)])
|
|
|
|
# markup NA and UNKNOWN
|
|
out[is.na(x)] <- font_na(" NA")
|
|
out[x == "UNKNOWN"] <- font_na(" UNKNOWN")
|
|
|
|
# markup manual codes
|
|
out[x %in% AMR_env$MO_lookup$mo & !x %in% AMR::microorganisms$mo] <- font_blue(out[x %in% AMR_env$MO_lookup$mo & !x %in% AMR::microorganisms$mo], collapse = NULL)
|
|
|
|
df <- tryCatch(get_current_data(arg_name = "x", call = 0),
|
|
error = function(e) NULL
|
|
)
|
|
if (!is.null(df)) {
|
|
mo_cols <- vapply(FUN.VALUE = logical(1), df, is.mo)
|
|
} else {
|
|
mo_cols <- NULL
|
|
}
|
|
|
|
all_mos <- c(AMR_env$MO_lookup$mo, NA)
|
|
if (!all(x %in% all_mos) ||
|
|
(!is.null(df) && !all(unlist(df[, which(mo_cols), drop = FALSE]) %in% all_mos))) {
|
|
# markup old mo codes
|
|
out[!x %in% all_mos] <- font_italic(
|
|
font_na(x[!x %in% all_mos],
|
|
collapse = NULL
|
|
),
|
|
collapse = NULL
|
|
)
|
|
# throw a warning with the affected column name(s)
|
|
if (!is.null(mo_cols)) {
|
|
col <- paste0("Column ", vector_or(colnames(df)[mo_cols], quotes = TRUE, sort = FALSE))
|
|
} else {
|
|
col <- "The data"
|
|
}
|
|
warning_(
|
|
col, " contains old MO codes (from a previous AMR package version). ",
|
|
"Please update your MO codes with `as.mo()`.",
|
|
call = FALSE
|
|
)
|
|
}
|
|
|
|
# add the names to the bugs as mouse-over!
|
|
if (in_rstudio()) {
|
|
out[!x %in% c("UNKNOWN", NA)] <- font_url(
|
|
url = paste0(
|
|
x[!x %in% c("UNKNOWN", NA)], ": ",
|
|
mo_name(x[!x %in% c("UNKNOWN", NA)], keep_synonyms = TRUE)
|
|
),
|
|
txt = out[!x %in% c("UNKNOWN", NA)]
|
|
)
|
|
}
|
|
|
|
# make it always fit exactly
|
|
max_char <- max(nchar(x))
|
|
if (is.na(max_char)) {
|
|
max_char <- 12
|
|
}
|
|
create_pillar_column(out,
|
|
align = "left",
|
|
width = max_char + ifelse(any(x %in% c(NA, "UNKNOWN")), 2, 0)
|
|
)
|
|
}
|
|
|
|
# this prevents the requirement for putting the dependency in Imports:
|
|
#' @rawNamespace if(getRversion() >= "3.0.0") S3method(pillar::type_sum, mo)
|
|
type_sum.mo <- function(x, ...) {
|
|
"mo"
|
|
}
|
|
|
|
# this prevents the requirement for putting the dependency in Imports:
|
|
#' @rawNamespace if(getRversion() >= "3.0.0") S3method(cleaner::freq, mo)
|
|
freq.mo <- function(x, ...) {
|
|
x_noNA <- as.mo(x[!is.na(x)]) # as.mo() to get the newest mo codes
|
|
grams <- mo_gramstain(x_noNA, language = NULL)
|
|
digits <- list(...)$digits
|
|
if (is.null(digits)) {
|
|
digits <- 2
|
|
}
|
|
cleaner::freq.default(
|
|
x = x,
|
|
...,
|
|
.add_header = list(
|
|
`Gram-negative` = paste0(
|
|
format(sum(grams == "Gram-negative", na.rm = TRUE),
|
|
big.mark = " ",
|
|
decimal.mark = "."
|
|
),
|
|
" (", percentage(sum(grams == "Gram-negative", na.rm = TRUE) / length(grams),
|
|
digits = digits
|
|
),
|
|
")"
|
|
),
|
|
`Gram-positive` = paste0(
|
|
format(sum(grams == "Gram-positive", na.rm = TRUE),
|
|
big.mark = " ",
|
|
decimal.mark = "."
|
|
),
|
|
" (", percentage(sum(grams == "Gram-positive", na.rm = TRUE) / length(grams),
|
|
digits = digits
|
|
),
|
|
")"
|
|
),
|
|
`Nr. of genera` = pm_n_distinct(mo_genus(x_noNA, language = NULL)),
|
|
`Nr. of species` = pm_n_distinct(paste(
|
|
mo_genus(x_noNA, language = NULL),
|
|
mo_species(x_noNA, language = NULL)
|
|
))
|
|
)
|
|
)
|
|
}
|
|
|
|
# this prevents the requirement for putting the dependency in Imports:
|
|
#' @rawNamespace if(getRversion() >= "3.0.0") S3method(skimr::get_skimmers, mo)
|
|
get_skimmers.mo <- function(column) {
|
|
mo <- as.mo(column, keep_synonyms = TRUE, language = NULL, info = FALSE)
|
|
mo <- mo[!is.na(mo)]
|
|
spp <- mo[mo_species(mo, keep_synonyms = TRUE, language = NULL, info = FALSE) != ""]
|
|
skimr::sfl(
|
|
skim_type = "mo",
|
|
n_unique = ~ length(unique(mo)),
|
|
gram_negative = ~ sum(mo_is_gram_negative(mo, keep_synonyms = TRUE, language = NULL, info = FALSE), na.rm = TRUE),
|
|
gram_positive = ~ sum(mo_is_gram_positive(mo, keep_synonyms = TRUE, language = NULL, info = FALSE), na.rm = TRUE),
|
|
yeast = ~ sum(mo_is_yeast(mo, keep_synonyms = TRUE, language = NULL, info = FALSE), na.rm = TRUE),
|
|
top_genus = ~ names(sort(-table(mo_genus(mo, keep_synonyms = TRUE, language = NULL, info = FALSE))))[1L],
|
|
top_species = ~ names(sort(-table(mo_name(spp, keep_synonyms = TRUE, language = NULL, info = FALSE))))[1L],
|
|
)
|
|
}
|
|
|
|
#' @method print mo
|
|
#' @export
|
|
#' @noRd
|
|
print.mo <- function(x, print.shortnames = FALSE, ...) {
|
|
add_MO_lookup_to_AMR_env()
|
|
cat("Class 'mo'\n")
|
|
x_names <- names(x)
|
|
if (is.null(x_names) & print.shortnames == TRUE) {
|
|
x_names <- tryCatch(mo_shortname(x, ...), error = function(e) NULL)
|
|
}
|
|
x <- as.character(x)
|
|
names(x) <- x_names
|
|
if (!all(x %in% c(AMR_env$MO_lookup$mo, NA))) {
|
|
warning_(
|
|
"Some MO codes are from a previous AMR package version. ",
|
|
"Please update the MO codes with `as.mo()`.",
|
|
call = FALSE
|
|
)
|
|
}
|
|
print.default(x, quote = FALSE)
|
|
}
|
|
|
|
#' @method summary mo
|
|
#' @export
|
|
#' @noRd
|
|
summary.mo <- function(object, ...) {
|
|
# unique and top 1-3
|
|
x <- object
|
|
top_3 <- names(sort(-table(x[!is.na(x)])))[1:3]
|
|
out <- c(
|
|
"Class" = "mo",
|
|
"<NA>" = length(x[is.na(x)]),
|
|
"Unique" = length(unique(x[!is.na(x)])),
|
|
"#1" = top_3[1],
|
|
"#2" = top_3[2],
|
|
"#3" = top_3[3]
|
|
)
|
|
class(out) <- c("summaryDefault", "table")
|
|
out
|
|
}
|
|
|
|
#' @method as.data.frame mo
|
|
#' @export
|
|
#' @noRd
|
|
as.data.frame.mo <- function(x, ...) {
|
|
add_MO_lookup_to_AMR_env()
|
|
if (!all(x %in% c(AMR_env$MO_lookup$mo, NA))) {
|
|
warning_(
|
|
"The data contains old MO codes (from a previous AMR package version). ",
|
|
"Please update your MO codes with `as.mo()`."
|
|
)
|
|
}
|
|
nm <- deparse1(substitute(x))
|
|
if (!"nm" %in% names(list(...))) {
|
|
as.data.frame.vector(x, ..., nm = nm)
|
|
} else {
|
|
as.data.frame.vector(x, ...)
|
|
}
|
|
}
|
|
|
|
#' @method [ mo
|
|
#' @export
|
|
#' @noRd
|
|
"[.mo" <- function(x, ...) {
|
|
y <- NextMethod()
|
|
attributes(y) <- attributes(x)
|
|
y
|
|
}
|
|
#' @method [[ mo
|
|
#' @export
|
|
#' @noRd
|
|
"[[.mo" <- function(x, ...) {
|
|
y <- NextMethod()
|
|
attributes(y) <- attributes(x)
|
|
y
|
|
}
|
|
#' @method [<- mo
|
|
#' @export
|
|
#' @noRd
|
|
"[<-.mo" <- function(i, j, ..., value) {
|
|
y <- NextMethod()
|
|
attributes(y) <- attributes(i)
|
|
# must only contain valid MOs
|
|
add_MO_lookup_to_AMR_env()
|
|
return_after_integrity_check(y, "microorganism code", as.character(AMR_env$MO_lookup$mo))
|
|
}
|
|
#' @method [[<- mo
|
|
#' @export
|
|
#' @noRd
|
|
"[[<-.mo" <- function(i, j, ..., value) {
|
|
y <- NextMethod()
|
|
attributes(y) <- attributes(i)
|
|
# must only contain valid MOs
|
|
add_MO_lookup_to_AMR_env()
|
|
return_after_integrity_check(y, "microorganism code", as.character(AMR_env$MO_lookup$mo))
|
|
}
|
|
#' @method c mo
|
|
#' @export
|
|
#' @noRd
|
|
c.mo <- function(...) {
|
|
x <- list(...)[[1L]]
|
|
y <- NextMethod()
|
|
attributes(y) <- attributes(x)
|
|
add_MO_lookup_to_AMR_env()
|
|
return_after_integrity_check(y, "microorganism code", as.character(AMR_env$MO_lookup$mo))
|
|
}
|
|
|
|
#' @method unique mo
|
|
#' @export
|
|
#' @noRd
|
|
unique.mo <- function(x, incomparables = FALSE, ...) {
|
|
y <- NextMethod()
|
|
attributes(y) <- attributes(x)
|
|
y
|
|
}
|
|
|
|
#' @method rep mo
|
|
#' @export
|
|
#' @noRd
|
|
rep.mo <- function(x, ...) {
|
|
y <- NextMethod()
|
|
attributes(y) <- attributes(x)
|
|
y
|
|
}
|
|
|
|
#' @method print mo_uncertainties
|
|
#' @export
|
|
#' @noRd
|
|
print.mo_uncertainties <- function(x, n = 10, ...) {
|
|
more_than_50 <- FALSE
|
|
if (NROW(x) == 0) {
|
|
cat(word_wrap("No uncertainties to show. Only uncertainties of the last call to `as.mo()` or any `mo_*()` function are stored.\n\n", add_fn = font_blue))
|
|
return(invisible(NULL))
|
|
} else if (NROW(x) > 50) {
|
|
more_than_50 <- TRUE
|
|
x <- x[1:50, , drop = FALSE]
|
|
}
|
|
|
|
cat(word_wrap("Matching scores are based on the resemblance between the input and the full taxonomic name, and the pathogenicity in humans. See `?mo_matching_score`.\n\n", add_fn = font_blue))
|
|
|
|
add_MO_lookup_to_AMR_env()
|
|
|
|
col_red <- function(x) font_rose_bg(x, collapse = NULL)
|
|
col_orange <- function(x) font_orange_bg(x, collapse = NULL)
|
|
col_yellow <- function(x) font_yellow_bg(x, collapse = NULL)
|
|
col_green <- function(x) font_green_bg(x, collapse = NULL)
|
|
|
|
if (has_colour()) {
|
|
cat(word_wrap("Colour keys: ",
|
|
col_red(" 0.000-0.549 "),
|
|
col_orange(" 0.550-0.649 "),
|
|
col_yellow(" 0.650-0.749 "),
|
|
col_green(" 0.750-1.000"),
|
|
add_fn = font_blue
|
|
), font_green_bg(" "), "\n", sep = "")
|
|
}
|
|
|
|
score_set_colour <- function(text, scores) {
|
|
# set colours to scores
|
|
text[scores >= 0.75] <- col_green(text[scores >= 0.75])
|
|
text[scores >= 0.65 & scores < 0.75] <- col_yellow(text[scores >= 0.65 & scores < 0.75])
|
|
text[scores >= 0.55 & scores < 0.65] <- col_orange(text[scores >= 0.55 & scores < 0.65])
|
|
text[scores < 0.55] <- col_red(text[scores < 0.55])
|
|
text
|
|
}
|
|
|
|
txt <- ""
|
|
any_maxed_out <- FALSE
|
|
for (i in seq_len(nrow(x))) {
|
|
if (x[i, ]$candidates != "") {
|
|
candidates <- unlist(strsplit(x[i, ]$candidates, ", ", fixed = TRUE))
|
|
if (length(candidates) > n) {
|
|
any_maxed_out <- TRUE
|
|
candidates <- candidates[seq_len(n)]
|
|
}
|
|
scores <- mo_matching_score(x = x[i, ]$input, n = candidates)
|
|
n_candidates <- length(candidates)
|
|
|
|
candidates_formatted <- italicise(candidates)
|
|
scores_formatted <- trimws(formatC(round(scores, 3), format = "f", digits = 3))
|
|
scores_formatted <- score_set_colour(scores_formatted, scores)
|
|
|
|
# sort on descending scores
|
|
candidates_formatted <- candidates_formatted[order(1 - scores)]
|
|
scores_formatted <- scores_formatted[order(1 - scores)]
|
|
|
|
candidates <- word_wrap(
|
|
paste0(
|
|
"Also matched: ",
|
|
vector_and(
|
|
paste0(
|
|
candidates_formatted,
|
|
font_blue(paste0(" (", scores_formatted, ")"), collapse = NULL)
|
|
),
|
|
quotes = FALSE, sort = FALSE
|
|
)
|
|
),
|
|
extra_indent = nchar("Also matched: "),
|
|
width = 0.9 * getOption("width", 100)
|
|
)
|
|
} else {
|
|
candidates <- ""
|
|
}
|
|
|
|
score <- mo_matching_score(
|
|
x = x[i, ]$input,
|
|
n = x[i, ]$fullname
|
|
)
|
|
score_formatted <- trimws(formatC(round(score, 3), format = "f", digits = 3))
|
|
txt <- paste(txt,
|
|
paste0(
|
|
paste0(
|
|
"", strrep(font_grey("-"), times = getOption("width", 100)), "\n",
|
|
'"', x[i, ]$original_input, '"',
|
|
" -> ",
|
|
paste0(
|
|
font_bold(italicise(x[i, ]$fullname)),
|
|
" (", x[i, ]$mo, ", ", score_set_colour(score_formatted, score), ")"
|
|
)
|
|
),
|
|
collapse = "\n"
|
|
),
|
|
ifelse(x[i, ]$mo %in% AMR_env$MO_lookup$mo[which(AMR_env$MO_lookup$status == "synonym")],
|
|
paste0(
|
|
strrep(" ", nchar(x[i, ]$original_input) + 6),
|
|
ifelse(x[i, ]$keep_synonyms == FALSE,
|
|
# Add note if result was coerced to accepted taxonomic name
|
|
font_red(paste0("This outdated taxonomic name was converted to ", font_italic(AMR_env$MO_lookup$fullname[match(synonym_mo_to_accepted_mo(x[i, ]$mo), AMR_env$MO_lookup$mo)], collapse = NULL), " (", synonym_mo_to_accepted_mo(x[i, ]$mo), ")."), collapse = NULL),
|
|
# Or add note if result is currently another taxonomic name
|
|
font_red(paste0(font_bold("Note: "), "The current name is ", font_italic(AMR_env$MO_lookup$fullname[match(synonym_mo_to_accepted_mo(x[i, ]$mo), AMR_env$MO_lookup$mo)], collapse = NULL), " (", AMR_env$MO_lookup$ref[match(synonym_mo_to_accepted_mo(x[i, ]$mo), AMR_env$MO_lookup$mo)], ")."), collapse = NULL)
|
|
)
|
|
),
|
|
""
|
|
),
|
|
candidates,
|
|
sep = "\n"
|
|
)
|
|
txt <- gsub("[\n]+", "\n", txt)
|
|
# remove first and last break
|
|
txt <- gsub("(^[\n]|[\n]$)", "", txt)
|
|
txt <- paste0("\n", txt, "\n")
|
|
}
|
|
|
|
cat(txt)
|
|
if (isTRUE(any_maxed_out)) {
|
|
cat(font_blue(word_wrap("\nOnly the first ", n, " other matches of each record are shown. Run `print(mo_uncertainties(), n = ...)` to view more entries, or save `mo_uncertainties()` to an object.")))
|
|
}
|
|
if (isTRUE(more_than_50)) {
|
|
cat(font_blue(word_wrap("\nOnly the first 50 uncertainties are shown. Run `View(mo_uncertainties())` to view all entries, or save `mo_uncertainties()` to an object.")))
|
|
}
|
|
}
|
|
|
|
#' @method print mo_renamed
|
|
#' @export
|
|
#' @noRd
|
|
print.mo_renamed <- function(x, extra_txt = "", n = 25, ...) {
|
|
if (NROW(x) == 0) {
|
|
cat(word_wrap("No renamed taxonomy to show. Only renamed taxonomy of the last call of `as.mo()` or any `mo_*()` function are stored.\n", add_fn = font_blue))
|
|
return(invisible(NULL))
|
|
}
|
|
|
|
x$ref_old[!is.na(x$ref_old)] <- paste0(" (", gsub("et al.", font_italic("et al."), x$ref_old[!is.na(x$ref_old)], fixed = TRUE), ")")
|
|
x$ref_new[!is.na(x$ref_new)] <- paste0(" (", gsub("et al.", font_italic("et al."), x$ref_new[!is.na(x$ref_new)], fixed = TRUE), ")")
|
|
x$ref_old[is.na(x$ref_old)] <- " (author unknown)"
|
|
x$ref_new[is.na(x$ref_new)] <- " (author unknown)"
|
|
|
|
rows <- seq_len(min(NROW(x), n))
|
|
|
|
message_(
|
|
"The following microorganism", ifelse(NROW(x) > 1, "s were", " was"), " taxonomically renamed", extra_txt, ":\n",
|
|
paste0(" ", AMR_env$bullet_icon, " ", font_italic(x$old[rows], collapse = NULL), x$ref_old[rows],
|
|
" -> ", font_italic(x$new[rows], collapse = NULL), x$ref_new[rows],
|
|
collapse = "\n"
|
|
),
|
|
ifelse(NROW(x) > n, paste0("\n\nOnly the first ", n, " (out of ", NROW(x), ") are shown. Run `print(mo_renamed(), n = ...)` to view more entries (might be slow), or save `mo_renamed()` to an object."), "")
|
|
)
|
|
}
|
|
|
|
# UNDOCUMENTED HELPER FUNCTIONS -------------------------------------------
|
|
|
|
convert_colloquial_input <- function(x) {
|
|
x.bak <- trimws2(x)
|
|
x <- trimws2(tolower(x))
|
|
out <- rep(NA_character_, length(x))
|
|
|
|
# Streptococci, like GBS = Group B Streptococci (B_STRPT_GRPB)
|
|
out[x %like_case% "^g[abcdefghijkl]s$"] <- gsub("g([abcdefghijkl])s",
|
|
"B_STRPT_GRP\\U\\1",
|
|
x[x %like_case% "^g[abcdefghijkl]s$"],
|
|
perl = TRUE
|
|
)
|
|
# Streptococci in different languages, like "estreptococos grupo B"
|
|
out[x %like_case% "strepto[ck]o[ck][a-zA-Z ]* [abcdefghijkl]$"] <- gsub(".*e?strepto[ck]o[ck].* ([abcdefghijkl])$",
|
|
"B_STRPT_GRP\\U\\1",
|
|
x[x %like_case% "strepto[ck]o[ck][a-zA-Z ]* [abcdefghijkl]$"],
|
|
perl = TRUE
|
|
)
|
|
out[x %like_case% "strep[a-z]* group [abcdefghijkl]$"] <- gsub(".* ([abcdefghijkl])$",
|
|
"B_STRPT_GRP\\U\\1",
|
|
x[x %like_case% "strep[a-z]* group [abcdefghijkl]$"],
|
|
perl = TRUE
|
|
)
|
|
out[x %like_case% "group [abcdefghijkl] strepto[ck]o[ck]"] <- gsub(".*group ([abcdefghijkl]) strepto[ck]o[ck].*",
|
|
"B_STRPT_GRP\\U\\1",
|
|
x[x %like_case% "group [abcdefghijkl] strepto[ck]o[ck]"],
|
|
perl = TRUE
|
|
)
|
|
out[x %like_case% "ha?emoly.*strep"] <- "B_STRPT_HAEM"
|
|
out[x %like_case% "(strepto.* [abcg, ]{2,4}$)"] <- "B_STRPT_ABCG"
|
|
out[x %like_case% "(strepto.* mil+er+i|^mgs[^a-z]*$)"] <- "B_STRPT_MILL"
|
|
out[x %like_case% "mil+er+i gr"] <- "B_STRPT_MILL"
|
|
out[x %like_case% "((strepto|^s).* viridans|^vgs[^a-z]*$)"] <- "B_STRPT_VIRI"
|
|
out[x %like_case% "(viridans.* (strepto|^s).*|^vgs[^a-z]*$)"] <- "B_STRPT_VIRI"
|
|
out[x %like_case% "meningo[ck]o[ck](ken)?$"] <- "B_NESSR_MNNG"
|
|
out[x %like_case% "pneumo[ck]o[ck](ken)?$"] <- "B_STRPT_PNMN"
|
|
|
|
# Salmonella in different languages, like "Salmonella grupo B"
|
|
out[x %like_case% "salmonella.* [abcdefgh]$"] <- gsub(".*salmonella.* ([abcdefgh])$",
|
|
"B_SLMNL_GRP\\U\\1",
|
|
x[x %like_case% "salmonella.* [abcdefgh]$"],
|
|
perl = TRUE
|
|
)
|
|
out[x %like_case% "group [abcdefgh] salmonella"] <- gsub(".*group ([abcdefgh]) salmonella*",
|
|
"B_SLMNL_GRP\\U\\1",
|
|
x[x %like_case% "group [abcdefgh] salmonella"],
|
|
perl = TRUE
|
|
)
|
|
|
|
# CoNS/CoPS in different languages (support for German, Dutch, Spanish, Portuguese)
|
|
out[x %like_case% "([ck]oagulas[ea].negatie?[vf]|^[ck]o?ns[^a-z]*$)"] <- "B_STPHY_CONS"
|
|
out[x %like_case% "([ck]oagulas[ea].positie?[vf]|^[ck]o?ps[^a-z]*$)"] <- "B_STPHY_COPS"
|
|
|
|
# Gram stains
|
|
out[x %like_case% "gram[ -]?neg.*"] <- "B_GRAMN"
|
|
out[x %like_case% "( |^)gram[-]( |$)"] <- "B_GRAMN"
|
|
out[x %like_case% "gram[ -]?pos.*"] <- "B_GRAMP"
|
|
out[x %like_case% "( |^)gram[+]( |$)"] <- "B_GRAMP"
|
|
out[x %like_case% "anaerob[a-z]+ .*gram[ -]?neg.*"] <- "B_ANAER-NEG"
|
|
out[x %like_case% "anaerob[a-z]+ .*gram[ -]?pos.*"] <- "B_ANAER-POS"
|
|
out[is.na(out) & x %like_case% "anaerob[a-z]+ (micro)?.*organism"] <- "B_ANAER"
|
|
out[is.na(out) & x %like_case% "anaerob[a-z]+ bacter"] <- "B_ANAER"
|
|
|
|
# coryneform bacteria
|
|
out[x %like_case% "^coryneform"] <- "B_CORYNF"
|
|
|
|
# yeasts and fungi
|
|
out[x %like_case% "(^| )yeast?"] <- "F_YEAST"
|
|
out[x %like_case% "(^| )fung(us|i)"] <- "F_FUNGUS"
|
|
|
|
# protozoa
|
|
out[x %like_case% "protozo"] <- "P_PROTOZOAN" # to hit it with most languages, and "protozo" does not occur in the microorganisms data set for anything else
|
|
|
|
# trivial names known to the field
|
|
out[x %like_case% "meningo[ck]o[ck]"] <- "B_NESSR_MNNG"
|
|
out[x %like_case% "gono[ck]o[ck]"] <- "B_NESSR_GNRR"
|
|
out[x %like_case% "pneumo[ck]o[ck]"] <- "B_STRPT_PNMN"
|
|
out[x %like_case% "hacek"] <- "B_HACEK"
|
|
out[x %like_case% "haemophilus" & x %like_case% "aggregatibacter" & x %like_case% "cardiobacterium" & x %like_case% "eikenella" & x %like_case% "kingella"] <- "B_HACEK"
|
|
out[x %like_case% "slow.* grow.* mycobact"] <- "B_MYCBC_SGM"
|
|
out[x %like_case% "rapid.* grow.* mycobact"] <- "B_MYCBC_RGM"
|
|
|
|
# unexisting names (con is the WHONET code for contamination)
|
|
out[x %in% c("con", "other", "none", "unknown") | x %like_case% "virus"] <- "UNKNOWN"
|
|
|
|
# WHONET has a lot of E. coli and Vibrio cholerae names
|
|
out[x %like_case% "escherichia coli"] <- "B_ESCHR_COLI"
|
|
out[x %like_case% "vibrio cholerae"] <- "B_VIBRI_CHLR"
|
|
|
|
out
|
|
}
|
|
|
|
italicise <- function(x) {
|
|
if (!has_colour()) {
|
|
return(x)
|
|
}
|
|
out <- font_italic(x, collapse = NULL)
|
|
# city-like serovars of Salmonella (start with a capital)
|
|
out[x %like_case% "Salmonella [A-Z]"] <- paste(
|
|
font_italic("Salmonella"),
|
|
gsub("Salmonella ", "", x[x %like_case% "Salmonella [A-Z]"])
|
|
)
|
|
# streptococcal groups
|
|
out[x %like_case% "Streptococcus [A-Z]"] <- paste(
|
|
font_italic("Streptococcus"),
|
|
gsub("Streptococcus ", "", x[x %like_case% "Streptococcus [A-Z]"])
|
|
)
|
|
# be sure not to make these italic
|
|
out <- gsub("([ -]*)(Group|group|Complex|complex)(\033\\[23m)?", "\033[23m\\1\\2", out, perl = TRUE)
|
|
out <- gsub("(\033\\[3m)?(Beta[-]haemolytic|Coagulase[-](postive|negative)) ", "\\2 \033[3m", out, perl = TRUE)
|
|
out
|
|
}
|
|
|
|
nr2char <- function(x) {
|
|
if (x %in% c(1:10)) {
|
|
v <- c(
|
|
"one" = 1, "two" = 2, "three" = 3, "four" = 4, "five" = 5,
|
|
"six" = 6, "seven" = 7, "eight" = 8, "nine" = 9, "ten" = 10
|
|
)
|
|
names(v[x])
|
|
} else {
|
|
x
|
|
}
|
|
}
|
|
|
|
parse_and_convert <- function(x) {
|
|
if (tryCatch(is.character(x) && all(Encoding(x) == "unknown", na.rm = TRUE), error = function(e) FALSE)) {
|
|
out <- x
|
|
} else {
|
|
out <- tryCatch(
|
|
{
|
|
if (!is.null(dim(x))) {
|
|
if (NCOL(x) > 2) {
|
|
stop("a maximum of two columns is allowed", call. = FALSE)
|
|
} else if (NCOL(x) == 2) {
|
|
# support Tidyverse selection like: df %>% select(colA, colB)
|
|
# paste these columns together
|
|
x <- as.data.frame(x, stringsAsFactors = FALSE)
|
|
colnames(x) <- c("A", "B")
|
|
x <- paste(x$A, x$B)
|
|
} else {
|
|
# support Tidyverse selection like: df %>% select(colA)
|
|
x <- as.data.frame(x, stringsAsFactors = FALSE)[[1]]
|
|
}
|
|
}
|
|
parsed <- iconv(as.character(x), to = "UTF-8")
|
|
parsed[is.na(parsed) & !is.na(x)] <- iconv(x[is.na(parsed) & !is.na(x)], from = "Latin1", to = "ASCII//TRANSLIT")
|
|
parsed <- gsub('"', "", parsed, fixed = TRUE)
|
|
parsed
|
|
},
|
|
error = function(e) stop(conditionMessage(e), call. = FALSE)
|
|
) # this will also be thrown when running `as.mo(no_existing_object)`
|
|
}
|
|
out <- trimws2(out)
|
|
out <- gsub(" +", " ", out, perl = TRUE)
|
|
out <- gsub(" ?/ ? ", "/", out, perl = TRUE)
|
|
out
|
|
}
|
|
|
|
replace_old_mo_codes <- function(x, property) {
|
|
# this function transform old MO codes to current codes, such as:
|
|
# B_ESCH_COL (AMR v0.5.0) -> B_ESCHR_COLI
|
|
ind <- x %like_case% "^[A-Z]_[A-Z_]+$" & !x %in% AMR_env$MO_lookup$mo
|
|
if (any(ind, na.rm = TRUE)) {
|
|
add_MO_lookup_to_AMR_env()
|
|
# get the ones that match
|
|
affected <- x[ind]
|
|
affected_unique <- unique(affected)
|
|
all_direct_matches <- TRUE
|
|
# find their new codes, once per code
|
|
solved_unique <- unlist(lapply(
|
|
strsplit(affected_unique, ""),
|
|
function(m) {
|
|
kingdom <- paste0("^", m[1])
|
|
name <- m[3:length(m)]
|
|
name[name == "_"] <- " "
|
|
name <- tolower(paste0(name, ".*", collapse = ""))
|
|
name <- gsub(" .*", " ", name, fixed = TRUE)
|
|
name <- paste0("^", name)
|
|
results <- AMR_env$MO_lookup$mo[AMR_env$MO_lookup$kingdom %like_case% kingdom &
|
|
AMR_env$MO_lookup$fullname_lower %like_case% name]
|
|
if (length(results) > 1) {
|
|
all_direct_matches <<- FALSE
|
|
}
|
|
results[1L]
|
|
}
|
|
), use.names = FALSE)
|
|
solved <- solved_unique[match(affected, affected_unique)]
|
|
# assign on places where a match was found
|
|
x[ind] <- solved
|
|
n_matched <- length(affected[!is.na(affected)])
|
|
n_solved <- length(affected[!is.na(solved)])
|
|
n_unsolved <- length(affected[is.na(solved)])
|
|
n_unique <- length(affected_unique[!is.na(affected_unique)])
|
|
if (n_unique < n_matched) {
|
|
n_unique <- paste0(n_unique, " unique, ")
|
|
} else {
|
|
n_unique <- ""
|
|
}
|
|
if (property != "mo") {
|
|
warning_(
|
|
"in `mo_", property, "()`: the input contained ", n_matched,
|
|
" old MO code", ifelse(n_matched == 1, "", "s"),
|
|
" (", n_unique, "from a previous AMR package version). ",
|
|
"Please update your MO codes with `as.mo()` to increase speed."
|
|
)
|
|
} else {
|
|
warning_(
|
|
"in `as.mo()`: the input contained ", n_matched,
|
|
" old MO code", ifelse(n_matched == 1, "", "s"),
|
|
" (", n_unique, "from a previous AMR package version). ",
|
|
n_solved, " old MO code", ifelse(n_solved == 1, "", "s"),
|
|
ifelse(n_solved == 1, " was", " were"),
|
|
ifelse(all_direct_matches, " updated ", font_bold(" guessed ")),
|
|
"to ", ifelse(n_solved == 1, "a ", ""),
|
|
"currently used MO code", ifelse(n_solved == 1, "", "s"),
|
|
ifelse(n_unsolved > 0,
|
|
paste0(" and ", n_unsolved, " old MO code", ifelse(n_unsolved == 1, "", "s"), " could not be updated."),
|
|
"."
|
|
)
|
|
)
|
|
}
|
|
}
|
|
x
|
|
}
|
|
|
|
replace_ignore_pattern <- function(x, ignore_pattern) {
|
|
if (!is.null(ignore_pattern) && !identical(trimws2(ignore_pattern), "")) {
|
|
ignore_cases <- x %like% ignore_pattern
|
|
if (sum(ignore_cases) > 0) {
|
|
message_(
|
|
"The following input was ignored by `ignore_pattern = \"", ignore_pattern, "\"`: ",
|
|
vector_and(x[ignore_cases], quotes = TRUE)
|
|
)
|
|
x[ignore_cases] <- NA_character_
|
|
}
|
|
}
|
|
x
|
|
}
|
|
|
|
repair_reference_df <- function(reference_df) {
|
|
if (is.null(reference_df)) {
|
|
return(NULL)
|
|
}
|
|
# has valid own reference_df
|
|
reference_df <- reference_df %pm>%
|
|
pm_filter(!is.na(mo))
|
|
|
|
# keep only first two columns, second must be mo
|
|
if (colnames(reference_df)[1] == "mo") {
|
|
reference_df <- reference_df %pm>% pm_select(2, "mo")
|
|
} else {
|
|
reference_df <- reference_df %pm>% pm_select(1, "mo")
|
|
}
|
|
|
|
# remove factors, just keep characters
|
|
colnames(reference_df)[1] <- "x"
|
|
reference_df[, "x"] <- as.character(reference_df[, "x", drop = TRUE])
|
|
reference_df[, "mo"] <- as.character(reference_df[, "mo", drop = TRUE])
|
|
|
|
# some MO codes might be old
|
|
reference_df[, "mo"] <- as.mo(reference_df[, "mo", drop = TRUE], reference_df = NULL)
|
|
reference_df
|
|
}
|
|
|
|
get_mo_uncertainties <- function() {
|
|
remember <- list(
|
|
uncertainties = AMR_env$mo_uncertainties,
|
|
failures = AMR_env$mo_failures
|
|
)
|
|
# empty them, otherwise e.g. mo_shortname("Chlamydophila psittaci") will give 3 notes
|
|
AMR_env$mo_uncertainties <- NULL
|
|
AMR_env$mo_failures <- NULL
|
|
remember
|
|
}
|
|
|
|
load_mo_uncertainties <- function(metadata) {
|
|
AMR_env$mo_uncertainties <- metadata$uncertainties
|
|
AMR_env$mo_failures <- metadata$failures
|
|
}
|
|
|
|
synonym_mo_to_accepted_mo <- function(x, fill_in_accepted = FALSE, dataset = AMR_env$MO_lookup) {
|
|
# `dataset` is an argument so that it can be used in the regeneration of the microorganisms data set
|
|
if (identical(dataset, AMR_env$MO_lookup)) {
|
|
add_MO_lookup_to_AMR_env()
|
|
dataset <- AMR_env$MO_lookup
|
|
}
|
|
|
|
out <- x
|
|
is_still_synonym <- dataset$status[match(out, dataset$mo)] == "synonym"
|
|
limit <- 0
|
|
while (any(is_still_synonym, na.rm = TRUE) && limit < 5) {
|
|
limit <- limit + 1
|
|
|
|
# make sure to get the latest name, e.g. Fusarium pulicaris robiniae was first renamed to Fusarium roseum, then to Fusarium sambucinum
|
|
# we need the MO of Fusarium pulicaris robiniae to return the MO of Fusarium sambucinum
|
|
must_be_corrected <- !is.na(is_still_synonym) & is_still_synonym
|
|
x_gbif <- dataset$gbif_renamed_to[match(out, dataset$mo)]
|
|
x_mycobank <- dataset$mycobank_renamed_to[match(out, dataset$mo)]
|
|
x_lpsn <- dataset$lpsn_renamed_to[match(out, dataset$mo)]
|
|
|
|
out[must_be_corrected & !is.na(x_gbif)] <- dataset$mo[match(x_gbif[must_be_corrected & !is.na(x_gbif)], dataset$gbif)]
|
|
out[must_be_corrected & !is.na(x_mycobank)] <- dataset$mo[match(x_mycobank[must_be_corrected & !is.na(x_mycobank)], dataset$mycobank)]
|
|
out[must_be_corrected & !is.na(x_lpsn)] <- dataset$mo[match(x_lpsn[must_be_corrected & !is.na(x_lpsn)], dataset$lpsn)]
|
|
|
|
is_still_synonym <- dataset$status[match(out, dataset$mo)] == "synonym"
|
|
}
|
|
|
|
x_no_synonym <- dataset$status[match(x, dataset$mo)] != "synonym"
|
|
out[x_no_synonym] <- NA_character_
|
|
if (isTRUE(fill_in_accepted)) {
|
|
out[!is.na(x_no_synonym) & x_no_synonym] <- x[!is.na(x_no_synonym) & x_no_synonym]
|
|
}
|
|
|
|
out[is.na(match(x, dataset$mo))] <- NA_character_
|
|
out
|
|
}
|