1
0
mirror of https://github.com/msberends/AMR.git synced 2025-01-16 07:21:37 +01:00
AMR/R/mo.R

1059 lines
47 KiB
R
Raw Normal View History

# ==================================================================== #
# TITLE #
# AMR: An R Package for Working with Antimicrobial Resistance Data #
# #
2019-01-02 23:24:07 +01:00
# SOURCE #
2020-07-08 14:48:06 +02:00
# https://github.com/msberends/AMR #
# #
# CITE AS #
# Berends MS, Luz CF, Friedrich AW, Sinha BNM, Albers CJ, Glasner C #
# (2022). AMR: An R Package for Working with Antimicrobial Resistance #
# Data. Journal of Statistical Software, 104(3), 1-31. #
# doi:10.18637/jss.v104.i03 #
# #
2020-10-08 11:16:03 +02:00
# Developed at the University of Groningen, the Netherlands, in #
# collaboration with non-profit organisations Certe Medical #
2020-10-26 12:23:03 +01:00
# Diagnostics & Advice, and University Medical Center Groningen. #
# #
2019-01-02 23:24:07 +01:00
# This R package is free software; you can freely use and distribute #
# it for both personal and commercial purposes under the terms of the #
# GNU General Public License version 2.0 (GNU GPL-2), as published by #
# the Free Software Foundation. #
# We created this package for both routine data analysis and academic #
# research and it was publicly released in the hope that it will be #
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. #
2020-10-08 11:16:03 +02:00
# #
# Visit our website for the full manual and a complete tutorial about #
# how to conduct AMR data analysis: https://msberends.github.io/AMR/ #
# ==================================================================== #
2021-06-01 15:33:06 +02:00
#' Transform Input to a Microorganism Code
#'
2022-09-16 23:15:23 +02:00
#' Use this function to determine a valid microorganism code ([`mo`]). Determination is done using intelligent rules and the complete taxonomic kingdoms `r vector_and(unique(microorganisms$kingdom[which(!grepl("(unknown|Fungi)", microorganisms$kingdom))]), quotes = FALSE)`, and most microbial species from the kingdom Fungi (see *Source*). The input can be almost anything: a full name (like `"Staphylococcus aureus"`), an abbreviated name (such as `"S. aureus"`), an abbreviation known in the field (such as `"MRSA"`), or just a genus. See *Examples*.
2021-05-12 18:15:03 +02:00
#' @param x a [character] vector or a [data.frame] with one or two columns
2022-10-03 14:34:45 +02:00
#' @param Becker a [logical] to indicate whether staphylococci should be categorised into coagulase-negative staphylococci ("CoNS") and coagulase-positive staphylococci ("CoPS") instead of their own species, according to Karsten Becker *et al.* (see Source).
#'
#' This excludes *Staphylococcus aureus* at default, use `Becker = "all"` to also categorise *S. aureus* as "CoPS".
2022-10-03 14:34:45 +02:00
#' @param Lancefield a [logical] to indicate whether a beta-haemolytic *Streptococcus* should be categorised into Lancefield groups instead of their own species, according to Rebecca C. Lancefield (see Source). These streptococci will be categorised in their first group, e.g. *Streptococcus dysgalactiae* will be group C, although officially it was also categorised into groups G and L.
#'
#' This excludes enterococci at default (who are in group D), use `Lancefield = "all"` to also categorise all enterococci as group D.
2022-09-16 23:15:23 +02:00
#' @param minimum_matching_score a numeric value to set as the lower limit for the [MO matching score][mo_matching_score()]. When left blank, this will be determined automatically based on the character length of `x`, its [taxonomic kingdom][microorganisms] and [human pathogenicity][mo_matching_score()].
2022-10-03 14:34:45 +02:00
#' @param keep_synonyms a [logical] to indicate if old, previously valid taxonomic names must be preserved and not be corrected to currently accepted names. The default is `FALSE`, which will return a note if old taxonomic names were processed. The default can be set with `options(AMR_keep_synonyms = TRUE)` or `options(AMR_keep_synonyms = FALSE)`.
#' @param reference_df a [data.frame] to be used for extra reference when translating `x` to a valid [`mo`]. See [set_mo_source()] and [get_mo_source()] to automate the usage of your own codes (e.g. used in your analysis or organisation).
2022-10-04 11:20:58 +02:00
#' @param ignore_pattern a [regular expression][base::regex] (case-insensitive) of which all matches in `x` must return `NA`. This can be convenient to exclude known non-relevant input and can also be set with the option `AMR_ignore_pattern`, e.g. `options(AMR_ignore_pattern = "(not reported|contaminated flora)")`.
#' @param remove_from_input a [regular expression][base::regex] (case-insensitive) to clean the input of `x`. Everything matched in `x` will be removed. At default, this is the outcome of [mo_cleaning_regex()], which removes texts between brackets and texts such as "species" and "serovar".
#' @param language language to translate text like "no growth", which defaults to the system language (see [get_AMR_locale()])
#' @param info a [logical] to indicate if a progress bar should be printed if more than 25 items are to be coerced, defaults to `TRUE` only in interactive mode
2020-12-22 00:51:17 +01:00
#' @param ... other arguments passed on to functions
2018-08-31 13:36:19 +02:00
#' @rdname as.mo
#' @aliases mo
2018-09-24 23:33:29 +02:00
#' @details
2021-06-01 15:33:06 +02:00
#' A microorganism (MO) code from this package (class: [`mo`]) is human readable and typically looks like these examples:
#' ```
2019-09-18 15:46:09 +02:00
#' Code Full name
#' --------------- --------------------------------------
#' B_KLBSL Klebsiella
#' B_KLBSL_PNMN Klebsiella pneumoniae
#' B_KLBSL_PNMN_RHNS Klebsiella pneumoniae rhinoscleromatis
#' | | | |
#' | | | |
2022-10-04 11:20:58 +02:00
#' | | | \---> subspecies, a 3-5 letter acronym
#' | | \----> species, a 3-6 letter acronym
#' | \----> genus, a 4-8 letter acronym
#' \----> taxonomic kingdom: A (Archaea), AN (Animalia), B (Bacteria),
2022-09-16 23:15:23 +02:00
#' F (Fungi), PL (Plantae), P (Protozoa)
#' ```
2018-08-01 08:03:31 +02:00
#'
2022-10-04 11:20:58 +02:00
#' Values that cannot be coerced will be considered 'unknown' and will be returned as the MO code `UNKNOWN` with a warning.
2019-03-02 22:47:04 +01:00
#'
#' Use the [`mo_*`][mo_property()] functions to get properties based on the returned code, see *Examples*.
2022-10-04 11:20:58 +02:00
#'
#' The [as.mo()] function uses a novel [matching score algorithm][mo_matching_score()] (see *Matching Score for Microorganisms* below) to match input against the [available microbial taxonomy][microoganisms] in this package. This will lead to the effect that e.g. `"E. coli"` (a microorganism highly prevalent in humans) will return the microbial ID of *Escherichia coli* and not *Entamoeba coli* (a microorganism less prevalent in humans), although the latter would alphabetically come first. The algorithm uses data from the List of Prokaryotic names with Standing in Nomenclature (LPSN) and the Global Biodiversity Information Facility (GBIF) (see [microorganisms]).
2020-10-26 12:23:03 +01:00
#'
2022-09-19 11:57:21 +02:00
#' ### Coping with Uncertain Results
2020-10-26 12:23:03 +01:00
#'
2022-10-04 11:20:58 +02:00
#' Results of non-exact taxonomic input are based on their [matching score][mo_matching_score()]. The lowest allowed score can be set with the `minimum_matching_score` argument. At default this will be determined based on the character length of the input, and the [taxonomic kingdom][microorganisms] and [human pathogenicity][mo_matching_score()] of the taxonomic outcome. If values are matched with uncertainty, a message will be shown to suggest the user to evaluate the results with [mo_uncertainties()], which returns a [data.frame] with all specifications.
2022-10-03 14:34:45 +02:00
#'
2022-10-04 11:20:58 +02:00
#' To increase the quality of matching, the `remove_from_input` argument can be used to clean the input (i.e., `x`). This must be a [regular expression][base::regex] that matches parts of the input that should be removed before the input is matched against the [available microbial taxonomy][microoganisms]. It will be matched Perl-compatible and case-insensitive. The default value of `remove_from_input` is the outcome of the helper function [mo_cleaning_regex()].
2020-10-26 12:23:03 +01:00
#'
#' There are three helper functions that can be run after using the [as.mo()] function:
#' - Use [mo_uncertainties()] to get a [data.frame] that prints in a pretty format with all taxonomic names that were guessed. The output contains the matching score for all matches (see *Matching Score for Microorganisms* below).
#' - Use [mo_failures()] to get a [character] [vector] with all values that could not be coerced to a valid value.
#' - Use [mo_renamed()] to get a [data.frame] with all values that could be coerced based on old, previously accepted taxonomic names.
2019-02-08 16:06:54 +01:00
#'
2022-09-19 11:57:21 +02:00
#' ### Microbial Prevalence of Pathogens in Humans
2020-10-26 12:23:03 +01:00
#'
2022-09-16 23:15:23 +02:00
#' The coercion rules consider the prevalence of microorganisms in humans grouped into three groups, which is available as the `prevalence` columns in the [microorganisms] data set. The grouping into human pathogenic prevalence is explained in the section *Matching Score for Microorganisms* below.
#' @inheritSection mo_matching_score Matching Score for Microorganisms
2022-10-03 14:34:45 +02:00
#'
# (source as a section here, so it can be inherited by other man pages)
2018-09-24 23:33:29 +02:00
#' @section Source:
2022-10-03 14:34:45 +02:00
#' 1. Berends MS *et al.* (2022). **AMR: An R Package for Working with Antimicrobial Resistance Data**. *Journal of Statistical Software*, 104(3), 1-31; \doi{10.18637/jss.v104.i03}
#' 2. Becker K *et al.* (2014). **Coagulase-Negative Staphylococci.** *Clin Microbiol Rev.* 27(4): 870-926; \doi{10.1128/CMR.00109-13}
#' 3. Becker K *et al.* (2019). **Implications of identifying the recently defined members of the *S. aureus* complex, *S. argenteus* and *S. schweitzeri*: A position paper of members of the ESCMID Study Group for staphylococci and Staphylococcal Diseases (ESGS).** *Clin Microbiol Infect*; \doi{10.1016/j.cmi.2019.02.028}
#' 4. Becker K *et al.* (2020). **Emergence of coagulase-negative staphylococci** *Expert Rev Anti Infect Ther.* 18(4):349-366; \doi{10.1080/14787210.2020.1730813}
#' 5. Lancefield RC (1933). **A serological differentiation of human and other groups of hemolytic streptococci**. *J Exp Med.* 57(4): 571-95; \doi{10.1084/jem.57.4.571}
#' 6. Berends MS *et al.* (2022). **Trends in Occurrence and Phenotypic Resistance of Coagulase-Negative Staphylococci (CoNS) Found in Human Blood in the Northern Netherlands between 2013 and 2019** *Microorganisms* 10(9), 1801; \doi{10.3390/microorganisms10091801}
#' 7. `r TAXONOMY_VERSION$LPSN$citation` Accessed from <`r TAXONOMY_VERSION$LPSN$url`> on `r documentation_date(TAXONOMY_VERSION$LPSN$accessed_date)`.
#' 8. `r TAXONOMY_VERSION$GBIF$citation` Accessed from <`r TAXONOMY_VERSION$GBIF$url`> on `r documentation_date(TAXONOMY_VERSION$GBIF$accessed_date)`.
#' 9. `r TAXONOMY_VERSION$SNOMED$citation` URL: <`r TAXONOMY_VERSION$SNOMED$url`>
#' @export
#' @return A [character] [vector] with additional class [`mo`]
#' @seealso [microorganisms] for the [data.frame] that is being used to determine ID's.
2020-10-26 12:23:03 +01:00
#'
2020-12-17 16:22:25 +01:00
#' The [`mo_*`][mo_property()] functions (such as [mo_genus()], [mo_gramstain()]) to get properties based on the returned code.
#' @inheritSection AMR Reference Data Publicly Available
#' @examples
#' \donttest{
2019-09-18 15:46:09 +02:00
#' # These examples all return "B_STPHY_AURS", the ID of S. aureus:
2022-09-16 23:15:23 +02:00
#' as.mo(c(
#' "sau", # WHONET code
#' "stau",
#' "STAU",
#' "staaur",
#' "S. aureus",
#' "S aureus",
#' "Staphylococcus aureus",
#' "Staphylococcus aureus (MRSA,",
#' "Zthafilokkoockus oureuz", # handles incorrect spelling
#' "MRSA", # Methicillin Resistant S. aureus
#' "VISA", # Vancomycin Intermediate S. aureus
#' "VRSA", # Vancomycin Resistant S. aureus
2022-09-19 11:57:21 +02:00
#' 115329001 # SNOMED CT code
#' ))
2020-10-26 12:23:03 +01:00
#'
2019-03-18 14:29:41 +01:00
#' # Dyslexia is no problem - these all work:
2022-09-16 23:15:23 +02:00
#' as.mo(c(
#' "Ureaplasma urealyticum",
#' "Ureaplasma urealyticus",
#' "Ureaplasmium urealytica",
#' "Ureaplazma urealitycium"
#' ))
2019-03-18 14:29:41 +01:00
#'
2018-09-05 10:51:46 +02:00
#' as.mo("Streptococcus group A")
#'
2022-08-28 10:31:50 +02:00
#' as.mo("S. epidermidis") # will remain species: B_STPHY_EPDR
#' as.mo("S. epidermidis", Becker = TRUE) # will not remain species: B_STPHY_CONS
#'
2022-08-28 10:31:50 +02:00
#' as.mo("S. pyogenes") # will remain species: B_STRPT_PYGN
2019-09-18 15:46:09 +02:00
#' as.mo("S. pyogenes", Lancefield = TRUE) # will not remain species: B_STRPT_GRPA
#'
2019-03-18 14:29:41 +01:00
#' # All mo_* functions use as.mo() internally too (see ?mo_property):
2022-09-19 11:57:21 +02:00
#' mo_genus("E. coli")
#' mo_gramstain("ESCO")
#' mo_is_intrinsic_resistant("ESCCOL", ab = "vanco")
#' }
2020-10-26 12:23:03 +01:00
as.mo <- function(x,
Becker = FALSE,
Lancefield = FALSE,
2022-09-16 23:15:23 +02:00
minimum_matching_score = NULL,
2022-10-03 14:34:45 +02:00
keep_synonyms = getOption("AMR_keep_synonyms", FALSE),
reference_df = get_mo_source(),
2022-09-19 11:57:21 +02:00
ignore_pattern = getOption("AMR_ignore_pattern", NULL),
2022-10-04 11:20:58 +02:00
remove_from_input = mo_cleaning_regex(),
language = get_AMR_locale(),
info = interactive(),
2019-11-23 12:39:57 +01:00
...) {
2020-10-20 21:00:57 +02:00
meet_criteria(x, allow_class = c("mo", "data.frame", "list", "character", "numeric", "integer", "factor"), allow_NA = TRUE)
meet_criteria(Becker, allow_class = c("logical", "character"), has_length = 1)
meet_criteria(Lancefield, allow_class = c("logical", "character"), has_length = 1)
2022-09-19 11:57:21 +02:00
meet_criteria(keep_synonyms, allow_class = "logical", has_length = 1)
2022-09-16 23:15:23 +02:00
meet_criteria(minimum_matching_score, allow_class = c("numeric", "integer"), has_length = 1, allow_NULL = TRUE)
meet_criteria(reference_df, allow_class = "data.frame", allow_NULL = TRUE)
meet_criteria(ignore_pattern, allow_class = "character", has_length = 1, allow_NULL = TRUE)
2022-09-16 23:15:23 +02:00
language <- validate_language(language)
meet_criteria(info, allow_class = "logical", has_length = 1)
2022-10-03 14:34:45 +02:00
2022-10-04 11:20:58 +02:00
if (tryCatch(all(x %in% c(MO_lookup$mo, NA)) &&
2022-10-03 14:34:45 +02:00
isFALSE(Becker) &&
2022-08-28 10:31:50 +02:00
isFALSE(Lancefield), error = function(e) FALSE)) {
# don't look into valid MO codes, just return them
2020-11-10 16:35:56 +01:00
# is.mo() won't work - MO codes might change between package versions
return(set_clean_class(x, new_class = c("mo", "character")))
}
2022-10-03 14:34:45 +02:00
2022-08-28 10:31:50 +02:00
2020-04-13 21:09:56 +02:00
# start off with replaced language-specific non-ASCII characters with ASCII characters
2020-04-14 15:10:09 +02:00
x <- parse_and_convert(x)
# replace mo codes used in older package versions
2020-07-22 12:29:51 +02:00
x <- replace_old_mo_codes(x, property = "mo")
# ignore cases that match the ignore pattern
x <- replace_ignore_pattern(x, ignore_pattern)
2022-08-28 10:31:50 +02:00
2019-06-02 19:23:19 +02:00
# WHONET: xxx = no growth
2022-09-16 23:15:23 +02:00
x[tolower(x) %in% c("", "xxx", "na", "nan")] <- NA_character_
2022-08-28 10:31:50 +02:00
2022-09-16 23:15:23 +02:00
out <- rep(NA_character_, length(x))
2022-08-28 10:31:50 +02:00
2022-09-16 23:15:23 +02:00
# below we use base R's match(), known for powering '%in%', and incredibly fast!
2022-09-16 23:15:23 +02:00
# From reference_df ----
reference_df <- repair_reference_df(reference_df)
if (!is.null(reference_df)) {
out[x %in% reference_df[[1]]] <- reference_df[[2]][match(x[x %in% reference_df[[1]]], reference_df[[1]])]
}
# From MO code ----
2022-10-04 11:20:58 +02:00
out[is.na(out) & x %in% MO_lookup$mo] <- x[is.na(out) & x %in% MO_lookup$mo]
2022-09-16 23:15:23 +02:00
# From full name ----
2022-10-04 11:20:58 +02:00
out[is.na(out) & x %in% MO_lookup$fullname] <- MO_lookup$mo[match(x[is.na(out) & x %in% MO_lookup$fullname], MO_lookup$fullname)]
2022-09-16 23:15:23 +02:00
# From known codes ----
out[is.na(out) & x %in% AMR::microorganisms.codes$code] <- AMR::microorganisms.codes$mo[match(x[is.na(out) & x %in% AMR::microorganisms.codes$code], AMR::microorganisms.codes$code)]
# From SNOMED ----
2022-10-03 14:34:45 +02:00
if (any(is.na(out) & !is.na(x)) && any(is.na(out) & x %in% unlist(microorganisms$snomed), na.rm = TRUE)) {
2022-09-16 23:15:23 +02:00
# found this extremely fast gem here: https://stackoverflow.com/a/11002456/4575331
out[is.na(out) & x %in% unlist(microorganisms$snomed)] <- microorganisms$mo[rep(seq_along(microorganisms$snomed), vapply(FUN.VALUE = double(1), microorganisms$snomed, length))[match(x[is.na(out) & x %in% unlist(microorganisms$snomed)], unlist(microorganisms$snomed))]]
}
2022-09-23 12:55:52 +02:00
# From other familiar output ----
# such as Salmonella groups, colloquial names, etc.
out[is.na(out)] <- convert_colloquial_input(x[is.na(out)])
2022-09-16 23:15:23 +02:00
# From previous hits in this session ----
old <- out
2022-10-04 11:20:58 +02:00
out[is.na(out) & paste(x, minimum_matching_score) %in% pkg_env$mo_previously_coerced$x] <- pkg_env$mo_previously_coerced$mo[match(paste(x, minimum_matching_score)[is.na(out) & paste(x, minimum_matching_score) %in% pkg_env$mo_previously_coerced$x], pkg_env$mo_previously_coerced$x)]
2022-09-16 23:15:23 +02:00
new <- out
2022-10-03 14:34:45 +02:00
if (isTRUE(info) && message_not_thrown_before("as.mo", old, new, entire_session = TRUE) && any(is.na(old) & !is.na(new), na.rm = TRUE)) {
2022-09-16 23:15:23 +02:00
message_(
"Returning previously coerced value", ifelse(sum(is.na(old) & !is.na(new)) > 1, "s", ""),
" for ", vector_and(x[is.na(old) & !is.na(new)]), ". Run `mo_reset_session()` to reset this."
)
2021-02-21 20:15:09 +01:00
}
2022-10-03 14:34:45 +02:00
2022-09-16 23:15:23 +02:00
# For all other input ----
if (any(is.na(out) & !is.na(x))) {
# reset uncertainties
pkg_env$mo_uncertainties <- pkg_env$mo_uncertainties[0, ]
2022-09-27 12:16:39 +02:00
pkg_env$mo_failures <- NULL
2022-09-16 23:15:23 +02:00
# Laboratory systems: remove (translated) entries like "no growth", "not E. coli", etc.
x[trimws2(x) %like% translate_into_language("no .*growth", language = language)] <- NA_character_
x[trimws2(x) %like% paste0("^(", translate_into_language("no|not", language = language), ") ")] <- NA_character_
# run over all unique leftovers
x_unique <- unique(x[is.na(out) & !is.na(x)])
# set up progress bar
progress <- progress_ticker(n = length(x_unique), n_min = 10, print = info)
on.exit(close(progress))
# run it
2022-09-27 12:16:39 +02:00
x_coerced <- vapply(FUN.VALUE = character(1), x_unique, function(x_search) {
2022-09-16 23:15:23 +02:00
progress$tick()
2022-10-04 11:20:58 +02:00
print(x_search)
2022-09-27 12:16:39 +02:00
# some required cleaning steps
2022-10-03 14:34:45 +02:00
x_out <- trimws2(x_search)
2022-10-04 11:20:58 +02:00
# this applies the `remove_from_input` argument, which defaults to mo_cleaning_regex()
x_out <- gsub(remove_from_input, " ", x_out, ignore.case = TRUE, perl = TRUE)
x_out <- trimws2(gsub(" +", " ", x_out, perl = TRUE))
2022-09-27 12:16:39 +02:00
x_search_cleaned <- x_out
x_out <- tolower(x_out)
2022-10-04 11:20:58 +02:00
print(x_out)
2022-10-03 14:34:45 +02:00
2022-09-27 12:16:39 +02:00
# take out the parts, split by space
2022-09-16 23:15:23 +02:00
x_parts <- strsplit(gsub("-", " ", x_out, fixed = TRUE), " ", fixed = TRUE)[[1]]
2022-09-27 12:16:39 +02:00
2022-09-16 23:15:23 +02:00
# do a pre-match on first character (and if it contains a space, first chars of first two terms)
2022-09-27 12:16:39 +02:00
if (length(x_parts) %in% c(2, 3)) {
# for genus + species + subspecies
2022-10-04 11:20:58 +02:00
filtr <- which(MO_lookup$full_first == substr(x_parts[1], 1, 1) & MO_lookup$species_first == substr(x_parts[2], 1, 1))
2022-09-27 12:16:39 +02:00
} else if (length(x_parts) > 3) {
2022-09-16 23:15:23 +02:00
first_chars <- paste0("(^| )", "[", paste(substr(x_parts, 1, 1), collapse = ""), "]")
2022-10-04 11:20:58 +02:00
filtr <- which(MO_lookup$full_first %like_case% first_chars)
2022-09-16 23:15:23 +02:00
} else if (nchar(x_out) == 4) {
# no space and 4 characters - probably a code such as STAU or ESCO!
if (isTRUE(info)) {
message_("Input \"", x_search, "\" is assumed to be a microorganism code - trying to match on ", vector_and(c(substr(x_out, 1, 2), substr(x_out, 3, 4)), sort = FALSE))
}
2022-10-04 11:20:58 +02:00
filtr <- which(MO_lookup$fullname_lower %like_case% paste0("(^| )", substr(x_out, 1, 2), ".* ", substr(x_out, 3, 4)))
2022-09-16 23:15:23 +02:00
} else if (nchar(x_out) <= 6) {
# no space and 5-6 characters - probably a code such as STAAUR or ESCCOL!
first_part <- paste0(substr(x_out, 1, 2), "[a-z]*", substr(x_out, 3, 3))
second_part <- substr(x_out, 4, nchar(x_out))
if (isTRUE(info)) {
message_("Input \"", x_search, "\" is assumed to be a microorganism code - trying to match on ", vector_and(c(gsub("[a-z]*", "(...)", first_part, fixed = TRUE), second_part), sort = FALSE))
}
2022-10-04 11:20:58 +02:00
filtr <- which(MO_lookup$fullname_lower %like_case% paste0("(^| )", first_part, ".* ", second_part))
2020-09-12 13:54:21 +02:00
} else {
2022-10-04 11:20:58 +02:00
filtr <- which(MO_lookup$full_first == substr(x_out, 1, 1))
2020-09-12 13:54:21 +02:00
}
2022-09-16 23:15:23 +02:00
if (length(filtr) == 0) {
2022-10-04 11:20:58 +02:00
mo_to_search <- MO_lookup$fullname
2021-02-21 20:15:09 +01:00
} else {
2022-10-04 11:20:58 +02:00
mo_to_search <- MO_lookup$fullname[filtr]
2020-09-12 13:54:21 +02:00
}
2022-09-16 23:15:23 +02:00
pkg_env$mo_to_search <- mo_to_search
# determine the matching score on the original search value
2022-09-27 12:16:39 +02:00
m <- mo_matching_score(x = x_search_cleaned, n = mo_to_search)
2022-09-16 23:15:23 +02:00
if (is.null(minimum_matching_score)) {
2022-10-04 11:20:58 +02:00
minimum_matching_score_current <- min(0.6, min(10, nchar(x_search_cleaned)) * 0.08)
2022-09-16 23:15:23 +02:00
# correct back for prevalence
minimum_matching_score_current <- minimum_matching_score_current / MO_lookup$prevalence[match(mo_to_search, MO_lookup$fullname)]
# correct back for kingdom
minimum_matching_score_current <- minimum_matching_score_current / MO_lookup$kingdom_index[match(mo_to_search, MO_lookup$fullname)]
} else {
minimum_matching_score_current <- minimum_matching_score
2019-02-23 16:02:31 +01:00
}
2022-09-16 23:15:23 +02:00
m[m < minimum_matching_score_current] <- NA_real_
2022-09-27 12:16:39 +02:00
2022-09-16 23:15:23 +02:00
top_hits <- mo_to_search[order(m, decreasing = TRUE, na.last = NA)] # na.last = NA will remove the NAs
if (length(top_hits) == 0) {
2022-09-27 12:16:39 +02:00
warning_("No hits found for \"", x_search, "\" with minimum_matching_score = ", ifelse(is.null(minimum_matching_score), paste0("NULL (=", round(min(minimum_matching_score_current, na.rm = TRUE), 3), ")"), minimum_matching_score), ". Try setting this value lower or even to 0.")
2022-09-16 23:15:23 +02:00
result_mo <- NA_character_
} else {
result_mo <- MO_lookup$mo[match(top_hits[1], MO_lookup$fullname)]
pkg_env$mo_uncertainties <- rbind(pkg_env$mo_uncertainties,
data.frame(
minimum_matching_score = ifelse(is.null(minimum_matching_score), "NULL", minimum_matching_score),
2022-09-27 12:16:39 +02:00
original_input = x_search,
input = x_search_cleaned,
2022-09-16 23:15:23 +02:00
fullname = top_hits[1],
mo = result_mo,
candidates = ifelse(length(top_hits) > 1, paste(top_hits[2:min(26, length(top_hits))], collapse = ", "), ""),
stringsAsFactors = FALSE
),
stringsAsFactors = FALSE
2022-08-28 10:31:50 +02:00
)
2022-09-16 23:15:23 +02:00
# save to package env to save time for next time
pkg_env$mo_previously_coerced <- unique(rbind(pkg_env$mo_previously_coerced,
data.frame(
2022-10-04 11:20:58 +02:00
x = paste(x_search, minimum_matching_score),
2022-09-16 23:15:23 +02:00
mo = result_mo,
stringsAsFactors = FALSE
),
stringsAsFactors = FALSE
2022-08-28 10:31:50 +02:00
))
}
2022-09-16 23:15:23 +02:00
# the actual result:
2022-09-27 12:16:39 +02:00
as.character(result_mo)
2022-09-16 23:15:23 +02:00
})
# remove progress bar from console
close(progress)
# expand from unique again
2022-09-27 12:16:39 +02:00
out[is.na(out)] <- x_coerced[match(x[is.na(out)], x_unique)]
2022-09-16 23:15:23 +02:00
# Throw note about uncertainties ----
if (isTRUE(info) && NROW(pkg_env$mo_uncertainties) > 0) {
2022-09-27 12:16:39 +02:00
if (message_not_thrown_before("as.mo", "uncertainties", pkg_env$mo_uncertainties$original_input)) {
2022-09-16 23:15:23 +02:00
plural <- c("", "this")
2022-09-27 12:16:39 +02:00
if (length(pkg_env$mo_uncertainties$original_input) > 1) {
2022-09-16 23:15:23 +02:00
plural <- c("s", "these uncertainties")
}
2022-09-27 12:16:39 +02:00
if (length(pkg_env$mo_uncertainties$original_input) <= 3) {
2022-09-16 23:15:23 +02:00
examples <- vector_and(paste0(
2022-09-27 12:16:39 +02:00
'"', pkg_env$mo_uncertainties$original_input,
2022-09-16 23:15:23 +02:00
'" (assumed ', font_italic(pkg_env$mo_uncertainties$fullname, collapse = NULL), ")"
),
quotes = FALSE
)
} else {
2022-09-27 12:16:39 +02:00
examples <- paste0(nr2char(length(pkg_env$mo_uncertainties$original_input)), " microorganism", plural[1])
2021-02-18 23:23:14 +01:00
}
2022-08-28 10:31:50 +02:00
msg <- paste0(
2022-09-16 23:15:23 +02:00
"Microorganism translation was uncertain for ", examples,
". Run `mo_uncertainties()` to review ", plural[2], "."
2022-08-28 10:31:50 +02:00
)
2022-09-16 23:15:23 +02:00
message_(msg)
2021-02-18 23:23:14 +01:00
}
2022-09-16 23:15:23 +02:00
}
} # end of loop over all yet unknowns
# Keep or replace synonyms ----
2022-09-19 11:57:21 +02:00
gbif_matches <- AMR::microorganisms$gbif_renamed_to[match(out, AMR::microorganisms$mo)]
gbif_matches[!gbif_matches %in% AMR::microorganisms$gbif] <- NA
2022-09-19 11:57:21 +02:00
lpsn_matches <- AMR::microorganisms$lpsn_renamed_to[match(out, AMR::microorganisms$mo)]
lpsn_matches[!lpsn_matches %in% AMR::microorganisms$lpsn] <- NA
pkg_env$mo_renamed <- list(old = out[!is.na(gbif_matches) | !is.na(lpsn_matches)],
gbif_matches = gbif_matches[!is.na(gbif_matches) | !is.na(lpsn_matches)],
lpsn_matches = lpsn_matches[!is.na(gbif_matches) | !is.na(lpsn_matches)])
2022-09-16 23:15:23 +02:00
if (isFALSE(keep_synonyms)) {
out[which(!is.na(gbif_matches))] <- AMR::microorganisms$mo[match(gbif_matches[which(!is.na(gbif_matches))], AMR::microorganisms$gbif)]
out[which(!is.na(lpsn_matches))] <- AMR::microorganisms$mo[match(lpsn_matches[which(!is.na(lpsn_matches))], AMR::microorganisms$lpsn)]
2022-10-03 14:34:45 +02:00
if (isTRUE(info) && length(pkg_env$mo_renamed$old) > 0) {
print(mo_renamed(), extra_txt = " (use `keep_synonyms = TRUE` to leave uncorrected)")
}
} else if (is.null(getOption("AMR_keep_synonyms")) && length(pkg_env$mo_renamed$old) > 0 && message_not_thrown_before("as.mo", "keep_synonyms_warning", entire_session = TRUE)) {
2022-09-19 11:57:21 +02:00
# keep synonyms is TRUE, so check if any do have synonyms
2022-09-23 12:55:52 +02:00
warning_("Function `as.mo()` returned some old taxonomic names. Use `as.mo(..., keep_synonyms = FALSE)` to clean the input to currently accepted taxonomic names, or set the R option `AMR_keep_synonyms` to `FALSE`. This warning will be shown once per session.")
2018-07-23 14:14:03 +02:00
}
2022-09-23 12:55:52 +02:00
2022-09-16 23:15:23 +02:00
# Apply Becker ----
if (isTRUE(Becker) || Becker == "all") {
2020-10-20 21:00:57 +02:00
# warn when species found that are not in:
# - Becker et al. 2014, PMID 25278577
# - Becker et al. 2019, PMID 30872103
# - Becker et al. 2020, PMID 32056452
2022-08-28 10:31:50 +02:00
# comment below code if all staphylococcal species are categorised as CoNS/CoPS
2022-09-16 23:15:23 +02:00
post_Becker <- paste(
"Staphylococcus",
c("caledonicus", "canis", "durrellii", "lloydii", "ratti", "roterodami", "singaporensis", "taiwanensis")
)
if (any(out %in% AMR::microorganisms$mo[match(post_Becker, AMR::microorganisms$fullname)])) {
if (message_not_thrown_before("as.mo", "becker")) {
warning_("in `as.mo()`: Becker ", font_italic("et al."), " (2014, 2019, 2020) does not contain these species named after their publication: ",
2022-09-16 23:15:23 +02:00
vector_and(font_italic(gsub("Staphylococcus", "S.", post_Becker, fixed = TRUE), collapse = NULL), quotes = FALSE),
2022-08-28 10:31:50 +02:00
". Categorisation to CoNS/CoPS was taken from the original scientific publication(s).",
immediate = TRUE
)
}
}
2022-08-28 10:31:50 +02:00
2022-09-16 23:15:23 +02:00
# 'MO_CONS' and 'MO_COPS' are <mo> vectors created in R/_pre_commit_hook.R
out[out %in% MO_CONS] <- "B_STPHY_CONS"
out[out %in% MO_COPS] <- "B_STPHY_COPS"
2018-09-01 21:19:46 +02:00
if (Becker == "all") {
2022-09-16 23:15:23 +02:00
out[out == "B_STPHY_AURS"] <- "B_STPHY_COPS"
2018-09-01 21:19:46 +02:00
}
}
2022-08-28 10:31:50 +02:00
2022-09-16 23:15:23 +02:00
# Apply Lancefield ----
if (isTRUE(Lancefield) || Lancefield == "all") {
# group A - S. pyogenes
2022-09-16 23:15:23 +02:00
out[out == "B_STRPT_PYGN"] <- "B_STRPT_GRPA"
# group B - S. agalactiae
2022-09-16 23:15:23 +02:00
out[out == "B_STRPT_AGLC"] <- "B_STRPT_GRPB"
# group C - all subspecies within S. dysgalactiae and S. equi (such as S. equi zooepidemicus)
out[out %like_case% "^B_STRPT_(DYSG|EQUI)(_|$)"] <- "B_STRPT_GRPC"
if (Lancefield == "all") {
2022-09-16 23:15:23 +02:00
# group D - all enterococci
out[out %like_case% "^B_ENTRC(_|$)"] <- "B_STRPT_GRPD"
}
2022-09-16 23:15:23 +02:00
# group F - S. anginosus, incl. S. anginosus anginosus and S. anginosus whileyi
out[out %like_case% "^B_STRPT_ANGN(_|$)"] <- "B_STRPT_GRPF"
# group G - only S. dysgalactiae which is also group C, so ignore it here
# group H - S. sanguinis
2022-09-16 23:15:23 +02:00
out[out == "B_STRPT_SNGN"] <- "B_STRPT_GRPH"
# group K - S. salivarius, incl. S. salivarius salivariuss and S. salivarius thermophilus
out[out %like_case% "^B_STRPT_SLVR(_|$)"] <- "B_STRPT_GRPK"
# group L - only S. dysgalactiae which is also group C, so ignore it here
2018-09-01 21:19:46 +02:00
}
2022-09-27 12:16:39 +02:00
# All unknowns ----
out[is.na(out) & !is.na(x)] <- "UNKNOWN"
pkg_env$mo_failures <- unique(x[out == "UNKNOWN" & x != "UNKNOWN" & !is.na(x)])
2022-08-28 10:31:50 +02:00
2022-09-16 23:15:23 +02:00
# Return class ----
set_clean_class(out,
new_class = c("mo", "character")
2022-08-28 10:31:50 +02:00
)
2018-09-25 16:44:40 +02:00
}
2022-09-16 23:15:23 +02:00
#' @rdname as.mo
#' @export
is.mo <- function(x) {
inherits(x, "mo")
2019-08-20 11:40:54 +02:00
}
# will be exported using s3_register() in R/zzz.R
2020-08-26 11:33:54 +02:00
pillar_shaft.mo <- function(x, ...) {
out <- format(x)
# grey out the kingdom (part until first "_")
out[!is.na(x)] <- gsub("^([A-Z]+_)(.*)", paste0(font_subtle("\\1"), "\\2"), out[!is.na(x)], perl = TRUE)
2020-08-26 11:33:54 +02:00
# and grey out every _
out[!is.na(x)] <- gsub("_", font_subtle("_"), out[!is.na(x)])
2022-08-28 10:31:50 +02:00
2020-08-26 11:33:54 +02:00
# markup NA and UNKNOWN
out[is.na(x)] <- font_na(" NA")
out[x == "UNKNOWN"] <- font_na(" UNKNOWN")
2022-08-28 10:31:50 +02:00
df <- tryCatch(get_current_data(arg_name = "x", call = 0),
2022-08-28 10:31:50 +02:00
error = function(e) NULL
)
if (!is.null(df)) {
mo_cols <- vapply(FUN.VALUE = logical(1), df, is.mo)
} else {
mo_cols <- NULL
}
2022-08-28 10:31:50 +02:00
2022-10-03 14:34:45 +02:00
if (!all(x %in% c(AMR::microorganisms$mo, NA)) ||
2022-09-19 11:57:21 +02:00
(!is.null(df) && !all(unlist(df[, which(mo_cols), drop = FALSE]) %in% AMR::microorganisms$mo))) {
2021-04-07 08:37:42 +02:00
# markup old mo codes
2022-09-19 11:57:21 +02:00
out[!x %in% AMR::microorganisms$mo] <- font_italic(font_na(x[!x %in% AMR::microorganisms$mo],
2022-08-28 10:31:50 +02:00
collapse = NULL
),
collapse = NULL
)
# throw a warning with the affected column name(s)
if (!is.null(mo_cols)) {
col <- paste0("Column ", vector_or(colnames(df)[mo_cols], quotes = TRUE, sort = FALSE))
2021-04-07 08:37:42 +02:00
} else {
col <- "The data"
}
2022-08-28 10:31:50 +02:00
warning_(
col, " contains old MO codes (from a previous AMR package version). ",
"Please update your MO codes with `as.mo()`."
)
2021-04-07 08:37:42 +02:00
}
2022-08-28 10:31:50 +02:00
2020-08-26 11:33:54 +02:00
# make it always fit exactly
max_char <- max(nchar(x))
if (is.na(max_char)) {
2022-09-16 23:15:23 +02:00
max_char <- 12
}
create_pillar_column(out,
2022-08-28 10:31:50 +02:00
align = "left",
width = max_char + ifelse(any(x %in% c(NA, "UNKNOWN")), 2, 0)
)
2020-08-26 11:33:54 +02:00
}
# will be exported using s3_register() in R/zzz.R
2020-08-26 11:33:54 +02:00
type_sum.mo <- function(x, ...) {
"mo"
}
# will be exported using s3_register() in R/zzz.R
freq.mo <- function(x, ...) {
x_noNA <- as.mo(x[!is.na(x)]) # as.mo() to get the newest mo codes
grams <- mo_gramstain(x_noNA, language = NULL)
digits <- list(...)$digits
if (is.null(digits)) {
digits <- 2
}
2020-12-17 16:22:25 +01:00
cleaner::freq.default(
x = x,
...,
.add_header = list(
`Gram-negative` = paste0(
format(sum(grams == "Gram-negative", na.rm = TRUE),
2022-08-28 10:31:50 +02:00
big.mark = ",",
decimal.mark = "."
),
2020-12-17 16:22:25 +01:00
" (", percentage(sum(grams == "Gram-negative", na.rm = TRUE) / length(grams),
2022-08-28 10:31:50 +02:00
digits = digits
),
")"
),
2020-12-17 16:22:25 +01:00
`Gram-positive` = paste0(
format(sum(grams == "Gram-positive", na.rm = TRUE),
2022-08-28 10:31:50 +02:00
big.mark = ",",
decimal.mark = "."
),
2020-12-17 16:22:25 +01:00
" (", percentage(sum(grams == "Gram-positive", na.rm = TRUE) / length(grams),
2022-08-28 10:31:50 +02:00
digits = digits
),
")"
),
2020-12-17 16:22:25 +01:00
`Nr. of genera` = pm_n_distinct(mo_genus(x_noNA, language = NULL)),
2022-08-28 10:31:50 +02:00
`Nr. of species` = pm_n_distinct(paste(
mo_genus(x_noNA, language = NULL),
mo_species(x_noNA, language = NULL)
))
)
)
}
2020-09-28 01:08:55 +02:00
# will be exported using s3_register() in R/zzz.R
get_skimmers.mo <- function(column) {
2020-12-17 16:22:25 +01:00
skimr::sfl(
2020-09-28 01:08:55 +02:00
skim_type = "mo",
2022-08-28 10:31:50 +02:00
unique_total = ~ length(unique(stats::na.omit(.))),
gram_negative = ~ sum(mo_is_gram_negative(.), na.rm = TRUE),
gram_positive = ~ sum(mo_is_gram_positive(.), na.rm = TRUE),
top_genus = ~ names(sort(-table(mo_genus(stats::na.omit(.), language = NULL))))[1L],
top_species = ~ names(sort(-table(mo_name(stats::na.omit(.), language = NULL))))[1L]
2020-09-28 01:08:55 +02:00
)
}
2020-05-28 16:48:55 +02:00
#' @method print mo
2018-08-31 13:36:19 +02:00
#' @export
#' @noRd
2020-08-26 11:33:54 +02:00
print.mo <- function(x, print.shortnames = FALSE, ...) {
2020-05-27 16:37:49 +02:00
cat("Class <mo>\n")
2018-10-12 16:35:18 +02:00
x_names <- names(x)
2020-08-26 11:33:54 +02:00
if (is.null(x_names) & print.shortnames == TRUE) {
x_names <- tryCatch(mo_shortname(x, ...), error = function(e) NULL)
}
2018-10-12 16:35:18 +02:00
x <- as.character(x)
names(x) <- x_names
2022-10-03 14:34:45 +02:00
if (!all(x %in% c(AMR::microorganisms$mo, NA))) {
2022-08-28 10:31:50 +02:00
warning_(
"Some MO codes are from a previous AMR package version. ",
"Please update the MO codes with `as.mo()`."
)
}
2018-10-12 16:35:18 +02:00
print.default(x, quote = FALSE)
2018-08-31 13:36:19 +02:00
}
2018-07-23 14:14:03 +02:00
2020-05-28 16:48:55 +02:00
#' @method summary mo
2018-12-07 12:04:55 +01:00
#' @export
#' @noRd
summary.mo <- function(object, ...) {
# unique and top 1-3
2022-09-23 12:55:52 +02:00
x <- object
top_3 <- names(sort(-table(x[!is.na(x)])))[1:3]
out <- c(
2022-08-28 10:31:50 +02:00
"Class" = "mo",
"<NA>" = length(x[is.na(x)]),
2022-09-23 12:55:52 +02:00
"Unique" = length(unique(x[!is.na(x)])),
2022-08-28 10:31:50 +02:00
"#1" = top_3[1],
"#2" = top_3[2],
"#3" = top_3[3]
)
2022-09-23 12:55:52 +02:00
class(out) <- c("summaryDefault", "table")
out
2018-12-07 12:04:55 +01:00
}
2020-05-28 16:48:55 +02:00
#' @method as.data.frame mo
2018-07-23 14:14:03 +02:00
#' @export
2018-08-31 13:36:19 +02:00
#' @noRd
2020-05-19 13:18:01 +02:00
as.data.frame.mo <- function(x, ...) {
2022-10-03 14:34:45 +02:00
if (!all(x %in% c(AMR::microorganisms$mo, NA))) {
2022-08-28 10:31:50 +02:00
warning_(
"The data contains old MO codes (from a previous AMR package version). ",
"Please update your MO codes with `as.mo()`."
)
2021-04-07 08:37:42 +02:00
}
2020-05-19 12:08:49 +02:00
nm <- deparse1(substitute(x))
2018-08-31 13:36:19 +02:00
if (!"nm" %in% names(list(...))) {
2021-04-07 08:37:42 +02:00
as.data.frame.vector(x, ..., nm = nm)
2018-08-31 13:36:19 +02:00
} else {
2021-04-07 08:37:42 +02:00
as.data.frame.vector(x, ...)
2018-08-31 13:36:19 +02:00
}
}
2020-05-28 16:48:55 +02:00
#' @method [ mo
2018-08-31 13:36:19 +02:00
#' @export
#' @noRd
2019-08-14 14:57:06 +02:00
"[.mo" <- function(x, ...) {
y <- NextMethod()
2019-08-14 14:57:06 +02:00
attributes(y) <- attributes(x)
y
}
2020-05-28 16:48:55 +02:00
#' @method [[ mo
2019-08-14 14:57:06 +02:00
#' @export
#' @noRd
"[[.mo" <- function(x, ...) {
2019-08-14 14:57:06 +02:00
y <- NextMethod()
attributes(y) <- attributes(x)
2019-08-14 14:57:06 +02:00
y
}
2020-05-28 16:48:55 +02:00
#' @method [<- mo
2019-08-14 14:57:06 +02:00
#' @export
#' @noRd
"[<-.mo" <- function(i, j, ..., value) {
2019-08-14 14:57:06 +02:00
y <- NextMethod()
attributes(y) <- attributes(i)
2020-04-13 21:09:56 +02:00
# must only contain valid MOs
2022-09-16 23:15:23 +02:00
return_after_integrity_check(y, "microorganism code", as.character(AMR::microorganisms$mo))
2019-08-14 14:57:06 +02:00
}
2020-05-28 16:48:55 +02:00
#' @method [[<- mo
2019-08-14 14:57:06 +02:00
#' @export
#' @noRd
"[[<-.mo" <- function(i, j, ..., value) {
2019-08-14 14:57:06 +02:00
y <- NextMethod()
attributes(y) <- attributes(i)
2020-04-13 21:09:56 +02:00
# must only contain valid MOs
2022-09-16 23:15:23 +02:00
return_after_integrity_check(y, "microorganism code", as.character(AMR::microorganisms$mo))
2019-08-14 14:57:06 +02:00
}
2020-05-28 16:48:55 +02:00
#' @method c mo
2019-08-14 14:57:06 +02:00
#' @export
#' @noRd
c.mo <- function(...) {
x <- list(...)[[1L]]
2019-08-14 14:57:06 +02:00
y <- NextMethod()
attributes(y) <- attributes(x)
2022-09-16 23:15:23 +02:00
return_after_integrity_check(y, "microorganism code", as.character(AMR::microorganisms$mo))
2018-07-23 14:14:03 +02:00
}
2018-12-06 14:36:39 +01:00
#' @method unique mo
#' @export
#' @noRd
unique.mo <- function(x, incomparables = FALSE, ...) {
y <- NextMethod()
attributes(y) <- attributes(x)
y
}
2021-02-22 20:21:33 +01:00
#' @method rep mo
#' @export
#' @noRd
rep.mo <- function(x, ...) {
y <- NextMethod()
attributes(y) <- attributes(x)
y
}
#' @rdname as.mo
#' @export
mo_failures <- function() {
pkg_env$mo_failures
}
2019-02-08 16:06:54 +01:00
#' @rdname as.mo
#' @export
mo_uncertainties <- function() {
set_clean_class(pkg_env$mo_uncertainties, new_class = c("mo_uncertainties", "data.frame"))
2019-02-28 13:56:28 +01:00
}
2020-05-28 16:48:55 +02:00
#' @method print mo_uncertainties
2019-02-28 13:56:28 +01:00
#' @export
#' @noRd
print.mo_uncertainties <- function(x, ...) {
2019-03-12 12:19:27 +01:00
if (NROW(x) == 0) {
2022-10-03 14:34:45 +02:00
cat(word_wrap("No uncertainties to show. Only uncertainties of the last call of `as.mo()` or any `mo_*()` function are stored.\n", add_fn = font_blue))
return(invisible(NULL))
2019-03-12 12:19:27 +01:00
}
2022-10-04 11:20:58 +02:00
cat(word_wrap("Matching scores are based on the resemblance between the input and the full taxonomic name, and the pathogenicity in humans. See `?mo_matching_score`.\n\n", add_fn = font_blue))
2022-09-16 23:15:23 +02:00
if (has_colour()) {
cat(word_wrap("Colour keys: ",
font_red_bg(" 0.000-0.499 "),
font_orange_bg(" 0.500-0.599 "),
font_yellow_bg(" 0.600-0.699 "),
font_green_bg(" 0.700-1.000"),
add_fn = font_blue
), font_green_bg(" "), "\n", sep = "")
}
score_set_colour <- function(text, scores) {
# set colours to scores
text[scores >= 0.7] <- font_green_bg(text[scores >= 0.7], collapse = NULL)
text[scores >= 0.6 & scores < 0.7] <- font_yellow_bg(text[scores >= 0.6 & scores < 0.7], collapse = NULL)
text[scores >= 0.5 & scores < 0.6] <- font_orange_bg(text[scores >= 0.5 & scores < 0.6], collapse = NULL)
text[scores < 0.5] <- font_red_bg(text[scores < 0.5], collapse = NULL)
text
}
2022-08-28 10:31:50 +02:00
txt <- ""
2019-10-11 17:21:02 +02:00
for (i in seq_len(nrow(x))) {
2020-09-14 12:21:23 +02:00
if (x[i, ]$candidates != "") {
candidates <- unlist(strsplit(x[i, ]$candidates, ", ", fixed = TRUE))
2020-09-28 11:00:59 +02:00
scores <- mo_matching_score(x = x[i, ]$input, n = candidates)
2020-09-14 12:21:23 +02:00
n_candidates <- length(candidates)
2022-08-28 10:31:50 +02:00
candidates_formatted <- font_italic(candidates, collapse = NULL)
scores_formatted <- trimws(formatC(round(scores, 3), format = "f", digits = 3))
2022-09-16 23:15:23 +02:00
scores_formatted <- score_set_colour(scores_formatted, scores)
2022-08-28 10:31:50 +02:00
# sort on descending scores
candidates_formatted <- candidates_formatted[order(1 - scores)]
scores_formatted <- scores_formatted[order(1 - scores)]
2022-08-28 10:31:50 +02:00
candidates <- word_wrap(paste0(
"Also matched: ",
vector_and(paste0(
candidates_formatted,
font_blue(paste0(" (", scores_formatted, ")"), collapse = NULL)
),
quotes = FALSE, sort = FALSE
),
2022-09-16 23:15:23 +02:00
ifelse(n_candidates == 25,
font_grey(" [showing first 25]"),
2022-08-28 10:31:50 +02:00
""
)
),
extra_indent = nchar("Also matched: ")
)
} else {
candidates <- ""
}
2022-09-16 23:15:23 +02:00
score <- mo_matching_score(
x = x[i, ]$input,
n = x[i, ]$fullname
)
score_formatted <- trimws(formatC(round(score, 3), format = "f", digits = 3))
txt <- paste(txt,
2022-09-27 12:16:39 +02:00
paste0(
paste0(
'"', x[i, ]$original_input, '"',
" -> ",
paste0(
font_bold(font_italic(x[i, ]$fullname)),
ifelse(!is.na(x[i, ]$renamed_to), paste(", renamed to", font_italic(x[i, ]$renamed_to)), ""),
" (", x[i, ]$mo, ", ", score_set_colour(score_formatted, score), ")"
)
),
collapse = "\n"
),
ifelse(x[i, ]$original_input != x[i, ]$input, paste0(strrep(" ", nchar(x[i, ]$original_input) + 6), "Based on input \"", x[i, ]$input, "\""), ""),
candidates,
sep = "\n"
2022-08-28 10:31:50 +02:00
)
txt <- paste0(gsub("\n\n", "\n", txt), "\n\n")
2019-02-27 11:36:12 +01:00
}
cat(txt)
2019-02-08 16:06:54 +01:00
}
#' @rdname as.mo
#' @export
mo_renamed <- function() {
2022-10-03 14:34:45 +02:00
x <- pkg_env$mo_renamed
x$new <- ifelse(is.na(x$lpsn_matches),
AMR::microorganisms$mo[match(x$gbif_matches, AMR::microorganisms$gbif)],
AMR::microorganisms$mo[match(x$lpsn_matches, AMR::microorganisms$lpsn)])
mo_old <- AMR::microorganisms$fullname[match(x$old, AMR::microorganisms$mo)]
mo_new <- AMR::microorganisms$fullname[match(x$new, AMR::microorganisms$mo)]
ref_old <- AMR::microorganisms$ref[match(x$old, AMR::microorganisms$mo)]
ref_new <- AMR::microorganisms$ref[match(x$new, AMR::microorganisms$mo)]
df_renamed <- data.frame(old = mo_old,
new = mo_new,
ref_old = ref_old,
ref_new = ref_new,
stringsAsFactors = FALSE)
df_renamed <- unique(df_renamed)
df_renamed <- df_renamed[order(df_renamed$old), , drop = FALSE]
set_clean_class(df_renamed, new_class = c("mo_renamed", "data.frame"))
}
#' @method print mo_renamed
#' @export
#' @noRd
2022-10-03 14:34:45 +02:00
print.mo_renamed <- function(x, extra_txt = "", n = 25, ...) {
if (NROW(x) == 0) {
cat(word_wrap("No renamed taxonomy to show. Only renamed taxonomy of the last call of `as.mo()` or any `mo_*()` function are stored.\n", add_fn = font_blue))
return(invisible(NULL))
}
2022-10-03 14:34:45 +02:00
x$ref_old[!is.na(x$ref_old)] <- paste0(" (", gsub("et al.", font_italic("et al."), x$ref_old[!is.na(x$ref_old)], fixed = TRUE), ")")
x$ref_new[!is.na(x$ref_new)] <- paste0(" (", gsub("et al.", font_italic("et al."), x$ref_new[!is.na(x$ref_new)], fixed = TRUE), ")")
rows <- seq_len(min(NROW(x), n))
message_(
2022-10-03 14:34:45 +02:00
"The following microorganism", ifelse(NROW(x) > 1, "s were", " was"), " taxonomically renamed", extra_txt, ":\n",
paste0(" \u2022 ", font_italic(x$old[rows], collapse = NULL), x$ref_old[rows],
" -> ", font_italic(x$new[rows], collapse = NULL), x$ref_new[rows],
collapse = "\n"
2022-10-03 14:34:45 +02:00
),
ifelse(NROW(x) > n, paste0("\n\nOnly the first ", n, " (out of ", NROW(x), ") are shown. Run `print(mo_renamed(), n = ...)` to view more entries (might be slow), or save `mo_renamed()` to an object."), "")
)
}
2022-09-16 23:15:23 +02:00
#' @rdname as.mo
#' @export
mo_reset_session <- function() {
if (NROW(pkg_env$mo_previously_coerced) > 0) {
message_("Reset ", NROW(pkg_env$mo_previously_coerced), " previously matched input values.")
pkg_env$mo_previously_coerced <- pkg_env$mo_previously_coerced[0, , drop = FALSE]
2022-09-27 12:16:39 +02:00
pkg_env$mo_uncertainties <- pkg_env$mo_uncertainties[0, , drop = FALSE]
2022-09-16 23:15:23 +02:00
} else {
message_("No previously matched input values to reset.")
}
}
2022-10-04 11:20:58 +02:00
#' @rdname as.mo
#' @export
mo_cleaning_regex <- function() {
paste0(
"(",
"[^A-Za-z- \\(\\)\\[\\]{}]+",
"|",
"([({]|\\[).+([})]|\\])",
"|",
"(^| )(e?spp|e?ssp|e?ss|e?sp|e?subsp|sube?species|biovar|biotype|serovar|e?species)( |$))")
}
2019-02-27 11:36:12 +01:00
nr2char <- function(x) {
if (x %in% c(1:10)) {
2022-08-28 10:31:50 +02:00
v <- c(
"one" = 1, "two" = 2, "three" = 3, "four" = 4, "five" = 5,
"six" = 6, "seven" = 7, "eight" = 8, "nine" = 9, "ten" = 10
)
2019-02-27 11:36:12 +01:00
names(v[x])
} else {
x
}
2018-12-06 14:36:39 +01:00
}
2019-03-15 13:57:25 +01:00
2022-09-23 14:56:00 +02:00
get_mo_uncertainties <- function() {
2022-09-23 12:55:52 +02:00
remember <- list(uncertainties = pkg_env$mo_uncertainties)
# empty them, otherwise e.g. mo_shortname("Chlamydophila psittaci") will give 3 notes
2020-12-27 00:07:00 +01:00
pkg_env$mo_uncertainties <- NULL
2020-05-27 16:37:49 +02:00
remember
}
2022-09-23 14:56:00 +02:00
load_mo_uncertainties <- function(metadata) {
2020-12-27 00:07:00 +01:00
pkg_env$mo_uncertainties <- metadata$uncertainties
}
2019-11-15 15:25:03 +01:00
2020-04-14 15:10:09 +02:00
parse_and_convert <- function(x) {
2022-09-23 15:47:31 +02:00
if (tryCatch(is.character(x) && all(Encoding(x) == "unknown", na.rm = TRUE), error = function(e) FALSE)) {
2022-09-27 12:16:39 +02:00
return(trimws2(x))
2022-09-23 12:55:52 +02:00
}
2022-08-28 10:31:50 +02:00
tryCatch(
{
if (!is.null(dim(x))) {
if (NCOL(x) > 2) {
stop("a maximum of two columns is allowed", call. = FALSE)
} else if (NCOL(x) == 2) {
# support Tidyverse selection like: df %>% select(colA, colB)
# paste these columns together
x <- as.data.frame(x, stringsAsFactors = FALSE)
colnames(x) <- c("A", "B")
x <- paste(x$A, x$B)
} else {
# support Tidyverse selection like: df %>% select(colA)
x <- as.data.frame(x, stringsAsFactors = FALSE)[[1]]
}
2020-04-14 15:10:09 +02:00
}
2022-08-28 10:31:50 +02:00
parsed <- iconv(as.character(x), to = "UTF-8")
parsed[is.na(parsed) & !is.na(x)] <- iconv(x[is.na(parsed) & !is.na(x)], from = "Latin1", to = "ASCII//TRANSLIT")
parsed <- gsub('"', "", parsed, fixed = TRUE)
parsed <- gsub(" +", " ", parsed, perl = TRUE)
parsed
},
error = function(e) stop(e$message, call. = FALSE)
) # this will also be thrown when running `as.mo(no_existing_object)`
2022-09-27 12:16:39 +02:00
trimws2(parsed)
2020-04-13 21:09:56 +02:00
}
2020-05-16 13:05:47 +02:00
2020-07-22 12:29:51 +02:00
replace_old_mo_codes <- function(x, property) {
# this function transform old MO codes to current codes, such as:
# B_ESCH_COL (AMR v0.5.0) -> B_ESCHR_COLI
2022-09-19 11:57:21 +02:00
ind <- x %like_case% "^[A-Z]_[A-Z_]+$" & !x %in% AMR::microorganisms$mo
2022-09-23 12:55:52 +02:00
if (any(ind, na.rm = TRUE)) {
# get the ones that match
2021-05-30 22:14:38 +02:00
affected <- x[ind]
affected_unique <- unique(affected)
all_direct_matches <- TRUE
# find their new codes, once per code
2022-08-28 10:31:50 +02:00
solved_unique <- unlist(lapply(
strsplit(affected_unique, ""),
function(m) {
kingdom <- paste0("^", m[1])
name <- m[3:length(m)]
name[name == "_"] <- " "
name <- tolower(paste0(name, ".*", collapse = ""))
name <- gsub(" .*", " ", name, fixed = TRUE)
name <- paste0("^", name)
results <- MO_lookup$mo[MO_lookup$kingdom %like_case% kingdom &
MO_lookup$fullname_lower %like_case% name]
if (length(results) > 1) {
all_direct_matches <<- FALSE
}
results[1L]
}
), use.names = FALSE)
2021-05-30 22:14:38 +02:00
solved <- solved_unique[match(affected, affected_unique)]
# assign on places where a match was found
2021-05-30 22:14:38 +02:00
x[ind] <- solved
n_matched <- length(affected[!is.na(affected)])
n_solved <- length(affected[!is.na(solved)])
n_unsolved <- length(affected[is.na(solved)])
2021-05-30 22:14:38 +02:00
n_unique <- length(affected_unique[!is.na(affected_unique)])
2021-06-01 15:33:06 +02:00
if (n_unique < n_matched) {
n_unique <- paste0(n_unique, " unique, ")
} else {
n_unique <- ""
}
2020-07-22 12:29:51 +02:00
if (property != "mo") {
2022-08-28 10:31:50 +02:00
warning_(
"in `mo_", property, "()`: the input contained ", n_matched,
" old MO code", ifelse(n_matched == 1, "", "s"),
" (", n_unique, "from a previous AMR package version). ",
"Please update your MO codes with `as.mo()` to increase speed."
)
2020-10-26 12:23:03 +01:00
} else {
2022-08-28 10:31:50 +02:00
warning_(
"in `as.mo()`: the input contained ", n_matched,
" old MO code", ifelse(n_matched == 1, "", "s"),
" (", n_unique, "from a previous AMR package version). ",
n_solved, " old MO code", ifelse(n_solved == 1, "", "s"),
ifelse(n_solved == 1, " was", " were"),
ifelse(all_direct_matches, " updated ", font_bold(" guessed ")),
"to ", ifelse(n_solved == 1, "a ", ""),
"currently used MO code", ifelse(n_solved == 1, "", "s"),
ifelse(n_unsolved > 0,
paste0(" and ", n_unsolved, " old MO code", ifelse(n_unsolved == 1, "", "s"), " could not be updated."),
"."
)
)
2020-07-22 12:29:51 +02:00
}
}
x
}
replace_ignore_pattern <- function(x, ignore_pattern) {
if (!is.null(ignore_pattern) && !identical(trimws2(ignore_pattern), "")) {
ignore_cases <- x %like% ignore_pattern
if (sum(ignore_cases) > 0) {
2022-08-28 10:31:50 +02:00
message_(
"The following input was ignored by `ignore_pattern = \"", ignore_pattern, "\"`: ",
vector_and(x[ignore_cases], quotes = TRUE)
)
x[ignore_cases] <- NA_character_
}
}
x
}
2020-11-05 01:11:49 +01:00
repair_reference_df <- function(reference_df) {
2022-08-28 19:17:12 +02:00
if (is.null(reference_df)) {
return(NULL)
}
2020-11-05 01:11:49 +01:00
# has valid own reference_df
reference_df <- reference_df %pm>%
pm_filter(!is.na(mo))
2022-08-28 10:31:50 +02:00
2020-11-05 01:11:49 +01:00
# keep only first two columns, second must be mo
if (colnames(reference_df)[1] == "mo") {
reference_df <- reference_df %pm>% pm_select(2, "mo")
} else {
reference_df <- reference_df %pm>% pm_select(1, "mo")
}
2022-08-28 10:31:50 +02:00
2020-11-05 01:11:49 +01:00
# remove factors, just keep characters
colnames(reference_df)[1] <- "x"
2020-11-10 16:35:56 +01:00
reference_df[, "x"] <- as.character(reference_df[, "x", drop = TRUE])
reference_df[, "mo"] <- as.character(reference_df[, "mo", drop = TRUE])
2022-08-28 10:31:50 +02:00
# some MO codes might be old
2020-11-10 16:35:56 +01:00
reference_df[, "mo"] <- as.mo(reference_df[, "mo", drop = TRUE])
2020-11-05 01:11:49 +01:00
reference_df
}
2022-09-23 12:55:52 +02:00
convert_colloquial_input <- function(x) {
2022-09-25 14:26:08 +02:00
x.bak <- trimws2(x)
x <- trimws2(tolower(x))
2022-09-23 12:55:52 +02:00
out <- rep(NA_character_, length(x))
# Streptococci, like GBS = Group B Streptococci (B_STRPT_GRPB)
out[x %like_case% "^g[abcdfghkl]s$"] <- gsub("g([abcdfghkl])s",
"B_STRPT_GRP\\U\\1",
x[x %like_case% "^g[abcdfghkl]s$"],
perl = TRUE)
# Streptococci in different languages, like "estreptococos grupo B"
out[x %like_case% "strepto[ck]o[ck].* [abcdfghkl]$"] <- gsub(".*e?strepto[ck]o[ck].* ([abcdfghkl])$",
"B_STRPT_GRP\\U\\1",
x[x %like_case% "strepto[ck]o[ck].* [abcdfghkl]$"],
perl = TRUE)
out[x %like_case% "group [abcdfghkl] strepto[ck]o[ck]"] <- gsub(".*group ([abcdfghkl]) strepto[ck]o[ck].*",
"B_STRPT_GRP\\U\\1",
x[x %like_case% "group [abcdfghkl] strepto[ck]o[ck]"],
perl = TRUE)
out[x %like_case% "ha?emoly.*strep"] <- "B_STRPT_HAEM"
out[x %like_case% "(strepto.* mil+er+i|^mgs[^a-z]*$)"] <- "B_STRPT_MILL"
out[x %like_case% "((strepto|^s).* viridans|^vgs[^a-z]*$)"] <- "B_STRPT_VIRI"
# CoNS/CoPS in different languages (support for German, Dutch, Spanish, Portuguese)
out[x %like_case% "([ck]oagulas[ea].negatie?[vf]|^[ck]o?ns[^a-z]*$)"] <- "B_STPHY_CONS"
out[x %like_case% "([ck]oagulas[ea].positie?[vf]|^[ck]o?ps[^a-z]*$)"] <- "B_STPHY_COPS"
# Gram stains
out[x %like_case% "gram[ -]?neg.*|negatie?[vf]"] <- "B_GRAMN"
out[x %like_case% "gram[ -]?pos.*|positie?[vf]"] <- "B_GRAMP"
2022-09-25 14:26:08 +02:00
# yeasts and fungi
out[x %like_case% "^yeast?"] <- "F_YEAST"
out[x %like_case% "^fung(us|i)"] <- "F_FUNGUS"
# Salmonella city names, starting with capital species name - they are all S. enterica
2022-09-23 12:55:52 +02:00
out[x.bak %like_case% "[sS]almonella [A-Z][a-z]+ ?.*" & x %unlike% "typhi"] <- "B_SLMNL_ENTR"
# trivial names known to the field
out[x %like_case% "meningo[ck]o[ck]"] <- "B_NESSR_MNNG"
out[x %like_case% "gono[ck]o[ck]"] <- "B_NESSR_GNRR"
out[x %like_case% "pneumo[ck]o[ck]"] <- "B_STRPT_PNMN"
2022-09-25 14:26:08 +02:00
# unexisting names (xxx and con are WHONET codes)
2022-09-27 12:16:39 +02:00
out[x %in% c("con", "other", "none", "unknown") | x %like_case% "virus"] <- "UNKNOWN"
2022-09-25 14:26:08 +02:00
2022-09-23 12:55:52 +02:00
out
}