2018-06-08 12:06:54 +02:00
# ==================================================================== #
2023-07-08 17:30:05 +02:00
# TITLE: #
2022-10-05 09:12:22 +02:00
# AMR: An R Package for Working with Antimicrobial Resistance Data #
2018-06-08 12:06:54 +02:00
# #
2023-07-08 17:30:05 +02:00
# SOURCE CODE: #
2020-07-08 14:48:06 +02:00
# https://github.com/msberends/AMR #
2018-06-08 12:06:54 +02:00
# #
2023-07-08 17:30:05 +02:00
# PLEASE CITE THIS SOFTWARE AS: #
2024-07-16 14:51:57 +02:00
# Berends MS, Luz CF, Friedrich AW, et al. (2022). #
# AMR: An R Package for Working with Antimicrobial Resistance Data. #
# Journal of Statistical Software, 104(3), 1-31. #
2023-05-27 10:39:22 +02:00
# https://doi.org/10.18637/jss.v104.i03 #
2022-10-05 09:12:22 +02:00
# #
2022-12-27 15:16:15 +01:00
# Developed at the University of Groningen and the University Medical #
# Center Groningen in The Netherlands, in collaboration with many #
# colleagues from around the world, see our website. #
2018-06-08 12:06:54 +02:00
# #
2019-01-02 23:24:07 +01:00
# This R package is free software; you can freely use and distribute #
# it for both personal and commercial purposes under the terms of the #
# GNU General Public License version 2.0 (GNU GPL-2), as published by #
# the Free Software Foundation. #
2020-01-05 17:22:09 +01:00
# We created this package for both routine data analysis and academic #
# research and it was publicly released in the hope that it will be #
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. #
2020-10-08 11:16:03 +02:00
# #
# Visit our website for the full manual and a complete tutorial about #
2021-02-02 23:57:35 +01:00
# how to conduct AMR data analysis: https://msberends.github.io/AMR/ #
2018-06-08 12:06:54 +02:00
# ==================================================================== #
2023-02-22 14:38:57 +01:00
#' Transform Arbitrary Input to Valid Microbial Taxonomy
2018-06-08 12:06:54 +02:00
#'
2023-02-22 14:38:57 +01:00
#' Use this function to get a valid microorganism code ([`mo`]) based on arbitrary user input. Determination is done using intelligent rules and the complete taxonomic tree of the kingdoms `r vector_and(unique(microorganisms$kingdom[which(!grepl("(unknown|Fungi)", microorganisms$kingdom))]), quotes = FALSE)`, and most microbial species from the kingdom Fungi (see *Source*). The input can be almost anything: a full name (like `"Staphylococcus aureus"`), an abbreviated name (such as `"S. aureus"`), an abbreviation known in the field (such as `"MRSA"`), or just a genus. See *Examples*.
2021-05-12 18:15:03 +02:00
#' @param x a [character] vector or a [data.frame] with one or two columns
2023-02-22 14:38:57 +01:00
#' @param Becker a [logical] to indicate whether staphylococci should be categorised into coagulase-negative staphylococci ("CoNS") and coagulase-positive staphylococci ("CoPS") instead of their own species, according to Karsten Becker *et al.* (see *Source*). Please see *Details* for a full list of staphylococcal species that will be converted.
2018-09-04 11:33:30 +02:00
#'
2019-12-20 15:05:58 +01:00
#' This excludes *Staphylococcus aureus* at default, use `Becker = "all"` to also categorise *S. aureus* as "CoPS".
2023-02-22 14:38:57 +01:00
#' @param Lancefield a [logical] to indicate whether a beta-haemolytic *Streptococcus* should be categorised into Lancefield groups instead of their own species, according to Rebecca C. Lancefield (see *Source*). These streptococci will be categorised in their first group, e.g. *Streptococcus dysgalactiae* will be group C, although officially it was also categorised into groups G and L. . Please see *Details* for a full list of streptococcal species that will be converted.
2018-09-04 11:33:30 +02:00
#'
2022-02-26 21:58:23 +01:00
#' This excludes enterococci at default (who are in group D), use `Lancefield = "all"` to also categorise all enterococci as group D.
2022-10-05 09:12:22 +02:00
#' @param minimum_matching_score a numeric value to set as the lower limit for the [MO matching score][mo_matching_score()]. When left blank, this will be determined automatically based on the character length of `x`, its [taxonomic kingdom][microorganisms] and [human pathogenicity][mo_matching_score()].
2023-02-22 14:38:57 +01:00
#' @param keep_synonyms a [logical] to indicate if old, previously valid taxonomic names must be preserved and not be corrected to currently accepted names. The default is `FALSE`, which will return a note if old taxonomic names were processed. The default can be set with the [package option][AMR-options] [`AMR_keep_synonyms`][AMR-options], i.e. `options(AMR_keep_synonyms = TRUE)` or `options(AMR_keep_synonyms = FALSE)`.
2020-09-18 16:05:53 +02:00
#' @param reference_df a [data.frame] to be used for extra reference when translating `x` to a valid [`mo`]. See [set_mo_source()] and [get_mo_source()] to automate the usage of your own codes (e.g. used in your analysis or organisation).
2023-02-22 14:38:57 +01:00
#' @param ignore_pattern a Perl-compatible [regular expression][base::regex] (case-insensitive) of which all matches in `x` must return `NA`. This can be convenient to exclude known non-relevant input and can also be set with the [package option][AMR-options] [`AMR_ignore_pattern`][AMR-options], e.g. `options(AMR_ignore_pattern = "(not reported|contaminated flora)")`.
#' @param cleaning_regex a Perl-compatible [regular expression][base::regex] (case-insensitive) to clean the input of `x`. Every matched part in `x` will be removed. At default, this is the outcome of [mo_cleaning_regex()], which removes texts between brackets and texts such as "species" and "serovar". The default can be set with the [package option][AMR-options] [`AMR_cleaning_regex`][AMR-options].
2024-09-19 11:44:56 +02:00
#' @param only_fungi a [logical] to indicate if only fungi must be found, making sure that e.g. misspellings always return records from the kingdom of Fungi. This can be set globally for [all microorganism functions][mo_property()] with the [package option][AMR-options] [`AMR_only_fungi`][AMR-options], i.e. `options(AMR_only_fungi = TRUE)`.
2021-12-12 09:42:03 +01:00
#' @param language language to translate text like "no growth", which defaults to the system language (see [get_AMR_locale()])
2024-09-19 11:44:56 +02:00
#' @param info a [logical] to indicate that info must be printed, e.g. a progress bar when more than 25 items are to be coerced, or a list with old taxonomic names. The default is `TRUE` only in interactive mode.
2020-12-22 00:51:17 +01:00
#' @param ... other arguments passed on to functions
2018-08-31 13:36:19 +02:00
#' @rdname as.mo
#' @aliases mo
2018-09-24 23:33:29 +02:00
#' @details
2024-09-19 11:44:56 +02:00
#' A microorganism (MO) code from this package (class: [`mo`]) is human-readable and typically looks like these examples:
#'
2019-11-28 22:32:17 +01:00
#' ```
2019-09-18 15:46:09 +02:00
#' Code Full name
#' --------------- --------------------------------------
#' B_KLBSL Klebsiella
#' B_KLBSL_PNMN Klebsiella pneumoniae
#' B_KLBSL_PNMN_RHNS Klebsiella pneumoniae rhinoscleromatis
#' | | | |
#' | | | |
2022-10-05 09:12:22 +02:00
#' | | | \---> subspecies, a 3-5 letter acronym
#' | | \----> species, a 3-6 letter acronym
#' | \----> genus, a 4-8 letter acronym
2024-09-19 11:44:56 +02:00
#' \----> kingdom: A (Archaea), AN (Animalia), B (Bacteria),
#' C (Chromista), F (Fungi), PL (Plantae),
#' P (Protozoa)
2019-11-28 22:32:17 +01:00
#' ```
2018-08-01 08:03:31 +02:00
#'
2024-09-19 11:44:56 +02:00
#' Values that cannot be coerced will be considered 'unknown' and will return the MO code `UNKNOWN` with a warning.
2019-03-02 22:47:04 +01:00
#'
2021-01-18 16:57:56 +01:00
#' Use the [`mo_*`][mo_property()] functions to get properties based on the returned code, see *Examples*.
2019-03-15 13:57:25 +01:00
#'
2024-09-19 11:44:56 +02:00
#' The [as.mo()] function uses a novel and scientifically validated (\doi{10.18637/jss.v104.i03}) matching score algorithm (see *Matching Score for Microorganisms* below) to match input against the [available microbial taxonomy][microorganisms] in this package. This implicates that e.g. `"E. coli"` (a microorganism highly prevalent in humans) will return the microbial ID of *Escherichia coli* and not *Entamoeba coli* (a microorganism less prevalent in humans), although the latter would alphabetically come first.
2020-10-26 12:23:03 +01:00
#'
2022-10-05 09:12:22 +02:00
#' ### Coping with Uncertain Results
2020-10-26 12:23:03 +01:00
#'
2024-09-19 11:44:56 +02:00
#' Results of non-exact taxonomic input are based on their [matching score][mo_matching_score()]. The lowest allowed score can be set with the `minimum_matching_score` argument. At default this will be determined based on the character length of the input, the [taxonomic kingdom][microorganisms], and the [human pathogenicity][mo_matching_score()] of the taxonomic outcome. If values are matched with uncertainty, a message will be shown to suggest the user to inspect the results with [mo_uncertainties()], which returns a [data.frame] with all specifications.
2020-10-26 12:23:03 +01:00
#'
2024-09-19 11:44:56 +02:00
#' To increase the quality of matching, the `cleaning_regex` argument is used to clean the input. This must be a [regular expression][base::regex] that matches parts of the input that should be removed before the input is matched against the [available microbial taxonomy][microorganisms]. It will be matched Perl-compatible and case-insensitive. The default value of `cleaning_regex` is the outcome of the helper function [mo_cleaning_regex()].
2020-10-26 12:23:03 +01:00
#'
2020-07-22 10:24:23 +02:00
#' There are three helper functions that can be run after using the [as.mo()] function:
2021-01-18 16:57:56 +01:00
#' - Use [mo_uncertainties()] to get a [data.frame] that prints in a pretty format with all taxonomic names that were guessed. The output contains the matching score for all matches (see *Matching Score for Microorganisms* below).
2020-09-18 16:05:53 +02:00
#' - Use [mo_failures()] to get a [character] [vector] with all values that could not be coerced to a valid value.
#' - Use [mo_renamed()] to get a [data.frame] with all values that could be coerced based on old, previously accepted taxonomic names.
2019-02-08 16:06:54 +01:00
#'
2024-09-19 11:44:56 +02:00
#' ### For Mycologists
#'
#' The [matching score algorithm][mo_matching_score()] gives precedence to bacteria over fungi. If you are only analysing fungi, be sure to use `only_fungi = TRUE`, or better yet, add this to your code and run it once every session:
#'
#' ```r
#' options(AMR_only_fungi = TRUE)
#' ```
#'
#' This will make sure that no bacteria or other 'non-fungi' will be returned by [as.mo()], or any of the [`mo_*`][mo_property()] functions.
2020-10-26 12:23:03 +01:00
#'
2024-09-19 11:44:56 +02:00
#' ### Coagulase-negative and Coagulase-positive Staphylococci
#'
#' With `Becker = TRUE`, the following staphylococci will be converted to their corresponding coagulase group:
#'
#' * Coagulase-negative: `r vector_and(gsub("Staphylococcus", "S.", mo_name(MO_CONS[MO_CONS != "B_STPHY_CONS"], keep_synonyms = TRUE)), quotes = "*")`
#' * Coagulase-positive: `r vector_and(gsub("Staphylococcus", "S.", mo_name(MO_COPS[MO_COPS != "B_STPHY_COPS"], keep_synonyms = TRUE)), quotes = "*")`
#'
#' This is based on:
#'
#' * Becker K *et al.* (2014). **Coagulase-Negative Staphylococci.** *Clin Microbiol Rev.* 27(4): 870-926; \doi{10.1128/CMR.00109-13}
#' * Becker K *et al.* (2019). **Implications of identifying the recently defined members of the *S. aureus* complex, *S. argenteus* and *S. schweitzeri*: A position paper of members of the ESCMID Study Group for staphylococci and Staphylococcal Diseases (ESGS).** *Clin Microbiol Infect*; \doi{10.1016/j.cmi.2019.02.028}
#' * Becker K *et al.* (2020). **Emergence of coagulase-negative staphylococci.** *Expert Rev Anti Infect Ther.* 18(4):349-366; \doi{10.1080/14787210.2020.1730813}
#'
#' For newly named staphylococcal species, such as *S. brunensis* (2024) and *S. shinii* (2023), we looked up the scientific reference to make sure the species are considered for the correct coagulase group.
#'
#' ### Lancefield Groups in Streptococci
#'
#' With `Lancefield = TRUE`, the following streptococci will be converted to their corresponding Lancefield group:
#'
#' * `r paste(apply(aggregate(mo_name ~ mo_group_name, data = microorganisms.groups[microorganisms.groups$mo_group_name %like_case% "Streptococcus Group [A-Z]$", ], FUN = function(x) vector_and(gsub("Streptococcus", "S.", x, fixed = TRUE), quotes = "*", sort = TRUE)), 1, function(row) paste(row["mo_group_name"], ": ", row["mo_name"], sep = "")), collapse = "\n* ")`
#'
#' This is based on:
#'
#' * Lancefield RC (1933). **A serological differentiation of human and other groups of hemolytic streptococci.** *J Exp Med.* 57(4): 571-95; \doi{10.1084/jem.57.4.571}
#'
2021-01-18 16:57:56 +01:00
#' @inheritSection mo_matching_score Matching Score for Microorganisms
2022-10-05 09:12:22 +02:00
#'
# (source as a section here, so it can be inherited by other man pages)
2018-09-24 23:33:29 +02:00
#' @section Source:
2024-09-19 11:44:56 +02:00
#' * Berends MS *et al.* (2022). **AMR: An R Package for Working with Antimicrobial Resistance Data**. *Journal of Statistical Software*, 104(3), 1-31; \doi{10.18637/jss.v104.i03}
#' * `r TAXONOMY_VERSION$LPSN$citation` Accessed from <`r TAXONOMY_VERSION$LPSN$url`> on `r documentation_date(TAXONOMY_VERSION$LPSN$accessed_date)`.
#' * `r TAXONOMY_VERSION$MycoBank$citation` Accessed from <`r TAXONOMY_VERSION$MycoBank$url`> on `r documentation_date(TAXONOMY_VERSION$MycoBank$accessed_date)`.
#' * `r TAXONOMY_VERSION$GBIF$citation` Accessed from <`r TAXONOMY_VERSION$GBIF$url`> on `r documentation_date(TAXONOMY_VERSION$GBIF$accessed_date)`.
#' * `r TAXONOMY_VERSION$BacDive$citation` Accessed from <`r TAXONOMY_VERSION$BacDive$url`> on `r documentation_date(TAXONOMY_VERSION$BacDive$accessed_date)`.
#' * `r TAXONOMY_VERSION$SNOMED$citation` URL: <`r TAXONOMY_VERSION$SNOMED$url`>
#' * Bartlett A *et al.* (2022). **A comprehensive list of bacterial pathogens infecting humans** *Microbiology* 168:001269; \doi{10.1099/mic.0.001269}
2018-06-08 12:06:54 +02:00
#' @export
2020-09-18 16:05:53 +02:00
#' @return A [character] [vector] with additional class [`mo`]
#' @seealso [microorganisms] for the [data.frame] that is being used to determine ID's.
2020-10-26 12:23:03 +01:00
#'
2020-12-17 16:22:25 +01:00
#' The [`mo_*`][mo_property()] functions (such as [mo_genus()], [mo_gramstain()]) to get properties based on the returned code.
2021-01-18 16:57:56 +01:00
#' @inheritSection AMR Reference Data Publicly Available
2018-06-08 12:06:54 +02:00
#' @examples
2019-07-02 16:48:52 +02:00
#' \donttest{
2019-09-18 15:46:09 +02:00
#' # These examples all return "B_STPHY_AURS", the ID of S. aureus:
2022-10-05 09:12:22 +02:00
#' as.mo(c(
#' "sau", # WHONET code
#' "stau",
#' "STAU",
#' "staaur",
#' "S. aureus",
#' "S aureus",
2022-10-06 11:33:30 +02:00
#' "Sthafilokkockus aureus", # handles incorrect spelling
#' "Staphylococcus aureus (MRSA)",
2022-10-05 09:12:22 +02:00
#' "MRSA", # Methicillin Resistant S. aureus
#' "VISA", # Vancomycin Intermediate S. aureus
#' "VRSA", # Vancomycin Resistant S. aureus
#' 115329001 # SNOMED CT code
#' ))
2020-10-26 12:23:03 +01:00
#'
2019-03-18 14:29:41 +01:00
#' # Dyslexia is no problem - these all work:
2022-10-05 09:12:22 +02:00
#' as.mo(c(
#' "Ureaplasma urealyticum",
#' "Ureaplasma urealyticus",
#' "Ureaplasmium urealytica",
#' "Ureaplazma urealitycium"
#' ))
2023-07-10 13:41:52 +02:00
#'
#' # input will get cleaned up with the input given in the `cleaning_regex` argument,
#' # which defaults to `mo_cleaning_regex()`:
#' cat(mo_cleaning_regex(), "\n")
2019-03-18 14:29:41 +01:00
#'
2018-09-05 10:51:46 +02:00
#' as.mo("Streptococcus group A")
#'
2022-08-28 10:31:50 +02:00
#' as.mo("S. epidermidis") # will remain species: B_STPHY_EPDR
#' as.mo("S. epidermidis", Becker = TRUE) # will not remain species: B_STPHY_CONS
2018-08-02 13:15:45 +02:00
#'
2022-08-28 10:31:50 +02:00
#' as.mo("S. pyogenes") # will remain species: B_STRPT_PYGN
2019-09-18 15:46:09 +02:00
#' as.mo("S. pyogenes", Lancefield = TRUE) # will not remain species: B_STRPT_GRPA
2018-08-02 13:15:45 +02:00
#'
2019-03-18 14:29:41 +01:00
#' # All mo_* functions use as.mo() internally too (see ?mo_property):
2022-10-05 09:12:22 +02:00
#' mo_genus("E. coli")
#' mo_gramstain("ESCO")
#' mo_is_intrinsic_resistant("ESCCOL", ab = "vanco")
2018-06-08 12:06:54 +02:00
#' }
2020-10-26 12:23:03 +01:00
as.mo <- function ( x ,
Becker = FALSE ,
Lancefield = FALSE ,
2022-10-05 09:12:22 +02:00
minimum_matching_score = NULL ,
keep_synonyms = getOption ( " AMR_keep_synonyms" , FALSE ) ,
2020-09-03 12:31:48 +02:00
reference_df = get_mo_source ( ) ,
2022-10-05 09:12:22 +02:00
ignore_pattern = getOption ( " AMR_ignore_pattern" , NULL ) ,
2023-02-22 14:38:57 +01:00
cleaning_regex = getOption ( " AMR_cleaning_regex" , mo_cleaning_regex ( ) ) ,
2024-09-19 11:44:56 +02:00
only_fungi = getOption ( " AMR_only_fungi" , FALSE ) ,
2021-12-12 09:42:03 +01:00
language = get_AMR_locale ( ) ,
2021-04-20 10:46:17 +02:00
info = interactive ( ) ,
2019-11-23 12:39:57 +01:00
... ) {
2020-10-20 21:00:57 +02:00
meet_criteria ( x , allow_class = c ( " mo" , " data.frame" , " list" , " character" , " numeric" , " integer" , " factor" ) , allow_NA = TRUE )
2020-10-19 17:09:19 +02:00
meet_criteria ( Becker , allow_class = c ( " logical" , " character" ) , has_length = 1 )
meet_criteria ( Lancefield , allow_class = c ( " logical" , " character" ) , has_length = 1 )
2023-02-10 16:18:00 +01:00
meet_criteria ( minimum_matching_score , allow_class = c ( " numeric" , " integer" ) , has_length = 1 , allow_NULL = TRUE , is_positive_or_zero = TRUE , is_finite = TRUE )
2023-02-22 14:38:57 +01:00
meet_criteria ( keep_synonyms , allow_class = " logical" , has_length = 1 )
2020-10-19 17:09:19 +02:00
meet_criteria ( reference_df , allow_class = " data.frame" , allow_NULL = TRUE )
meet_criteria ( ignore_pattern , allow_class = " character" , has_length = 1 , allow_NULL = TRUE )
2023-02-22 14:38:57 +01:00
meet_criteria ( cleaning_regex , allow_class = " character" , has_length = 1 , allow_NULL = TRUE )
2024-09-19 11:44:56 +02:00
meet_criteria ( only_fungi , allow_class = " logical" , has_length = 1 )
2022-10-05 09:12:22 +02:00
language <- validate_language ( language )
2021-04-20 10:46:17 +02:00
meet_criteria ( info , allow_class = " logical" , has_length = 1 )
2023-07-08 17:30:05 +02:00
2023-01-21 23:47:20 +01:00
add_MO_lookup_to_AMR_env ( )
2023-06-22 15:10:59 +02:00
if ( tryCatch ( all ( x %in% c ( AMR_env $ MO_lookup $ mo , NA ) ) , error = function ( e ) FALSE ) &&
isFALSE ( Becker ) &&
isFALSE ( Lancefield ) &&
isTRUE ( keep_synonyms ) ) {
2020-09-12 08:49:01 +02:00
# don't look into valid MO codes, just return them
2020-11-10 16:35:56 +01:00
# is.mo() won't work - MO codes might change between package versions
2020-11-16 16:57:55 +01:00
return ( set_clean_class ( x , new_class = c ( " mo" , " character" ) ) )
2020-09-12 08:49:01 +02:00
}
2023-07-08 17:30:05 +02:00
2020-04-13 21:09:56 +02:00
# start off with replaced language-specific non-ASCII characters with ASCII characters
2020-04-14 15:10:09 +02:00
x <- parse_and_convert ( x )
2020-07-22 10:24:23 +02:00
# replace mo codes used in older package versions
2020-07-22 12:29:51 +02:00
x <- replace_old_mo_codes ( x , property = " mo" )
2020-09-03 12:31:48 +02:00
# ignore cases that match the ignore pattern
x <- replace_ignore_pattern ( x , ignore_pattern )
2023-07-08 17:30:05 +02:00
2022-10-05 09:12:22 +02:00
x_lower <- tolower ( x )
2023-07-08 17:30:05 +02:00
2022-10-05 09:12:22 +02:00
# WHONET: xxx = no growth
x [x_lower %in% c ( " " , " xxx" , " na" , " nan" ) ] <- NA_character_
2023-07-08 17:30:05 +02:00
2022-10-05 09:12:22 +02:00
out <- rep ( NA_character_ , length ( x ) )
2023-07-08 17:30:05 +02:00
2022-10-05 09:12:22 +02:00
# below we use base R's match(), known for powering '%in%', and incredibly fast!
2023-07-08 17:30:05 +02:00
2022-10-05 09:12:22 +02:00
# From reference_df ----
reference_df <- repair_reference_df ( reference_df )
if ( ! is.null ( reference_df ) ) {
out [x %in% reference_df [ [1 ] ] ] <- reference_df [ [2 ] ] [match ( x [x %in% reference_df [ [1 ] ] ] , reference_df [ [1 ] ] ) ]
2020-09-12 13:54:21 +02:00
}
2022-10-05 09:12:22 +02:00
# From MO code ----
2022-12-27 15:16:15 +01:00
out [is.na ( out ) & toupper ( x ) %in% AMR_env $ MO_lookup $ mo ] <- toupper ( x [is.na ( out ) & toupper ( x ) %in% AMR_env $ MO_lookup $ mo ] )
2022-10-05 09:12:22 +02:00
# From full name ----
2022-10-14 13:02:50 +02:00
out [is.na ( out ) & x_lower %in% AMR_env $ MO_lookup $ fullname_lower ] <- AMR_env $ MO_lookup $ mo [match ( x_lower [is.na ( out ) & x_lower %in% AMR_env $ MO_lookup $ fullname_lower ] , AMR_env $ MO_lookup $ fullname_lower ) ]
2022-10-05 09:12:22 +02:00
# one exception: "Fungi" matches the kingdom, but instead it should return the 'unknown' code for fungi
out [out == " F_[KNG]_FUNGI" ] <- " F_FUNGUS"
# From known codes ----
out [is.na ( out ) & toupper ( x ) %in% AMR :: microorganisms.codes $ code ] <- AMR :: microorganisms.codes $ mo [match ( toupper ( x ) [is.na ( out ) & toupper ( x ) %in% AMR :: microorganisms.codes $ code ] , AMR :: microorganisms.codes $ code ) ]
# From SNOMED ----
2023-05-08 13:04:18 +02:00
# based on this extremely fast gem: https://stackoverflow.com/a/11002456/4575331
snomeds <- unlist ( AMR_env $ MO_lookup $ snomed )
snomeds <- snomeds [ ! is.na ( snomeds ) ]
out [is.na ( out ) & x %in% snomeds ] <- AMR_env $ MO_lookup $ mo [rep ( seq_along ( AMR_env $ MO_lookup $ snomed ) , vapply ( FUN.VALUE = double ( 1 ) , AMR_env $ MO_lookup $ snomed , length ) ) [match ( x [is.na ( out ) & x %in% snomeds ] , snomeds ) ] ]
2022-10-05 09:12:22 +02:00
# From other familiar output ----
# such as Salmonella groups, colloquial names, etc.
out [is.na ( out ) ] <- convert_colloquial_input ( x [is.na ( out ) ] )
# From previous hits in this session ----
old <- out
2024-09-19 11:44:56 +02:00
out [is.na ( out ) & paste ( x , minimum_matching_score , only_fungi ) %in% AMR_env $ mo_previously_coerced $ x ] <- AMR_env $ mo_previously_coerced $ mo [match ( paste ( x , minimum_matching_score , only_fungi ) [is.na ( out ) & paste ( x , minimum_matching_score , only_fungi ) %in% AMR_env $ mo_previously_coerced $ x ] , AMR_env $ mo_previously_coerced $ x ) ]
2022-10-05 09:12:22 +02:00
new <- out
if ( isTRUE ( info ) && message_not_thrown_before ( " as.mo" , old , new , entire_session = TRUE ) && any ( is.na ( old ) & ! is.na ( new ) , na.rm = TRUE ) ) {
message_ (
" Returning previously coerced value" , ifelse ( sum ( is.na ( old ) & ! is.na ( new ) ) > 1 , " s" , " " ) ,
2023-01-14 17:10:10 +01:00
" for " , vector_and ( x [is.na ( old ) & ! is.na ( new ) ] ) , " . Run `mo_reset_session()` to reset this. This note will be shown once per session for this input."
2022-10-05 09:12:22 +02:00
)
2018-10-01 11:39:43 +02:00
}
2023-07-08 17:30:05 +02:00
2022-10-05 09:12:22 +02:00
# For all other input ----
if ( any ( is.na ( out ) & ! is.na ( x ) ) ) {
# reset uncertainties
AMR_env $ mo_uncertainties <- AMR_env $ mo_uncertainties [0 , ]
AMR_env $ mo_failures <- NULL
2023-07-08 17:30:05 +02:00
2022-10-05 09:12:22 +02:00
# Laboratory systems: remove (translated) entries like "no growth", "not E. coli", etc.
x [trimws2 ( x ) %like% translate_into_language ( " no .*growth" , language = language ) ] <- NA_character_
x [trimws2 ( x ) %like% paste0 ( " ^(" , translate_into_language ( " no|not" , language = language ) , " ) " ) ] <- NA_character_
2023-07-08 17:30:05 +02:00
2023-01-14 19:50:25 +01:00
# groups are in our taxonomic table with a capital G
2023-01-14 17:10:10 +01:00
x <- gsub ( " group( |$)" , " Group\\1" , x , perl = TRUE )
2023-07-08 17:30:05 +02:00
2022-10-05 09:12:22 +02:00
# run over all unique leftovers
x_unique <- unique ( x [is.na ( out ) & ! is.na ( x ) ] )
2023-07-08 17:30:05 +02:00
2022-10-05 09:12:22 +02:00
# set up progress bar
2024-02-24 15:16:52 +01:00
progress <- progress_ticker ( n = length ( x_unique ) , n_min = 10 , print = info , title = " Converting microorganism input" )
2022-10-05 09:12:22 +02:00
on.exit ( close ( progress ) )
2023-07-08 17:30:05 +02:00
2022-10-22 22:00:15 +02:00
msg <- character ( 0 )
2023-07-08 17:30:05 +02:00
2024-09-19 11:44:56 +02:00
MO_lookup_current <- AMR_env $ MO_lookup
if ( isTRUE ( only_fungi ) ) {
MO_lookup_current <- MO_lookup_current [MO_lookup_current $ kingdom == " Fungi" , , drop = FALSE ]
}
2022-10-05 09:12:22 +02:00
# run it
x_coerced <- vapply ( FUN.VALUE = character ( 1 ) , x_unique , function ( x_search ) {
progress $ tick ( )
2023-07-08 17:30:05 +02:00
2022-10-05 09:12:22 +02:00
# some required cleaning steps
x_out <- trimws2 ( x_search )
2023-02-22 14:38:57 +01:00
# this applies the `cleaning_regex` argument, which defaults to mo_cleaning_regex()
x_out <- gsub ( cleaning_regex , " " , x_out , ignore.case = TRUE , perl = TRUE )
2022-10-05 09:12:22 +02:00
x_out <- trimws2 ( gsub ( " +" , " " , x_out , perl = TRUE ) )
x_search_cleaned <- x_out
x_out <- tolower ( x_out )
2023-06-22 15:10:59 +02:00
# when x_search_cleaned are only capitals (such as in codes), make them lowercase to increase matching score
x_search_cleaned [x_search_cleaned == toupper ( x_search_cleaned ) ] <- x_out [x_search_cleaned == toupper ( x_search_cleaned ) ]
2023-07-08 17:30:05 +02:00
2022-10-22 22:00:15 +02:00
# first check if cleaning led to an exact result, case-insensitive
2024-09-19 11:44:56 +02:00
if ( x_out %in% MO_lookup_current $ fullname_lower ) {
return ( as.character ( MO_lookup_current $ mo [match ( x_out , MO_lookup_current $ fullname_lower ) ] ) )
2022-10-22 22:00:15 +02:00
}
2023-07-08 17:30:05 +02:00
2022-10-05 09:12:22 +02:00
# input must not be too short
if ( nchar ( x_out ) < 3 ) {
return ( " UNKNOWN" )
2019-09-15 22:57:30 +02:00
}
2023-07-08 17:30:05 +02:00
2022-10-05 09:12:22 +02:00
# take out the parts, split by space
x_parts <- strsplit ( gsub ( " -" , " " , x_out , fixed = TRUE ) , " " , fixed = TRUE ) [ [1 ] ]
2023-07-08 17:30:05 +02:00
2022-10-05 09:12:22 +02:00
# do a pre-match on first character (and if it contains a space, first chars of first two terms)
2023-05-26 16:10:01 +02:00
if ( length ( x_parts ) %in% c ( 2 , 3 ) ) {
2022-10-05 09:12:22 +02:00
# for genus + species + subspecies
2024-09-19 11:44:56 +02:00
if ( paste ( x_parts [1 : 2 ] , collapse = " " ) %in% MO_lookup_current $ fullname_lower ) {
filtr <- which ( MO_lookup_current $ fullname_lower %like% paste ( x_parts [1 : 2 ] , collapse = " " ) )
2024-06-14 22:39:01 +02:00
} else if ( nchar ( gsub ( " [^a-z]" , " " , x_parts [1 ] , perl = TRUE ) ) <= 3 ) {
2024-09-19 11:44:56 +02:00
filtr <- which ( MO_lookup_current $ full_first == substr ( x_parts [1 ] , 1 , 1 ) &
( MO_lookup_current $ species_first == substr ( x_parts [2 ] , 1 , 1 ) |
MO_lookup_current $ subspecies_first == substr ( x_parts [2 ] , 1 , 1 ) |
MO_lookup_current $ subspecies_first == substr ( x_parts [3 ] , 1 , 1 ) ) )
2023-05-26 19:20:21 +02:00
} else {
2024-09-19 11:44:56 +02:00
filtr <- which ( MO_lookup_current $ full_first == substr ( x_parts [1 ] , 1 , 1 ) |
MO_lookup_current $ species_first == substr ( x_parts [2 ] , 1 , 1 ) |
MO_lookup_current $ subspecies_first == substr ( x_parts [2 ] , 1 , 1 ) |
MO_lookup_current $ subspecies_first == substr ( x_parts [3 ] , 1 , 1 ) )
2023-05-26 19:20:21 +02:00
}
2022-10-05 09:12:22 +02:00
} else if ( length ( x_parts ) > 3 ) {
2023-05-26 19:20:21 +02:00
first_chars <- paste0 ( " (^| )[" , paste ( substr ( x_parts , 1 , 1 ) , collapse = " " ) , " ]" )
2024-09-19 11:44:56 +02:00
filtr <- which ( MO_lookup_current $ full_first %like_case% first_chars )
2023-06-22 15:10:59 +02:00
} else if ( nchar ( x_out ) == 3 ) {
# no space and 3 characters - probably a code such as SAU or ECO
2023-06-22 15:24:18 +02:00
msg <<- c ( msg , paste0 ( " Input \"" , x_search , " \" was assumed to be a microorganism code - tried to match on \"" , totitle ( substr ( x_out , 1 , 1 ) ) , AMR_env $ dots , " " , substr ( x_out , 2 , 3 ) , AMR_env $ dots , " \"" ) )
2024-09-19 11:44:56 +02:00
filtr <- which ( MO_lookup_current $ fullname_lower %like_case% paste0 ( " (^| )" , substr ( x_out , 1 , 1 ) , " .* " , substr ( x_out , 2 , 3 ) ) )
2022-10-05 09:12:22 +02:00
} else if ( nchar ( x_out ) == 4 ) {
2023-01-14 17:10:10 +01:00
# no space and 4 characters - probably a code such as STAU or ESCO
2023-06-22 15:24:18 +02:00
msg <<- c ( msg , paste0 ( " Input \"" , x_search , " \" was assumed to be a microorganism code - tried to match on \"" , totitle ( substr ( x_out , 1 , 2 ) ) , AMR_env $ dots , " " , substr ( x_out , 3 , 4 ) , AMR_env $ dots , " \"" ) )
2024-09-19 11:44:56 +02:00
filtr <- which ( MO_lookup_current $ fullname_lower %like_case% paste0 ( " (^| )" , substr ( x_out , 1 , 2 ) , " .* " , substr ( x_out , 3 , 4 ) ) )
2022-10-05 09:12:22 +02:00
} else if ( nchar ( x_out ) <= 6 ) {
2023-01-14 17:10:10 +01:00
# no space and 5-6 characters - probably a code such as STAAUR or ESCCOL
2022-10-05 09:12:22 +02:00
first_part <- paste0 ( substr ( x_out , 1 , 2 ) , " [a-z]*" , substr ( x_out , 3 , 3 ) )
second_part <- substr ( x_out , 4 , nchar ( x_out ) )
2023-06-22 15:24:18 +02:00
msg <<- c ( msg , paste0 ( " Input \"" , x_search , " \" was assumed to be a microorganism code - tried to match on \"" , gsub ( " [a-z]*" , AMR_env $ dots , totitle ( first_part ) , fixed = TRUE ) , " " , second_part , AMR_env $ dots , " \"" ) )
2024-09-19 11:44:56 +02:00
filtr <- which ( MO_lookup_current $ fullname_lower %like_case% paste0 ( " (^| )" , first_part , " .* " , second_part ) )
2022-10-05 09:12:22 +02:00
} else {
2023-05-26 16:10:01 +02:00
# for genus or species or subspecies
2024-09-19 11:44:56 +02:00
filtr <- which ( MO_lookup_current $ full_first == substr ( x_parts , 1 , 1 ) |
MO_lookup_current $ species_first == substr ( x_parts , 1 , 1 ) |
MO_lookup_current $ subspecies_first == substr ( x_parts , 1 , 1 ) )
2020-01-27 19:14:23 +01:00
}
2022-10-05 09:12:22 +02:00
if ( length ( filtr ) == 0 ) {
2024-09-19 11:44:56 +02:00
mo_to_search <- MO_lookup_current $ fullname
2022-10-05 09:12:22 +02:00
} else {
2024-09-19 11:44:56 +02:00
mo_to_search <- MO_lookup_current $ fullname [filtr ]
2019-09-18 15:46:09 +02:00
}
2023-07-08 17:30:05 +02:00
2022-10-05 09:12:22 +02:00
AMR_env $ mo_to_search <- mo_to_search
# determine the matching score on the original search value
m <- mo_matching_score ( x = x_search_cleaned , n = mo_to_search )
if ( is.null ( minimum_matching_score ) ) {
minimum_matching_score_current <- min ( 0.6 , min ( 10 , nchar ( x_search_cleaned ) ) * 0.08 )
# correct back for prevalence
2024-09-19 11:44:56 +02:00
minimum_matching_score_current <- minimum_matching_score_current / MO_lookup_current $ prevalence [match ( mo_to_search , MO_lookup_current $ fullname ) ]
2022-10-05 09:12:22 +02:00
# correct back for kingdom
2024-09-19 11:44:56 +02:00
minimum_matching_score_current <- minimum_matching_score_current / MO_lookup_current $ kingdom_index [match ( mo_to_search , MO_lookup_current $ fullname ) ]
2023-06-22 15:10:59 +02:00
minimum_matching_score_current <- pmax ( minimum_matching_score_current , m )
2023-06-22 15:24:18 +02:00
if ( length ( x_parts ) > 1 && all ( m <= 0.55 , na.rm = TRUE ) ) {
2023-06-22 15:10:59 +02:00
# if the highest score is 0.5, we have nothing serious - 0.5 is the lowest for pathogenic group 1
# make everything NA so the results will get removed below
2023-06-22 15:24:18 +02:00
# (we added length(x_parts) > 1 to exclude microbial codes from this rule, such as "STAU")
2023-06-22 15:10:59 +02:00
m [seq_len ( length ( m ) ) ] <- NA_real_
}
2022-10-05 09:12:22 +02:00
} else {
2023-06-22 15:10:59 +02:00
# minimum_matching_score was set, so remove everything below it
m [m < minimum_matching_score ] <- NA_real_
2022-10-05 09:12:22 +02:00
minimum_matching_score_current <- minimum_matching_score
2019-02-23 16:02:31 +01:00
}
2023-07-08 17:30:05 +02:00
2023-01-23 15:01:21 +01:00
top_hits <- mo_to_search [order ( m , decreasing = TRUE , na.last = NA ) ] # na.last = NA will remove the NAs
2022-10-05 09:12:22 +02:00
if ( length ( top_hits ) == 0 ) {
2022-10-22 22:00:15 +02:00
warning_ ( " No hits found for \"" , x_search , " \" with minimum_matching_score = " , ifelse ( is.null ( minimum_matching_score ) , paste0 ( " NULL (=" , round ( min ( minimum_matching_score_current , na.rm = TRUE ) , 3 ) , " )" ) , minimum_matching_score ) , " . Try setting this value lower or even to 0." , call = FALSE )
2022-10-05 09:12:22 +02:00
result_mo <- NA_character_
} else {
2024-09-19 11:44:56 +02:00
result_mo <- MO_lookup_current $ mo [match ( top_hits [1 ] , MO_lookup_current $ fullname ) ]
2023-03-12 13:02:37 +01:00
AMR_env $ mo_uncertainties <- rbind_AMR (
2023-02-12 17:10:48 +01:00
AMR_env $ mo_uncertainties ,
2022-10-05 09:12:22 +02:00
data.frame (
original_input = x_search ,
input = x_search_cleaned ,
fullname = top_hits [1 ] ,
mo = result_mo ,
2023-02-26 21:26:58 +01:00
candidates = ifelse ( length ( top_hits ) > 1 , paste ( top_hits [2 : min ( 99 , length ( top_hits ) ) ] , collapse = " , " ) , " " ) ,
2022-10-05 09:12:22 +02:00
minimum_matching_score = ifelse ( is.null ( minimum_matching_score ) , " NULL" , minimum_matching_score ) ,
keep_synonyms = keep_synonyms ,
stringsAsFactors = FALSE
2023-02-12 17:10:48 +01:00
)
)
2022-10-05 09:12:22 +02:00
# save to package env to save time for next time
2023-03-12 13:02:37 +01:00
AMR_env $ mo_previously_coerced <- unique ( rbind_AMR (
2023-02-12 17:10:48 +01:00
AMR_env $ mo_previously_coerced ,
2022-10-05 09:12:22 +02:00
data.frame (
2024-09-19 11:44:56 +02:00
x = paste ( x_search , minimum_matching_score , only_fungi ) ,
2022-10-05 09:12:22 +02:00
mo = result_mo ,
stringsAsFactors = FALSE
2023-02-12 17:10:48 +01:00
)
) )
2019-09-15 22:57:30 +02:00
}
2022-10-05 09:12:22 +02:00
# the actual result:
as.character ( result_mo )
} )
2023-07-08 17:30:05 +02:00
2022-10-05 09:12:22 +02:00
# remove progress bar from console
close ( progress )
# expand from unique again
out [is.na ( out ) ] <- x_coerced [match ( x [is.na ( out ) ] , x_unique ) ]
2023-07-08 17:30:05 +02:00
2022-10-05 09:12:22 +02:00
# Throw note about uncertainties ----
if ( isTRUE ( info ) && NROW ( AMR_env $ mo_uncertainties ) > 0 ) {
if ( message_not_thrown_before ( " as.mo" , " uncertainties" , AMR_env $ mo_uncertainties $ original_input ) ) {
plural <- c ( " " , " this" )
if ( length ( AMR_env $ mo_uncertainties $ original_input ) > 1 ) {
plural <- c ( " s" , " these uncertainties" )
2021-02-18 23:23:14 +01:00
}
2022-10-05 09:12:22 +02:00
if ( length ( AMR_env $ mo_uncertainties $ original_input ) <= 3 ) {
2023-01-23 15:01:21 +01:00
examples <- vector_and (
paste0 (
' "' , AMR_env $ mo_uncertainties $ original_input ,
' " (assumed ' , italicise ( AMR_env $ mo_uncertainties $ fullname ) , " )"
) ,
quotes = FALSE
2022-10-05 09:12:22 +02:00
)
} else {
examples <- paste0 ( nr2char ( length ( AMR_env $ mo_uncertainties $ original_input ) ) , " microorganism" , plural [1 ] )
2021-02-18 23:23:14 +01:00
}
2022-10-22 22:00:15 +02:00
msg <- c ( msg , paste0 (
2022-10-05 09:12:22 +02:00
" Microorganism translation was uncertain for " , examples ,
2023-01-23 15:01:21 +01:00
" . Run `mo_uncertainties()` to review " , plural [2 ] , " , or use `add_custom_microorganisms()` to add custom entries."
2022-10-22 22:00:15 +02:00
) )
2023-07-08 17:30:05 +02:00
2022-10-22 22:00:15 +02:00
for ( m in msg ) {
2022-10-30 14:31:45 +01:00
message_ ( m )
2022-10-22 22:00:15 +02:00
}
2021-02-18 23:23:14 +01:00
}
2018-09-27 23:23:48 +02:00
}
2022-10-05 09:12:22 +02:00
} # end of loop over all yet unknowns
2023-07-08 17:30:05 +02:00
2022-10-05 09:12:22 +02:00
# Keep or replace synonyms ----
2024-07-17 14:29:55 +02:00
out_current <- synonym_mo_to_accepted_mo ( out , fill_in_accepted = FALSE )
AMR_env $ mo_renamed <- list ( old = out [ ! is.na ( out_current ) ] )
2022-10-05 09:12:22 +02:00
if ( isFALSE ( keep_synonyms ) ) {
2024-07-17 14:29:55 +02:00
out [ ! is.na ( out_current ) ] <- out_current [ ! is.na ( out_current ) ]
2022-10-05 09:12:22 +02:00
if ( isTRUE ( info ) && length ( AMR_env $ mo_renamed $ old ) > 0 ) {
print ( mo_renamed ( ) , extra_txt = " (use `keep_synonyms = TRUE` to leave uncorrected)" )
}
} else if ( is.null ( getOption ( " AMR_keep_synonyms" ) ) && length ( AMR_env $ mo_renamed $ old ) > 0 && message_not_thrown_before ( " as.mo" , " keep_synonyms_warning" , entire_session = TRUE ) ) {
# keep synonyms is TRUE, so check if any do have synonyms
2022-10-22 22:00:15 +02:00
warning_ ( " Function `as.mo()` returned " , nr2char ( length ( unique ( AMR_env $ mo_renamed $ old ) ) ) , " old taxonomic name" , ifelse ( length ( unique ( AMR_env $ mo_renamed $ old ) ) > 1 , " s" , " " ) , " . Use `as.mo(..., keep_synonyms = FALSE)` to clean the input to currently accepted taxonomic names, or set the R option `AMR_keep_synonyms` to `FALSE`. This warning will be shown once per session." , call = FALSE )
2018-07-23 14:14:03 +02:00
}
2023-07-08 17:30:05 +02:00
2022-10-05 09:12:22 +02:00
# Apply Becker ----
2024-09-19 11:44:56 +02:00
if ( ! isTRUE ( only_fungi ) && ( isTRUE ( Becker ) || Becker == " all" ) ) {
2020-10-20 21:00:57 +02:00
# warn when species found that are not in:
# - Becker et al. 2014, PMID 25278577
# - Becker et al. 2019, PMID 30872103
# - Becker et al. 2020, PMID 32056452
2023-07-08 17:30:05 +02:00
2021-10-06 13:23:57 +02:00
# comment below code if all staphylococcal species are categorised as CoNS/CoPS
2022-10-05 09:12:22 +02:00
post_Becker <- paste (
" Staphylococcus" ,
c ( " caledonicus" , " canis" , " durrellii" , " lloydii" , " ratti" , " roterodami" , " singaporensis" , " taiwanensis" )
)
2022-12-27 15:16:15 +01:00
if ( any ( out %in% AMR_env $ MO_lookup $ mo [match ( post_Becker , AMR_env $ MO_lookup $ fullname ) ] ) ) {
2021-12-11 13:41:31 +01:00
if ( message_not_thrown_before ( " as.mo" , " becker" ) ) {
2022-03-02 15:38:55 +01:00
warning_ ( " in `as.mo()`: Becker " , font_italic ( " et al." ) , " (2014, 2019, 2020) does not contain these species named after their publication: " ,
2023-07-08 17:30:05 +02:00
vector_and ( font_italic ( gsub ( " Staphylococcus" , " S." , post_Becker , fixed = TRUE ) , collapse = NULL ) , quotes = FALSE ) ,
" . Categorisation to CoNS/CoPS was taken from the original scientific publication(s)." ,
immediate = TRUE , call = FALSE
2022-08-28 10:31:50 +02:00
)
2021-10-06 13:23:57 +02:00
}
}
2023-07-08 17:30:05 +02:00
2024-04-23 10:55:48 +02:00
# 'MO_CONS' and 'MO_COPS' are 'mo' vectors created in R/_pre_commit_checks.R
2022-10-05 09:12:22 +02:00
out [out %in% MO_CONS ] <- " B_STPHY_CONS"
out [out %in% MO_COPS ] <- " B_STPHY_COPS"
2018-09-01 21:19:46 +02:00
if ( Becker == " all" ) {
2022-10-05 09:12:22 +02:00
out [out == " B_STPHY_AURS" ] <- " B_STPHY_COPS"
2018-09-01 21:19:46 +02:00
}
}
2023-07-08 17:30:05 +02:00
2022-10-05 09:12:22 +02:00
# Apply Lancefield ----
2024-09-19 11:44:56 +02:00
if ( ! isTRUE ( only_fungi ) && ( isTRUE ( Lancefield ) || Lancefield == " all" ) ) {
2022-12-16 16:10:43 +01:00
# (using `%like_case%` to also match subspecies)
2023-07-08 17:30:05 +02:00
2018-09-27 23:23:48 +02:00
# group A - S. pyogenes
2022-12-16 16:10:43 +01:00
out [out %like_case% " ^B_STRPT_PYGN(_|$)" ] <- " B_STRPT_GRPA"
2018-09-27 23:23:48 +02:00
# group B - S. agalactiae
2022-12-16 16:10:43 +01:00
out [out %like_case% " ^B_STRPT_AGLC(_|$)" ] <- " B_STRPT_GRPB"
2022-10-05 09:12:22 +02:00
# group C - all subspecies within S. dysgalactiae and S. equi (such as S. equi zooepidemicus)
out [out %like_case% " ^B_STRPT_(DYSG|EQUI)(_|$)" ] <- " B_STRPT_GRPC"
2018-09-04 11:33:30 +02:00
if ( Lancefield == " all" ) {
2022-10-05 09:12:22 +02:00
# group D - all enterococci
out [out %like_case% " ^B_ENTRC(_|$)" ] <- " B_STRPT_GRPD"
2018-09-27 23:23:48 +02:00
}
2023-07-12 16:04:48 +02:00
# group F - Milleri group == S. anginosus group, which incl. S. anginosus, S. constellatus, S. intermedius
out [out %like_case% " ^B_STRPT_(ANGN|CNST|INTR)(_|$)" ] <- " B_STRPT_GRPF"
2022-12-16 16:10:43 +01:00
# group G - S. dysgalactiae and S. canis (though dysgalactiae is also group C and will be matched there)
out [out %like_case% " ^B_STRPT_(DYSG|CANS)(_|$)" ] <- " B_STRPT_GRPG"
2018-09-27 23:23:48 +02:00
# group H - S. sanguinis
2022-12-16 16:10:43 +01:00
out [out %like_case% " ^B_STRPT_SNGN(_|$)" ] <- " B_STRPT_GRPH"
2023-07-12 16:04:48 +02:00
# group K - S. salivarius, incl. S. salivarius salivarius and S. salivarius thermophilus
2022-10-05 09:12:22 +02:00
out [out %like_case% " ^B_STRPT_SLVR(_|$)" ] <- " B_STRPT_GRPK"
2022-12-16 16:10:43 +01:00
# group L - only S. dysgalactiae which is also group C & G, so ignore it here
2018-09-27 23:23:48 +02:00
}
2023-07-08 17:30:05 +02:00
2022-10-05 09:12:22 +02:00
# All unknowns ----
out [is.na ( out ) & ! is.na ( x ) ] <- " UNKNOWN"
2023-07-08 17:30:05 +02:00
AMR_env $ mo_failures <- unique ( x [out == " UNKNOWN" & ! toupper ( x ) %in% c ( " UNKNOWN" , " CON" , " UNK" ) & ! x %like_case% " ^[(]unknown [a-z]+[)]$" & ! is.na ( x ) ] )
2022-10-05 09:12:22 +02:00
if ( length ( AMR_env $ mo_failures ) > 0 ) {
2022-10-22 22:00:15 +02:00
warning_ ( " The following input could not be coerced and was returned as \"UNKNOWN\": " , vector_and ( AMR_env $ mo_failures , quotes = TRUE ) , " .\nYou can retrieve this list with `mo_failures()`." , call = FALSE )
2018-12-14 10:52:20 +01:00
}
2023-07-08 17:30:05 +02:00
2022-10-05 09:12:22 +02:00
# Return class ----
set_clean_class ( out ,
2023-07-08 17:30:05 +02:00
new_class = c ( " mo" , " character" )
2022-10-05 09:12:22 +02:00
)
}
2022-08-28 10:31:50 +02:00
2022-10-05 09:12:22 +02:00
# OTHER DOCUMENTED FUNCTIONS ----------------------------------------------
2022-08-28 10:31:50 +02:00
2022-10-05 09:12:22 +02:00
#' @rdname as.mo
#' @export
is.mo <- function ( x ) {
inherits ( x , " mo" )
2018-06-08 12:06:54 +02:00
}
2018-07-23 14:14:03 +02:00
2022-10-05 09:12:22 +02:00
#' @rdname as.mo
#' @export
mo_uncertainties <- function ( ) {
set_clean_class ( AMR_env $ mo_uncertainties , new_class = c ( " mo_uncertainties" , " data.frame" ) )
2019-03-02 22:47:04 +01:00
}
2022-10-05 09:12:22 +02:00
#' @rdname as.mo
#' @export
mo_renamed <- function ( ) {
2023-01-21 23:47:20 +01:00
add_MO_lookup_to_AMR_env ( )
2022-10-05 09:12:22 +02:00
x <- AMR_env $ mo_renamed
2023-07-08 17:30:05 +02:00
2022-10-05 09:12:22 +02:00
x $ new <- synonym_mo_to_accepted_mo ( x $ old )
2022-12-27 15:16:15 +01:00
mo_old <- AMR_env $ MO_lookup $ fullname [match ( x $ old , AMR_env $ MO_lookup $ mo ) ]
mo_new <- AMR_env $ MO_lookup $ fullname [match ( x $ new , AMR_env $ MO_lookup $ mo ) ]
ref_old <- AMR_env $ MO_lookup $ ref [match ( x $ old , AMR_env $ MO_lookup $ mo ) ]
ref_new <- AMR_env $ MO_lookup $ ref [match ( x $ new , AMR_env $ MO_lookup $ mo ) ]
2023-07-08 17:30:05 +02:00
2022-10-05 09:12:22 +02:00
df_renamed <- data.frame (
old = mo_old ,
new = mo_new ,
ref_old = ref_old ,
ref_new = ref_new ,
2022-08-28 10:31:50 +02:00
stringsAsFactors = FALSE
)
2022-10-05 09:12:22 +02:00
df_renamed <- unique ( df_renamed )
df_renamed <- df_renamed [order ( df_renamed $ old ) , , drop = FALSE ]
set_clean_class ( df_renamed , new_class = c ( " mo_renamed" , " data.frame" ) )
}
#' @rdname as.mo
#' @export
mo_failures <- function ( ) {
AMR_env $ mo_failures
2018-09-25 16:44:40 +02:00
}
2022-10-05 09:12:22 +02:00
#' @rdname as.mo
#' @export
mo_reset_session <- function ( ) {
if ( NROW ( AMR_env $ mo_previously_coerced ) > 0 ) {
message_ ( " Reset " , nr2char ( NROW ( AMR_env $ mo_previously_coerced ) ) , " previously matched input value" , ifelse ( NROW ( AMR_env $ mo_previously_coerced ) > 1 , " s" , " " ) , " ." )
AMR_env $ mo_previously_coerced <- AMR_env $ mo_previously_coerced [0 , , drop = FALSE ]
AMR_env $ mo_uncertainties <- AMR_env $ mo_uncertainties [0 , , drop = FALSE ]
2019-08-20 11:40:54 +02:00
} else {
2022-10-05 09:12:22 +02:00
message_ ( " No previously matched input values to reset." )
2019-08-20 11:40:54 +02:00
}
2022-10-05 09:12:22 +02:00
}
#' @rdname as.mo
#' @export
mo_cleaning_regex <- function ( ) {
2023-07-10 13:41:52 +02:00
parts_to_remove <- c ( " e?spp([^a-z]+|$)" , " e?ssp([^a-z]+|$)" , " e?ss([^a-z]+|$)" , " e?sp([^a-z]+|$)" , " e?subsp" , " sube?species" , " e?species" ,
" biovar[a-z]*" , " biotype" , " serovar[a-z]*" , " var([^a-z]+|$)" , " serogr.?up[a-z]*" ,
2024-09-19 11:44:56 +02:00
" titer" , " dummy" , " Ig[ADEGM]" , " ?[a-z-]+[-](resistant|susceptible) ?" )
2022-10-05 09:12:22 +02:00
paste0 (
" (" ,
" [^A-Za-z- \\(\\)\\[\\]{}]+" ,
" |" ,
" ([({]|\\[).+([})]|\\])" ,
2023-07-10 13:41:52 +02:00
" |(^| )(" ,
paste0 ( parts_to_remove [order ( 1 - nchar ( parts_to_remove ) ) ] , collapse = " |" ) ,
" ))" )
2019-08-20 11:40:54 +02:00
}
2022-10-05 09:12:22 +02:00
# UNDOCUMENTED METHODS ----------------------------------------------------
2020-08-28 21:55:47 +02:00
# will be exported using s3_register() in R/zzz.R
2020-08-26 11:33:54 +02:00
pillar_shaft.mo <- function ( x , ... ) {
2023-01-21 23:47:20 +01:00
add_MO_lookup_to_AMR_env ( )
2023-07-12 16:27:43 +02:00
out <- trimws ( format ( x ) )
2020-08-26 11:33:54 +02:00
# grey out the kingdom (part until first "_")
2020-09-25 14:44:50 +02:00
out [ ! is.na ( x ) ] <- gsub ( " ^([A-Z]+_)(.*)" , paste0 ( font_subtle ( " \\1" ) , " \\2" ) , out [ ! is.na ( x ) ] , perl = TRUE )
2020-08-26 11:33:54 +02:00
# and grey out every _
2020-08-28 21:55:47 +02:00
out [ ! is.na ( x ) ] <- gsub ( " _" , font_subtle ( " _" ) , out [ ! is.na ( x ) ] )
2023-07-08 17:30:05 +02:00
2020-08-26 11:33:54 +02:00
# markup NA and UNKNOWN
2020-08-28 21:55:47 +02:00
out [is.na ( x ) ] <- font_na ( " NA" )
out [x == " UNKNOWN" ] <- font_na ( " UNKNOWN" )
2023-07-08 17:30:05 +02:00
2022-12-27 15:16:15 +01:00
# markup manual codes
out [x %in% AMR_env $ MO_lookup $ mo & ! x %in% AMR :: microorganisms $ mo ] <- font_blue ( out [x %in% AMR_env $ MO_lookup $ mo & ! x %in% AMR :: microorganisms $ mo ] , collapse = NULL )
2023-07-08 17:30:05 +02:00
2021-06-22 12:16:42 +02:00
df <- tryCatch ( get_current_data ( arg_name = " x" , call = 0 ) ,
2023-07-08 17:30:05 +02:00
error = function ( e ) NULL
2022-08-28 10:31:50 +02:00
)
2021-06-14 22:04:04 +02:00
if ( ! is.null ( df ) ) {
mo_cols <- vapply ( FUN.VALUE = logical ( 1 ) , df , is.mo )
} else {
mo_cols <- NULL
}
2023-07-08 17:30:05 +02:00
2022-12-27 15:16:15 +01:00
all_mos <- c ( AMR_env $ MO_lookup $ mo , NA )
2022-10-22 22:00:15 +02:00
if ( ! all ( x %in% all_mos ) ||
2023-07-08 17:30:05 +02:00
( ! is.null ( df ) && ! all ( unlist ( df [ , which ( mo_cols ) , drop = FALSE ] ) %in% all_mos ) ) ) {
2021-04-07 08:37:42 +02:00
# markup old mo codes
2023-01-23 15:01:21 +01:00
out [ ! x %in% all_mos ] <- font_italic (
2023-02-09 13:07:39 +01:00
font_na ( x [ ! x %in% all_mos ] ,
2023-07-08 17:30:05 +02:00
collapse = NULL
2023-01-23 15:01:21 +01:00
) ,
2022-08-28 10:31:50 +02:00
collapse = NULL
)
2021-06-14 22:04:04 +02:00
# throw a warning with the affected column name(s)
if ( ! is.null ( mo_cols ) ) {
col <- paste0 ( " Column " , vector_or ( colnames ( df ) [mo_cols ] , quotes = TRUE , sort = FALSE ) )
2021-04-07 08:37:42 +02:00
} else {
col <- " The data"
}
2022-08-28 10:31:50 +02:00
warning_ (
col , " contains old MO codes (from a previous AMR package version). " ,
2022-10-30 14:31:45 +01:00
" Please update your MO codes with `as.mo()`." ,
call = FALSE
2022-08-28 10:31:50 +02:00
)
2021-04-07 08:37:42 +02:00
}
2023-07-08 17:30:05 +02:00
2023-07-10 16:43:46 +02:00
# add the names to the bugs as mouse-over!
if ( tryCatch ( isTRUE ( getExportedValue ( " ansi_has_hyperlink_support" , ns = asNamespace ( " cli" ) ) ( ) ) , error = function ( e ) FALSE ) ) {
2023-07-12 16:27:43 +02:00
out [ ! x %in% c ( " UNKNOWN" , NA ) ] <- font_url ( url = paste0 ( x [ ! x %in% c ( " UNKNOWN" , NA ) ] , " : " ,
mo_name ( x [ ! x %in% c ( " UNKNOWN" , NA ) ] , keep_synonyms = TRUE ) ) ,
2023-07-10 16:43:46 +02:00
txt = out [ ! x %in% c ( " UNKNOWN" , NA ) ] )
}
2020-08-26 11:33:54 +02:00
# make it always fit exactly
2020-12-24 23:29:10 +01:00
max_char <- max ( nchar ( x ) )
if ( is.na ( max_char ) ) {
2022-10-05 09:12:22 +02:00
max_char <- 12
2020-12-24 23:29:10 +01:00
}
2020-08-28 21:55:47 +02:00
create_pillar_column ( out ,
2023-07-08 17:30:05 +02:00
align = " left" ,
width = max_char + ifelse ( any ( x %in% c ( NA , " UNKNOWN" ) ) , 2 , 0 )
2022-08-28 10:31:50 +02:00
)
2020-08-26 11:33:54 +02:00
}
2020-08-28 21:55:47 +02:00
# will be exported using s3_register() in R/zzz.R
2020-08-26 11:33:54 +02:00
type_sum.mo <- function ( x , ... ) {
" mo"
}
2020-08-28 21:55:47 +02:00
# will be exported using s3_register() in R/zzz.R
freq.mo <- function ( x , ... ) {
x_noNA <- as.mo ( x [ ! is.na ( x ) ] ) # as.mo() to get the newest mo codes
grams <- mo_gramstain ( x_noNA , language = NULL )
digits <- list ( ... ) $ digits
if ( is.null ( digits ) ) {
digits <- 2
}
2020-12-17 16:22:25 +01:00
cleaner :: freq.default (
x = x ,
... ,
.add_header = list (
`Gram-negative` = paste0 (
format ( sum ( grams == " Gram-negative" , na.rm = TRUE ) ,
2023-07-08 17:30:05 +02:00
big.mark = " " ,
decimal.mark = " ."
2022-08-28 10:31:50 +02:00
) ,
2020-12-17 16:22:25 +01:00
" (" , percentage ( sum ( grams == " Gram-negative" , na.rm = TRUE ) / length ( grams ) ,
2023-07-08 17:30:05 +02:00
digits = digits
2022-08-28 10:31:50 +02:00
) ,
" )"
) ,
2020-12-17 16:22:25 +01:00
`Gram-positive` = paste0 (
format ( sum ( grams == " Gram-positive" , na.rm = TRUE ) ,
2023-07-08 17:30:05 +02:00
big.mark = " " ,
decimal.mark = " ."
2022-08-28 10:31:50 +02:00
) ,
2020-12-17 16:22:25 +01:00
" (" , percentage ( sum ( grams == " Gram-positive" , na.rm = TRUE ) / length ( grams ) ,
2023-07-08 17:30:05 +02:00
digits = digits
2022-08-28 10:31:50 +02:00
) ,
" )"
) ,
2023-02-09 13:07:39 +01:00
`Nr. of genera` = pm_n_distinct ( mo_genus ( x_noNA , language = NULL ) ) ,
`Nr. of species` = pm_n_distinct ( paste (
2022-08-28 10:31:50 +02:00
mo_genus ( x_noNA , language = NULL ) ,
mo_species ( x_noNA , language = NULL )
) )
)
)
2020-08-28 21:55:47 +02:00
}
2020-09-28 01:08:55 +02:00
# will be exported using s3_register() in R/zzz.R
get_skimmers.mo <- function ( column ) {
2020-12-17 16:22:25 +01:00
skimr :: sfl (
2020-09-28 01:08:55 +02:00
skim_type = " mo" ,
2022-08-28 10:31:50 +02:00
unique_total = ~ length ( unique ( stats :: na.omit ( .) ) ) ,
gram_negative = ~ sum ( mo_is_gram_negative ( .) , na.rm = TRUE ) ,
gram_positive = ~ sum ( mo_is_gram_positive ( .) , na.rm = TRUE ) ,
top_genus = ~ names ( sort ( - table ( mo_genus ( stats :: na.omit ( .) , language = NULL ) ) ) ) [1L ] ,
top_species = ~ names ( sort ( - table ( mo_name ( stats :: na.omit ( .) , language = NULL ) ) ) ) [1L ]
2020-09-28 01:08:55 +02:00
)
}
2020-05-28 16:48:55 +02:00
#' @method print mo
2018-08-31 13:36:19 +02:00
#' @export
#' @noRd
2020-08-26 11:33:54 +02:00
print.mo <- function ( x , print.shortnames = FALSE , ... ) {
2023-01-21 23:47:20 +01:00
add_MO_lookup_to_AMR_env ( )
2022-10-19 11:47:57 +02:00
cat ( " Class 'mo'\n" )
2018-10-12 16:35:18 +02:00
x_names <- names ( x )
2020-08-26 11:33:54 +02:00
if ( is.null ( x_names ) & print.shortnames == TRUE ) {
x_names <- tryCatch ( mo_shortname ( x , ... ) , error = function ( e ) NULL )
}
2018-10-12 16:35:18 +02:00
x <- as.character ( x )
names ( x ) <- x_names
2022-12-27 15:16:15 +01:00
if ( ! all ( x %in% c ( AMR_env $ MO_lookup $ mo , NA ) ) ) {
2022-08-28 10:31:50 +02:00
warning_ (
" Some MO codes are from a previous AMR package version. " ,
2022-10-30 14:31:45 +01:00
" Please update the MO codes with `as.mo()`." ,
call = FALSE
2022-08-28 10:31:50 +02:00
)
2021-05-03 13:06:43 +02:00
}
2018-10-12 16:35:18 +02:00
print.default ( x , quote = FALSE )
2018-08-31 13:36:19 +02:00
}
2018-07-23 14:14:03 +02:00
2020-05-28 16:48:55 +02:00
#' @method summary mo
2018-12-07 12:04:55 +01:00
#' @export
#' @noRd
summary.mo <- function ( object , ... ) {
# unique and top 1-3
2022-10-05 09:12:22 +02:00
x <- object
top_3 <- names ( sort ( - table ( x [ ! is.na ( x ) ] ) ) ) [1 : 3 ]
out <- c (
2022-08-28 10:31:50 +02:00
" Class" = " mo" ,
" <NA>" = length ( x [is.na ( x ) ] ) ,
2022-10-05 09:12:22 +02:00
" Unique" = length ( unique ( x [ ! is.na ( x ) ] ) ) ,
2022-08-28 10:31:50 +02:00
" #1" = top_3 [1 ] ,
" #2" = top_3 [2 ] ,
" #3" = top_3 [3 ]
)
2022-10-05 09:12:22 +02:00
class ( out ) <- c ( " summaryDefault" , " table" )
out
2018-12-07 12:04:55 +01:00
}
2020-05-28 16:48:55 +02:00
#' @method as.data.frame mo
2018-07-23 14:14:03 +02:00
#' @export
2018-08-31 13:36:19 +02:00
#' @noRd
2020-05-19 13:18:01 +02:00
as.data.frame.mo <- function ( x , ... ) {
2023-01-21 23:47:20 +01:00
add_MO_lookup_to_AMR_env ( )
2022-12-27 15:16:15 +01:00
if ( ! all ( x %in% c ( AMR_env $ MO_lookup $ mo , NA ) ) ) {
2022-08-28 10:31:50 +02:00
warning_ (
" The data contains old MO codes (from a previous AMR package version). " ,
" Please update your MO codes with `as.mo()`."
)
2021-04-07 08:37:42 +02:00
}
2020-05-19 12:08:49 +02:00
nm <- deparse1 ( substitute ( x ) )
2018-08-31 13:36:19 +02:00
if ( ! " nm" %in% names ( list ( ... ) ) ) {
2021-04-07 08:37:42 +02:00
as.data.frame.vector ( x , ... , nm = nm )
2018-08-31 13:36:19 +02:00
} else {
2021-04-07 08:37:42 +02:00
as.data.frame.vector ( x , ... )
2018-08-31 13:36:19 +02:00
}
}
2020-05-28 16:48:55 +02:00
#' @method [ mo
2018-08-31 13:36:19 +02:00
#' @export
#' @noRd
2019-08-14 14:57:06 +02:00
" [.mo" <- function ( x , ... ) {
2019-08-12 14:48:09 +02:00
y <- NextMethod ( )
2019-08-14 14:57:06 +02:00
attributes ( y ) <- attributes ( x )
y
}
2020-05-28 16:48:55 +02:00
#' @method [[ mo
2019-08-14 14:57:06 +02:00
#' @export
#' @noRd
2019-08-26 16:02:03 +02:00
" [[.mo" <- function ( x , ... ) {
2019-08-14 14:57:06 +02:00
y <- NextMethod ( )
2019-08-26 16:02:03 +02:00
attributes ( y ) <- attributes ( x )
2019-08-14 14:57:06 +02:00
y
}
2020-05-28 16:48:55 +02:00
#' @method [<- mo
2019-08-14 14:57:06 +02:00
#' @export
#' @noRd
2019-08-26 16:02:03 +02:00
" [<-.mo" <- function ( i , j , ... , value ) {
2019-08-14 14:57:06 +02:00
y <- NextMethod ( )
2019-08-26 16:02:03 +02:00
attributes ( y ) <- attributes ( i )
2020-04-13 21:09:56 +02:00
# must only contain valid MOs
2023-01-21 23:47:20 +01:00
add_MO_lookup_to_AMR_env ( )
2022-12-27 15:16:15 +01:00
return_after_integrity_check ( y , " microorganism code" , as.character ( AMR_env $ MO_lookup $ mo ) )
2019-08-14 14:57:06 +02:00
}
2020-05-28 16:48:55 +02:00
#' @method [[<- mo
2019-08-14 14:57:06 +02:00
#' @export
#' @noRd
2019-08-26 16:02:03 +02:00
" [[<-.mo" <- function ( i , j , ... , value ) {
2019-08-14 14:57:06 +02:00
y <- NextMethod ( )
2019-08-26 16:02:03 +02:00
attributes ( y ) <- attributes ( i )
2020-04-13 21:09:56 +02:00
# must only contain valid MOs
2023-01-21 23:47:20 +01:00
add_MO_lookup_to_AMR_env ( )
2022-12-27 15:16:15 +01:00
return_after_integrity_check ( y , " microorganism code" , as.character ( AMR_env $ MO_lookup $ mo ) )
2019-08-14 14:57:06 +02:00
}
2020-05-28 16:48:55 +02:00
#' @method c mo
2019-08-14 14:57:06 +02:00
#' @export
#' @noRd
2021-05-03 13:06:43 +02:00
c.mo <- function ( ... ) {
x <- list ( ... ) [ [1L ] ]
2019-08-14 14:57:06 +02:00
y <- NextMethod ( )
attributes ( y ) <- attributes ( x )
2023-01-21 23:47:20 +01:00
add_MO_lookup_to_AMR_env ( )
2022-12-27 15:16:15 +01:00
return_after_integrity_check ( y , " microorganism code" , as.character ( AMR_env $ MO_lookup $ mo ) )
2018-07-23 14:14:03 +02:00
}
2018-12-06 14:36:39 +01:00
2020-09-25 14:44:50 +02:00
#' @method unique mo
#' @export
#' @noRd
unique.mo <- function ( x , incomparables = FALSE , ... ) {
y <- NextMethod ( )
attributes ( y ) <- attributes ( x )
y
}
2021-02-22 20:21:33 +01:00
#' @method rep mo
#' @export
#' @noRd
rep.mo <- function ( x , ... ) {
y <- NextMethod ( )
attributes ( y ) <- attributes ( x )
y
}
2020-05-28 16:48:55 +02:00
#' @method print mo_uncertainties
2019-02-28 13:56:28 +01:00
#' @export
#' @noRd
2023-02-26 21:26:58 +01:00
print.mo_uncertainties <- function ( x , n = 10 , ... ) {
2023-05-26 16:10:01 +02:00
more_than_50 <- FALSE
2019-03-12 12:19:27 +01:00
if ( NROW ( x ) == 0 ) {
2023-06-22 15:10:59 +02:00
cat ( word_wrap ( " No uncertainties to show. Only uncertainties of the last call to `as.mo()` or any `mo_*()` function are stored.\n\n" , add_fn = font_blue ) )
2022-10-05 09:12:22 +02:00
return ( invisible ( NULL ) )
2023-05-26 16:10:01 +02:00
} else if ( NROW ( x ) > 50 ) {
more_than_50 <- TRUE
x <- x [1 : 50 , , drop = FALSE ]
2022-10-05 09:12:22 +02:00
}
2023-07-08 17:30:05 +02:00
2023-01-07 01:51:19 +01:00
cat ( word_wrap ( " Matching scores are based on the resemblance between the input and the full taxonomic name, and the pathogenicity in humans. See `?mo_matching_score`.\n\n" , add_fn = font_blue ) )
2023-07-08 17:30:05 +02:00
2023-01-30 17:24:03 +01:00
add_MO_lookup_to_AMR_env ( )
2023-07-08 17:30:05 +02:00
2023-07-10 13:41:52 +02:00
col_red <- function ( x ) font_rose_bg ( x , collapse = NULL )
col_orange <- function ( x ) font_orange_bg ( x , collapse = NULL )
col_yellow <- function ( x ) font_yellow_bg ( x , collapse = NULL )
col_green <- function ( x ) font_green_bg ( x , collapse = NULL )
2023-07-08 17:30:05 +02:00
2022-10-05 09:12:22 +02:00
if ( has_colour ( ) ) {
cat ( word_wrap ( " Colour keys: " ,
2023-07-08 17:30:05 +02:00
col_red ( " 0.000-0.549 " ) ,
col_orange ( " 0.550-0.649 " ) ,
col_yellow ( " 0.650-0.749 " ) ,
col_green ( " 0.750-1.000" ) ,
add_fn = font_blue
2022-10-05 09:12:22 +02:00
) , font_green_bg ( " " ) , " \n" , sep = " " )
}
2023-07-08 17:30:05 +02:00
2022-10-05 09:12:22 +02:00
score_set_colour <- function ( text , scores ) {
# set colours to scores
2023-06-22 15:10:59 +02:00
text [scores >= 0.75 ] <- col_green ( text [scores >= 0.75 ] )
text [scores >= 0.65 & scores < 0.75 ] <- col_yellow ( text [scores >= 0.65 & scores < 0.75 ] )
text [scores >= 0.55 & scores < 0.65 ] <- col_orange ( text [scores >= 0.55 & scores < 0.65 ] )
text [scores < 0.55 ] <- col_red ( text [scores < 0.55 ] )
2022-10-05 09:12:22 +02:00
text
2019-03-12 12:19:27 +01:00
}
2023-07-08 17:30:05 +02:00
2021-08-16 21:54:34 +02:00
txt <- " "
2023-02-26 21:26:58 +01:00
any_maxed_out <- FALSE
2019-10-11 17:21:02 +02:00
for ( i in seq_len ( nrow ( x ) ) ) {
2020-09-14 12:21:23 +02:00
if ( x [i , ] $ candidates != " " ) {
candidates <- unlist ( strsplit ( x [i , ] $ candidates , " , " , fixed = TRUE ) )
2023-02-26 21:26:58 +01:00
if ( length ( candidates ) > n ) {
any_maxed_out <- TRUE
candidates <- candidates [seq_len ( n ) ]
}
2020-09-28 11:00:59 +02:00
scores <- mo_matching_score ( x = x [i , ] $ input , n = candidates )
2020-09-14 12:21:23 +02:00
n_candidates <- length ( candidates )
2023-07-08 17:30:05 +02:00
2022-12-17 14:31:33 +01:00
candidates_formatted <- italicise ( candidates )
2021-08-16 21:54:34 +02:00
scores_formatted <- trimws ( formatC ( round ( scores , 3 ) , format = " f" , digits = 3 ) )
2022-10-05 09:12:22 +02:00
scores_formatted <- score_set_colour ( scores_formatted , scores )
2023-07-08 17:30:05 +02:00
2021-08-16 21:54:34 +02:00
# sort on descending scores
candidates_formatted <- candidates_formatted [order ( 1 - scores ) ]
scores_formatted <- scores_formatted [order ( 1 - scores ) ]
2023-07-08 17:30:05 +02:00
2023-01-23 15:01:21 +01:00
candidates <- word_wrap (
paste0 (
" Also matched: " ,
vector_and (
paste0 (
candidates_formatted ,
font_blue ( paste0 ( " (" , scores_formatted , " )" ) , collapse = NULL )
) ,
quotes = FALSE , sort = FALSE
)
2022-08-28 10:31:50 +02:00
) ,
2023-01-23 15:01:21 +01:00
extra_indent = nchar ( " Also matched: " ) ,
width = 0.9 * getOption ( " width" , 100 )
2022-08-28 10:31:50 +02:00
)
2020-09-12 08:49:01 +02:00
} else {
candidates <- " "
}
2023-07-08 17:30:05 +02:00
2022-10-05 09:12:22 +02:00
score <- mo_matching_score (
x = x [i , ] $ input ,
n = x [i , ] $ fullname
)
score_formatted <- trimws ( formatC ( round ( score , 3 ) , format = " f" , digits = 3 ) )
2021-08-16 21:54:34 +02:00
txt <- paste ( txt ,
2023-07-08 17:30:05 +02:00
paste0 (
paste0 (
" " , strrep ( font_grey ( " -" ) , times = getOption ( " width" , 100 ) ) , " \n" ,
' "' , x [i , ] $ original_input , ' "' ,
" -> " ,
paste0 (
font_bold ( italicise ( x [i , ] $ fullname ) ) ,
" (" , x [i , ] $ mo , " , " , score_set_colour ( score_formatted , score ) , " )"
)
) ,
collapse = " \n"
) ,
# Add note if result was coerced to accepted taxonomic name
ifelse ( x [i , ] $ keep_synonyms == FALSE & x [i , ] $ mo %in% AMR_env $ MO_lookup $ mo [which ( AMR_env $ MO_lookup $ status == " synonym" ) ] ,
paste0 (
strrep ( " " , nchar ( x [i , ] $ original_input ) + 6 ) ,
2024-09-19 11:44:56 +02:00
font_red ( paste0 ( " This outdated taxonomic name was converted to " , font_italic ( AMR_env $ MO_lookup $ fullname [match ( synonym_mo_to_accepted_mo ( x [i , ] $ mo ) , AMR_env $ MO_lookup $ mo ) ] , collapse = NULL ) , " (" , synonym_mo_to_accepted_mo ( x [i , ] $ mo ) , " )." ) , collapse = NULL )
2023-07-08 17:30:05 +02:00
) ,
" "
) ,
candidates ,
sep = " \n"
2022-08-28 10:31:50 +02:00
)
2022-10-06 11:33:30 +02:00
txt <- gsub ( " [\n]+" , " \n" , txt )
# remove first and last break
txt <- gsub ( " (^[\n]|[\n]$)" , " " , txt )
txt <- paste0 ( " \n" , txt , " \n" )
2019-02-27 11:36:12 +01:00
}
2023-07-08 17:30:05 +02:00
2021-08-16 21:54:34 +02:00
cat ( txt )
2023-02-26 21:26:58 +01:00
if ( isTRUE ( any_maxed_out ) ) {
cat ( font_blue ( word_wrap ( " \nOnly the first " , n , " other matches of each record are shown. Run `print(mo_uncertainties(), n = ...)` to view more entries, or save `mo_uncertainties()` to an object." ) ) )
}
2023-05-26 16:10:01 +02:00
if ( isTRUE ( more_than_50 ) ) {
cat ( font_blue ( word_wrap ( " \nOnly the first 50 uncertainties are shown. Run `View(mo_uncertainties())` to view all entries, or save `mo_uncertainties()` to an object." ) ) )
}
2019-02-08 16:06:54 +01:00
}
2020-05-28 16:48:55 +02:00
#' @method print mo_renamed
2019-02-28 13:56:28 +01:00
#' @export
#' @noRd
2022-10-05 09:12:22 +02:00
print.mo_renamed <- function ( x , extra_txt = " " , n = 25 , ... ) {
2019-09-15 22:57:30 +02:00
if ( NROW ( x ) == 0 ) {
2022-10-05 09:12:22 +02:00
cat ( word_wrap ( " No renamed taxonomy to show. Only renamed taxonomy of the last call of `as.mo()` or any `mo_*()` function are stored.\n" , add_fn = font_blue ) )
return ( invisible ( NULL ) )
2019-09-15 22:57:30 +02:00
}
2023-07-08 17:30:05 +02:00
2022-10-05 09:12:22 +02:00
x $ ref_old [ ! is.na ( x $ ref_old ) ] <- paste0 ( " (" , gsub ( " et al." , font_italic ( " et al." ) , x $ ref_old [ ! is.na ( x $ ref_old ) ] , fixed = TRUE ) , " )" )
x $ ref_new [ ! is.na ( x $ ref_new ) ] <- paste0 ( " (" , gsub ( " et al." , font_italic ( " et al." ) , x $ ref_new [ ! is.na ( x $ ref_new ) ] , fixed = TRUE ) , " )" )
x $ ref_old [is.na ( x $ ref_old ) ] <- " (author unknown)"
x $ ref_new [is.na ( x $ ref_new ) ] <- " (author unknown)"
2023-07-08 17:30:05 +02:00
2022-10-05 09:12:22 +02:00
rows <- seq_len ( min ( NROW ( x ) , n ) )
2023-07-08 17:30:05 +02:00
2022-10-05 09:12:22 +02:00
message_ (
" The following microorganism" , ifelse ( NROW ( x ) > 1 , " s were" , " was" ) , " taxonomically renamed" , extra_txt , " :\n" ,
2022-12-17 14:31:33 +01:00
paste0 ( " " , AMR_env $ bullet_icon , " " , font_italic ( x $ old [rows ] , collapse = NULL ) , x $ ref_old [rows ] ,
2023-07-08 17:30:05 +02:00
" -> " , font_italic ( x $ new [rows ] , collapse = NULL ) , x $ ref_new [rows ] ,
collapse = " \n"
2022-10-05 09:12:22 +02:00
) ,
ifelse ( NROW ( x ) > n , paste0 ( " \n\nOnly the first " , n , " (out of " , NROW ( x ) , " ) are shown. Run `print(mo_renamed(), n = ...)` to view more entries (might be slow), or save `mo_renamed()` to an object." ) , " " )
)
}
# UNDOCUMENTED HELPER FUNCTIONS -------------------------------------------
convert_colloquial_input <- function ( x ) {
x.bak <- trimws2 ( x )
x <- trimws2 ( tolower ( x ) )
out <- rep ( NA_character_ , length ( x ) )
2023-07-08 17:30:05 +02:00
2022-10-05 09:12:22 +02:00
# Streptococci, like GBS = Group B Streptococci (B_STRPT_GRPB)
2023-07-08 17:30:05 +02:00
out [x %like_case% " ^g[abcdefghijkl]s$" ] <- gsub ( " g([abcdefghijkl])s" ,
" B_STRPT_GRP\\U\\1" ,
x [x %like_case% " ^g[abcdefghijkl]s$" ] ,
perl = TRUE
2022-10-05 09:12:22 +02:00
)
# Streptococci in different languages, like "estreptococos grupo B"
2023-07-10 13:41:52 +02:00
out [x %like_case% " strepto[ck]o[ck][a-zA-Z ]* [abcdefghijkl]$" ] <- gsub ( " .*e?strepto[ck]o[ck].* ([abcdefghijkl])$" ,
2023-07-08 17:30:05 +02:00
" B_STRPT_GRP\\U\\1" ,
2023-07-10 13:41:52 +02:00
x [x %like_case% " strepto[ck]o[ck][a-zA-Z ]* [abcdefghijkl]$" ] ,
2023-07-08 17:30:05 +02:00
perl = TRUE
2022-10-05 09:12:22 +02:00
)
2023-04-15 09:32:13 +02:00
out [x %like_case% " strep[a-z]* group [abcdefghijkl]$" ] <- gsub ( " .* ([abcdefghijkl])$" ,
2023-07-08 17:30:05 +02:00
" B_STRPT_GRP\\U\\1" ,
x [x %like_case% " strep[a-z]* group [abcdefghijkl]$" ] ,
perl = TRUE
2023-01-07 14:53:14 +01:00
)
2023-04-15 09:32:13 +02:00
out [x %like_case% " group [abcdefghijkl] strepto[ck]o[ck]" ] <- gsub ( " .*group ([abcdefghijkl]) strepto[ck]o[ck].*" ,
2023-07-08 17:30:05 +02:00
" B_STRPT_GRP\\U\\1" ,
x [x %like_case% " group [abcdefghijkl] strepto[ck]o[ck]" ] ,
perl = TRUE
2022-10-05 09:12:22 +02:00
)
out [x %like_case% " ha?emoly.*strep" ] <- " B_STRPT_HAEM"
2023-07-10 13:41:52 +02:00
out [x %like_case% " (strepto.* [abcg, ]{2,4}$)" ] <- " B_STRPT_ABCG"
2022-10-05 09:12:22 +02:00
out [x %like_case% " (strepto.* mil+er+i|^mgs[^a-z]*$)" ] <- " B_STRPT_MILL"
out [x %like_case% " mil+er+i gr" ] <- " B_STRPT_MILL"
out [x %like_case% " ((strepto|^s).* viridans|^vgs[^a-z]*$)" ] <- " B_STRPT_VIRI"
out [x %like_case% " (viridans.* (strepto|^s).*|^vgs[^a-z]*$)" ] <- " B_STRPT_VIRI"
2023-07-08 17:30:05 +02:00
2022-12-17 14:31:33 +01:00
# Salmonella in different languages, like "Salmonella grupo B"
2023-04-15 09:32:13 +02:00
out [x %like_case% " salmonella.* [abcdefgh]$" ] <- gsub ( " .*salmonella.* ([abcdefgh])$" ,
2023-07-08 17:30:05 +02:00
" B_SLMNL_GRP\\U\\1" ,
x [x %like_case% " salmonella.* [abcdefgh]$" ] ,
perl = TRUE
2022-12-17 14:31:33 +01:00
)
2023-04-15 09:32:13 +02:00
out [x %like_case% " group [abcdefgh] salmonella" ] <- gsub ( " .*group ([abcdefgh]) salmonella*" ,
2023-07-08 17:30:05 +02:00
" B_SLMNL_GRP\\U\\1" ,
x [x %like_case% " group [abcdefgh] salmonella" ] ,
perl = TRUE
2022-12-17 14:31:33 +01:00
)
2023-07-08 17:30:05 +02:00
2022-10-05 09:12:22 +02:00
# CoNS/CoPS in different languages (support for German, Dutch, Spanish, Portuguese)
out [x %like_case% " ([ck]oagulas[ea].negatie?[vf]|^[ck]o?ns[^a-z]*$)" ] <- " B_STPHY_CONS"
out [x %like_case% " ([ck]oagulas[ea].positie?[vf]|^[ck]o?ps[^a-z]*$)" ] <- " B_STPHY_COPS"
2023-07-08 17:30:05 +02:00
2022-10-05 09:12:22 +02:00
# Gram stains
2022-10-22 22:00:15 +02:00
out [x %like_case% " gram[ -]?neg.*" ] <- " B_GRAMN"
2022-12-17 14:31:33 +01:00
out [x %like_case% " ( |^)gram[-]( |$)" ] <- " B_GRAMN"
2022-10-22 22:00:15 +02:00
out [x %like_case% " gram[ -]?pos.*" ] <- " B_GRAMP"
2022-12-17 14:31:33 +01:00
out [x %like_case% " ( |^)gram[+]( |$)" ] <- " B_GRAMP"
2023-04-17 11:26:19 +02:00
out [x %like_case% " anaerob[a-z]+ .*gram[ -]?neg.*" ] <- " B_ANAER-NEG"
out [x %like_case% " anaerob[a-z]+ .*gram[ -]?pos.*" ] <- " B_ANAER-POS"
2022-10-22 22:00:15 +02:00
out [is.na ( out ) & x %like_case% " anaerob[a-z]+ (micro)?.*organism" ] <- " B_ANAER"
2024-07-16 15:55:58 +02:00
out [is.na ( out ) & x %like_case% " anaerob[a-z]+ bacter" ] <- " B_ANAER"
2023-04-17 11:26:19 +02:00
2023-07-10 13:41:52 +02:00
# coryneform bacteria
out [x %like_case% " ^coryneform" ] <- " B_CORYNF"
2022-10-05 09:12:22 +02:00
# yeasts and fungi
2024-07-16 15:55:58 +02:00
out [x %like_case% " (^| )yeast?" ] <- " F_YEAST"
out [x %like_case% " (^| )fung(us|i)" ] <- " F_FUNGUS"
2023-07-08 17:30:05 +02:00
2022-10-05 09:12:22 +02:00
# trivial names known to the field
out [x %like_case% " meningo[ck]o[ck]" ] <- " B_NESSR_MNNG"
out [x %like_case% " gono[ck]o[ck]" ] <- " B_NESSR_GNRR"
out [x %like_case% " pneumo[ck]o[ck]" ] <- " B_STRPT_PNMN"
2023-07-10 13:41:52 +02:00
out [x %like_case% " hacek" ] <- " B_HACEK"
out [x %like_case% " haemophilus" & x %like_case% " aggregatibacter" & x %like_case% " cardiobacterium" & x %like_case% " eikenella" & x %like_case% " kingella" ] <- " B_HACEK"
out [x %like_case% " slow.* grow.* mycobact" ] <- " B_MYCBC_SGM"
out [x %like_case% " rapid.* grow.* mycobact" ] <- " B_MYCBC_RGM"
2023-04-17 11:26:19 +02:00
# unexisting names (con is the WHONET code for contamination)
2022-10-05 09:12:22 +02:00
out [x %in% c ( " con" , " other" , " none" , " unknown" ) | x %like_case% " virus" ] <- " UNKNOWN"
2023-07-08 17:30:05 +02:00
2022-10-22 22:00:15 +02:00
# WHONET has a lot of E. coli and Vibrio cholerae names
out [x %like_case% " escherichia coli" ] <- " B_ESCHR_COLI"
out [x %like_case% " vibrio cholerae" ] <- " B_VIBRI_CHLR"
2023-07-08 17:30:05 +02:00
2022-10-05 09:12:22 +02:00
out
2019-02-27 11:36:12 +01:00
}
2022-12-17 14:31:33 +01:00
italicise <- function ( x ) {
2023-04-17 11:26:19 +02:00
if ( ! has_colour ( ) ) {
return ( x )
}
2022-12-17 14:31:33 +01:00
out <- font_italic ( x , collapse = NULL )
2023-04-17 11:26:19 +02:00
# city-like serovars of Salmonella (start with a capital)
2023-01-23 15:01:21 +01:00
out [x %like_case% " Salmonella [A-Z]" ] <- paste (
font_italic ( " Salmonella" ) ,
gsub ( " Salmonella " , " " , x [x %like_case% " Salmonella [A-Z]" ] )
)
2023-04-17 11:26:19 +02:00
# streptococcal groups
2023-01-23 15:01:21 +01:00
out [x %like_case% " Streptococcus [A-Z]" ] <- paste (
font_italic ( " Streptococcus" ) ,
gsub ( " Streptococcus " , " " , x [x %like_case% " Streptococcus [A-Z]" ] )
)
2023-04-17 11:26:19 +02:00
# be sure not to make these italic
out <- gsub ( " ([ -]*)(Group|group|Complex|complex)(\033\\[23m)?" , " \033[23m\\1\\2" , out , perl = TRUE )
out <- gsub ( " (\033\\[3m)?(Beta[-]haemolytic|Coagulase[-](postive|negative)) " , " \\2 \033[3m" , out , perl = TRUE )
2022-12-17 14:31:33 +01:00
out
}
2019-02-27 11:36:12 +01:00
nr2char <- function ( x ) {
if ( x %in% c ( 1 : 10 ) ) {
2022-08-28 10:31:50 +02:00
v <- c (
" one" = 1 , " two" = 2 , " three" = 3 , " four" = 4 , " five" = 5 ,
" six" = 6 , " seven" = 7 , " eight" = 8 , " nine" = 9 , " ten" = 10
)
2019-02-27 11:36:12 +01:00
names ( v [x ] )
} else {
x
}
2018-12-06 14:36:39 +01:00
}
2019-03-15 13:57:25 +01:00
2020-04-14 15:10:09 +02:00
parse_and_convert <- function ( x ) {
2022-10-05 09:12:22 +02:00
if ( tryCatch ( is.character ( x ) && all ( Encoding ( x ) == " unknown" , na.rm = TRUE ) , error = function ( e ) FALSE ) ) {
2023-01-14 17:10:10 +01:00
out <- x
} else {
out <- tryCatch (
{
if ( ! is.null ( dim ( x ) ) ) {
if ( NCOL ( x ) > 2 ) {
stop ( " a maximum of two columns is allowed" , call. = FALSE )
} else if ( NCOL ( x ) == 2 ) {
# support Tidyverse selection like: df %>% select(colA, colB)
# paste these columns together
x <- as.data.frame ( x , stringsAsFactors = FALSE )
colnames ( x ) <- c ( " A" , " B" )
x <- paste ( x $ A , x $ B )
} else {
# support Tidyverse selection like: df %>% select(colA)
x <- as.data.frame ( x , stringsAsFactors = FALSE ) [ [1 ] ]
}
2022-08-28 10:31:50 +02:00
}
2023-01-14 17:10:10 +01:00
parsed <- iconv ( as.character ( x ) , to = " UTF-8" )
parsed [is.na ( parsed ) & ! is.na ( x ) ] <- iconv ( x [is.na ( parsed ) & ! is.na ( x ) ] , from = " Latin1" , to = " ASCII//TRANSLIT" )
parsed <- gsub ( ' "' , " " , parsed , fixed = TRUE )
parsed
} ,
error = function ( e ) stop ( e $ message , call. = FALSE )
) # this will also be thrown when running `as.mo(no_existing_object)`
}
out <- trimws2 ( out )
out <- gsub ( " +" , " " , out , perl = TRUE )
out <- gsub ( " ?/ ? " , " /" , out , perl = TRUE )
out
2020-04-13 21:09:56 +02:00
}
2020-05-16 13:05:47 +02:00
2020-07-22 12:29:51 +02:00
replace_old_mo_codes <- function ( x , property ) {
2021-10-06 13:23:57 +02:00
# this function transform old MO codes to current codes, such as:
# B_ESCH_COL (AMR v0.5.0) -> B_ESCHR_COLI
2022-12-27 15:16:15 +01:00
ind <- x %like_case% " ^[A-Z]_[A-Z_]+$" & ! x %in% AMR_env $ MO_lookup $ mo
2022-10-05 09:12:22 +02:00
if ( any ( ind , na.rm = TRUE ) ) {
2023-01-21 23:47:20 +01:00
add_MO_lookup_to_AMR_env ( )
2020-07-22 10:24:23 +02:00
# get the ones that match
2021-05-30 22:14:38 +02:00
affected <- x [ind ]
affected_unique <- unique ( affected )
all_direct_matches <- TRUE
# find their new codes, once per code
2022-08-28 10:31:50 +02:00
solved_unique <- unlist ( lapply (
strsplit ( affected_unique , " " ) ,
function ( m ) {
kingdom <- paste0 ( " ^" , m [1 ] )
name <- m [3 : length ( m ) ]
name [name == " _" ] <- " "
name <- tolower ( paste0 ( name , " .*" , collapse = " " ) )
name <- gsub ( " .*" , " " , name , fixed = TRUE )
name <- paste0 ( " ^" , name )
2022-10-14 13:02:50 +02:00
results <- AMR_env $ MO_lookup $ mo [AMR_env $ MO_lookup $ kingdom %like_case% kingdom &
2023-07-08 17:30:05 +02:00
AMR_env $ MO_lookup $ fullname_lower %like_case% name ]
2022-08-28 10:31:50 +02:00
if ( length ( results ) > 1 ) {
all_direct_matches <<- FALSE
}
results [1L ]
}
) , use.names = FALSE )
2021-05-30 22:14:38 +02:00
solved <- solved_unique [match ( affected , affected_unique ) ]
2020-07-22 10:24:23 +02:00
# assign on places where a match was found
2021-05-30 22:14:38 +02:00
x [ind ] <- solved
n_matched <- length ( affected [ ! is.na ( affected ) ] )
2021-10-06 13:23:57 +02:00
n_solved <- length ( affected [ ! is.na ( solved ) ] )
n_unsolved <- length ( affected [is.na ( solved ) ] )
2021-05-30 22:14:38 +02:00
n_unique <- length ( affected_unique [ ! is.na ( affected_unique ) ] )
2021-06-01 15:33:06 +02:00
if ( n_unique < n_matched ) {
n_unique <- paste0 ( n_unique , " unique, " )
} else {
n_unique <- " "
}
2020-07-22 12:29:51 +02:00
if ( property != " mo" ) {
2022-08-28 10:31:50 +02:00
warning_ (
" in `mo_" , property , " ()`: the input contained " , n_matched ,
" old MO code" , ifelse ( n_matched == 1 , " " , " s" ) ,
" (" , n_unique , " from a previous AMR package version). " ,
" Please update your MO codes with `as.mo()` to increase speed."
)
2020-10-26 12:23:03 +01:00
} else {
2022-08-28 10:31:50 +02:00
warning_ (
" in `as.mo()`: the input contained " , n_matched ,
" old MO code" , ifelse ( n_matched == 1 , " " , " s" ) ,
" (" , n_unique , " from a previous AMR package version). " ,
n_solved , " old MO code" , ifelse ( n_solved == 1 , " " , " s" ) ,
ifelse ( n_solved == 1 , " was" , " were" ) ,
ifelse ( all_direct_matches , " updated " , font_bold ( " guessed " ) ) ,
" to " , ifelse ( n_solved == 1 , " a " , " " ) ,
" currently used MO code" , ifelse ( n_solved == 1 , " " , " s" ) ,
ifelse ( n_unsolved > 0 ,
2023-07-08 17:30:05 +02:00
paste0 ( " and " , n_unsolved , " old MO code" , ifelse ( n_unsolved == 1 , " " , " s" ) , " could not be updated." ) ,
" ."
2022-08-28 10:31:50 +02:00
)
)
2020-07-22 12:29:51 +02:00
}
2020-07-22 10:24:23 +02:00
}
x
}
2020-09-03 12:31:48 +02:00
replace_ignore_pattern <- function ( x , ignore_pattern ) {
if ( ! is.null ( ignore_pattern ) && ! identical ( trimws2 ( ignore_pattern ) , " " ) ) {
ignore_cases <- x %like% ignore_pattern
if ( sum ( ignore_cases ) > 0 ) {
2022-08-28 10:31:50 +02:00
message_ (
" The following input was ignored by `ignore_pattern = \"" , ignore_pattern , " \"`: " ,
vector_and ( x [ignore_cases ] , quotes = TRUE )
)
2021-02-04 16:48:16 +01:00
x [ignore_cases ] <- NA_character_
2020-09-03 12:31:48 +02:00
}
}
x
}
2020-11-05 01:11:49 +01:00
repair_reference_df <- function ( reference_df ) {
2022-08-28 19:17:12 +02:00
if ( is.null ( reference_df ) ) {
return ( NULL )
}
2020-11-05 01:11:49 +01:00
# has valid own reference_df
2023-02-09 13:07:39 +01:00
reference_df <- reference_df %pm>%
pm_filter ( ! is.na ( mo ) )
2023-07-08 17:30:05 +02:00
2020-11-05 01:11:49 +01:00
# keep only first two columns, second must be mo
if ( colnames ( reference_df ) [1 ] == " mo" ) {
2023-02-09 13:07:39 +01:00
reference_df <- reference_df %pm>% pm_select ( 2 , " mo" )
2020-11-05 01:11:49 +01:00
} else {
2023-02-09 13:07:39 +01:00
reference_df <- reference_df %pm>% pm_select ( 1 , " mo" )
2020-11-05 01:11:49 +01:00
}
2023-07-08 17:30:05 +02:00
2020-11-05 01:11:49 +01:00
# remove factors, just keep characters
colnames ( reference_df ) [1 ] <- " x"
2020-11-10 16:35:56 +01:00
reference_df [ , " x" ] <- as.character ( reference_df [ , " x" , drop = TRUE ] )
reference_df [ , " mo" ] <- as.character ( reference_df [ , " mo" , drop = TRUE ] )
2023-07-08 17:30:05 +02:00
2021-05-03 13:06:43 +02:00
# some MO codes might be old
2023-06-26 13:52:02 +02:00
reference_df [ , " mo" ] <- as.mo ( reference_df [ , " mo" , drop = TRUE ] , reference_df = NULL )
2020-11-05 01:11:49 +01:00
reference_df
}
2021-01-25 21:58:00 +01:00
2022-10-05 09:12:22 +02:00
get_mo_uncertainties <- function ( ) {
2024-09-19 11:44:56 +02:00
remember <- list ( uncertainties = AMR_env $ mo_uncertainties ,
failures = AMR_env $ mo_failures )
2022-10-05 09:12:22 +02:00
# empty them, otherwise e.g. mo_shortname("Chlamydophila psittaci") will give 3 notes
AMR_env $ mo_uncertainties <- NULL
2024-09-19 11:44:56 +02:00
AMR_env $ mo_failures <- NULL
2022-10-05 09:12:22 +02:00
remember
}
load_mo_uncertainties <- function ( metadata ) {
AMR_env $ mo_uncertainties <- metadata $ uncertainties
2024-09-19 11:44:56 +02:00
AMR_env $ mo_failures <- metadata $ failures
2022-10-05 09:12:22 +02:00
}
2024-07-16 14:51:57 +02:00
synonym_mo_to_accepted_mo <- function ( x , fill_in_accepted = FALSE , dataset = AMR_env $ MO_lookup ) {
2024-07-17 14:29:55 +02:00
# `dataset` is an argument so that it can be used in the regeneration of the microorganisms data set
2024-07-16 14:51:57 +02:00
if ( identical ( dataset , AMR_env $ MO_lookup ) ) {
add_MO_lookup_to_AMR_env ( )
dataset <- AMR_env $ MO_lookup
}
2024-07-17 14:29:55 +02:00
out <- x
is_still_synonym <- dataset $ status [match ( out , dataset $ mo ) ] == " synonym"
limit <- 0
while ( any ( is_still_synonym , na.rm = TRUE ) && limit < 5 ) {
limit <- limit + 1
# make sure to get the latest name, e.g. Fusarium pulicaris robiniae was first renamed to Fusarium roseum, then to Fusarium sambucinum
# we need the MO of Fusarium pulicaris robiniae to return the MO of Fusarium sambucinum
2024-07-19 18:04:45 +02:00
must_be_corrected <- ! is.na ( is_still_synonym ) & is_still_synonym
x_gbif <- dataset $ gbif_renamed_to [match ( out , dataset $ mo ) ]
x_mycobank <- dataset $ mycobank_renamed_to [match ( out , dataset $ mo ) ]
x_lpsn <- dataset $ lpsn_renamed_to [match ( out , dataset $ mo ) ]
2024-07-17 14:29:55 +02:00
2024-07-19 18:04:45 +02:00
out [must_be_corrected & ! is.na ( x_gbif ) ] <- dataset $ mo [match ( x_gbif [must_be_corrected & ! is.na ( x_gbif ) ] , dataset $ gbif ) ]
out [must_be_corrected & ! is.na ( x_mycobank ) ] <- dataset $ mo [match ( x_mycobank [must_be_corrected & ! is.na ( x_mycobank ) ] , dataset $ mycobank ) ]
out [must_be_corrected & ! is.na ( x_lpsn ) ] <- dataset $ mo [match ( x_lpsn [must_be_corrected & ! is.na ( x_lpsn ) ] , dataset $ lpsn ) ]
2024-07-17 14:29:55 +02:00
is_still_synonym <- dataset $ status [match ( out , dataset $ mo ) ] == " synonym"
}
2024-07-16 14:51:57 +02:00
2024-07-17 14:29:55 +02:00
x_no_synonym <- dataset $ status [match ( x , dataset $ mo ) ] != " synonym"
out [x_no_synonym ] <- NA_character_
2022-10-10 10:12:08 +02:00
if ( isTRUE ( fill_in_accepted ) ) {
2024-07-17 14:29:55 +02:00
out [ ! is.na ( x_no_synonym ) & x_no_synonym ] <- x [ ! is.na ( x_no_synonym ) & x_no_synonym ]
2022-10-10 10:12:08 +02:00
}
2024-07-16 14:51:57 +02:00
out [is.na ( match ( x , dataset $ mo ) ) ] <- NA_character_
2022-10-10 10:12:08 +02:00
out
2021-01-25 21:58:00 +01:00
}