2018-06-08 12:06:54 +02:00
# ==================================================================== #
# TITLE #
2021-02-02 23:57:35 +01:00
# Antimicrobial Resistance (AMR) Data Analysis for R #
2018-06-08 12:06:54 +02:00
# #
2019-01-02 23:24:07 +01:00
# SOURCE #
2020-07-08 14:48:06 +02:00
# https://github.com/msberends/AMR #
2018-06-08 12:06:54 +02:00
# #
# LICENCE #
2021-12-23 18:56:28 +01:00
# (c) 2018-2022 Berends MS, Luz CF et al. #
2020-10-08 11:16:03 +02:00
# Developed at the University of Groningen, the Netherlands, in #
# collaboration with non-profit organisations Certe Medical #
2020-10-26 12:23:03 +01:00
# Diagnostics & Advice, and University Medical Center Groningen. #
2018-06-08 12:06:54 +02:00
# #
2019-01-02 23:24:07 +01:00
# This R package is free software; you can freely use and distribute #
# it for both personal and commercial purposes under the terms of the #
# GNU General Public License version 2.0 (GNU GPL-2), as published by #
# the Free Software Foundation. #
2020-01-05 17:22:09 +01:00
# We created this package for both routine data analysis and academic #
# research and it was publicly released in the hope that it will be #
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. #
2020-10-08 11:16:03 +02:00
# #
# Visit our website for the full manual and a complete tutorial about #
2021-02-02 23:57:35 +01:00
# how to conduct AMR data analysis: https://msberends.github.io/AMR/ #
2018-06-08 12:06:54 +02:00
# ==================================================================== #
2021-06-01 15:33:06 +02:00
#' Transform Input to a Microorganism Code
2018-06-08 12:06:54 +02:00
#'
2022-09-16 23:15:23 +02:00
#' Use this function to determine a valid microorganism code ([`mo`]). Determination is done using intelligent rules and the complete taxonomic kingdoms `r vector_and(unique(microorganisms$kingdom[which(!grepl("(unknown|Fungi)", microorganisms$kingdom))]), quotes = FALSE)`, and most microbial species from the kingdom Fungi (see *Source*). The input can be almost anything: a full name (like `"Staphylococcus aureus"`), an abbreviated name (such as `"S. aureus"`), an abbreviation known in the field (such as `"MRSA"`), or just a genus. See *Examples*.
2021-05-12 18:15:03 +02:00
#' @param x a [character] vector or a [data.frame] with one or two columns
#' @param Becker a [logical] to indicate whether staphylococci should be categorised into coagulase-negative staphylococci ("CoNS") and coagulase-positive staphylococci ("CoPS") instead of their own species, according to Karsten Becker *et al.* (1,2,3).
2018-09-04 11:33:30 +02:00
#'
2019-12-20 15:05:58 +01:00
#' This excludes *Staphylococcus aureus* at default, use `Becker = "all"` to also categorise *S. aureus* as "CoPS".
2022-02-26 21:58:23 +01:00
#' @param Lancefield a [logical] to indicate whether a beta-haemolytic *Streptococcus* should be categorised into Lancefield groups instead of their own species, according to Rebecca C. Lancefield (4). These streptococci will be categorised in their first group, e.g. *Streptococcus dysgalactiae* will be group C, although officially it was also categorised into groups G and L.
2018-09-04 11:33:30 +02:00
#'
2022-02-26 21:58:23 +01:00
#' This excludes enterococci at default (who are in group D), use `Lancefield = "all"` to also categorise all enterococci as group D.
2022-09-16 23:15:23 +02:00
#' @param minimum_matching_score a numeric value to set as the lower limit for the [MO matching score][mo_matching_score()]. When left blank, this will be determined automatically based on the character length of `x`, its [taxonomic kingdom][microorganisms] and [human pathogenicity][mo_matching_score()].
2021-01-18 16:57:56 +01:00
#' @param allow_uncertain a number between `0` (or `"none"`) and `3` (or `"all"`), or `TRUE` (= `2`) or `FALSE` (= `0`) to indicate whether the input should be checked for less probable results, see *Details*
2022-09-18 10:18:57 +02:00
#' @param keep_synonyms a [logical] to indicate if old, previously valid taxonomic names must be preserved and not be corrected to currently accepted names. The default is `FALSE` to always return the currently accepted names.
2020-09-18 16:05:53 +02:00
#' @param reference_df a [data.frame] to be used for extra reference when translating `x` to a valid [`mo`]. See [set_mo_source()] and [get_mo_source()] to automate the usage of your own codes (e.g. used in your analysis or organisation).
2020-09-03 12:31:48 +02:00
#' @param ignore_pattern a regular expression (case-insensitive) of which all matches in `x` must return `NA`. This can be convenient to exclude known non-relevant input and can also be set with the option `AMR_ignore_pattern`, e.g. `options(AMR_ignore_pattern = "(not reported|contaminated flora)")`.
2021-12-12 09:42:03 +01:00
#' @param language language to translate text like "no growth", which defaults to the system language (see [get_AMR_locale()])
2021-04-20 10:46:17 +02:00
#' @param info a [logical] to indicate if a progress bar should be printed if more than 25 items are to be coerced, defaults to `TRUE` only in interactive mode
2020-12-22 00:51:17 +01:00
#' @param ... other arguments passed on to functions
2018-08-31 13:36:19 +02:00
#' @rdname as.mo
#' @aliases mo
#' @keywords mo Becker becker Lancefield lancefield guess
2018-09-24 23:33:29 +02:00
#' @details
2021-01-18 16:57:56 +01:00
#' ## General Info
2020-10-26 12:23:03 +01:00
#'
2021-06-01 15:33:06 +02:00
#' A microorganism (MO) code from this package (class: [`mo`]) is human readable and typically looks like these examples:
2019-11-28 22:32:17 +01:00
#' ```
2019-09-18 15:46:09 +02:00
#' Code Full name
#' --------------- --------------------------------------
#' B_KLBSL Klebsiella
#' B_KLBSL_PNMN Klebsiella pneumoniae
#' B_KLBSL_PNMN_RHNS Klebsiella pneumoniae rhinoscleromatis
#' | | | |
#' | | | |
2020-09-03 12:31:48 +02:00
#' | | | \---> subspecies, a 4-5 letter acronym
#' | | \----> species, a 4-5 letter acronym
#' | \----> genus, a 5-7 letter acronym
#' \----> taxonomic kingdom: A (Archaea), AN (Animalia), B (Bacteria),
2022-09-16 23:15:23 +02:00
#' F (Fungi), PL (Plantae), P (Protozoa)
2019-11-28 22:32:17 +01:00
#' ```
2018-08-01 08:03:31 +02:00
#'
2020-07-22 10:24:23 +02:00
#' Values that cannot be coerced will be considered 'unknown' and will get the MO code `UNKNOWN`.
2019-03-02 22:47:04 +01:00
#'
2021-01-18 16:57:56 +01:00
#' Use the [`mo_*`][mo_property()] functions to get properties based on the returned code, see *Examples*.
2019-03-15 13:57:25 +01:00
#'
2022-09-16 23:15:23 +02:00
#' The algorithm uses data from the List of Prokaryotic names with Standing in Nomenclature (LPSN) and the Global Biodiversity Information Facility (GBIF) (see [microorganisms]).
2019-09-15 22:57:30 +02:00
#'
2019-11-28 22:32:17 +01:00
#' The [as.mo()] function uses several coercion rules for fast and logical results. It assesses the input matching criteria in the following order:
2020-10-26 12:23:03 +01:00
#'
2019-11-28 22:32:17 +01:00
#' 1. Human pathogenic prevalence: the function starts with more prevalent microorganisms, followed by less prevalent ones;
#' 2. Taxonomic kingdom: the function starts with determining Bacteria, then Fungi, then Protozoa, then others;
#' 3. Breakdown of input values to identify possible matches.
2018-09-24 23:33:29 +02:00
#'
2020-09-18 16:05:53 +02:00
#' This will lead to the effect that e.g. `"E. coli"` (a microorganism highly prevalent in humans) will return the microbial ID of *Escherichia coli* and not *Entamoeba coli* (a microorganism less prevalent in humans), although the latter would alphabetically come first.
2020-10-26 12:23:03 +01:00
#'
2021-01-18 16:57:56 +01:00
#' ## Coping with Uncertain Results
2020-10-26 12:23:03 +01:00
#'
#' In addition, the [as.mo()] function can differentiate four levels of uncertainty to guess valid results:
2019-11-28 22:32:17 +01:00
#' - Uncertainty level 0: no additional rules are applied;
#' - Uncertainty level 1: allow previously accepted (but now invalid) taxonomic names and minor spelling errors;
#' - Uncertainty level 2: allow all of level 1, strip values between brackets, inverse the words of the input, strip off text elements from the end keeping at least two elements;
#' - Uncertainty level 3: allow all of level 1 and 2, strip off text elements from the end, allow any part of a taxonomic name.
2020-10-26 12:23:03 +01:00
#'
2020-07-22 10:24:23 +02:00
#' The level of uncertainty can be set using the argument `allow_uncertain`. The default is `allow_uncertain = TRUE`, which is equal to uncertainty level 2. Using `allow_uncertain = FALSE` is equal to uncertainty level 0 and will skip all rules. You can also use e.g. `as.mo(..., allow_uncertain = 1)` to only allow up to level 1 uncertainty.
2020-10-26 12:23:03 +01:00
#'
2020-07-22 10:24:23 +02:00
#' With the default setting (`allow_uncertain = TRUE`, level 2), below examples will lead to valid results:
2020-03-14 14:05:43 +01:00
#' - `"Streptococcus group B (known as S. agalactiae)"`. The text between brackets will be removed and a warning will be thrown that the result *Streptococcus group B* (``r as.mo("Streptococcus group B")``) needs review.
#' - `"S. aureus - please mind: MRSA"`. The last word will be stripped, after which the function will try to find a match. If it does not, the second last word will be stripped, etc. Again, a warning will be thrown that the result *Staphylococcus aureus* (``r as.mo("Staphylococcus aureus")``) needs review.
#' - `"Fluoroquinolone-resistant Neisseria gonorrhoeae"`. The first word will be stripped, after which the function will try to find a match. A warning will be thrown that the result *Neisseria gonorrhoeae* (``r as.mo("Neisseria gonorrhoeae")``) needs review.
2020-10-26 12:23:03 +01:00
#'
2020-07-22 10:24:23 +02:00
#' There are three helper functions that can be run after using the [as.mo()] function:
2021-01-18 16:57:56 +01:00
#' - Use [mo_uncertainties()] to get a [data.frame] that prints in a pretty format with all taxonomic names that were guessed. The output contains the matching score for all matches (see *Matching Score for Microorganisms* below).
2020-09-18 16:05:53 +02:00
#' - Use [mo_failures()] to get a [character] [vector] with all values that could not be coerced to a valid value.
#' - Use [mo_renamed()] to get a [data.frame] with all values that could be coerced based on old, previously accepted taxonomic names.
2019-02-08 16:06:54 +01:00
#'
2021-01-18 16:57:56 +01:00
#' ## Microbial Prevalence of Pathogens in Humans
2020-10-26 12:23:03 +01:00
#'
2022-09-16 23:15:23 +02:00
#' The coercion rules consider the prevalence of microorganisms in humans grouped into three groups, which is available as the `prevalence` columns in the [microorganisms] data set. The grouping into human pathogenic prevalence is explained in the section *Matching Score for Microorganisms* below.
2021-01-18 16:57:56 +01:00
#' @inheritSection mo_matching_score Matching Score for Microorganisms
2019-05-20 12:00:18 +02:00
# (source as a section here, so it can be inherited by other man pages:)
2018-09-24 23:33:29 +02:00
#' @section Source:
2022-09-16 23:15:23 +02:00
#' 1. Becker K. *et al.* (2014). **Coagulase-Negative Staphylococci.** *Clin Microbiol Rev.* 27(4): 870-926; \doi{10.1128/CMR.00109-13}
#' 2. Becker K. *et al.* (2019). **Implications of identifying the recently defined members of the *S. aureus* complex, *S. argenteus* and *S. schweitzeri*: A position paper of members of the ESCMID Study Group for staphylococci and Staphylococcal Diseases (ESGS).** *Clin Microbiol Infect*; \doi{10.1016/j.cmi.2019.02.028}
#' 3. Becker K. *et al.* (2020). **Emergence of coagulase-negative staphylococci** *Expert Rev Anti Infect Ther.* 18(4):349-366; \doi{10.1080/14787210.2020.1730813}
#' 4. Lancefield R.C. (1933). **A serological differentiation of human and other groups of hemolytic streptococci**. *J Exp Med.* 57(4): 571-95; \doi{10.1084/jem.57.4.571}
#' 5. Berends M.S. *et al.* (2022). **Trends in Occurrence and Phenotypic Resistance of Coagulase-Negative Staphylococci (CoNS) Found in Human Blood in the Northern Netherlands between 2013 and 2019** *Microorganisms* 10(9), 1801; \doi{10.3390/microorganisms10091801}
#' 6. `r TAXONOMY_VERSION$LPSN$citation` Accessed from <`r TAXONOMY_VERSION$LPSN$url`> on `r documentation_date(TAXONOMY_VERSION$LPSN$accessed_date)`.
#' 7. `r TAXONOMY_VERSION$GBIF$citation` Accessed from <`r TAXONOMY_VERSION$GBIF$url`> on `r documentation_date(TAXONOMY_VERSION$GBIF$accessed_date)`.
#' 8. `r TAXONOMY_VERSION$SNOMED$citation` URL: <`r TAXONOMY_VERSION$SNOMED$url`>
2018-06-08 12:06:54 +02:00
#' @export
2020-09-18 16:05:53 +02:00
#' @return A [character] [vector] with additional class [`mo`]
#' @seealso [microorganisms] for the [data.frame] that is being used to determine ID's.
2020-10-26 12:23:03 +01:00
#'
2020-12-17 16:22:25 +01:00
#' The [`mo_*`][mo_property()] functions (such as [mo_genus()], [mo_gramstain()]) to get properties based on the returned code.
2021-01-18 16:57:56 +01:00
#' @inheritSection AMR Reference Data Publicly Available
2018-06-08 12:06:54 +02:00
#' @examples
2019-07-02 16:48:52 +02:00
#' \donttest{
2019-09-18 15:46:09 +02:00
#' # These examples all return "B_STPHY_AURS", the ID of S. aureus:
2022-09-16 23:15:23 +02:00
#' as.mo(c(
#' "sau", # WHONET code
#' "stau",
#' "STAU",
#' "staaur",
#' "S. aureus",
#' "S aureus",
#' "Staphylococcus aureus",
#' "Staphylococcus aureus (MRSA,",
#' "Zthafilokkoockus oureuz", # handles incorrect spelling
#' "MRSA", # Methicillin Resistant S. aureus
#' "VISA", # Vancomycin Intermediate S. aureus
#' "VRSA", # Vancomycin Resistant S. aureus
#' 115329001
#' )) # SNOMED CT code
2020-10-26 12:23:03 +01:00
#'
2019-03-18 14:29:41 +01:00
#' # Dyslexia is no problem - these all work:
2022-09-16 23:15:23 +02:00
#' as.mo(c(
#' "Ureaplasma urealyticum",
#' "Ureaplasma urealyticus",
#' "Ureaplasmium urealytica",
#' "Ureaplazma urealitycium"
#' ))
2019-03-18 14:29:41 +01:00
#'
2018-09-05 10:51:46 +02:00
#' as.mo("Streptococcus group A")
#'
2022-08-28 10:31:50 +02:00
#' as.mo("S. epidermidis") # will remain species: B_STPHY_EPDR
#' as.mo("S. epidermidis", Becker = TRUE) # will not remain species: B_STPHY_CONS
2018-08-02 13:15:45 +02:00
#'
2022-08-28 10:31:50 +02:00
#' as.mo("S. pyogenes") # will remain species: B_STRPT_PYGN
2019-09-18 15:46:09 +02:00
#' as.mo("S. pyogenes", Lancefield = TRUE) # will not remain species: B_STRPT_GRPA
2018-08-02 13:15:45 +02:00
#'
2019-03-18 14:29:41 +01:00
#' # All mo_* functions use as.mo() internally too (see ?mo_property):
2022-09-16 23:15:23 +02:00
#' mo_genus("Esch coli")
#' mo_gramstain("E. coli")
#' mo_is_intrinsic_resistant("E. coli", "vanco")
2018-06-08 12:06:54 +02:00
#' }
2020-10-26 12:23:03 +01:00
as.mo <- function ( x ,
Becker = FALSE ,
Lancefield = FALSE ,
2022-09-16 23:15:23 +02:00
minimum_matching_score = NULL ,
2020-10-26 12:23:03 +01:00
allow_uncertain = TRUE ,
2022-09-16 23:15:23 +02:00
keep_synonyms = FALSE ,
2020-09-03 12:31:48 +02:00
reference_df = get_mo_source ( ) ,
ignore_pattern = getOption ( " AMR_ignore_pattern" ) ,
2021-12-12 09:42:03 +01:00
language = get_AMR_locale ( ) ,
2021-04-20 10:46:17 +02:00
info = interactive ( ) ,
2019-11-23 12:39:57 +01:00
... ) {
2020-10-20 21:00:57 +02:00
meet_criteria ( x , allow_class = c ( " mo" , " data.frame" , " list" , " character" , " numeric" , " integer" , " factor" ) , allow_NA = TRUE )
2020-10-19 17:09:19 +02:00
meet_criteria ( Becker , allow_class = c ( " logical" , " character" ) , has_length = 1 )
meet_criteria ( Lancefield , allow_class = c ( " logical" , " character" ) , has_length = 1 )
2022-09-16 23:15:23 +02:00
meet_criteria ( keep_synonyms , allow_class = c ( " logical" , " character" ) , has_length = 1 )
meet_criteria ( minimum_matching_score , allow_class = c ( " numeric" , " integer" ) , has_length = 1 , allow_NULL = TRUE )
2020-10-19 17:09:19 +02:00
meet_criteria ( reference_df , allow_class = " data.frame" , allow_NULL = TRUE )
meet_criteria ( ignore_pattern , allow_class = " character" , has_length = 1 , allow_NULL = TRUE )
2022-09-16 23:15:23 +02:00
language <- validate_language ( language )
2021-04-20 10:46:17 +02:00
meet_criteria ( info , allow_class = " logical" , has_length = 1 )
2022-08-28 10:31:50 +02:00
2022-09-16 23:15:23 +02:00
if ( tryCatch ( all ( x [ ! is.na ( x ) ] %in% AMR :: microorganisms $ mo ) &
2022-08-28 10:31:50 +02:00
isFALSE ( Becker ) &
2022-09-16 23:15:23 +02:00
isTRUE ( keep_synonyms ) &&
2022-08-28 10:31:50 +02:00
isFALSE ( Lancefield ) , error = function ( e ) FALSE ) ) {
2020-09-12 08:49:01 +02:00
# don't look into valid MO codes, just return them
2020-11-10 16:35:56 +01:00
# is.mo() won't work - MO codes might change between package versions
2020-11-16 16:57:55 +01:00
return ( set_clean_class ( x , new_class = c ( " mo" , " character" ) ) )
2020-09-12 08:49:01 +02:00
}
2022-08-28 10:31:50 +02:00
2020-04-13 21:09:56 +02:00
# start off with replaced language-specific non-ASCII characters with ASCII characters
2020-04-14 15:10:09 +02:00
x <- parse_and_convert ( x )
2020-07-22 10:24:23 +02:00
# replace mo codes used in older package versions
2020-07-22 12:29:51 +02:00
x <- replace_old_mo_codes ( x , property = " mo" )
2020-09-03 12:31:48 +02:00
# ignore cases that match the ignore pattern
x <- replace_ignore_pattern ( x , ignore_pattern )
2022-08-28 10:31:50 +02:00
2019-06-02 19:23:19 +02:00
# WHONET: xxx = no growth
2022-09-16 23:15:23 +02:00
x [tolower ( x ) %in% c ( " " , " xxx" , " na" , " nan" ) ] <- NA_character_
2022-08-28 10:31:50 +02:00
2022-09-16 23:15:23 +02:00
if ( tryCatch ( all ( x == " " | gsub ( " .*(unknown ).*" , " unknown name" , tolower ( x ) , perl = TRUE ) %in% MO_lookup $ fullname_lower , na.rm = TRUE ) &&
isFALSE ( Becker ) &&
isTRUE ( keep_synonyms ) &&
2022-08-28 10:31:50 +02:00
isFALSE ( Lancefield ) , error = function ( e ) FALSE ) ) {
2021-02-21 22:56:35 +01:00
# to improve speed, special case for taxonomically correct full names (case-insensitive)
2022-09-16 23:15:23 +02:00
return ( set_clean_class ( MO_lookup [match (
gsub ( " .*(unknown ).*" , " unknown name" ,
tolower ( x ) ,
perl = TRUE
) ,
MO_lookup $ fullname_lower
) , " mo" , drop = TRUE ] ,
new_class = c ( " mo" , " character" )
2022-08-28 10:31:50 +02:00
) )
2021-02-21 22:56:35 +01:00
}
2022-08-28 10:31:50 +02:00
2022-09-16 23:15:23 +02:00
out <- rep ( NA_character_ , length ( x ) )
2022-08-28 10:31:50 +02:00
2022-09-16 23:15:23 +02:00
# below we use base R's match(), known for powering '%in%', and incredibly fast!
2018-09-27 23:23:48 +02:00
2022-09-16 23:15:23 +02:00
# From reference_df ----
reference_df <- repair_reference_df ( reference_df )
if ( ! is.null ( reference_df ) ) {
out [x %in% reference_df [ [1 ] ] ] <- reference_df [ [2 ] ] [match ( x [x %in% reference_df [ [1 ] ] ] , reference_df [ [1 ] ] ) ]
}
# From MO code ----
out [is.na ( out ) & x %in% AMR :: microorganisms $ mo ] <- x [is.na ( out ) & x %in% AMR :: microorganisms $ mo ]
# From full name ----
out [is.na ( out ) & x %in% AMR :: microorganisms $ fullname ] <- AMR :: microorganisms $ mo [match ( x [is.na ( out ) & x %in% AMR :: microorganisms $ fullname ] , AMR :: microorganisms $ fullname ) ]
# From known codes ----
out [is.na ( out ) & x %in% AMR :: microorganisms.codes $ code ] <- AMR :: microorganisms.codes $ mo [match ( x [is.na ( out ) & x %in% AMR :: microorganisms.codes $ code ] , AMR :: microorganisms.codes $ code ) ]
# From SNOMED ----
if ( any ( is.na ( out ) & x %in% unlist ( microorganisms $ snomed ) , na.rm = TRUE ) ) {
# found this extremely fast gem here: https://stackoverflow.com/a/11002456/4575331
out [is.na ( out ) & x %in% unlist ( microorganisms $ snomed ) ] <- microorganisms $ mo [rep ( seq_along ( microorganisms $ snomed ) , vapply ( FUN.VALUE = double ( 1 ) , microorganisms $ snomed , length ) ) [match ( x [is.na ( out ) & x %in% unlist ( microorganisms $ snomed ) ] , unlist ( microorganisms $ snomed ) ) ] ]
}
# From previous hits in this session ----
old <- out
out [is.na ( out ) & x %in% pkg_env $ mo_previously_coerced $ x ] <- pkg_env $ mo_previously_coerced $ mo [match ( x [is.na ( out ) & x %in% pkg_env $ mo_previously_coerced $ x ] , pkg_env $ mo_previously_coerced $ x ) ]
new <- out
if ( isTRUE ( info ) && message_not_thrown_before ( " as.mo" , old [seq_len ( min ( 100 , length ( old ) ) ) ] , new [seq_len ( min ( 100 , length ( new ) ) ) ] , entire_session = TRUE ) && any ( is.na ( old ) & ! is.na ( new ) , na.rm = TRUE ) ) {
message_ (
" Returning previously coerced value" , ifelse ( sum ( is.na ( old ) & ! is.na ( new ) ) > 1 , " s" , " " ) ,
" for " , vector_and ( x [is.na ( old ) & ! is.na ( new ) ] ) , " . Run `mo_reset_session()` to reset this."
)
2021-02-21 20:15:09 +01:00
}
2022-08-28 10:31:50 +02:00
2022-09-16 23:15:23 +02:00
# For all other input ----
if ( any ( is.na ( out ) & ! is.na ( x ) ) ) {
# reset uncertainties
pkg_env $ mo_uncertainties <- pkg_env $ mo_uncertainties [0 , ]
# Laboratory systems: remove (translated) entries like "no growth", "not E. coli", etc.
x [trimws2 ( x ) %like% translate_into_language ( " no .*growth" , language = language ) ] <- NA_character_
x [trimws2 ( x ) %like% paste0 ( " ^(" , translate_into_language ( " no|not" , language = language ) , " ) " ) ] <- NA_character_
# run over all unique leftovers
x_unique <- unique ( x [is.na ( out ) & ! is.na ( x ) ] )
# set up progress bar
progress <- progress_ticker ( n = length ( x_unique ) , n_min = 10 , print = info )
on.exit ( close ( progress ) )
# run it
x_coerced <- lapply ( x_unique , function ( x_search ) {
progress $ tick ( )
x_out <- trimws ( tolower ( x_search ) )
x_parts <- strsplit ( gsub ( " -" , " " , x_out , fixed = TRUE ) , " " , fixed = TRUE ) [ [1 ] ]
# do a pre-match on first character (and if it contains a space, first chars of first two terms)
if ( length ( x_parts ) == 2 ) {
filtr <- which ( MO_lookup $ full_first == substr ( x_parts [1 ] , 1 , 1 ) & MO_lookup $ species_first == substr ( x_parts [2 ] , 1 , 1 ) )
} else if ( length ( x_parts ) > 2 ) {
first_chars <- paste0 ( " (^| )" , " [" , paste ( substr ( x_parts , 1 , 1 ) , collapse = " " ) , " ]" )
filtr <- which ( MO_lookup $ full_first %like_case% first_chars )
} else if ( nchar ( x_out ) == 4 ) {
# no space and 4 characters - probably a code such as STAU or ESCO!
if ( isTRUE ( info ) ) {
message_ ( " Input \"" , x_search , " \" is assumed to be a microorganism code - trying to match on " , vector_and ( c ( substr ( x_out , 1 , 2 ) , substr ( x_out , 3 , 4 ) ) , sort = FALSE ) )
}
filtr <- which ( MO_lookup $ fullname_lower %like_case% paste0 ( " (^| )" , substr ( x_out , 1 , 2 ) , " .* " , substr ( x_out , 3 , 4 ) ) )
} else if ( nchar ( x_out ) <= 6 ) {
# no space and 5-6 characters - probably a code such as STAAUR or ESCCOL!
first_part <- paste0 ( substr ( x_out , 1 , 2 ) , " [a-z]*" , substr ( x_out , 3 , 3 ) )
second_part <- substr ( x_out , 4 , nchar ( x_out ) )
if ( isTRUE ( info ) ) {
message_ ( " Input \"" , x_search , " \" is assumed to be a microorganism code - trying to match on " , vector_and ( c ( gsub ( " [a-z]*" , " (...)" , first_part , fixed = TRUE ) , second_part ) , sort = FALSE ) )
}
filtr <- which ( MO_lookup $ fullname_lower %like_case% paste0 ( " (^| )" , first_part , " .* " , second_part ) )
2020-09-12 13:54:21 +02:00
} else {
2022-09-16 23:15:23 +02:00
filtr <- which ( MO_lookup $ full_first == substr ( x_out , 1 , 1 ) )
2020-09-12 13:54:21 +02:00
}
2022-09-16 23:15:23 +02:00
if ( length ( filtr ) == 0 ) {
mo_to_search <- MO_lookup $ fullname
2021-02-21 20:15:09 +01:00
} else {
2022-09-16 23:15:23 +02:00
mo_to_search <- MO_lookup $ fullname [filtr ]
2020-09-12 13:54:21 +02:00
}
2022-09-16 23:15:23 +02:00
pkg_env $ mo_to_search <- mo_to_search
# determine the matching score on the original search value
m <- mo_matching_score ( x = x_search , n = mo_to_search )
if ( is.null ( minimum_matching_score ) ) {
minimum_matching_score_current <- min ( 0.7 , min ( 10 , nchar ( x_search ) ) * 0.08 )
# correct back for prevalence
minimum_matching_score_current <- minimum_matching_score_current / MO_lookup $ prevalence [match ( mo_to_search , MO_lookup $ fullname ) ]
# correct back for kingdom
minimum_matching_score_current <- minimum_matching_score_current / MO_lookup $ kingdom_index [match ( mo_to_search , MO_lookup $ fullname ) ]
} else {
minimum_matching_score_current <- minimum_matching_score
2019-02-23 16:02:31 +01:00
}
2022-09-16 23:15:23 +02:00
m [m < minimum_matching_score_current ] <- NA_real_
top_hits <- mo_to_search [order ( m , decreasing = TRUE , na.last = NA ) ] # na.last = NA will remove the NAs
if ( length ( top_hits ) == 0 ) {
warning_ ( " No hits found for \"" , x_search , " \" with minimum_matching_score = " , ifelse ( is.null ( minimum_matching_score ) , " NULL" , minimum_matching_score ) , " . Try setting this value higher." )
result_mo <- NA_character_
} else {
result_mo <- MO_lookup $ mo [match ( top_hits [1 ] , MO_lookup $ fullname ) ]
pkg_env $ mo_uncertainties <- rbind ( pkg_env $ mo_uncertainties ,
data.frame (
minimum_matching_score = ifelse ( is.null ( minimum_matching_score ) , " NULL" , minimum_matching_score ) ,
input = x_search ,
fullname = top_hits [1 ] ,
mo = result_mo ,
candidates = ifelse ( length ( top_hits ) > 1 , paste ( top_hits [2 : min ( 26 , length ( top_hits ) ) ] , collapse = " , " ) , " " ) ,
stringsAsFactors = FALSE
) ,
stringsAsFactors = FALSE
2022-08-28 10:31:50 +02:00
)
2022-09-16 23:15:23 +02:00
# save to package env to save time for next time
pkg_env $ mo_previously_coerced <- unique ( rbind ( pkg_env $ mo_previously_coerced ,
data.frame (
x = x_search ,
mo = result_mo ,
stringsAsFactors = FALSE
) ,
stringsAsFactors = FALSE
2022-08-28 10:31:50 +02:00
) )
2019-09-15 22:57:30 +02:00
}
2022-09-16 23:15:23 +02:00
# the actual result:
result_mo
} )
# remove progress bar from console
close ( progress )
# expand from unique again
out [is.na ( out ) ] <- unlist ( x_coerced ) [match ( x [is.na ( out ) ] , x_unique ) ]
# Throw note about uncertainties ----
if ( isTRUE ( info ) && NROW ( pkg_env $ mo_uncertainties ) > 0 ) {
if ( message_not_thrown_before ( " as.mo" , " uncertainties" , pkg_env $ mo_uncertainties $ input ) ) {
plural <- c ( " " , " this" )
if ( length ( pkg_env $ mo_uncertainties $ input ) > 1 ) {
plural <- c ( " s" , " these uncertainties" )
}
if ( length ( pkg_env $ mo_uncertainties $ input ) <= 3 ) {
examples <- vector_and ( paste0 (
' "' , pkg_env $ mo_uncertainties $ input ,
' " (assumed ' , font_italic ( pkg_env $ mo_uncertainties $ fullname , collapse = NULL ) , " )"
) ,
quotes = FALSE
)
} else {
examples <- paste0 ( nr2char ( length ( pkg_env $ mo_uncertainties $ input ) ) , " microorganism" , plural [1 ] )
2021-02-18 23:23:14 +01:00
}
2022-08-28 10:31:50 +02:00
msg <- paste0 (
2022-09-16 23:15:23 +02:00
" Microorganism translation was uncertain for " , examples ,
" . Run `mo_uncertainties()` to review " , plural [2 ] , " ."
2022-08-28 10:31:50 +02:00
)
2022-09-16 23:15:23 +02:00
message_ ( msg )
2021-02-18 23:23:14 +01:00
}
2022-09-16 23:15:23 +02:00
}
} # end of loop over all yet unknowns
# Keep or replace synonyms ----
if ( isFALSE ( keep_synonyms ) ) {
out_old <- out
gbif_matches <- AMR :: microorganisms $ gbif_renamed_to [match ( out , AMR :: microorganisms $ mo ) ]
gbif_matches [ ! gbif_matches %in% AMR :: microorganisms $ gbif ] <- NA
out [which ( ! is.na ( gbif_matches ) ) ] <- AMR :: microorganisms $ mo [match ( gbif_matches [which ( ! is.na ( gbif_matches ) ) ] , AMR :: microorganisms $ gbif ) ]
lpsn_matches <- AMR :: microorganisms $ lpsn_renamed_to [match ( out , AMR :: microorganisms $ mo ) ]
lpsn_matches [ ! lpsn_matches %in% AMR :: microorganisms $ lpsn ] <- NA
out [which ( ! is.na ( lpsn_matches ) ) ] <- AMR :: microorganisms $ mo [match ( lpsn_matches [which ( ! is.na ( lpsn_matches ) ) ] , AMR :: microorganisms $ lpsn ) ]
if ( isTRUE ( info ) && ( any ( ! is.na ( gbif_matches ) ) || any ( ! is.na ( lpsn_matches ) ) ) && message_not_thrown_before ( " as.mo" , gbif_matches [which ( ! is.na ( gbif_matches ) ) ] [1 : 5 ] , lpsn_matches [which ( ! is.na ( lpsn_matches ) ) ] [1 : 5 ] ) && length ( c ( lpsn_matches , gbif_matches ) ) > 0 ) {
total_old <- out_old [which ( ! is.na ( gbif_matches ) | ! is.na ( lpsn_matches ) ) ]
total_new <- out [which ( ! is.na ( gbif_matches ) | ! is.na ( lpsn_matches ) ) ]
total_new <- total_new [ ! duplicated ( total_old ) ]
total_old <- total_old [ ! duplicated ( total_old ) ]
total_new <- total_new [order ( total_old ) ]
total_old <- total_old [order ( total_old ) ]
refs_old <- microorganisms $ ref [match ( total_old , microorganisms $ mo ) ]
refs_old [ ! is.na ( refs_old ) ] <- paste0 ( " (" , refs_old [ ! is.na ( refs_old ) ] , " )" )
refs_old [is.na ( refs_old ) ] <- " "
refs_new <- microorganisms $ ref [match ( total_new , microorganisms $ mo ) ]
refs_new [ ! is.na ( refs_new ) ] <- paste0 ( " (" , refs_new [ ! is.na ( refs_new ) ] , " )" )
refs_new [is.na ( refs_new ) ] <- " "
message_ (
" The following microorganism" , ifelse ( length ( total_old ) > 1 , " s were" , " was" ) , " taxonomically renamed (use `keep_synonyms = TRUE` to leave uncorrected):\n" ,
paste0 ( " " , microorganisms $ fullname [match ( total_old , microorganisms $ mo ) ] ,
refs_old ,
" -> " , microorganisms $ fullname [match ( total_new , microorganisms $ mo ) ] ,
refs_new ,
collapse = " \n"
)
)
2018-09-27 23:23:48 +02:00
}
2018-07-23 14:14:03 +02:00
}
2022-08-28 10:31:50 +02:00
2022-09-16 23:15:23 +02:00
# Apply Becker ----
if ( isTRUE ( Becker ) || Becker == " all" ) {
2020-10-20 21:00:57 +02:00
# warn when species found that are not in:
# - Becker et al. 2014, PMID 25278577
# - Becker et al. 2019, PMID 30872103
# - Becker et al. 2020, PMID 32056452
2022-08-28 10:31:50 +02:00
2021-10-06 13:23:57 +02:00
# comment below code if all staphylococcal species are categorised as CoNS/CoPS
2022-09-16 23:15:23 +02:00
post_Becker <- paste (
" Staphylococcus" ,
c ( " caledonicus" , " canis" , " durrellii" , " lloydii" , " ratti" , " roterodami" , " singaporensis" , " taiwanensis" )
)
if ( any ( out %in% AMR :: microorganisms $ mo [match ( post_Becker , AMR :: microorganisms $ fullname ) ] ) ) {
2021-12-11 13:41:31 +01:00
if ( message_not_thrown_before ( " as.mo" , " becker" ) ) {
2022-03-02 15:38:55 +01:00
warning_ ( " in `as.mo()`: Becker " , font_italic ( " et al." ) , " (2014, 2019, 2020) does not contain these species named after their publication: " ,
2022-09-16 23:15:23 +02:00
vector_and ( font_italic ( gsub ( " Staphylococcus" , " S." , post_Becker , fixed = TRUE ) , collapse = NULL ) , quotes = FALSE ) ,
2022-08-28 10:31:50 +02:00
" . Categorisation to CoNS/CoPS was taken from the original scientific publication(s)." ,
immediate = TRUE
)
2021-10-06 13:23:57 +02:00
}
}
2022-08-28 10:31:50 +02:00
2022-09-16 23:15:23 +02:00
# 'MO_CONS' and 'MO_COPS' are <mo> vectors created in R/_pre_commit_hook.R
out [out %in% MO_CONS ] <- " B_STPHY_CONS"
out [out %in% MO_COPS ] <- " B_STPHY_COPS"
2018-09-01 21:19:46 +02:00
if ( Becker == " all" ) {
2022-09-16 23:15:23 +02:00
out [out == " B_STPHY_AURS" ] <- " B_STPHY_COPS"
2018-09-01 21:19:46 +02:00
}
}
2022-08-28 10:31:50 +02:00
2022-09-16 23:15:23 +02:00
# Apply Lancefield ----
if ( isTRUE ( Lancefield ) || Lancefield == " all" ) {
2018-09-27 23:23:48 +02:00
# group A - S. pyogenes
2022-09-16 23:15:23 +02:00
out [out == " B_STRPT_PYGN" ] <- " B_STRPT_GRPA"
2018-09-27 23:23:48 +02:00
# group B - S. agalactiae
2022-09-16 23:15:23 +02:00
out [out == " B_STRPT_AGLC" ] <- " B_STRPT_GRPB"
# group C - all subspecies within S. dysgalactiae and S. equi (such as S. equi zooepidemicus)
out [out %like_case% " ^B_STRPT_(DYSG|EQUI)(_|$)" ] <- " B_STRPT_GRPC"
2018-09-04 11:33:30 +02:00
if ( Lancefield == " all" ) {
2022-09-16 23:15:23 +02:00
# group D - all enterococci
out [out %like_case% " ^B_ENTRC(_|$)" ] <- " B_STRPT_GRPD"
2018-09-27 23:23:48 +02:00
}
2022-09-16 23:15:23 +02:00
# group F - S. anginosus, incl. S. anginosus anginosus and S. anginosus whileyi
out [out %like_case% " ^B_STRPT_ANGN(_|$)" ] <- " B_STRPT_GRPF"
# group G - only S. dysgalactiae which is also group C, so ignore it here
2018-09-27 23:23:48 +02:00
# group H - S. sanguinis
2022-09-16 23:15:23 +02:00
out [out == " B_STRPT_SNGN" ] <- " B_STRPT_GRPH"
# group K - S. salivarius, incl. S. salivarius salivariuss and S. salivarius thermophilus
out [out %like_case% " ^B_STRPT_SLVR(_|$)" ] <- " B_STRPT_GRPK"
# group L - only S. dysgalactiae which is also group C, so ignore it here
2018-09-01 21:19:46 +02:00
}
2022-08-28 10:31:50 +02:00
2022-09-16 23:15:23 +02:00
# Return class ----
set_clean_class ( out ,
new_class = c ( " mo" , " character" )
2022-08-28 10:31:50 +02:00
)
2018-09-25 16:44:40 +02:00
}
2022-09-16 23:15:23 +02:00
#' @rdname as.mo
#' @export
is.mo <- function ( x ) {
inherits ( x , " mo" )
2019-08-20 11:40:54 +02:00
}
2020-08-28 21:55:47 +02:00
# will be exported using s3_register() in R/zzz.R
2020-08-26 11:33:54 +02:00
pillar_shaft.mo <- function ( x , ... ) {
out <- format ( x )
# grey out the kingdom (part until first "_")
2020-09-25 14:44:50 +02:00
out [ ! is.na ( x ) ] <- gsub ( " ^([A-Z]+_)(.*)" , paste0 ( font_subtle ( " \\1" ) , " \\2" ) , out [ ! is.na ( x ) ] , perl = TRUE )
2020-08-26 11:33:54 +02:00
# and grey out every _
2020-08-28 21:55:47 +02:00
out [ ! is.na ( x ) ] <- gsub ( " _" , font_subtle ( " _" ) , out [ ! is.na ( x ) ] )
2022-08-28 10:31:50 +02:00
2020-08-26 11:33:54 +02:00
# markup NA and UNKNOWN
2020-08-28 21:55:47 +02:00
out [is.na ( x ) ] <- font_na ( " NA" )
out [x == " UNKNOWN" ] <- font_na ( " UNKNOWN" )
2022-08-28 10:31:50 +02:00
2021-06-22 12:16:42 +02:00
df <- tryCatch ( get_current_data ( arg_name = " x" , call = 0 ) ,
2022-08-28 10:31:50 +02:00
error = function ( e ) NULL
)
2021-06-14 22:04:04 +02:00
if ( ! is.null ( df ) ) {
mo_cols <- vapply ( FUN.VALUE = logical ( 1 ) , df , is.mo )
} else {
mo_cols <- NULL
}
2022-08-28 10:31:50 +02:00
if ( ! all ( x [ ! is.na ( x ) ] %in% MO_lookup $ mo ) |
( ! is.null ( df ) && ! all ( unlist ( df [ , which ( mo_cols ) , drop = FALSE ] ) %in% MO_lookup $ mo ) ) ) {
2021-04-07 08:37:42 +02:00
# markup old mo codes
2022-08-28 10:31:50 +02:00
out [ ! x %in% MO_lookup $ mo ] <- font_italic ( font_na ( x [ ! x %in% MO_lookup $ mo ] ,
collapse = NULL
) ,
collapse = NULL
)
2021-06-14 22:04:04 +02:00
# throw a warning with the affected column name(s)
if ( ! is.null ( mo_cols ) ) {
col <- paste0 ( " Column " , vector_or ( colnames ( df ) [mo_cols ] , quotes = TRUE , sort = FALSE ) )
2021-04-07 08:37:42 +02:00
} else {
col <- " The data"
}
2022-08-28 10:31:50 +02:00
warning_ (
col , " contains old MO codes (from a previous AMR package version). " ,
" Please update your MO codes with `as.mo()`."
)
2021-04-07 08:37:42 +02:00
}
2022-08-28 10:31:50 +02:00
2020-08-26 11:33:54 +02:00
# make it always fit exactly
2020-12-24 23:29:10 +01:00
max_char <- max ( nchar ( x ) )
if ( is.na ( max_char ) ) {
2022-09-16 23:15:23 +02:00
max_char <- 12
2020-12-24 23:29:10 +01:00
}
2020-08-28 21:55:47 +02:00
create_pillar_column ( out ,
2022-08-28 10:31:50 +02:00
align = " left" ,
width = max_char + ifelse ( any ( x %in% c ( NA , " UNKNOWN" ) ) , 2 , 0 )
)
2020-08-26 11:33:54 +02:00
}
2020-08-28 21:55:47 +02:00
# will be exported using s3_register() in R/zzz.R
2020-08-26 11:33:54 +02:00
type_sum.mo <- function ( x , ... ) {
" mo"
}
2020-08-28 21:55:47 +02:00
# will be exported using s3_register() in R/zzz.R
freq.mo <- function ( x , ... ) {
x_noNA <- as.mo ( x [ ! is.na ( x ) ] ) # as.mo() to get the newest mo codes
grams <- mo_gramstain ( x_noNA , language = NULL )
digits <- list ( ... ) $ digits
if ( is.null ( digits ) ) {
digits <- 2
}
2020-12-17 16:22:25 +01:00
cleaner :: freq.default (
x = x ,
... ,
.add_header = list (
`Gram-negative` = paste0 (
format ( sum ( grams == " Gram-negative" , na.rm = TRUE ) ,
2022-08-28 10:31:50 +02:00
big.mark = " ," ,
decimal.mark = " ."
) ,
2020-12-17 16:22:25 +01:00
" (" , percentage ( sum ( grams == " Gram-negative" , na.rm = TRUE ) / length ( grams ) ,
2022-08-28 10:31:50 +02:00
digits = digits
) ,
" )"
) ,
2020-12-17 16:22:25 +01:00
`Gram-positive` = paste0 (
format ( sum ( grams == " Gram-positive" , na.rm = TRUE ) ,
2022-08-28 10:31:50 +02:00
big.mark = " ," ,
decimal.mark = " ."
) ,
2020-12-17 16:22:25 +01:00
" (" , percentage ( sum ( grams == " Gram-positive" , na.rm = TRUE ) / length ( grams ) ,
2022-08-28 10:31:50 +02:00
digits = digits
) ,
" )"
) ,
2020-12-17 16:22:25 +01:00
`Nr. of genera` = pm_n_distinct ( mo_genus ( x_noNA , language = NULL ) ) ,
2022-08-28 10:31:50 +02:00
`Nr. of species` = pm_n_distinct ( paste (
mo_genus ( x_noNA , language = NULL ) ,
mo_species ( x_noNA , language = NULL )
) )
)
)
2020-08-28 21:55:47 +02:00
}
2020-09-28 01:08:55 +02:00
# will be exported using s3_register() in R/zzz.R
get_skimmers.mo <- function ( column ) {
2020-12-17 16:22:25 +01:00
skimr :: sfl (
2020-09-28 01:08:55 +02:00
skim_type = " mo" ,
2022-08-28 10:31:50 +02:00
unique_total = ~ length ( unique ( stats :: na.omit ( .) ) ) ,
gram_negative = ~ sum ( mo_is_gram_negative ( .) , na.rm = TRUE ) ,
gram_positive = ~ sum ( mo_is_gram_positive ( .) , na.rm = TRUE ) ,
top_genus = ~ names ( sort ( - table ( mo_genus ( stats :: na.omit ( .) , language = NULL ) ) ) ) [1L ] ,
top_species = ~ names ( sort ( - table ( mo_name ( stats :: na.omit ( .) , language = NULL ) ) ) ) [1L ]
2020-09-28 01:08:55 +02:00
)
}
2020-05-28 16:48:55 +02:00
#' @method print mo
2018-08-31 13:36:19 +02:00
#' @export
#' @noRd
2020-08-26 11:33:54 +02:00
print.mo <- function ( x , print.shortnames = FALSE , ... ) {
2020-05-27 16:37:49 +02:00
cat ( " Class <mo>\n" )
2018-10-12 16:35:18 +02:00
x_names <- names ( x )
2020-08-26 11:33:54 +02:00
if ( is.null ( x_names ) & print.shortnames == TRUE ) {
x_names <- tryCatch ( mo_shortname ( x , ... ) , error = function ( e ) NULL )
}
2018-10-12 16:35:18 +02:00
x <- as.character ( x )
names ( x ) <- x_names
2021-05-03 13:06:43 +02:00
if ( ! all ( x [ ! is.na ( x ) ] %in% MO_lookup $ mo ) ) {
2022-08-28 10:31:50 +02:00
warning_ (
" Some MO codes are from a previous AMR package version. " ,
" Please update the MO codes with `as.mo()`."
)
2021-05-03 13:06:43 +02:00
}
2018-10-12 16:35:18 +02:00
print.default ( x , quote = FALSE )
2018-08-31 13:36:19 +02:00
}
2018-07-23 14:14:03 +02:00
2020-05-28 16:48:55 +02:00
#' @method summary mo
2018-12-07 12:04:55 +01:00
#' @export
#' @noRd
summary.mo <- function ( object , ... ) {
# unique and top 1-3
2020-05-16 13:05:47 +02:00
x <- as.mo ( object ) # force again, could be mo from older pkg version
top <- as.data.frame ( table ( x ) , responseName = " n" , stringsAsFactors = FALSE )
2022-08-27 20:49:37 +02:00
top_3 <- top [order ( - top $ n ) , 1 , drop = TRUE ] [1 : 3 ]
2022-08-28 10:31:50 +02:00
value <- c (
" Class" = " mo" ,
" <NA>" = length ( x [is.na ( x ) ] ) ,
" Unique" = pm_n_distinct ( x [ ! is.na ( x ) ] ) ,
" #1" = top_3 [1 ] ,
" #2" = top_3 [2 ] ,
" #3" = top_3 [3 ]
)
2020-07-22 10:24:23 +02:00
class ( value ) <- c ( " summaryDefault" , " table" )
value
2018-12-07 12:04:55 +01:00
}
2020-05-28 16:48:55 +02:00
#' @method as.data.frame mo
2018-07-23 14:14:03 +02:00
#' @export
2018-08-31 13:36:19 +02:00
#' @noRd
2020-05-19 13:18:01 +02:00
as.data.frame.mo <- function ( x , ... ) {
2021-04-07 08:37:42 +02:00
if ( ! all ( x [ ! is.na ( x ) ] %in% MO_lookup $ mo ) ) {
2022-08-28 10:31:50 +02:00
warning_ (
" The data contains old MO codes (from a previous AMR package version). " ,
" Please update your MO codes with `as.mo()`."
)
2021-04-07 08:37:42 +02:00
}
2020-05-19 12:08:49 +02:00
nm <- deparse1 ( substitute ( x ) )
2018-08-31 13:36:19 +02:00
if ( ! " nm" %in% names ( list ( ... ) ) ) {
2021-04-07 08:37:42 +02:00
as.data.frame.vector ( x , ... , nm = nm )
2018-08-31 13:36:19 +02:00
} else {
2021-04-07 08:37:42 +02:00
as.data.frame.vector ( x , ... )
2018-08-31 13:36:19 +02:00
}
}
2020-05-28 16:48:55 +02:00
#' @method [ mo
2018-08-31 13:36:19 +02:00
#' @export
#' @noRd
2019-08-14 14:57:06 +02:00
" [.mo" <- function ( x , ... ) {
2019-08-12 14:48:09 +02:00
y <- NextMethod ( )
2019-08-14 14:57:06 +02:00
attributes ( y ) <- attributes ( x )
y
}
2020-05-28 16:48:55 +02:00
#' @method [[ mo
2019-08-14 14:57:06 +02:00
#' @export
#' @noRd
2019-08-26 16:02:03 +02:00
" [[.mo" <- function ( x , ... ) {
2019-08-14 14:57:06 +02:00
y <- NextMethod ( )
2019-08-26 16:02:03 +02:00
attributes ( y ) <- attributes ( x )
2019-08-14 14:57:06 +02:00
y
}
2020-05-28 16:48:55 +02:00
#' @method [<- mo
2019-08-14 14:57:06 +02:00
#' @export
#' @noRd
2019-08-26 16:02:03 +02:00
" [<-.mo" <- function ( i , j , ... , value ) {
2019-08-14 14:57:06 +02:00
y <- NextMethod ( )
2019-08-26 16:02:03 +02:00
attributes ( y ) <- attributes ( i )
2020-04-13 21:09:56 +02:00
# must only contain valid MOs
2022-09-16 23:15:23 +02:00
return_after_integrity_check ( y , " microorganism code" , as.character ( AMR :: microorganisms $ mo ) )
2019-08-14 14:57:06 +02:00
}
2020-05-28 16:48:55 +02:00
#' @method [[<- mo
2019-08-14 14:57:06 +02:00
#' @export
#' @noRd
2019-08-26 16:02:03 +02:00
" [[<-.mo" <- function ( i , j , ... , value ) {
2019-08-14 14:57:06 +02:00
y <- NextMethod ( )
2019-08-26 16:02:03 +02:00
attributes ( y ) <- attributes ( i )
2020-04-13 21:09:56 +02:00
# must only contain valid MOs
2022-09-16 23:15:23 +02:00
return_after_integrity_check ( y , " microorganism code" , as.character ( AMR :: microorganisms $ mo ) )
2019-08-14 14:57:06 +02:00
}
2020-05-28 16:48:55 +02:00
#' @method c mo
2019-08-14 14:57:06 +02:00
#' @export
#' @noRd
2021-05-03 13:06:43 +02:00
c.mo <- function ( ... ) {
x <- list ( ... ) [ [1L ] ]
2019-08-14 14:57:06 +02:00
y <- NextMethod ( )
attributes ( y ) <- attributes ( x )
2022-09-16 23:15:23 +02:00
return_after_integrity_check ( y , " microorganism code" , as.character ( AMR :: microorganisms $ mo ) )
2018-07-23 14:14:03 +02:00
}
2018-12-06 14:36:39 +01:00
2020-09-25 14:44:50 +02:00
#' @method unique mo
#' @export
#' @noRd
unique.mo <- function ( x , incomparables = FALSE , ... ) {
y <- NextMethod ( )
attributes ( y ) <- attributes ( x )
y
}
2021-02-22 20:21:33 +01:00
#' @method rep mo
#' @export
#' @noRd
rep.mo <- function ( x , ... ) {
y <- NextMethod ( )
attributes ( y ) <- attributes ( x )
y
}
2019-02-08 16:06:54 +01:00
#' @rdname as.mo
2018-12-06 14:36:39 +01:00
#' @export
mo_failures <- function ( ) {
2020-12-27 00:07:00 +01:00
pkg_env $ mo_failures
2018-12-06 14:36:39 +01:00
}
2019-02-08 16:06:54 +01:00
#' @rdname as.mo
#' @export
mo_uncertainties <- function ( ) {
2020-12-27 00:07:00 +01:00
if ( is.null ( pkg_env $ mo_uncertainties ) ) {
2019-07-01 14:03:15 +02:00
return ( NULL )
}
2022-08-28 10:31:50 +02:00
set_clean_class ( as.data.frame ( pkg_env $ mo_uncertainties ,
stringsAsFactors = FALSE
) ,
new_class = c ( " mo_uncertainties" , " data.frame" )
)
2019-02-28 13:56:28 +01:00
}
2020-05-28 16:48:55 +02:00
#' @method print mo_uncertainties
2019-02-28 13:56:28 +01:00
#' @export
#' @noRd
print.mo_uncertainties <- function ( x , ... ) {
2019-03-12 12:19:27 +01:00
if ( NROW ( x ) == 0 ) {
return ( NULL )
}
2022-09-16 23:15:23 +02:00
cat ( word_wrap ( " Matching scores are based on pathogenicity in humans and the resemblance between the input and the full taxonomic name. See `?mo_matching_score`.\n\n" , add_fn = font_blue ) )
if ( has_colour ( ) ) {
cat ( word_wrap ( " Colour keys: " ,
font_red_bg ( " 0.000-0.499 " ) ,
font_orange_bg ( " 0.500-0.599 " ) ,
font_yellow_bg ( " 0.600-0.699 " ) ,
font_green_bg ( " 0.700-1.000" ) ,
add_fn = font_blue
) , font_green_bg ( " " ) , " \n" , sep = " " )
}
score_set_colour <- function ( text , scores ) {
# set colours to scores
text [scores >= 0.7 ] <- font_green_bg ( text [scores >= 0.7 ] , collapse = NULL )
text [scores >= 0.6 & scores < 0.7 ] <- font_yellow_bg ( text [scores >= 0.6 & scores < 0.7 ] , collapse = NULL )
text [scores >= 0.5 & scores < 0.6 ] <- font_orange_bg ( text [scores >= 0.5 & scores < 0.6 ] , collapse = NULL )
text [scores < 0.5 ] <- font_red_bg ( text [scores < 0.5 ] , collapse = NULL )
text
}
2022-08-28 10:31:50 +02:00
2021-08-16 21:54:34 +02:00
txt <- " "
2019-10-11 17:21:02 +02:00
for ( i in seq_len ( nrow ( x ) ) ) {
2020-09-14 12:21:23 +02:00
if ( x [i , ] $ candidates != " " ) {
candidates <- unlist ( strsplit ( x [i , ] $ candidates , " , " , fixed = TRUE ) )
2020-09-28 11:00:59 +02:00
scores <- mo_matching_score ( x = x [i , ] $ input , n = candidates )
2020-09-14 12:21:23 +02:00
n_candidates <- length ( candidates )
2022-08-28 10:31:50 +02:00
2021-08-16 21:54:34 +02:00
candidates_formatted <- font_italic ( candidates , collapse = NULL )
scores_formatted <- trimws ( formatC ( round ( scores , 3 ) , format = " f" , digits = 3 ) )
2022-09-16 23:15:23 +02:00
scores_formatted <- score_set_colour ( scores_formatted , scores )
2022-08-28 10:31:50 +02:00
2021-08-16 21:54:34 +02:00
# sort on descending scores
candidates_formatted <- candidates_formatted [order ( 1 - scores ) ]
scores_formatted <- scores_formatted [order ( 1 - scores ) ]
2022-08-28 10:31:50 +02:00
candidates <- word_wrap ( paste0 (
" Also matched: " ,
vector_and ( paste0 (
candidates_formatted ,
font_blue ( paste0 ( " (" , scores_formatted , " )" ) , collapse = NULL )
) ,
quotes = FALSE , sort = FALSE
) ,
2022-09-16 23:15:23 +02:00
ifelse ( n_candidates == 25 ,
font_grey ( " [showing first 25]" ) ,
2022-08-28 10:31:50 +02:00
" "
)
) ,
extra_indent = nchar ( " Also matched: " )
)
2020-09-12 08:49:01 +02:00
} else {
candidates <- " "
}
2022-09-16 23:15:23 +02:00
score <- mo_matching_score (
x = x [i , ] $ input ,
n = x [i , ] $ fullname
)
score_formatted <- trimws ( formatC ( round ( score , 3 ) , format = " f" , digits = 3 ) )
2021-08-16 21:54:34 +02:00
txt <- paste ( txt ,
2022-08-28 10:31:50 +02:00
paste0 (
strwrap (
paste0 (
' "' , x [i , ] $ input , ' "' ,
" -> " ,
paste0 (
font_bold ( font_italic ( x [i , ] $ fullname ) ) ,
ifelse ( ! is.na ( x [i , ] $ renamed_to ) , paste ( " , renamed to" , font_italic ( x [i , ] $ renamed_to ) ) , " " ) ,
" (" , x [i , ] $ mo ,
2022-09-16 23:15:23 +02:00
" , " , score_set_colour ( score_formatted , score ) ,
2022-08-28 10:31:50 +02:00
" ) "
)
) ,
width = 0.98 * getOption ( " width" ) ,
exdent = nchar ( x [i , ] $ input ) + 6
) ,
collapse = " \n"
) ,
candidates ,
sep = " \n"
)
2021-08-16 21:54:34 +02:00
txt <- paste0 ( gsub ( " \n\n" , " \n" , txt ) , " \n\n" )
2019-02-27 11:36:12 +01:00
}
2021-08-16 21:54:34 +02:00
cat ( txt )
2019-02-08 16:06:54 +01:00
}
2022-09-16 23:15:23 +02:00
#' @rdname as.mo
#' @export
mo_reset_session <- function ( ) {
if ( NROW ( pkg_env $ mo_previously_coerced ) > 0 ) {
message_ ( " Reset " , NROW ( pkg_env $ mo_previously_coerced ) , " previously matched input values." )
pkg_env $ mo_previously_coerced <- pkg_env $ mo_previously_coerced [0 , , drop = FALSE ]
} else {
message_ ( " No previously matched input values to reset." )
}
}
2019-02-08 16:06:54 +01:00
#' @rdname as.mo
2018-12-06 14:36:39 +01:00
#' @export
mo_renamed <- function ( ) {
2020-12-27 00:07:00 +01:00
items <- pkg_env $ mo_renamed
2019-07-01 14:03:15 +02:00
if ( is.null ( items ) ) {
2020-11-11 16:49:27 +01:00
items <- data.frame ( stringsAsFactors = FALSE )
2019-09-15 22:57:30 +02:00
} else {
2020-09-18 16:05:53 +02:00
items <- pm_distinct ( items , old_name , .keep_all = TRUE )
2019-07-01 14:03:15 +02:00
}
2020-11-16 16:57:55 +01:00
set_clean_class ( as.data.frame ( items ,
2022-08-28 10:31:50 +02:00
stringsAsFactors = FALSE
) ,
new_class = c ( " mo_renamed" , " data.frame" )
)
2019-02-28 13:56:28 +01:00
}
2020-05-28 16:48:55 +02:00
#' @method print mo_renamed
2019-02-28 13:56:28 +01:00
#' @export
#' @noRd
print.mo_renamed <- function ( x , ... ) {
2019-09-15 22:57:30 +02:00
if ( NROW ( x ) == 0 ) {
return ( invisible ( ) )
}
2019-10-11 17:21:02 +02:00
for ( i in seq_len ( nrow ( x ) ) ) {
2022-08-28 10:31:50 +02:00
message_ (
font_italic ( x $ old_name [i ] ) ,
ifelse ( x $ old_ref [i ] %in% c ( " " , NA ) ,
" " ,
paste0 ( " (" , gsub ( " et al." , font_italic ( " et al." ) , x $ old_ref [i ] ) , " )" )
) ,
" was renamed " ,
ifelse ( ! x $ new_ref [i ] %in% c ( " " , NA ) && as.integer ( gsub ( " [^0-9]" , " " , x $ new_ref [i ] ) ) < as.integer ( gsub ( " [^0-9]" , " " , x $ old_ref [i ] ) ) ,
font_bold ( " back to " ) ,
" "
) ,
font_italic ( x $ new_name [i ] ) ,
ifelse ( x $ new_ref [i ] %in% c ( " " , NA ) ,
" " ,
paste0 ( " (" , gsub ( " et al." , font_italic ( " et al." ) , x $ new_ref [i ] ) , " )" )
) ,
" [" , x $ mo [i ] , " ]"
)
2019-09-15 22:57:30 +02:00
}
2019-02-27 11:36:12 +01:00
}
nr2char <- function ( x ) {
if ( x %in% c ( 1 : 10 ) ) {
2022-08-28 10:31:50 +02:00
v <- c (
" one" = 1 , " two" = 2 , " three" = 3 , " four" = 4 , " five" = 5 ,
" six" = 6 , " seven" = 7 , " eight" = 8 , " nine" = 9 , " ten" = 10
)
2019-02-27 11:36:12 +01:00
names ( v [x ] )
} else {
x
}
2018-12-06 14:36:39 +01:00
}
2019-03-15 13:57:25 +01:00
unregex <- function ( x ) {
gsub ( " [^a-zA-Z0-9 -]" , " " , x )
}
2019-03-18 14:29:41 +01:00
2019-03-26 14:24:03 +01:00
translate_allow_uncertain <- function ( allow_uncertain ) {
if ( isTRUE ( allow_uncertain ) ) {
# default to uncertainty level 2
allow_uncertain <- 2
} else {
2019-08-13 16:15:08 +02:00
allow_uncertain [tolower ( allow_uncertain ) == " none" ] <- 0
allow_uncertain [tolower ( allow_uncertain ) == " all" ] <- 3
2019-03-26 14:24:03 +01:00
allow_uncertain <- as.integer ( allow_uncertain )
2020-06-22 11:18:40 +02:00
stop_ifnot ( allow_uncertain %in% c ( 0 : 3 ) ,
2022-08-28 10:31:50 +02:00
' `allow_uncertain` must be a number between 0 (or "none") and 3 (or "all"), or TRUE (= 2) or FALSE (= 0)' ,
call = FALSE
)
2019-03-26 14:24:03 +01:00
}
allow_uncertain
}
2019-07-01 14:03:15 +02:00
get_mo_failures_uncertainties_renamed <- function ( ) {
2022-08-28 10:31:50 +02:00
remember <- list (
failures = pkg_env $ mo_failures ,
uncertainties = pkg_env $ mo_uncertainties ,
renamed = pkg_env $ mo_renamed
)
2020-05-27 16:37:49 +02:00
# empty them, otherwise mo_shortname("Chlamydophila psittaci") will give 3 notes
2020-12-27 00:07:00 +01:00
pkg_env $ mo_failures <- NULL
pkg_env $ mo_uncertainties <- NULL
pkg_env $ mo_renamed <- NULL
2020-05-27 16:37:49 +02:00
remember
2019-07-01 14:03:15 +02:00
}
load_mo_failures_uncertainties_renamed <- function ( metadata ) {
2020-12-27 00:07:00 +01:00
pkg_env $ mo_failures <- metadata $ failures
pkg_env $ mo_uncertainties <- metadata $ uncertainties
pkg_env $ mo_renamed <- metadata $ renamed
2019-07-01 14:03:15 +02:00
}
2019-11-15 15:25:03 +01:00
2020-04-13 21:09:56 +02:00
trimws2 <- function ( x ) {
trimws ( gsub ( " [\\s]+" , " " , x , perl = TRUE ) )
}
2020-04-14 15:10:09 +02:00
parse_and_convert <- function ( x ) {
2022-08-28 10:31:50 +02:00
tryCatch (
{
if ( ! is.null ( dim ( x ) ) ) {
if ( NCOL ( x ) > 2 ) {
stop ( " a maximum of two columns is allowed" , call. = FALSE )
} else if ( NCOL ( x ) == 2 ) {
# support Tidyverse selection like: df %>% select(colA, colB)
# paste these columns together
x <- as.data.frame ( x , stringsAsFactors = FALSE )
colnames ( x ) <- c ( " A" , " B" )
x <- paste ( x $ A , x $ B )
} else {
# support Tidyverse selection like: df %>% select(colA)
x <- as.data.frame ( x , stringsAsFactors = FALSE ) [ [1 ] ]
}
2020-04-14 15:10:09 +02:00
}
2022-08-28 10:31:50 +02:00
parsed <- iconv ( as.character ( x ) , to = " UTF-8" )
parsed [is.na ( parsed ) & ! is.na ( x ) ] <- iconv ( x [is.na ( parsed ) & ! is.na ( x ) ] , from = " Latin1" , to = " ASCII//TRANSLIT" )
parsed <- gsub ( ' "' , " " , parsed , fixed = TRUE )
parsed <- gsub ( " +" , " " , parsed , perl = TRUE )
parsed <- trimws ( parsed )
parsed
} ,
error = function ( e ) stop ( e $ message , call. = FALSE )
) # this will also be thrown when running `as.mo(no_existing_object)`
2020-04-14 14:12:31 +02:00
parsed
2020-04-13 21:09:56 +02:00
}
2020-05-16 13:05:47 +02:00
2020-07-22 12:29:51 +02:00
replace_old_mo_codes <- function ( x , property ) {
2021-10-06 13:23:57 +02:00
# this function transform old MO codes to current codes, such as:
# B_ESCH_COL (AMR v0.5.0) -> B_ESCHR_COLI
2021-06-01 15:33:06 +02:00
ind <- x %like_case% " ^[A-Z]_[A-Z_]+$" & ! x %in% MO_lookup $ mo
2021-05-30 22:14:38 +02:00
if ( any ( ind ) ) {
2020-07-22 10:24:23 +02:00
# get the ones that match
2021-05-30 22:14:38 +02:00
affected <- x [ind ]
affected_unique <- unique ( affected )
all_direct_matches <- TRUE
# find their new codes, once per code
2022-08-28 10:31:50 +02:00
solved_unique <- unlist ( lapply (
strsplit ( affected_unique , " " ) ,
function ( m ) {
kingdom <- paste0 ( " ^" , m [1 ] )
name <- m [3 : length ( m ) ]
name [name == " _" ] <- " "
name <- tolower ( paste0 ( name , " .*" , collapse = " " ) )
name <- gsub ( " .*" , " " , name , fixed = TRUE )
name <- paste0 ( " ^" , name )
results <- MO_lookup $ mo [MO_lookup $ kingdom %like_case% kingdom &
MO_lookup $ fullname_lower %like_case% name ]
if ( length ( results ) > 1 ) {
all_direct_matches <<- FALSE
}
results [1L ]
}
) , use.names = FALSE )
2021-05-30 22:14:38 +02:00
solved <- solved_unique [match ( affected , affected_unique ) ]
2020-07-22 10:24:23 +02:00
# assign on places where a match was found
2021-05-30 22:14:38 +02:00
x [ind ] <- solved
n_matched <- length ( affected [ ! is.na ( affected ) ] )
2021-10-06 13:23:57 +02:00
n_solved <- length ( affected [ ! is.na ( solved ) ] )
n_unsolved <- length ( affected [is.na ( solved ) ] )
2021-05-30 22:14:38 +02:00
n_unique <- length ( affected_unique [ ! is.na ( affected_unique ) ] )
2021-06-01 15:33:06 +02:00
if ( n_unique < n_matched ) {
n_unique <- paste0 ( n_unique , " unique, " )
} else {
n_unique <- " "
}
2020-07-22 12:29:51 +02:00
if ( property != " mo" ) {
2022-08-28 10:31:50 +02:00
warning_ (
" in `mo_" , property , " ()`: the input contained " , n_matched ,
" old MO code" , ifelse ( n_matched == 1 , " " , " s" ) ,
" (" , n_unique , " from a previous AMR package version). " ,
" Please update your MO codes with `as.mo()` to increase speed."
)
2020-10-26 12:23:03 +01:00
} else {
2022-08-28 10:31:50 +02:00
warning_ (
" in `as.mo()`: the input contained " , n_matched ,
" old MO code" , ifelse ( n_matched == 1 , " " , " s" ) ,
" (" , n_unique , " from a previous AMR package version). " ,
n_solved , " old MO code" , ifelse ( n_solved == 1 , " " , " s" ) ,
ifelse ( n_solved == 1 , " was" , " were" ) ,
ifelse ( all_direct_matches , " updated " , font_bold ( " guessed " ) ) ,
" to " , ifelse ( n_solved == 1 , " a " , " " ) ,
" currently used MO code" , ifelse ( n_solved == 1 , " " , " s" ) ,
ifelse ( n_unsolved > 0 ,
paste0 ( " and " , n_unsolved , " old MO code" , ifelse ( n_unsolved == 1 , " " , " s" ) , " could not be updated." ) ,
" ."
)
)
2020-07-22 12:29:51 +02:00
}
2020-07-22 10:24:23 +02:00
}
x
}
2020-09-03 12:31:48 +02:00
replace_ignore_pattern <- function ( x , ignore_pattern ) {
if ( ! is.null ( ignore_pattern ) && ! identical ( trimws2 ( ignore_pattern ) , " " ) ) {
ignore_cases <- x %like% ignore_pattern
if ( sum ( ignore_cases ) > 0 ) {
2022-08-28 10:31:50 +02:00
message_ (
" The following input was ignored by `ignore_pattern = \"" , ignore_pattern , " \"`: " ,
vector_and ( x [ignore_cases ] , quotes = TRUE )
)
2021-02-04 16:48:16 +01:00
x [ignore_cases ] <- NA_character_
2020-09-03 12:31:48 +02:00
}
}
x
}
2020-11-05 01:11:49 +01:00
repair_reference_df <- function ( reference_df ) {
2022-08-28 19:17:12 +02:00
if ( is.null ( reference_df ) ) {
return ( NULL )
}
2020-11-05 01:11:49 +01:00
# has valid own reference_df
reference_df <- reference_df %pm>%
pm_filter ( ! is.na ( mo ) )
2022-08-28 10:31:50 +02:00
2020-11-05 01:11:49 +01:00
# keep only first two columns, second must be mo
if ( colnames ( reference_df ) [1 ] == " mo" ) {
reference_df <- reference_df %pm>% pm_select ( 2 , " mo" )
} else {
reference_df <- reference_df %pm>% pm_select ( 1 , " mo" )
}
2022-08-28 10:31:50 +02:00
2020-11-05 01:11:49 +01:00
# remove factors, just keep characters
colnames ( reference_df ) [1 ] <- " x"
2020-11-10 16:35:56 +01:00
reference_df [ , " x" ] <- as.character ( reference_df [ , " x" , drop = TRUE ] )
reference_df [ , " mo" ] <- as.character ( reference_df [ , " mo" , drop = TRUE ] )
2022-08-28 10:31:50 +02:00
2021-05-03 13:06:43 +02:00
# some MO codes might be old
2020-11-10 16:35:56 +01:00
reference_df [ , " mo" ] <- as.mo ( reference_df [ , " mo" , drop = TRUE ] )
2020-11-05 01:11:49 +01:00
reference_df
}
2021-01-25 21:58:00 +01:00
strip_words <- function ( text , n , side = " right" ) {
2021-01-28 16:09:30 +01:00
out <- lapply ( strsplit ( text , " " ) , function ( x ) {
2021-01-25 21:58:00 +01:00
if ( side %like% " ^r" & length ( x ) > n ) {
x [seq_len ( length ( x ) - n ) ]
} else if ( side %like% " ^l" & length ( x ) > n ) {
x [2 : length ( x ) ]
}
} )
vapply ( FUN.VALUE = character ( 1 ) , out , paste , collapse = " " )
}