2018-06-08 12:06:54 +02:00
# ==================================================================== #
# TITLE #
2021-02-02 23:57:35 +01:00
# Antimicrobial Resistance (AMR) Data Analysis for R #
2018-06-08 12:06:54 +02:00
# #
2019-01-02 23:24:07 +01:00
# SOURCE #
2020-07-08 14:48:06 +02:00
# https://github.com/msberends/AMR #
2018-06-08 12:06:54 +02:00
# #
# LICENCE #
2020-12-27 00:30:28 +01:00
# (c) 2018-2021 Berends MS, Luz CF et al. #
2020-10-08 11:16:03 +02:00
# Developed at the University of Groningen, the Netherlands, in #
# collaboration with non-profit organisations Certe Medical #
2020-10-26 12:23:03 +01:00
# Diagnostics & Advice, and University Medical Center Groningen. #
2018-06-08 12:06:54 +02:00
# #
2019-01-02 23:24:07 +01:00
# This R package is free software; you can freely use and distribute #
# it for both personal and commercial purposes under the terms of the #
# GNU General Public License version 2.0 (GNU GPL-2), as published by #
# the Free Software Foundation. #
2020-01-05 17:22:09 +01:00
# We created this package for both routine data analysis and academic #
# research and it was publicly released in the hope that it will be #
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. #
2020-10-08 11:16:03 +02:00
# #
# Visit our website for the full manual and a complete tutorial about #
2021-02-02 23:57:35 +01:00
# how to conduct AMR data analysis: https://msberends.github.io/AMR/ #
2018-06-08 12:06:54 +02:00
# ==================================================================== #
2021-01-18 16:57:56 +01:00
#' Transform Input to a Microorganism ID
2018-06-08 12:06:54 +02:00
#'
2021-01-18 16:57:56 +01:00
#' Use this function to determine a valid microorganism ID ([`mo`]). Determination is done using intelligent rules and the complete taxonomic kingdoms Bacteria, Chromista, Protozoa, Archaea and most microbial species from the kingdom Fungi (see *Source*). The input can be almost anything: a full name (like `"Staphylococcus aureus"`), an abbreviated name (such as `"S. aureus"`), an abbreviation known in the field (such as `"MRSA"`), or just a genus. See *Examples*.
#' @inheritSection lifecycle Stable Lifecycle
2021-05-12 18:15:03 +02:00
#' @param x a [character] vector or a [data.frame] with one or two columns
#' @param Becker a [logical] to indicate whether staphylococci should be categorised into coagulase-negative staphylococci ("CoNS") and coagulase-positive staphylococci ("CoPS") instead of their own species, according to Karsten Becker *et al.* (1,2,3).
2018-09-04 11:33:30 +02:00
#'
2019-12-20 15:05:58 +01:00
#' This excludes *Staphylococcus aureus* at default, use `Becker = "all"` to also categorise *S. aureus* as "CoPS".
2021-05-12 18:15:03 +02:00
#' @param Lancefield a [logical] to indicate whether beta-haemolytic *Streptococci* should be categorised into Lancefield groups instead of their own species, according to Rebecca C. Lancefield (4). These *Streptococci* will be categorised in their first group, e.g. *Streptococcus dysgalactiae* will be group C, although officially it was also categorised into groups G and L.
2018-09-04 11:33:30 +02:00
#'
2019-12-20 15:05:58 +01:00
#' This excludes *Enterococci* at default (who are in group D), use `Lancefield = "all"` to also categorise all *Enterococci* as group D.
2021-01-18 16:57:56 +01:00
#' @param allow_uncertain a number between `0` (or `"none"`) and `3` (or `"all"`), or `TRUE` (= `2`) or `FALSE` (= `0`) to indicate whether the input should be checked for less probable results, see *Details*
2020-09-18 16:05:53 +02:00
#' @param reference_df a [data.frame] to be used for extra reference when translating `x` to a valid [`mo`]. See [set_mo_source()] and [get_mo_source()] to automate the usage of your own codes (e.g. used in your analysis or organisation).
2020-09-03 12:31:48 +02:00
#' @param ignore_pattern a regular expression (case-insensitive) of which all matches in `x` must return `NA`. This can be convenient to exclude known non-relevant input and can also be set with the option `AMR_ignore_pattern`, e.g. `options(AMR_ignore_pattern = "(not reported|contaminated flora)")`.
2020-09-14 12:21:23 +02:00
#' @param language language to translate text like "no growth", which defaults to the system language (see [get_locale()])
2021-04-20 10:46:17 +02:00
#' @param info a [logical] to indicate if a progress bar should be printed if more than 25 items are to be coerced, defaults to `TRUE` only in interactive mode
2020-12-22 00:51:17 +01:00
#' @param ... other arguments passed on to functions
2018-08-31 13:36:19 +02:00
#' @rdname as.mo
#' @aliases mo
#' @keywords mo Becker becker Lancefield lancefield guess
2018-09-24 23:33:29 +02:00
#' @details
2021-01-18 16:57:56 +01:00
#' ## General Info
2020-10-26 12:23:03 +01:00
#'
2020-09-03 12:31:48 +02:00
#' A microorganism ID from this package (class: [`mo`]) is human readable and typically looks like these examples:
2019-11-28 22:32:17 +01:00
#' ```
2019-09-18 15:46:09 +02:00
#' Code Full name
#' --------------- --------------------------------------
#' B_KLBSL Klebsiella
#' B_KLBSL_PNMN Klebsiella pneumoniae
#' B_KLBSL_PNMN_RHNS Klebsiella pneumoniae rhinoscleromatis
#' | | | |
#' | | | |
2020-09-03 12:31:48 +02:00
#' | | | \---> subspecies, a 4-5 letter acronym
#' | | \----> species, a 4-5 letter acronym
#' | \----> genus, a 5-7 letter acronym
#' \----> taxonomic kingdom: A (Archaea), AN (Animalia), B (Bacteria),
2019-08-13 16:15:08 +02:00
#' C (Chromista), F (Fungi), P (Protozoa)
2019-11-28 22:32:17 +01:00
#' ```
2018-08-01 08:03:31 +02:00
#'
2020-07-22 10:24:23 +02:00
#' Values that cannot be coerced will be considered 'unknown' and will get the MO code `UNKNOWN`.
2019-03-02 22:47:04 +01:00
#'
2021-01-18 16:57:56 +01:00
#' Use the [`mo_*`][mo_property()] functions to get properties based on the returned code, see *Examples*.
2019-03-15 13:57:25 +01:00
#'
2019-11-28 22:32:17 +01:00
#' The algorithm uses data from the Catalogue of Life (see below) and from one other source (see [microorganisms]).
2019-09-15 22:57:30 +02:00
#'
2019-11-28 22:32:17 +01:00
#' The [as.mo()] function uses several coercion rules for fast and logical results. It assesses the input matching criteria in the following order:
2020-10-26 12:23:03 +01:00
#'
2019-11-28 22:32:17 +01:00
#' 1. Human pathogenic prevalence: the function starts with more prevalent microorganisms, followed by less prevalent ones;
#' 2. Taxonomic kingdom: the function starts with determining Bacteria, then Fungi, then Protozoa, then others;
#' 3. Breakdown of input values to identify possible matches.
2018-09-24 23:33:29 +02:00
#'
2020-09-18 16:05:53 +02:00
#' This will lead to the effect that e.g. `"E. coli"` (a microorganism highly prevalent in humans) will return the microbial ID of *Escherichia coli* and not *Entamoeba coli* (a microorganism less prevalent in humans), although the latter would alphabetically come first.
2020-10-26 12:23:03 +01:00
#'
2021-01-18 16:57:56 +01:00
#' ## Coping with Uncertain Results
2020-10-26 12:23:03 +01:00
#'
#' In addition, the [as.mo()] function can differentiate four levels of uncertainty to guess valid results:
2019-11-28 22:32:17 +01:00
#' - Uncertainty level 0: no additional rules are applied;
#' - Uncertainty level 1: allow previously accepted (but now invalid) taxonomic names and minor spelling errors;
#' - Uncertainty level 2: allow all of level 1, strip values between brackets, inverse the words of the input, strip off text elements from the end keeping at least two elements;
#' - Uncertainty level 3: allow all of level 1 and 2, strip off text elements from the end, allow any part of a taxonomic name.
2020-10-26 12:23:03 +01:00
#'
2020-07-22 10:24:23 +02:00
#' The level of uncertainty can be set using the argument `allow_uncertain`. The default is `allow_uncertain = TRUE`, which is equal to uncertainty level 2. Using `allow_uncertain = FALSE` is equal to uncertainty level 0 and will skip all rules. You can also use e.g. `as.mo(..., allow_uncertain = 1)` to only allow up to level 1 uncertainty.
2020-10-26 12:23:03 +01:00
#'
2020-07-22 10:24:23 +02:00
#' With the default setting (`allow_uncertain = TRUE`, level 2), below examples will lead to valid results:
2020-03-14 14:05:43 +01:00
#' - `"Streptococcus group B (known as S. agalactiae)"`. The text between brackets will be removed and a warning will be thrown that the result *Streptococcus group B* (``r as.mo("Streptococcus group B")``) needs review.
#' - `"S. aureus - please mind: MRSA"`. The last word will be stripped, after which the function will try to find a match. If it does not, the second last word will be stripped, etc. Again, a warning will be thrown that the result *Staphylococcus aureus* (``r as.mo("Staphylococcus aureus")``) needs review.
#' - `"Fluoroquinolone-resistant Neisseria gonorrhoeae"`. The first word will be stripped, after which the function will try to find a match. A warning will be thrown that the result *Neisseria gonorrhoeae* (``r as.mo("Neisseria gonorrhoeae")``) needs review.
2020-10-26 12:23:03 +01:00
#'
2020-07-22 10:24:23 +02:00
#' There are three helper functions that can be run after using the [as.mo()] function:
2021-01-18 16:57:56 +01:00
#' - Use [mo_uncertainties()] to get a [data.frame] that prints in a pretty format with all taxonomic names that were guessed. The output contains the matching score for all matches (see *Matching Score for Microorganisms* below).
2020-09-18 16:05:53 +02:00
#' - Use [mo_failures()] to get a [character] [vector] with all values that could not be coerced to a valid value.
#' - Use [mo_renamed()] to get a [data.frame] with all values that could be coerced based on old, previously accepted taxonomic names.
2019-02-08 16:06:54 +01:00
#'
2021-01-18 16:57:56 +01:00
#' ## Microbial Prevalence of Pathogens in Humans
2020-10-26 12:23:03 +01:00
#'
2021-01-18 16:57:56 +01:00
#' The intelligent rules consider the prevalence of microorganisms in humans grouped into three groups, which is available as the `prevalence` columns in the [microorganisms] and [microorganisms.old] data sets. The grouping into human pathogenic prevalence is explained in the section *Matching Score for Microorganisms* below.
#' @inheritSection mo_matching_score Matching Score for Microorganisms
2019-02-20 00:04:48 +01:00
#' @inheritSection catalogue_of_life Catalogue of Life
2019-05-20 12:00:18 +02:00
# (source as a section here, so it can be inherited by other man pages:)
2018-09-24 23:33:29 +02:00
#' @section Source:
2021-01-06 11:16:17 +01:00
#' 1. Becker K *et al.* **Coagulase-Negative Staphylococci**. 2014. Clin Microbiol Rev. 27(4): 870– 926; \doi{10.1128/CMR.00109-13}
#' 2. Becker K *et al.* **Implications of identifying the recently defined members of the *S. aureus* complex, *S. argenteus* and *S. schweitzeri*: A position paper of members of the ESCMID Study Group for staphylococci and Staphylococcal Diseases (ESGS).** 2019. Clin Microbiol Infect; \doi{10.1016/j.cmi.2019.02.028}
#' 3. Becker K *et al.* **Emergence of coagulase-negative staphylococci** 2020. Expert Rev Anti Infect Ther. 18(4):349-366; \doi{10.1080/14787210.2020.1730813}
#' 4. Lancefield RC **A serological differentiation of human and other groups of hemolytic streptococci**. 1933. J Exp Med. 57(4): 571– 95; \doi{10.1084/jem.57.4.571}
2021-03-11 21:42:30 +01:00
#' 5. `r gsub("{year}", CATALOGUE_OF_LIFE$year, CATALOGUE_OF_LIFE$version, fixed = TRUE)`, <http://www.catalogueoflife.org>
#' 6. List of Prokaryotic names with Standing in Nomenclature (`r CATALOGUE_OF_LIFE$yearmonth_LPSN`), \doi{10.1099/ijsem.0.004332}
#' 7. `r SNOMED_VERSION$current_source`, retrieved from the `r SNOMED_VERSION$title`, OID `r SNOMED_VERSION$current_oid`, version `r SNOMED_VERSION$current_version`; url: <`r SNOMED_VERSION$url`>
2018-06-08 12:06:54 +02:00
#' @export
2020-09-18 16:05:53 +02:00
#' @return A [character] [vector] with additional class [`mo`]
#' @seealso [microorganisms] for the [data.frame] that is being used to determine ID's.
2020-10-26 12:23:03 +01:00
#'
2020-12-17 16:22:25 +01:00
#' The [`mo_*`][mo_property()] functions (such as [mo_genus()], [mo_gramstain()]) to get properties based on the returned code.
2021-01-18 16:57:56 +01:00
#' @inheritSection AMR Reference Data Publicly Available
#' @inheritSection AMR Read more on Our Website!
2018-06-08 12:06:54 +02:00
#' @examples
2019-07-02 16:48:52 +02:00
#' \donttest{
2019-09-18 15:46:09 +02:00
#' # These examples all return "B_STPHY_AURS", the ID of S. aureus:
2019-03-09 08:21:00 +01:00
#' as.mo("sau") # WHONET code
2018-08-31 13:36:19 +02:00
#' as.mo("stau")
#' as.mo("STAU")
#' as.mo("staaur")
#' as.mo("S. aureus")
#' as.mo("S aureus")
#' as.mo("Staphylococcus aureus")
2018-12-07 12:04:55 +01:00
#' as.mo("Staphylococcus aureus (MRSA)")
2019-11-15 15:25:03 +01:00
#' as.mo("Zthafilokkoockus oureuz") # handles incorrect spelling
2020-01-27 19:14:23 +01:00
#' as.mo("MRSA") # Methicillin Resistant S. aureus
#' as.mo("VISA") # Vancomycin Intermediate S. aureus
#' as.mo("VRSA") # Vancomycin Resistant S. aureus
#' as.mo(115329001) # SNOMED CT code
2020-10-26 12:23:03 +01:00
#'
2019-03-18 14:29:41 +01:00
#' # Dyslexia is no problem - these all work:
#' as.mo("Ureaplasma urealyticum")
#' as.mo("Ureaplasma urealyticus")
#' as.mo("Ureaplasmium urealytica")
#' as.mo("Ureaplazma urealitycium")
#'
2018-09-05 10:51:46 +02:00
#' as.mo("Streptococcus group A")
#' as.mo("GAS") # Group A Streptococci
#' as.mo("GBS") # Group B Streptococci
#'
2019-09-18 15:46:09 +02:00
#' as.mo("S. epidermidis") # will remain species: B_STPHY_EPDR
#' as.mo("S. epidermidis", Becker = TRUE) # will not remain species: B_STPHY_CONS
2018-08-02 13:15:45 +02:00
#'
2019-09-18 15:46:09 +02:00
#' as.mo("S. pyogenes") # will remain species: B_STRPT_PYGN
#' as.mo("S. pyogenes", Lancefield = TRUE) # will not remain species: B_STRPT_GRPA
2018-08-02 13:15:45 +02:00
#'
2019-03-18 14:29:41 +01:00
#' # All mo_* functions use as.mo() internally too (see ?mo_property):
2020-11-16 11:03:24 +01:00
#' mo_genus("E. coli") # returns "Escherichia"
#' mo_gramstain("E. coli") # returns "Gram negative"
#' mo_is_intrinsic_resistant("E. coli", "vanco") # returns TRUE
2018-06-08 12:06:54 +02:00
#' }
2020-10-26 12:23:03 +01:00
as.mo <- function ( x ,
Becker = FALSE ,
Lancefield = FALSE ,
allow_uncertain = TRUE ,
2020-09-03 12:31:48 +02:00
reference_df = get_mo_source ( ) ,
ignore_pattern = getOption ( " AMR_ignore_pattern" ) ,
2020-09-14 12:21:23 +02:00
language = get_locale ( ) ,
2021-04-20 10:46:17 +02:00
info = interactive ( ) ,
2019-11-23 12:39:57 +01:00
... ) {
2020-10-20 21:00:57 +02:00
meet_criteria ( x , allow_class = c ( " mo" , " data.frame" , " list" , " character" , " numeric" , " integer" , " factor" ) , allow_NA = TRUE )
2020-10-19 17:09:19 +02:00
meet_criteria ( Becker , allow_class = c ( " logical" , " character" ) , has_length = 1 )
meet_criteria ( Lancefield , allow_class = c ( " logical" , " character" ) , has_length = 1 )
meet_criteria ( allow_uncertain , allow_class = c ( " logical" , " numeric" , " integer" ) , has_length = 1 )
meet_criteria ( reference_df , allow_class = " data.frame" , allow_NULL = TRUE )
meet_criteria ( ignore_pattern , allow_class = " character" , has_length = 1 , allow_NULL = TRUE )
meet_criteria ( language , has_length = 1 , is_in = c ( LANGUAGES_SUPPORTED , " " ) , allow_NULL = TRUE , allow_NA = TRUE )
2021-04-20 10:46:17 +02:00
meet_criteria ( info , allow_class = " logical" , has_length = 1 )
2020-02-14 19:54:13 +01:00
check_dataset_integrity ( )
2020-10-26 12:23:03 +01:00
2020-10-04 19:26:43 +02:00
if ( tryCatch ( all ( x [ ! is.na ( x ) ] %in% MO_lookup $ mo )
2020-09-12 08:49:01 +02:00
& isFALSE ( Becker )
& isFALSE ( Lancefield ) , error = function ( e ) FALSE ) ) {
# don't look into valid MO codes, just return them
2020-11-10 16:35:56 +01:00
# is.mo() won't work - MO codes might change between package versions
2020-11-16 16:57:55 +01:00
return ( set_clean_class ( x , new_class = c ( " mo" , " character" ) ) )
2020-09-12 08:49:01 +02:00
}
2020-10-26 12:23:03 +01:00
2020-04-13 21:09:56 +02:00
# start off with replaced language-specific non-ASCII characters with ASCII characters
2020-04-14 15:10:09 +02:00
x <- parse_and_convert ( x )
2020-07-22 10:24:23 +02:00
# replace mo codes used in older package versions
2020-07-22 12:29:51 +02:00
x <- replace_old_mo_codes ( x , property = " mo" )
2020-09-03 12:31:48 +02:00
# ignore cases that match the ignore pattern
x <- replace_ignore_pattern ( x , ignore_pattern )
2020-10-26 12:23:03 +01:00
2019-06-02 19:23:19 +02:00
# WHONET: xxx = no growth
x [tolower ( as.character ( paste0 ( x , " " ) ) ) %in% c ( " " , " xxx" , " na" , " nan" ) ] <- NA_character_
2020-09-14 12:21:23 +02:00
# Laboratory systems: remove (translated) entries like "no growth", etc.
x [trimws2 ( x ) %like% translate_AMR ( " no .*growth" , language = language ) ] <- NA_character_
x [trimws2 ( x ) %like% paste0 ( " ^(" , translate_AMR ( " no|not" , language = language ) , " ) [a-z]+" ) ] <- " UNKNOWN"
2019-03-26 14:24:03 +01:00
uncertainty_level <- translate_allow_uncertain ( allow_uncertain )
2021-02-21 22:56:35 +01:00
if ( tryCatch ( all ( x == " " | gsub ( " .*(unknown ).*" , " unknown name" , tolower ( x ) , perl = TRUE ) %in% MO_lookup $ fullname_lower , na.rm = TRUE )
& isFALSE ( Becker )
& isFALSE ( Lancefield ) , error = function ( e ) FALSE ) ) {
# to improve speed, special case for taxonomically correct full names (case-insensitive)
return ( MO_lookup [match ( gsub ( " .*(unknown ).*" , " unknown name" , tolower ( x ) , perl = TRUE ) , MO_lookup $ fullname_lower ) , " mo" , drop = TRUE ] )
}
2020-10-26 12:23:03 +01:00
2020-12-17 16:22:25 +01:00
if ( ! is.null ( reference_df )
2020-12-22 00:51:17 +01:00
&& check_validity_mo_source ( reference_df )
2020-11-05 01:11:49 +01:00
&& isFALSE ( Becker )
&& isFALSE ( Lancefield )
&& all ( x %in% unlist ( reference_df ) , na.rm = TRUE ) ) {
reference_df <- repair_reference_df ( reference_df )
2019-03-01 09:34:04 +01:00
suppressWarnings (
2020-09-18 16:05:53 +02:00
y <- data.frame ( x = x , stringsAsFactors = FALSE ) %pm>%
pm_left_join ( reference_df , by = " x" ) %pm>%
2020-11-05 01:11:49 +01:00
pm_pull ( mo )
2019-03-01 09:34:04 +01:00
)
2020-10-26 12:23:03 +01:00
2020-10-04 19:26:43 +02:00
} else if ( all ( x [ ! is.na ( x ) ] %in% MO_lookup $ mo )
2019-03-12 12:19:27 +01:00
& isFALSE ( Becker )
& isFALSE ( Lancefield ) ) {
2019-02-26 12:33:26 +01:00
y <- x
2020-10-26 12:23:03 +01:00
2019-03-18 14:29:41 +01:00
} else {
# will be checked for mo class in validation and uses exec_as.mo internally if necessary
y <- mo_validate ( x = x , property = " mo" ,
Becker = Becker , Lancefield = Lancefield ,
2020-10-26 12:23:03 +01:00
allow_uncertain = uncertainty_level ,
2020-09-14 12:21:23 +02:00
reference_df = reference_df ,
2020-09-03 12:31:48 +02:00
ignore_pattern = ignore_pattern ,
2020-09-14 12:21:23 +02:00
language = language ,
2021-04-20 10:46:17 +02:00
info = info ,
2019-05-10 16:44:59 +02:00
... )
2019-03-18 14:29:41 +01:00
}
2020-10-26 12:23:03 +01:00
2020-11-16 16:57:55 +01:00
set_clean_class ( y ,
new_class = c ( " mo" , " character" ) )
2018-09-27 23:23:48 +02:00
}
#' @rdname as.mo
#' @export
is.mo <- function ( x ) {
2020-01-31 23:27:38 +01:00
inherits ( x , " mo" )
2018-09-27 23:23:48 +02:00
}
2020-02-14 19:54:13 +01:00
# param property a column name of microorganisms
2021-05-12 18:15:03 +02:00
# param initial_search [logical] - is FALSE when coming from uncertain tries, which uses exec_as.mo internally too
# param dyslexia_mode [logical] - also check for characters that resemble others
# param debug [logical] - show different lookup texts while searching
# param reference_data_to_use [data.frame] - the data set to check for
2020-09-14 12:21:23 +02:00
# param actual_uncertainty - (only for initial_search = FALSE) the actual uncertainty level used in the function for score calculation (sometimes passed as 2 or 3 by uncertain_fn())
# param actual_input - (only for initial_search = FALSE) the actual, original input
# param language - used for translating "no growth", etc.
2019-03-15 13:57:25 +01:00
exec_as.mo <- function ( x ,
Becker = FALSE ,
Lancefield = FALSE ,
allow_uncertain = TRUE ,
reference_df = get_mo_source ( ) ,
2021-04-20 10:46:17 +02:00
info = interactive ( ) ,
2019-03-15 13:57:25 +01:00
property = " mo" ,
2019-03-15 17:36:42 +01:00
initial_search = TRUE ,
2019-08-12 19:07:15 +02:00
dyslexia_mode = FALSE ,
2019-09-15 22:57:30 +02:00
debug = FALSE ,
2020-09-03 12:31:48 +02:00
ignore_pattern = getOption ( " AMR_ignore_pattern" ) ,
2020-09-14 12:21:23 +02:00
reference_data_to_use = MO_lookup ,
actual_uncertainty = 1 ,
actual_input = NULL ,
language = get_locale ( ) ) {
2020-10-20 21:00:57 +02:00
meet_criteria ( x , allow_class = c ( " mo" , " data.frame" , " list" , " character" , " numeric" , " integer" , " factor" ) , allow_NA = TRUE )
2020-10-19 17:09:19 +02:00
meet_criteria ( Becker , allow_class = c ( " logical" , " character" ) , has_length = 1 )
meet_criteria ( Lancefield , allow_class = c ( " logical" , " character" ) , has_length = 1 )
meet_criteria ( allow_uncertain , allow_class = c ( " logical" , " numeric" , " integer" ) , has_length = 1 )
meet_criteria ( reference_df , allow_class = " data.frame" , allow_NULL = TRUE )
meet_criteria ( property , allow_class = " character" , has_length = 1 , is_in = colnames ( microorganisms ) )
meet_criteria ( initial_search , allow_class = " logical" , has_length = 1 )
meet_criteria ( dyslexia_mode , allow_class = " logical" , has_length = 1 )
meet_criteria ( debug , allow_class = " logical" , has_length = 1 )
meet_criteria ( ignore_pattern , allow_class = " character" , has_length = 1 , allow_NULL = TRUE )
meet_criteria ( reference_data_to_use , allow_class = " data.frame" )
meet_criteria ( actual_uncertainty , allow_class = " numeric" , has_length = 1 )
meet_criteria ( actual_input , allow_class = " character" , allow_NULL = TRUE )
meet_criteria ( language , has_length = 1 , is_in = c ( LANGUAGES_SUPPORTED , " " ) , allow_NULL = TRUE , allow_NA = TRUE )
2020-10-26 12:23:03 +01:00
2020-02-14 19:54:13 +01:00
check_dataset_integrity ( )
2020-12-24 23:29:10 +01:00
2021-02-21 20:15:09 +01:00
if ( isTRUE ( debug ) && initial_search == TRUE ) {
time_start_tracking ( )
}
2020-10-26 12:23:03 +01:00
lookup <- function ( needle ,
2020-09-14 12:21:23 +02:00
column = property ,
haystack = reference_data_to_use ,
n = 1 ,
2020-10-26 12:23:03 +01:00
debug_mode = debug ,
2020-09-14 12:21:23 +02:00
initial = initial_search ,
uncertainty = actual_uncertainty ,
input_actual = actual_input ) {
2020-10-26 12:23:03 +01:00
2020-09-14 12:21:23 +02:00
if ( ! is.null ( input_actual ) ) {
input <- input_actual
} else {
input <- tryCatch ( x_backup [i ] , error = function ( e ) " " )
}
2020-10-26 12:23:03 +01:00
2020-09-12 13:54:21 +02:00
# `column` can be NULL for all columns, or a selection
2021-05-12 18:15:03 +02:00
# returns a [character] (vector) - if `column` > length 1 then with columns as names
2020-09-12 13:54:21 +02:00
if ( isTRUE ( debug_mode ) ) {
2021-02-21 20:15:09 +01:00
cat ( font_silver ( " Looking up: " , substitute ( needle ) , collapse = " " ) ,
" \n " , time_track ( ) )
2020-09-12 13:54:21 +02:00
}
if ( length ( column ) == 1 ) {
res_df <- haystack [which ( eval ( substitute ( needle ) , envir = haystack , enclos = parent.frame ( ) ) ) , , drop = FALSE ]
2020-09-14 19:41:48 +02:00
if ( NROW ( res_df ) > 1 & uncertainty != -1 ) {
2020-09-14 12:21:23 +02:00
# sort the findings on matching score
2020-10-26 12:23:03 +01:00
scores <- mo_matching_score ( x = input ,
2020-09-26 16:51:17 +02:00
n = res_df [ , " fullname" , drop = TRUE ] )
2020-09-18 16:05:53 +02:00
res_df <- res_df [order ( scores , decreasing = TRUE ) , , drop = FALSE ]
2020-09-14 12:21:23 +02:00
}
2020-09-12 13:54:21 +02:00
res <- as.character ( res_df [ , column , drop = TRUE ] )
if ( length ( res ) == 0 ) {
2020-09-14 12:21:23 +02:00
if ( isTRUE ( debug_mode ) ) {
cat ( font_red ( " (no match)\n" ) )
}
2020-10-26 12:23:03 +01:00
NA_character_
2020-09-12 13:54:21 +02:00
} else {
2020-09-14 12:21:23 +02:00
if ( isTRUE ( debug_mode ) ) {
2021-02-21 20:15:09 +01:00
cat ( font_green ( paste0 ( " MATCH (" , NROW ( res_df ) , " results)\n" ) ) )
2020-09-14 12:21:23 +02:00
}
2020-09-14 19:41:48 +02:00
if ( ( length ( res ) > n | uncertainty > 1 ) & uncertainty != -1 ) {
# save the other possible results as well, but not for forced certain results (then uncertainty == -1)
2020-09-12 13:54:21 +02:00
uncertainties <<- rbind ( uncertainties ,
2020-09-14 12:21:23 +02:00
format_uncertainty_as_df ( uncertainty_level = uncertainty ,
input = input ,
2020-09-12 13:54:21 +02:00
result_mo = res_df [1 , " mo" , drop = TRUE ] ,
2020-11-11 16:49:27 +01:00
candidates = as.character ( res_df [ , " fullname" , drop = TRUE ] ) ) ,
stringsAsFactors = FALSE )
2020-09-12 13:54:21 +02:00
}
res [seq_len ( min ( n , length ( res ) ) ) ]
}
} else {
if ( is.null ( column ) ) {
column <- names ( haystack )
}
res <- haystack [which ( eval ( substitute ( needle ) , envir = haystack , enclos = parent.frame ( ) ) ) , , drop = FALSE ]
res <- res [seq_len ( min ( n , nrow ( res ) ) ) , column , drop = TRUE ]
if ( NROW ( res ) == 0 ) {
2021-02-21 20:15:09 +01:00
if ( isTRUE ( debug_mode ) ) {
cat ( font_red ( " (no rows)\n" ) )
}
2020-09-12 13:54:21 +02:00
res <- rep ( NA_character_ , length ( column ) )
2021-02-21 20:15:09 +01:00
} else {
if ( isTRUE ( debug_mode ) ) {
cat ( font_green ( paste0 ( " MATCH (" , NROW ( res ) , " rows)\n" ) ) )
}
2020-09-12 13:54:21 +02:00
}
res <- as.character ( res )
names ( res ) <- column
res
}
}
2020-10-26 12:23:03 +01:00
2020-04-13 21:09:56 +02:00
# start off with replaced language-specific non-ASCII characters with ASCII characters
2020-04-14 15:10:09 +02:00
x <- parse_and_convert ( x )
2020-07-22 10:24:23 +02:00
# replace mo codes used in older package versions
2020-07-22 12:29:51 +02:00
x <- replace_old_mo_codes ( x , property )
2020-09-03 12:31:48 +02:00
# ignore cases that match the ignore pattern
x <- replace_ignore_pattern ( x , ignore_pattern )
2020-10-26 12:23:03 +01:00
2019-06-02 19:23:19 +02:00
# WHONET: xxx = no growth
x [tolower ( as.character ( paste0 ( x , " " ) ) ) %in% c ( " " , " xxx" , " na" , " nan" ) ] <- NA_character_
2020-09-14 12:21:23 +02:00
# Laboratory systems: remove (translated) entries like "no growth", etc.
x [trimws2 ( x ) %like% translate_AMR ( " no .*growth" , language = language ) ] <- NA_character_
x [trimws2 ( x ) %like% paste0 ( " ^(" , translate_AMR ( " no|not" , language = language ) , " ) [a-z]+" ) ] <- " UNKNOWN"
2020-10-26 12:23:03 +01:00
2019-03-15 17:36:42 +01:00
if ( initial_search == TRUE ) {
2020-12-24 23:29:10 +01:00
# keep track of time - give some hints to improve speed if it takes a long time
start_time <- Sys.time ( )
2020-12-27 00:07:00 +01:00
pkg_env $ mo_failures <- NULL
pkg_env $ mo_uncertainties <- NULL
pkg_env $ mo_renamed <- NULL
2018-12-06 14:36:39 +01:00
}
2020-12-27 00:07:00 +01:00
pkg_env $ mo_renamed_last_run <- NULL
2020-10-26 12:23:03 +01:00
2020-09-12 08:49:01 +02:00
failures <- character ( 0 )
uncertainty_level <- translate_allow_uncertain ( allow_uncertain )
2019-08-20 11:40:54 +02:00
uncertainties <- data.frame ( uncertainty = integer ( 0 ) ,
input = character ( 0 ) ,
2019-02-27 11:36:12 +01:00
fullname = character ( 0 ) ,
2019-08-20 11:40:54 +02:00
renamed_to = character ( 0 ) ,
2020-09-12 08:49:01 +02:00
mo = character ( 0 ) ,
candidates = character ( 0 ) ,
2019-08-20 11:40:54 +02:00
stringsAsFactors = FALSE )
2020-10-26 12:23:03 +01:00
2018-09-27 23:23:48 +02:00
x_input <- x
2019-02-23 18:08:28 +01:00
# already strip leading and trailing spaces
2020-04-13 21:09:56 +02:00
x <- trimws ( x )
2018-09-27 23:23:48 +02:00
# only check the uniques, which is way faster
x <- unique ( x )
2018-11-30 12:05:59 +01:00
# remove empty values (to later fill them in again with NAs)
2019-06-02 19:23:19 +02:00
# ("xxx" is WHONET code for 'no growth')
2019-03-05 22:47:42 +01:00
x <- x [ ! is.na ( x )
& ! is.null ( x )
& ! identical ( x , " " )
2019-06-02 19:23:19 +02:00
& ! identical ( x , " xxx" ) ]
2020-10-26 12:23:03 +01:00
2018-10-01 11:39:43 +02:00
# defined df to check for
if ( ! is.null ( reference_df ) ) {
2020-12-22 00:51:17 +01:00
check_validity_mo_source ( reference_df )
2020-11-05 01:11:49 +01:00
reference_df <- repair_reference_df ( reference_df )
2018-10-01 11:39:43 +02:00
}
2021-02-21 20:15:09 +01:00
2019-02-23 18:08:28 +01:00
# all empty
2019-03-06 14:39:02 +01:00
if ( all ( identical ( trimws ( x_input ) , " " ) | is.na ( x_input ) | length ( x ) == 0 ) ) {
2019-01-08 16:23:45 +01:00
if ( property == " mo" ) {
2020-11-16 16:57:55 +01:00
return ( set_clean_class ( rep ( NA_character_ , length ( x_input ) ) ,
new_class = c ( " mo" , " character" ) ) )
2019-01-08 16:23:45 +01:00
} else {
return ( rep ( NA_character_ , length ( x_input ) ) )
}
2020-10-26 12:23:03 +01:00
2019-03-05 22:47:42 +01:00
} else if ( all ( x %in% reference_df [ , 1 ] [ [1 ] ] ) ) {
2019-01-21 15:53:01 +01:00
# all in reference df
2018-10-01 11:39:43 +02:00
colnames ( reference_df ) [1 ] <- " x"
suppressWarnings (
2020-09-03 12:31:48 +02:00
x <- MO_lookup [match ( reference_df [match ( x , reference_df $ x ) , " mo" , drop = TRUE ] , MO_lookup $ mo ) , property , drop = TRUE ]
2018-10-01 11:39:43 +02:00
)
2020-10-26 12:23:03 +01:00
2019-09-15 22:57:30 +02:00
} else if ( all ( x %in% reference_data_to_use $ mo ) ) {
2020-09-03 12:31:48 +02:00
x <- MO_lookup [match ( x , MO_lookup $ mo ) , property , drop = TRUE ]
2020-10-26 12:23:03 +01:00
2019-09-15 22:57:30 +02:00
} else if ( all ( tolower ( x ) %in% reference_data_to_use $ fullname_lower ) ) {
2019-02-23 21:49:02 +01:00
# we need special treatment for very prevalent full names, they are likely!
2019-02-23 18:08:28 +01:00
# e.g. as.mo("Staphylococcus aureus")
2020-09-03 12:31:48 +02:00
x <- MO_lookup [match ( tolower ( x ) , MO_lookup $ fullname_lower ) , property , drop = TRUE ]
2020-10-26 12:23:03 +01:00
2020-08-14 13:36:10 +02:00
} else if ( all ( x %in% reference_data_to_use $ fullname ) ) {
# we need special treatment for very prevalent full names, they are likely!
# e.g. as.mo("Staphylococcus aureus")
2020-09-03 12:31:48 +02:00
x <- MO_lookup [match ( x , MO_lookup $ fullname ) , property , drop = TRUE ]
2020-10-26 12:23:03 +01:00
2020-02-14 19:54:13 +01:00
} else if ( all ( toupper ( x ) %in% microorganisms.codes $ code ) ) {
2019-01-21 15:53:01 +01:00
# commonly used MO codes
2020-10-26 12:23:03 +01:00
x <- MO_lookup [match ( microorganisms.codes [match ( toupper ( x ) ,
microorganisms.codes $ code ) ,
2020-09-03 12:31:48 +02:00
" mo" ,
2020-10-26 12:23:03 +01:00
drop = TRUE ] ,
MO_lookup $ mo ) ,
2020-09-03 12:31:48 +02:00
property ,
drop = TRUE ]
2020-10-26 12:23:03 +01:00
2020-02-14 19:54:13 +01:00
} else if ( ! all ( x %in% microorganisms [ , property ] ) ) {
2020-10-26 12:23:03 +01:00
2019-09-15 22:57:30 +02:00
strip_whitespace <- function ( x , dyslexia_mode ) {
2019-05-28 16:50:40 +02:00
# all whitespaces (tab, new lines, etc.) should be one space
2020-12-27 20:32:40 +01:00
# and spaces before and after should be left blank
2020-04-13 21:09:56 +02:00
trimmed <- trimws2 ( x )
2019-09-15 22:57:30 +02:00
# also, make sure the trailing and leading characters are a-z or 0-9
# in case of non-regex
if ( dyslexia_mode == FALSE ) {
2020-09-25 14:44:50 +02:00
trimmed <- gsub ( " ^[^a-zA-Z0-9)(]+" , " " , trimmed , perl = TRUE )
trimmed <- gsub ( " [^a-zA-Z0-9)(]+$" , " " , trimmed , perl = TRUE )
2019-09-15 22:57:30 +02:00
}
trimmed
2019-05-28 16:50:40 +02:00
}
2020-10-26 12:23:03 +01:00
2019-10-23 14:48:25 +02:00
x_backup_untouched <- x
2019-09-15 22:57:30 +02:00
x <- strip_whitespace ( x , dyslexia_mode )
2021-02-18 23:23:14 +01:00
# translate 'unknown' names back to English
if ( any ( x %like% " unbekannt|onbekend|desconocid|sconosciut|iconnu|desconhecid" , na.rm = TRUE ) ) {
trns <- subset ( translations_file , pattern %like% " unknown" | affect_mo_name == TRUE )
2021-03-04 23:28:32 +01:00
langs <- LANGUAGES_SUPPORTED [LANGUAGES_SUPPORTED != " en" ]
for ( l in langs ) {
for ( i in seq_len ( nrow ( trns ) ) ) {
if ( ! is.na ( trns [i , l , drop = TRUE ] ) ) {
x <- gsub ( pattern = trns [i , l , drop = TRUE ] ,
replacement = trns $ pattern [i ] ,
x = x ,
ignore.case = TRUE ,
perl = TRUE )
}
}
}
2021-02-18 23:23:14 +01:00
}
2019-02-23 18:08:28 +01:00
x_backup <- x
2021-03-04 23:28:32 +01:00
2019-09-15 22:57:30 +02:00
# from here on case-insensitive
x <- tolower ( x )
2021-02-18 23:23:14 +01:00
x_backup [x %like_case% " ^(fungus|fungi)$" ] <- " (unknown fungus)" # will otherwise become the kingdom
x_backup [x_backup_untouched == " Fungi" ] <- " Fungi" # is literally the kingdom
# Fill in fullnames and MO codes at once
2021-02-21 20:15:09 +01:00
known_names <- tolower ( x_backup ) %in% MO_lookup $ fullname_lower
2021-02-21 22:56:35 +01:00
x [known_names ] <- MO_lookup [match ( tolower ( x_backup ) [known_names ] , MO_lookup $ fullname_lower ) , property , drop = TRUE ]
2021-02-21 20:15:09 +01:00
known_codes <- toupper ( x_backup ) %in% MO_lookup $ mo
x [known_codes ] <- MO_lookup [match ( toupper ( x_backup ) [known_codes ] , MO_lookup $ mo ) , property , drop = TRUE ]
2021-02-18 23:23:14 +01:00
already_known <- known_names | known_codes
2021-02-21 22:56:35 +01:00
2021-02-18 23:23:14 +01:00
# now only continue where the right taxonomic output is not already known
if ( any ( ! already_known ) ) {
x_known <- x [already_known ]
2021-02-22 20:21:33 +01:00
2021-02-18 23:23:14 +01:00
# remove spp and species
2021-02-22 20:21:33 +01:00
x <- gsub ( " +(spp.?|ssp.?|sp.? |ss ?.?|subsp.?|subspecies|biovar |serovar |species)" , " " , x )
x <- gsub ( " (spp.?|subsp.?|subspecies|biovar|serovar|species)" , " " , x )
2021-02-18 23:23:14 +01:00
x <- gsub ( " ^([a-z]{2,4})(spe.?)$" , " \\1" , x , perl = TRUE ) # when ending in SPE instead of SPP and preceded by 2-4 characters
x <- strip_whitespace ( x , dyslexia_mode )
x_backup_without_spp <- x
x_species <- paste ( x , " species" )
# translate to English for supported languages of mo_property
x <- gsub ( " (gruppe|groep|grupo|gruppo|groupe)" , " group" , x , perl = TRUE )
# no groups and complexes as ending
x <- gsub ( " (complex|group)$" , " " , x , perl = TRUE )
x <- gsub ( " (^|[^a-z])((an)?aero+b)[a-z]*" , " " , x , perl = TRUE )
x <- gsub ( " ^atyp[a-z]*" , " " , x , perl = TRUE )
x <- gsub ( " (vergroen)[a-z]*" , " viridans" , x , perl = TRUE )
x <- gsub ( " [a-z]*diff?erent[a-z]*" , " " , x , perl = TRUE )
x <- gsub ( " (hefe|gist|gisten|levadura|lievito|fermento|levure)[a-z]*" , " yeast" , x , perl = TRUE )
x <- gsub ( " (schimmels?|mofo|molde|stampo|moisissure|fungi)[a-z]*" , " fungus" , x , perl = TRUE )
x <- gsub ( " fungus[ph|f]rya" , " fungiphrya" , x , perl = TRUE )
# no contamination
x <- gsub ( " (contamination|kontamination|mengflora|contaminaci.n|contamina..o)" , " " , x , perl = TRUE )
# remove non-text in case of "E. coli" except dots and spaces
x <- trimws ( gsub ( " [^.a-zA-Z0-9/ \\-]+" , " " , x , perl = TRUE ) )
# but make sure that dots are followed by a space
x <- gsub ( " [.] ?" , " . " , x , perl = TRUE )
# replace minus by a space
x <- gsub ( " -+" , " " , x , perl = TRUE )
# replace hemolytic by haemolytic
x <- gsub ( " ha?emoly" , " haemoly" , x , perl = TRUE )
# place minus back in streptococci
x <- gsub ( " (alpha|beta|gamma).?ha?emoly" , " \\1-haemoly" , x , perl = TRUE )
# remove genus as first word
x <- gsub ( " ^genus " , " " , x , perl = TRUE )
# remove 'uncertain'-like texts
x <- trimws ( gsub ( " (uncertain|susp[ie]c[a-z]+|verdacht)" , " " , x , perl = TRUE ) )
# allow characters that resemble others = dyslexia_mode ----
if ( dyslexia_mode == TRUE ) {
x <- tolower ( x )
2021-02-22 20:21:33 +01:00
x <- gsub ( " [iy]+" , " [iy]+" , x )
x <- gsub ( " (c|k|q|qu|s|z|x|ks)+" , " (c|k|q|qu|s|z|x|ks)+" , x )
x <- gsub ( " (ph|hp|f|v)+" , " (ph|hp|f|v)+" , x )
x <- gsub ( " (th|ht|t)+" , " (th|ht|t)+" , x )
x <- gsub ( " a+" , " a+" , x )
x <- gsub ( " u+" , " u+" , x )
2021-02-18 23:23:14 +01:00
# allow any ending of -um, -us, -ium, -icum, -ius, -icus, -ica, -ia and -a (needs perl for the negative backward lookup):
x <- gsub ( " (u\\+\\(c\\|k\\|q\\|qu\\+\\|s\\|z\\|x\\|ks\\)\\+)(?![a-z])" ,
" (u[s|m]|[iy][ck]?u[ms]|[iy]?[ck]?a)" , x , perl = TRUE )
x <- gsub ( " (\\[iy\\]\\+\\(c\\|k\\|q\\|qu\\+\\|s\\|z\\|x\\|ks\\)\\+a\\+)(?![a-z])" ,
" (u[s|m]|[iy][ck]?u[ms]|[iy]?[ck]?a)" , x , perl = TRUE )
x <- gsub ( " (\\[iy\\]\\+u\\+m)(?![a-z])" ,
" (u[s|m]|[iy][ck]?u[ms]|[iy]?[ck]?a)" , x , perl = TRUE )
x <- gsub ( " (\\[iy\\]\\+a\\+)(?![a-z])" ,
" ([iy]*a+|[iy]+a*)" , x , perl = TRUE )
2021-02-22 20:21:33 +01:00
x <- gsub ( " e+" , " e+" , x )
x <- gsub ( " o+" , " o+" , x )
x <- gsub ( " (.)\\1+" , " \\1+" , x )
2021-02-18 23:23:14 +01:00
# allow multiplication of all other consonants
x <- gsub ( " ([bdgjlnrw]+)" , " \\1+" , x , perl = TRUE )
# allow ending in -en or -us
x <- gsub ( " e\\+n(?![a-z[])" , " (e+n|u+(c|k|q|qu|s|z|x|ks)+)" , x , perl = TRUE )
# if the input is longer than 10 characters, allow any forgotten consonant between all characters, as some might just have forgotten one...
# this will allow "Pasteurella damatis" to be correctly read as "Pasteurella dagmatis".
consonants <- paste ( letters [ ! letters %in% c ( " a" , " e" , " i" , " o" , " u" ) ] , collapse = " " )
x [nchar ( x_backup_without_spp ) > 10 ] <- gsub ( " [+]" , paste0 ( " +[" , consonants , " ]?" ) , x [nchar ( x_backup_without_spp ) > 10 ] )
2021-02-22 20:21:33 +01:00
# allow au and ou after all above regex implementations
2021-02-18 23:23:14 +01:00
x <- gsub ( " a+[bcdfghjklmnpqrstvwxyz]?u+[bcdfghjklmnpqrstvwxyz]?" , " (a+u+|o+u+)[bcdfghjklmnpqrstvwxyz]?" , x , fixed = TRUE )
x <- gsub ( " o+[bcdfghjklmnpqrstvwxyz]?u+[bcdfghjklmnpqrstvwxyz]?" , " (a+u+|o+u+)[bcdfghjklmnpqrstvwxyz]?" , x , fixed = TRUE )
2020-01-27 19:14:23 +01:00
}
2021-02-18 23:23:14 +01:00
x <- strip_whitespace ( x , dyslexia_mode )
# make sure to remove regex overkill (will lead to errors)
x <- gsub ( " ++" , " +" , x , fixed = TRUE )
x <- gsub ( " ?+" , " ?" , x , fixed = TRUE )
x_trimmed <- x
x_trimmed_species <- paste ( x_trimmed , " species" )
x_trimmed_without_group <- gsub ( " gro.u.p$" , " " , x_trimmed , perl = TRUE )
# remove last part from "-" or "/"
x_trimmed_without_group <- gsub ( " (.*)[-/].*" , " \\1" , x_trimmed_without_group )
# replace space and dot by regex sign
x_withspaces <- gsub ( " [ .]+" , " .* " , x , perl = TRUE )
x <- gsub ( " [ .]+" , " .*" , x , perl = TRUE )
# add start en stop regex
x <- paste0 ( " ^" , x , " $" )
x_withspaces_start_only <- paste0 ( " ^" , x_withspaces )
x_withspaces_end_only <- paste0 ( x_withspaces , " $" )
x_withspaces_start_end <- paste0 ( " ^" , x_withspaces , " $" )
if ( isTRUE ( debug ) ) {
cat ( paste0 ( font_blue ( " x" ) , ' "' , x , ' "\n' ) )
cat ( paste0 ( font_blue ( " x_species" ) , ' "' , x_species , ' "\n' ) )
cat ( paste0 ( font_blue ( " x_withspaces_start_only" ) , ' "' , x_withspaces_start_only , ' "\n' ) )
cat ( paste0 ( font_blue ( " x_withspaces_end_only" ) , ' "' , x_withspaces_end_only , ' "\n' ) )
cat ( paste0 ( font_blue ( " x_withspaces_start_end" ) , ' "' , x_withspaces_start_end , ' "\n' ) )
cat ( paste0 ( font_blue ( " x_backup" ) , ' "' , x_backup , ' "\n' ) )
cat ( paste0 ( font_blue ( " x_backup_without_spp" ) , ' "' , x_backup_without_spp , ' "\n' ) )
cat ( paste0 ( font_blue ( " x_trimmed" ) , ' "' , x_trimmed , ' "\n' ) )
cat ( paste0 ( font_blue ( " x_trimmed_species" ) , ' "' , x_trimmed_species , ' "\n' ) )
cat ( paste0 ( font_blue ( " x_trimmed_without_group" ) , ' "' , x_trimmed_without_group , ' "\n' ) )
2019-09-18 15:46:09 +02:00
}
2021-02-18 23:23:14 +01:00
if ( initial_search == TRUE ) {
2021-04-20 10:46:17 +02:00
progress <- progress_ticker ( n = length ( x [ ! already_known ] ) , n_min = 25 , print = info ) # start if n >= 25
2021-02-18 23:23:14 +01:00
on.exit ( close ( progress ) )
2019-02-23 16:02:31 +01:00
}
2021-02-18 23:23:14 +01:00
for ( i in which ( ! already_known ) ) {
if ( initial_search == TRUE ) {
progress $ tick ( )
2020-05-27 16:37:49 +02:00
}
2021-02-18 23:23:14 +01:00
# valid MO code ----
found <- lookup ( mo == toupper ( x_backup [i ] ) )
if ( ! is.na ( found ) ) {
x [i ] <- found [1L ]
next
}
# valid fullname ----
found <- lookup ( fullname_lower %in% gsub ( " [^a-zA-Z0-9_. -]" , " " , tolower ( c ( x_backup [i ] , x_backup_without_spp [i ] ) ) , perl = TRUE ) )
# added the gsub() for "(unknown fungus)", since fullname_lower does not contain brackets
if ( ! is.na ( found ) ) {
x [i ] <- found [1L ]
next
}
# old fullname ----
found <- lookup ( fullname_lower %in% tolower ( c ( x_backup [i ] , x_backup_without_spp [i ] ) ) ,
column = NULL , # all columns
haystack = MO.old_lookup )
if ( ! all ( is.na ( found ) ) ) {
# when property is "ref" (which is the case in mo_ref, mo_authors and mo_year), return the old value, so:
# mo_ref() of "Chlamydia psittaci" will be "Page, 1968" (with warning)
# mo_ref() of "Chlamydophila psittaci" will be "Everett et al., 1999"
if ( property == " ref" ) {
x [i ] <- found [ " ref" ]
} else {
x [i ] <- lookup ( fullname == found [ " fullname_new" ] , haystack = MO_lookup )
}
pkg_env $ mo_renamed_last_run <- found [ " fullname" ]
was_renamed ( name_old = found [ " fullname" ] ,
name_new = lookup ( fullname == found [ " fullname_new" ] , " fullname" , haystack = MO_lookup ) ,
ref_old = found [ " ref" ] ,
ref_new = lookup ( fullname == found [ " fullname_new" ] , " ref" , haystack = MO_lookup ) ,
mo = lookup ( fullname == found [ " fullname_new" ] , " mo" , haystack = MO_lookup ) )
next
}
if ( x_backup [i ] %like_case% " \\(unknown [a-z]+\\)" | tolower ( x_backup_without_spp [i ] ) %in% c ( " other" , " none" , " unknown" ) ) {
# empty and nonsense values, ignore without warning
x [i ] <- lookup ( mo == " UNKNOWN" )
next
}
# exact SNOMED code ----
if ( x_backup [i ] %like_case% " ^[0-9]+$" ) {
snomed_found <- unlist ( lapply ( reference_data_to_use $ snomed ,
function ( s ) if ( x_backup [i ] %in% s ) {
TRUE
} else {
FALSE
} ) )
if ( sum ( snomed_found , na.rm = TRUE ) > 0 ) {
found <- reference_data_to_use [snomed_found == TRUE , property ] [ [1 ] ]
if ( ! is.na ( found ) ) {
x [i ] <- found [1L ]
next
}
}
}
# very probable: is G. species ----
found <- lookup ( g_species %in% gsub ( " [^a-z0-9/ \\-]+" , " " ,
tolower ( c ( x_backup [i ] , x_backup_without_spp [i ] ) ) , perl = TRUE ) )
if ( ! is.na ( found ) ) {
x [i ] <- found [1L ]
next
}
# WHONET and other common LIS codes ----
found <- microorganisms.codes [which ( microorganisms.codes $ code %in% toupper ( c ( x_backup_untouched [i ] , x_backup [i ] , x_backup_without_spp [i ] ) ) ) , " mo" , drop = TRUE ] [1L ]
if ( ! is.na ( found ) ) {
x [i ] <- lookup ( mo == found )
next
}
# user-defined reference ----
if ( ! is.null ( reference_df ) ) {
if ( x_backup [i ] %in% reference_df [ , 1 ] ) {
# already checked integrity of reference_df, all MOs are valid
ref_mo <- reference_df [reference_df [ , 1 ] == x_backup [i ] , " mo" ] [ [1L ] ]
x [i ] <- lookup ( mo == ref_mo )
2020-05-16 13:05:47 +02:00
next
}
2020-01-27 19:14:23 +01:00
}
2021-02-18 23:23:14 +01:00
# WHONET: xxx = no growth
if ( tolower ( as.character ( paste0 ( x_backup_without_spp [i ] , " " ) ) ) %in% c ( " " , " xxx" , " na" , " nan" ) ) {
x [i ] <- NA_character_
2020-05-16 13:05:47 +02:00
next
2019-09-23 17:32:05 +02:00
}
2021-02-18 23:23:14 +01:00
# check for very small input, but ignore the O antigens of E. coli
if ( nchar ( gsub ( " [^a-zA-Z]" , " " , x_trimmed [i ] ) ) < 3
2021-04-23 09:59:36 +02:00
& toupper ( x_backup_without_spp [i ] ) %unlike_case% " O?(26|103|104|104|111|121|145|157)" ) {
2021-02-18 23:23:14 +01:00
# fewer than 3 chars and not looked for species, add as failure
x [i ] <- lookup ( mo == " UNKNOWN" )
if ( initial_search == TRUE ) {
failures <- c ( failures , x_backup [i ] )
}
next
2019-03-15 17:36:42 +01:00
}
2021-02-18 23:23:14 +01:00
if ( x_backup_without_spp [i ] %like_case% " (virus|viridae)" ) {
# there is no fullname like virus or viridae, so don't try to coerce it
x [i ] <- NA_character_
2019-08-06 14:39:22 +02:00
next
2021-02-18 23:23:14 +01:00
}
# translate known trivial abbreviations to genus + species ----
2021-02-25 12:31:12 +01:00
if ( toupper ( x_backup_without_spp [i ] ) %in% c ( " MRSA" , " MSSA" , " VISA" , " VRSA" , " BORSA" , " GISA" )
| x_backup_without_spp [i ] %like_case% " (^| )(mrsa|mssa|visa|vrsa|borsa|gisa|la-?mrsa|ca-?mrsa)( |$)" ) {
2021-02-18 23:23:14 +01:00
x [i ] <- lookup ( fullname == " Staphylococcus aureus" , uncertainty = -1 )
2019-08-06 14:39:22 +02:00
next
}
2021-02-18 23:23:14 +01:00
if ( toupper ( x_backup_without_spp [i ] ) %in% c ( " MRSE" , " MSSE" )
| x_backup_without_spp [i ] %like_case% " (^| )(mrse|msse)( |$)" ) {
x [i ] <- lookup ( fullname == " Staphylococcus epidermidis" , uncertainty = -1 )
next
2021-01-12 22:08:04 +01:00
}
2021-02-18 23:23:14 +01:00
if ( toupper ( x_backup_without_spp [i ] ) == " VRE"
| x_backup_without_spp [i ] %like_case% " (^| )vre "
| x_backup_without_spp [i ] %like_case% " (enterococci|enterokok|enterococo)[a-z]*?$" ) {
x [i ] <- lookup ( genus == " Enterococcus" , uncertainty = -1 )
next
}
# support for:
# - AIEC (Adherent-Invasive E. coli)
# - ATEC (Atypical Entero-pathogenic E. coli)
# - DAEC (Diffusely Adhering E. coli)
# - EAEC (Entero-Aggresive E. coli)
# - EHEC (Entero-Haemorrhagic E. coli)
# - EIEC (Entero-Invasive E. coli)
# - EPEC (Entero-Pathogenic E. coli)
# - ETEC (Entero-Toxigenic E. coli)
# - NMEC (Neonatal Meningitis‐ causing E. coli)
# - STEC (Shiga-toxin producing E. coli)
# - UPEC (Uropathogenic E. coli)
if ( toupper ( x_backup_without_spp [i ] ) %in% c ( " AIEC" , " ATEC" , " DAEC" , " EAEC" , " EHEC" , " EIEC" , " EPEC" , " ETEC" , " NMEC" , " STEC" , " UPEC" )
# also support O-antigens of E. coli: O26, O103, O104, O111, O121, O145, O157
| x_backup_without_spp [i ] %like_case% " o?(26|103|104|111|121|145|157)" ) {
x [i ] <- lookup ( fullname == " Escherichia coli" , uncertainty = -1 )
next
}
if ( toupper ( x_backup_without_spp [i ] ) == " MRPA"
| x_backup_without_spp [i ] %like_case% " (^| )mrpa( |$)" ) {
# multi resistant P. aeruginosa
x [i ] <- lookup ( fullname == " Pseudomonas aeruginosa" , uncertainty = -1 )
next
}
if ( toupper ( x_backup_without_spp [i ] ) == " CRSM" ) {
# co-trim resistant S. maltophilia
x [i ] <- lookup ( fullname == " Stenotrophomonas maltophilia" , uncertainty = -1 )
next
}
if ( toupper ( x_backup_without_spp [i ] ) %in% c ( " PISP" , " PRSP" , " VISP" , " VRSP" )
| x_backup_without_spp [i ] %like_case% " (^| )(pisp|prsp|visp|vrsp)( |$)" ) {
# peni I, peni R, vanco I, vanco R: S. pneumoniae
x [i ] <- lookup ( fullname == " Streptococcus pneumoniae" , uncertainty = -1 )
next
}
if ( x_backup_without_spp [i ] %like_case% " ^g[abcdfghk]s$" ) {
# Streptococci, like GBS = Group B Streptococci (B_STRPT_GRPB)
x [i ] <- lookup ( mo == toupper ( gsub ( " g([abcdfghk])s" ,
" B_STRPT_GRP\\1" ,
2021-02-21 23:19:40 +01:00
x_backup_without_spp [i ] ,
perl = TRUE ) ) , uncertainty = -1 )
2021-02-18 23:23:14 +01:00
next
}
if ( x_backup_without_spp [i ] %like_case% " (streptococ|streptokok).* [abcdfghk]$" ) {
# Streptococci in different languages, like "estreptococos grupo B"
x [i ] <- lookup ( mo == toupper ( gsub ( " .*(streptococ|streptokok|estreptococ).* ([abcdfghk])$" ,
" B_STRPT_GRP\\2" ,
2021-02-21 23:19:40 +01:00
x_backup_without_spp [i ] ,
perl = TRUE ) ) , uncertainty = -1 )
2021-02-18 23:23:14 +01:00
next
}
if ( x_backup_without_spp [i ] %like_case% " group [abcdfghk] (streptococ|streptokok|estreptococ)" ) {
# Streptococci in different languages, like "Group A Streptococci"
x [i ] <- lookup ( mo == toupper ( gsub ( " .*group ([abcdfghk]) (streptococ|streptokok|estreptococ).*" ,
" B_STRPT_GRP\\1" ,
2021-02-21 23:19:40 +01:00
x_backup_without_spp [i ] ,
perl = TRUE ) ) , uncertainty = -1 )
2021-02-18 23:23:14 +01:00
next
}
if ( x_backup_without_spp [i ] %like_case% " haemoly.*strep" ) {
# Haemolytic streptococci in different languages
x [i ] <- lookup ( mo == " B_STRPT_HAEM" , uncertainty = -1 )
next
}
# CoNS/CoPS in different languages (support for German, Dutch, Spanish, Portuguese)
if ( x_backup_without_spp [i ] %like_case% " [ck]oagulas[ea] negatie?[vf]"
| x_trimmed [i ] %like_case% " [ck]oagulas[ea] negatie?[vf]"
| x_backup_without_spp [i ] %like_case% " [ck]o?ns[^a-z]?$" ) {
# coerce S. coagulase negative
x [i ] <- lookup ( mo == " B_STPHY_CONS" , uncertainty = -1 )
next
}
if ( x_backup_without_spp [i ] %like_case% " [ck]oagulas[ea] positie?[vf]"
| x_trimmed [i ] %like_case% " [ck]oagulas[ea] positie?[vf]"
| x_backup_without_spp [i ] %like_case% " [ck]o?ps[^a-z]?$" ) {
# coerce S. coagulase positive
x [i ] <- lookup ( mo == " B_STPHY_COPS" , uncertainty = -1 )
next
}
# streptococcal groups: milleri and viridans
if ( x_trimmed [i ] %like_case% " strepto.* mil+er+i"
| x_backup_without_spp [i ] %like_case% " strepto.* mil+er+i"
| x_backup_without_spp [i ] %like_case% " mgs[^a-z]?$" ) {
# Milleri Group Streptococcus (MGS)
x [i ] <- lookup ( mo == " B_STRPT_MILL" , uncertainty = -1 )
next
}
if ( x_trimmed [i ] %like_case% " strepto.* viridans"
| x_backup_without_spp [i ] %like_case% " strepto.* viridans"
| x_backup_without_spp [i ] %like_case% " vgs[^a-z]?$" ) {
# Viridans Group Streptococcus (VGS)
x [i ] <- lookup ( mo == " B_STRPT_VIRI" , uncertainty = -1 )
next
}
if ( x_backup_without_spp [i ] %like_case% " gram[ -]?neg.*"
| x_backup_without_spp [i ] %like_case% " negatie?[vf]"
| x_trimmed [i ] %like_case% " gram[ -]?neg.*" ) {
# coerce Gram negatives
x [i ] <- lookup ( mo == " B_GRAMN" , uncertainty = -1 )
next
}
if ( x_backup_without_spp [i ] %like_case% " gram[ -]?pos.*"
| x_backup_without_spp [i ] %like_case% " positie?[vf]"
| x_trimmed [i ] %like_case% " gram[ -]?pos.*" ) {
# coerce Gram positives
x [i ] <- lookup ( mo == " B_GRAMP" , uncertainty = -1 )
next
}
if ( x_backup_without_spp [i ] %like_case% " mycoba[ck]teri.[nm]?$" ) {
# coerce mycobacteria in multiple languages
x [i ] <- lookup ( genus == " Mycobacterium" , uncertainty = -1 )
next
}
if ( x_backup_without_spp [i ] %like_case% " salmonella [a-z]+ ?.*" ) {
if ( x_backup_without_spp [i ] %like_case% " salmonella group" ) {
# Salmonella Group A to Z, just return S. species for now
x [i ] <- lookup ( genus == " Salmonella" , uncertainty = -1 )
next
2021-02-21 23:19:40 +01:00
} else if ( x_backup [i ] %like_case% " [sS]almonella [A-Z][a-z]+ ?.*" &
2021-04-23 09:59:36 +02:00
x_backup [i ] %unlike% " t[iy](ph|f)[iy]" ) {
2021-02-18 23:23:14 +01:00
# Salmonella with capital letter species like "Salmonella Goettingen" - they're all S. enterica
# except for S. typhi, S. paratyphi, S. typhimurium
x [i ] <- lookup ( fullname == " Salmonella enterica" , uncertainty = -1 )
uncertainties <- rbind ( uncertainties ,
format_uncertainty_as_df ( uncertainty_level = 1 ,
input = x_backup [i ] ,
result_mo = lookup ( fullname == " Salmonella enterica" , " mo" , uncertainty = -1 ) ) ,
stringsAsFactors = FALSE )
next
2018-09-27 23:23:48 +02:00
}
2021-02-18 23:23:14 +01:00
}
# trivial names known to the field:
if ( " meningococcus" %like_case% x_trimmed [i ] ) {
# coerce Neisseria meningitidis
x [i ] <- lookup ( fullname == " Neisseria meningitidis" , uncertainty = -1 )
next
}
if ( " gonococcus" %like_case% x_trimmed [i ] ) {
# coerce Neisseria gonorrhoeae
x [i ] <- lookup ( fullname == " Neisseria gonorrhoeae" , uncertainty = -1 )
next
}
if ( " pneumococcus" %like_case% x_trimmed [i ] ) {
# coerce Streptococcus penumoniae
x [i ] <- lookup ( fullname == " Streptococcus pneumoniae" , uncertainty = -1 )
next
}
if ( x_backup [i ] %in% pkg_env $ mo_failed ) {
# previously failed already in this session ----
# (at this point the latest reference_df has also been checked)
x [i ] <- lookup ( mo == " UNKNOWN" )
if ( initial_search == TRUE ) {
failures <- c ( failures , x_backup [i ] )
}
next
}
# NOW RUN THROUGH DIFFERENT PREVALENCE LEVELS
check_per_prevalence <- function ( data_to_check ,
data.old_to_check ,
a.x_backup ,
b.x_trimmed ,
c.x_trimmed_without_group ,
d.x_withspaces_start_end ,
e.x_withspaces_start_only ,
f.x_withspaces_end_only ,
g.x_backup_without_spp ,
h.x_species ,
i.x_trimmed_species ) {
# FIRST TRY FULLNAMES AND CODES ----
# if only genus is available, return only genus
2021-04-23 09:59:36 +02:00
if ( all ( c ( x [i ] , b.x_trimmed ) %unlike_case% " " ) ) {
2021-02-18 23:23:14 +01:00
found <- lookup ( fullname_lower %in% c ( h.x_species , i.x_trimmed_species ) ,
2020-05-16 13:05:47 +02:00
haystack = data_to_check )
if ( ! is.na ( found ) ) {
2019-09-15 22:57:30 +02:00
x [i ] <- found [1L ]
return ( x [i ] )
}
2021-02-18 23:23:14 +01:00
if ( nchar ( g.x_backup_without_spp ) >= 6 ) {
found <- lookup ( fullname_lower %like_case% paste0 ( " ^" , unregex ( g.x_backup_without_spp ) , " [a-z]+" ) ,
haystack = data_to_check )
if ( ! is.na ( found ) ) {
x [i ] <- found [1L ]
return ( x [i ] )
}
}
# rest of genus only is in allow_uncertain part.
2019-09-15 22:57:30 +02:00
}
2021-02-18 23:23:14 +01:00
# allow no codes less than 4 characters long, was already checked for WHONET earlier
if ( nchar ( g.x_backup_without_spp ) < 4 ) {
x [i ] <- lookup ( mo == " UNKNOWN" )
if ( initial_search == TRUE ) {
failures <- c ( failures , a.x_backup )
}
return ( x [i ] )
2019-09-15 22:57:30 +02:00
}
2021-02-18 23:23:14 +01:00
# try probable: trimmed version of fullname ----
found <- lookup ( fullname_lower %in% tolower ( g.x_backup_without_spp ) ,
2020-12-03 16:59:04 +01:00
haystack = data_to_check )
if ( ! is.na ( found ) ) {
return ( found [1L ] )
}
2021-02-18 23:23:14 +01:00
# try any match keeping spaces ----
if ( nchar ( g.x_backup_without_spp ) >= 6 ) {
found <- lookup ( fullname_lower %like_case% d.x_withspaces_start_end ,
haystack = data_to_check )
if ( ! is.na ( found ) ) {
return ( found [1L ] )
}
}
# try any match keeping spaces, not ending with $ ----
found <- lookup ( fullname_lower %like_case% paste0 ( trimws ( e.x_withspaces_start_only ) , " " ) ,
2020-12-03 16:59:04 +01:00
haystack = data_to_check )
if ( ! is.na ( found ) ) {
return ( found [1L ] )
}
2021-02-18 23:23:14 +01:00
if ( nchar ( g.x_backup_without_spp ) >= 6 ) {
found <- lookup ( fullname_lower %like_case% e.x_withspaces_start_only ,
haystack = data_to_check )
if ( ! is.na ( found ) ) {
return ( found [1L ] )
}
}
# try any match keeping spaces, not start with ^ ----
found <- lookup ( fullname_lower %like_case% paste0 ( " " , trimws ( f.x_withspaces_end_only ) ) ,
2020-12-03 16:59:04 +01:00
haystack = data_to_check )
if ( ! is.na ( found ) ) {
return ( found [1L ] )
}
2021-02-18 23:23:14 +01:00
# try a trimmed version
if ( nchar ( g.x_backup_without_spp ) >= 6 ) {
found <- lookup ( fullname_lower %like_case% b.x_trimmed |
fullname_lower %like_case% c.x_trimmed_without_group ,
haystack = data_to_check )
if ( ! is.na ( found ) ) {
return ( found [1L ] )
}
}
# try splitting of characters in the middle and then find ID ----
# only when text length is 6 or lower
# like esco = E. coli, klpn = K. pneumoniae, stau = S. aureus, staaur = S. aureus
if ( nchar ( g.x_backup_without_spp ) <= 6 ) {
x_length <- nchar ( g.x_backup_without_spp )
x_split <- paste0 ( " ^" ,
g.x_backup_without_spp %pm>% substr ( 1 , x_length / 2 ) ,
" .* " ,
g.x_backup_without_spp %pm>% substr ( ( x_length / 2 ) + 1 , x_length ) )
found <- lookup ( fullname_lower %like_case% x_split ,
haystack = data_to_check )
if ( ! is.na ( found ) ) {
return ( found [1L ] )
}
}
# try fullname without start and without nchar limit of >= 6 ----
# like "K. pneu rhino" >> "Klebsiella pneumoniae (rhinoscleromatis)" = KLEPNERH
found <- lookup ( fullname_lower %like_case% e.x_withspaces_start_only ,
2020-05-16 13:05:47 +02:00
haystack = data_to_check )
if ( ! is.na ( found ) ) {
2019-02-21 18:55:52 +01:00
return ( found [1L ] )
}
2021-02-18 23:23:14 +01:00
# MISCELLANEOUS ----
# look for old taxonomic names ----
found <- lookup ( fullname_lower %like_case% e.x_withspaces_start_only ,
column = NULL , # all columns
haystack = data.old_to_check )
if ( ! all ( is.na ( found ) ) ) {
# when property is "ref" (which is the case in mo_ref, mo_authors and mo_year), return the old value, so:
# mo_ref() of "Chlamydia psittaci" will be "Page, 1968" (with warning)
# mo_ref() of "Chlamydophila psittaci" will be "Everett et al., 1999"
if ( property == " ref" ) {
x [i ] <- found [ " ref" ]
} else {
x [i ] <- lookup ( fullname == found [ " fullname_new" ] , haystack = MO_lookup )
2018-12-14 10:52:20 +01:00
}
2021-02-18 23:23:14 +01:00
pkg_env $ mo_renamed_last_run <- found [ " fullname" ]
was_renamed ( name_old = found [ " fullname" ] ,
name_new = lookup ( fullname == found [ " fullname_new" ] , " fullname" , haystack = MO_lookup ) ,
ref_old = found [ " ref" ] ,
ref_new = lookup ( fullname == found [ " fullname_new" ] , " ref" , haystack = MO_lookup ) ,
mo = lookup ( fullname == found [ " fullname_new" ] , " mo" , haystack = MO_lookup ) )
return ( x [i ] )
}
# check for uncertain results ----
uncertain_fn <- function ( a.x_backup ,
b.x_trimmed ,
d.x_withspaces_start_end ,
e.x_withspaces_start_only ,
f.x_withspaces_end_only ,
g.x_backup_without_spp ,
uncertain.reference_data_to_use ) {
if ( uncertainty_level == 0 ) {
# do not allow uncertainties
return ( NA_character_ )
2019-03-26 14:24:03 +01:00
}
2021-02-18 23:23:14 +01:00
# UNCERTAINTY LEVEL 1 ----
if ( uncertainty_level >= 1 ) {
now_checks_for_uncertainty_level <- 1
# (1) look again for old taxonomic names, now for G. species ----
if ( isTRUE ( debug ) ) {
cat ( font_bold ( " \n[ UNCERTAINTY LEVEL" , now_checks_for_uncertainty_level , " ] (1) look again for old taxonomic names, now for G. species\n" ) )
}
if ( isTRUE ( debug ) ) {
message ( " Running '" , d.x_withspaces_start_end , " ' and '" , e.x_withspaces_start_only , " '" )
}
found <- lookup ( fullname_lower %like_case% d.x_withspaces_start_end |
fullname_lower %like_case% e.x_withspaces_start_only ,
column = NULL , # all columns
haystack = data.old_to_check )
if ( ! all ( is.na ( found ) ) & nchar ( g.x_backup_without_spp ) >= 6 ) {
if ( property == " ref" ) {
# when property is "ref" (which is the case in mo_ref, mo_authors and mo_year), return the old value, so:
# mo_ref("Chlamydia psittaci") = "Page, 1968" (with warning)
# mo_ref("Chlamydophila psittaci") = "Everett et al., 1999"
x <- found [ " ref" ]
} else {
x <- lookup ( fullname == found [ " fullname_new" ] , haystack = MO_lookup )
}
was_renamed ( name_old = found [ " fullname" ] ,
name_new = lookup ( fullname == found [ " fullname_new" ] , " fullname" , haystack = MO_lookup ) ,
ref_old = found [ " ref" ] ,
ref_new = lookup ( fullname == found [ " fullname_new" ] , " ref" , haystack = MO_lookup ) ,
mo = lookup ( fullname == found [ " fullname_new" ] , " mo" , haystack = MO_lookup ) )
pkg_env $ mo_renamed_last_run <- found [ " fullname" ]
uncertainties <<- rbind ( uncertainties ,
format_uncertainty_as_df ( uncertainty_level = now_checks_for_uncertainty_level ,
input = a.x_backup ,
result_mo = lookup ( fullname == found [ " fullname_new" ] , " mo" , haystack = MO_lookup ) ) ,
stringsAsFactors = FALSE )
return ( x )
}
# (2) Try with misspelled input ----
# just rerun with dyslexia_mode = TRUE will used the extensive regex part above
if ( isTRUE ( debug ) ) {
cat ( font_bold ( " \n[ UNCERTAINTY LEVEL" , now_checks_for_uncertainty_level , " ] (2) Try with misspelled input\n" ) )
}
if ( isTRUE ( debug ) ) {
message ( " Running '" , a.x_backup , " '" )
}
# first try without dyslexia mode
found <- suppressMessages ( suppressWarnings ( exec_as.mo ( a.x_backup , initial_search = FALSE , dyslexia_mode = FALSE , allow_uncertain = FALSE , debug = debug , reference_data_to_use = uncertain.reference_data_to_use , actual_uncertainty = 1 , actual_input = a.x_backup ) ) )
if ( empty_result ( found ) ) {
# then with dyslexia mode
found <- suppressMessages ( suppressWarnings ( exec_as.mo ( a.x_backup , initial_search = FALSE , dyslexia_mode = TRUE , allow_uncertain = FALSE , debug = debug , reference_data_to_use = uncertain.reference_data_to_use , actual_uncertainty = 1 , actual_input = a.x_backup ) ) )
}
if ( ! empty_result ( found ) ) {
found_result <- found
uncertainties <<- rbind ( uncertainties ,
attr ( found , which = " uncertainties" , exact = TRUE ) ,
stringsAsFactors = FALSE )
found <- lookup ( mo == found )
return ( found )
2019-09-15 22:57:30 +02:00
}
}
2021-02-18 23:23:14 +01:00
# UNCERTAINTY LEVEL 2 ----
if ( uncertainty_level >= 2 ) {
now_checks_for_uncertainty_level <- 2
# (3) look for genus only, part of name ----
if ( isTRUE ( debug ) ) {
cat ( font_bold ( " \n[ UNCERTAINTY LEVEL" , now_checks_for_uncertainty_level , " ] (3) look for genus only, part of name\n" ) )
}
2021-04-23 09:59:36 +02:00
if ( nchar ( g.x_backup_without_spp ) > 4 & b.x_trimmed %unlike_case% " " ) {
if ( b.x_trimmed %unlike_case% " ^[A-Z][a-z]+" ) {
2021-02-18 23:23:14 +01:00
if ( isTRUE ( debug ) ) {
message ( " Running '" , paste ( b.x_trimmed , " species" ) , " '" )
}
# not when input is like Genustext, because then Neospora would lead to Actinokineospora
found <- lookup ( fullname_lower %like_case% paste ( b.x_trimmed , " species" ) ,
haystack = uncertain.reference_data_to_use )
if ( ! is.na ( found ) ) {
found_result <- found
found <- lookup ( mo == found )
uncertainties <<- rbind ( uncertainties ,
format_uncertainty_as_df ( uncertainty_level = now_checks_for_uncertainty_level ,
input = a.x_backup ,
result_mo = found_result ) ,
stringsAsFactors = FALSE )
return ( found )
}
2019-09-15 22:57:30 +02:00
}
2021-02-18 23:23:14 +01:00
}
# (4) strip values between brackets ----
if ( isTRUE ( debug ) ) {
cat ( font_bold ( " \n[ UNCERTAINTY LEVEL" , now_checks_for_uncertainty_level , " ] (4) strip values between brackets\n" ) )
}
a.x_backup_stripped <- gsub ( " ( *[(].*[)] *)" , " " , a.x_backup , perl = TRUE )
a.x_backup_stripped <- trimws ( gsub ( " +" , " " , a.x_backup_stripped , perl = TRUE ) )
if ( isTRUE ( debug ) ) {
message ( " Running '" , a.x_backup_stripped , " '" )
}
# first try without dyslexia mode
found <- suppressMessages ( suppressWarnings ( exec_as.mo ( a.x_backup_stripped , initial_search = FALSE , dyslexia_mode = FALSE , allow_uncertain = FALSE , debug = debug , reference_data_to_use = uncertain.reference_data_to_use , actual_uncertainty = 2 , actual_input = a.x_backup ) ) )
if ( empty_result ( found ) ) {
# then with dyslexia mode
found <- suppressMessages ( suppressWarnings ( exec_as.mo ( a.x_backup_stripped , initial_search = FALSE , dyslexia_mode = TRUE , allow_uncertain = FALSE , debug = debug , reference_data_to_use = uncertain.reference_data_to_use , actual_uncertainty = 2 , actual_input = a.x_backup ) ) )
}
if ( ! empty_result ( found ) & nchar ( g.x_backup_without_spp ) >= 6 ) {
found_result <- found
uncertainties <<- rbind ( uncertainties ,
attr ( found , which = " uncertainties" , exact = TRUE ) ,
stringsAsFactors = FALSE )
found <- lookup ( mo == found )
return ( found )
}
# (5) inverse input ----
if ( isTRUE ( debug ) ) {
cat ( font_bold ( " \n[ UNCERTAINTY LEVEL" , now_checks_for_uncertainty_level , " ] (5) inverse input\n" ) )
}
a.x_backup_inversed <- paste ( rev ( unlist ( strsplit ( a.x_backup , split = " " ) ) ) , collapse = " " )
if ( isTRUE ( debug ) ) {
message ( " Running '" , a.x_backup_inversed , " '" )
}
# first try without dyslexia mode
found <- suppressMessages ( suppressWarnings ( exec_as.mo ( a.x_backup_inversed , initial_search = FALSE , dyslexia_mode = FALSE , allow_uncertain = FALSE , debug = debug , reference_data_to_use = uncertain.reference_data_to_use , actual_uncertainty = 2 , actual_input = a.x_backup ) ) )
if ( empty_result ( found ) ) {
# then with dyslexia mode
found <- suppressMessages ( suppressWarnings ( exec_as.mo ( a.x_backup_inversed , initial_search = FALSE , dyslexia_mode = TRUE , allow_uncertain = FALSE , debug = debug , reference_data_to_use = uncertain.reference_data_to_use , actual_uncertainty = 2 , actual_input = a.x_backup ) ) )
}
if ( ! empty_result ( found ) & nchar ( g.x_backup_without_spp ) >= 6 ) {
found_result <- found
uncertainties <<- rbind ( uncertainties ,
attr ( found , which = " uncertainties" , exact = TRUE ) ,
stringsAsFactors = FALSE )
found <- lookup ( mo == found )
return ( found )
}
# (6) try to strip off half an element from end and check the remains ----
if ( isTRUE ( debug ) ) {
cat ( font_bold ( " \n[ UNCERTAINTY LEVEL" , now_checks_for_uncertainty_level , " ] (6) try to strip off half an element from end and check the remains\n" ) )
}
x_strip <- a.x_backup %pm>% strsplit ( " [ .]" ) %pm>% unlist ( )
if ( length ( x_strip ) > 1 ) {
for ( i in seq_len ( length ( x_strip ) - 1 ) ) {
lastword <- x_strip [length ( x_strip ) - i + 1 ]
lastword_half <- substr ( lastword , 1 , as.integer ( nchar ( lastword ) / 2 ) )
# remove last half of the second term
x_strip_collapsed <- paste ( c ( x_strip [seq_len ( length ( x_strip ) - i ) ] , lastword_half ) , collapse = " " )
if ( nchar ( x_strip_collapsed ) >= 4 & nchar ( lastword_half ) > 2 ) {
if ( isTRUE ( debug ) ) {
message ( " Running '" , x_strip_collapsed , " '" )
}
# first try without dyslexia mode
found <- suppressMessages ( suppressWarnings ( exec_as.mo ( x_strip_collapsed , initial_search = FALSE , dyslexia_mode = FALSE , allow_uncertain = FALSE , debug = debug , reference_data_to_use = uncertain.reference_data_to_use , actual_uncertainty = 2 , actual_input = a.x_backup ) ) )
if ( empty_result ( found ) ) {
# then with dyslexia mode
found <- suppressMessages ( suppressWarnings ( exec_as.mo ( x_strip_collapsed , initial_search = FALSE , dyslexia_mode = TRUE , allow_uncertain = FALSE , debug = debug , reference_data_to_use = uncertain.reference_data_to_use , actual_uncertainty = 2 , actual_input = a.x_backup ) ) )
}
if ( ! empty_result ( found ) ) {
found_result <- found
uncertainties <<- rbind ( uncertainties ,
attr ( found , which = " uncertainties" , exact = TRUE ) ,
stringsAsFactors = FALSE )
found <- lookup ( mo == found )
return ( found )
}
}
2019-09-15 22:57:30 +02:00
}
2019-08-12 19:07:15 +02:00
}
2021-02-18 23:23:14 +01:00
# (7) try to strip off one element from end and check the remains ----
if ( isTRUE ( debug ) ) {
cat ( font_bold ( " \n[ UNCERTAINTY LEVEL" , now_checks_for_uncertainty_level , " ] (7) try to strip off one element from end and check the remains\n" ) )
}
if ( length ( x_strip ) > 1 ) {
for ( i in seq_len ( length ( x_strip ) - 1 ) ) {
x_strip_collapsed <- paste ( x_strip [seq_len ( length ( x_strip ) - i ) ] , collapse = " " )
if ( nchar ( x_strip_collapsed ) >= 6 ) {
if ( isTRUE ( debug ) ) {
message ( " Running '" , x_strip_collapsed , " '" )
}
# first try without dyslexia mode
found <- suppressMessages ( suppressWarnings ( exec_as.mo ( x_strip_collapsed , initial_search = FALSE , dyslexia_mode = FALSE , allow_uncertain = FALSE , debug = debug , reference_data_to_use = uncertain.reference_data_to_use , actual_uncertainty = 2 , actual_input = a.x_backup ) ) )
if ( empty_result ( found ) ) {
# then with dyslexia mode
found <- suppressMessages ( suppressWarnings ( exec_as.mo ( x_strip_collapsed , initial_search = FALSE , dyslexia_mode = TRUE , allow_uncertain = FALSE , debug = debug , reference_data_to_use = uncertain.reference_data_to_use , actual_uncertainty = 2 , actual_input = a.x_backup ) ) )
}
if ( ! empty_result ( found ) ) {
found_result <- found
uncertainties <<- rbind ( uncertainties ,
attr ( found , which = " uncertainties" , exact = TRUE ) ,
stringsAsFactors = FALSE )
found <- lookup ( mo == found )
return ( found )
}
}
}
}
# (8) check for unknown yeasts/fungi ----
if ( isTRUE ( debug ) ) {
cat ( font_bold ( " \n[ UNCERTAINTY LEVEL" , now_checks_for_uncertainty_level , " ] (8) check for unknown yeasts/fungi\n" ) )
}
if ( b.x_trimmed %like_case% " yeast" ) {
found <- " F_YEAST"
found_result <- found
found <- lookup ( mo == found )
uncertainties <<- rbind ( uncertainties ,
format_uncertainty_as_df ( uncertainty_level = now_checks_for_uncertainty_level ,
input = a.x_backup ,
result_mo = found_result ) ,
stringsAsFactors = FALSE )
return ( found )
}
2021-04-23 09:59:36 +02:00
if ( b.x_trimmed %like_case% " (fungus|fungi)" & b.x_trimmed %unlike_case% " fungiphrya" ) {
2021-02-18 23:23:14 +01:00
found <- " F_FUNGUS"
found_result <- found
found <- lookup ( mo == found )
uncertainties <<- rbind ( uncertainties ,
format_uncertainty_as_df ( uncertainty_level = now_checks_for_uncertainty_level ,
input = a.x_backup ,
result_mo = found_result ) ,
stringsAsFactors = FALSE )
return ( found )
}
# (9) try to strip off one element from start and check the remains (only allow >= 2-part name outcome) ----
if ( isTRUE ( debug ) ) {
cat ( font_bold ( " \n[ UNCERTAINTY LEVEL" , now_checks_for_uncertainty_level , " ] (9) try to strip off one element from start and check the remains (only allow >= 2-part name outcome)\n" ) )
}
x_strip <- a.x_backup %pm>% strsplit ( " [ .]" ) %pm>% unlist ( )
if ( length ( x_strip ) > 1 & nchar ( g.x_backup_without_spp ) >= 6 ) {
for ( i in 2 : ( length ( x_strip ) ) ) {
x_strip_collapsed <- paste ( x_strip [i : length ( x_strip ) ] , collapse = " " )
2019-09-15 22:57:30 +02:00
if ( isTRUE ( debug ) ) {
message ( " Running '" , x_strip_collapsed , " '" )
}
# first try without dyslexia mode
2020-09-14 12:21:23 +02:00
found <- suppressMessages ( suppressWarnings ( exec_as.mo ( x_strip_collapsed , initial_search = FALSE , dyslexia_mode = FALSE , allow_uncertain = FALSE , debug = debug , reference_data_to_use = uncertain.reference_data_to_use , actual_uncertainty = 2 , actual_input = a.x_backup ) ) )
2019-09-15 22:57:30 +02:00
if ( empty_result ( found ) ) {
# then with dyslexia mode
2020-09-14 12:21:23 +02:00
found <- suppressMessages ( suppressWarnings ( exec_as.mo ( x_strip_collapsed , initial_search = FALSE , dyslexia_mode = TRUE , allow_uncertain = FALSE , debug = debug , reference_data_to_use = uncertain.reference_data_to_use , actual_uncertainty = 2 , actual_input = a.x_backup ) ) )
2019-09-15 22:57:30 +02:00
}
if ( ! empty_result ( found ) ) {
found_result <- found
2021-02-18 23:23:14 +01:00
# uncertainty level 2 only if searched part contains a space (otherwise it will be found with lvl 3)
if ( x_strip_collapsed %like_case% " " ) {
uncertainties <<- rbind ( uncertainties ,
attr ( found , which = " uncertainties" , exact = TRUE ) ,
stringsAsFactors = FALSE )
found <- lookup ( mo == found )
return ( found )
}
2019-09-15 22:57:30 +02:00
}
2019-03-26 14:24:03 +01:00
}
2019-03-12 12:19:27 +01:00
}
}
2021-02-18 23:23:14 +01:00
# UNCERTAINTY LEVEL 3 ----
if ( uncertainty_level >= 3 ) {
now_checks_for_uncertainty_level <- 3
# (10) try to strip off one element from start and check the remains (any text size) ----
if ( isTRUE ( debug ) ) {
cat ( font_bold ( " \n[ UNCERTAINTY LEVEL" , now_checks_for_uncertainty_level , " ] (10) try to strip off one element from start and check the remains (any text size)\n" ) )
}
x_strip <- a.x_backup %pm>% strsplit ( " [ .]" ) %pm>% unlist ( )
if ( length ( x_strip ) > 1 & nchar ( g.x_backup_without_spp ) >= 6 ) {
for ( i in 2 : ( length ( x_strip ) ) ) {
x_strip_collapsed <- paste ( x_strip [i : length ( x_strip ) ] , collapse = " " )
2019-09-15 22:57:30 +02:00
if ( isTRUE ( debug ) ) {
message ( " Running '" , x_strip_collapsed , " '" )
}
# first try without dyslexia mode
2021-02-18 23:23:14 +01:00
found <- suppressMessages ( suppressWarnings ( exec_as.mo ( x_strip_collapsed , initial_search = FALSE , dyslexia_mode = FALSE , allow_uncertain = FALSE , debug = debug , reference_data_to_use = uncertain.reference_data_to_use , actual_uncertainty = 3 , actual_input = a.x_backup ) ) )
2019-09-15 22:57:30 +02:00
if ( empty_result ( found ) ) {
# then with dyslexia mode
2021-02-18 23:23:14 +01:00
found <- suppressMessages ( suppressWarnings ( exec_as.mo ( x_strip_collapsed , initial_search = FALSE , dyslexia_mode = TRUE , allow_uncertain = FALSE , debug = debug , reference_data_to_use = uncertain.reference_data_to_use , actual_uncertainty = 3 , actual_input = a.x_backup ) ) )
2019-09-15 22:57:30 +02:00
}
if ( ! empty_result ( found ) ) {
found_result <- found
uncertainties <<- rbind ( uncertainties ,
2020-11-11 16:49:27 +01:00
attr ( found , which = " uncertainties" , exact = TRUE ) ,
stringsAsFactors = FALSE )
2020-09-14 12:21:23 +02:00
found <- lookup ( mo == found )
2020-05-16 13:05:47 +02:00
return ( found )
2019-09-15 22:57:30 +02:00
}
}
}
2021-02-18 23:23:14 +01:00
# (11) try to strip off one element from end and check the remains (any text size) ----
# (this is in fact 7 but without nchar limit of >=6)
if ( isTRUE ( debug ) ) {
cat ( font_bold ( " \n[ UNCERTAINTY LEVEL" , now_checks_for_uncertainty_level , " ] (11) try to strip off one element from end and check the remains (any text size)\n" ) )
}
if ( length ( x_strip ) > 1 ) {
for ( i in seq_len ( length ( x_strip ) - 1 ) ) {
x_strip_collapsed <- paste ( x_strip [seq_len ( length ( x_strip ) - i ) ] , collapse = " " )
if ( isTRUE ( debug ) ) {
message ( " Running '" , x_strip_collapsed , " '" )
}
# first try without dyslexia mode
found <- suppressMessages ( suppressWarnings ( exec_as.mo ( x_strip_collapsed , initial_search = FALSE , dyslexia_mode = FALSE , allow_uncertain = FALSE , debug = debug , reference_data_to_use = uncertain.reference_data_to_use , actual_uncertainty = 3 , actual_input = a.x_backup ) ) )
if ( empty_result ( found ) ) {
# then with dyslexia mode
found <- suppressMessages ( suppressWarnings ( exec_as.mo ( x_strip_collapsed , initial_search = FALSE , dyslexia_mode = TRUE , allow_uncertain = FALSE , debug = debug , reference_data_to_use = uncertain.reference_data_to_use , actual_uncertainty = 3 , actual_input = a.x_backup ) ) )
}
if ( ! empty_result ( found ) ) {
found_result <- found
2019-09-15 22:57:30 +02:00
uncertainties <<- rbind ( uncertainties ,
2020-11-11 16:49:27 +01:00
attr ( found , which = " uncertainties" , exact = TRUE ) ,
stringsAsFactors = FALSE )
2020-09-14 19:41:48 +02:00
found <- lookup ( mo == found )
2020-05-16 13:05:47 +02:00
return ( found )
2019-06-27 11:57:45 +02:00
}
}
}
2021-02-18 23:23:14 +01:00
# (12) part of a name (very unlikely match) ----
if ( isTRUE ( debug ) ) {
cat ( font_bold ( " \n[ UNCERTAINTY LEVEL" , now_checks_for_uncertainty_level , " ] (12) part of a name (very unlikely match)\n" ) )
2019-03-26 14:24:03 +01:00
}
2021-02-18 23:23:14 +01:00
if ( isTRUE ( debug ) ) {
message ( " Running '" , f.x_withspaces_end_only , " '" )
}
if ( nchar ( g.x_backup_without_spp ) >= 6 ) {
found <- lookup ( fullname_lower %like_case% f.x_withspaces_end_only , column = " mo" )
if ( ! is.na ( found ) ) {
found_result <- lookup ( mo == found )
2019-03-26 14:24:03 +01:00
uncertainties <<- rbind ( uncertainties ,
2020-11-11 16:49:27 +01:00
attr ( found , which = " uncertainties" , exact = TRUE ) ,
stringsAsFactors = FALSE )
2020-09-14 19:41:48 +02:00
found <- lookup ( mo == found )
2020-05-16 13:05:47 +02:00
return ( found )
2019-03-12 12:19:27 +01:00
}
2019-02-08 16:06:54 +01:00
}
}
2021-02-18 23:23:14 +01:00
# didn't found in uncertain results too
return ( NA_character_ )
2019-08-12 19:07:15 +02:00
}
2021-02-18 23:23:14 +01:00
# uncertain results
x [i ] <- uncertain_fn ( a.x_backup = a.x_backup ,
b.x_trimmed = b.x_trimmed ,
d.x_withspaces_start_end = d.x_withspaces_start_end ,
e.x_withspaces_start_only = e.x_withspaces_start_only ,
f.x_withspaces_end_only = f.x_withspaces_end_only ,
g.x_backup_without_spp = g.x_backup_without_spp ,
uncertain.reference_data_to_use = MO_lookup )
if ( ! empty_result ( x [i ] ) ) {
return ( x [i ] )
}
# didn't found any
2019-09-15 22:57:30 +02:00
return ( NA_character_ )
}
2021-02-18 23:23:14 +01:00
# CHECK ALL IN ONE GO ----
x [i ] <- check_per_prevalence ( data_to_check = MO_lookup ,
data.old_to_check = MO.old_lookup ,
a.x_backup = x_backup [i ] ,
b.x_trimmed = x_trimmed [i ] ,
c.x_trimmed_without_group = x_trimmed_without_group [i ] ,
d.x_withspaces_start_end = x_withspaces_start_end [i ] ,
e.x_withspaces_start_only = x_withspaces_start_only [i ] ,
f.x_withspaces_end_only = x_withspaces_end_only [i ] ,
g.x_backup_without_spp = x_backup_without_spp [i ] ,
h.x_species = x_species [i ] ,
i.x_trimmed_species = x_trimmed_species [i ] )
2020-05-16 13:05:47 +02:00
if ( ! empty_result ( x [i ] ) ) {
2021-02-18 23:23:14 +01:00
next
}
# no results found: make them UNKNOWN ----
x [i ] <- lookup ( mo == " UNKNOWN" , uncertainty = -1 )
if ( initial_search == TRUE ) {
failures <- c ( failures , x_backup [i ] )
2018-12-06 14:36:39 +01:00
}
2019-09-15 22:57:30 +02:00
}
2021-02-18 23:23:14 +01:00
2019-03-18 14:29:41 +01:00
if ( initial_search == TRUE ) {
2021-02-18 23:23:14 +01:00
close ( progress )
2019-03-15 17:36:42 +01:00
}
2021-02-18 23:23:14 +01:00
2021-02-21 20:15:09 +01:00
if ( isTRUE ( debug ) && initial_search == TRUE ) {
cat ( " Ended search" , time_track ( ) , " \n" )
}
2021-02-18 23:23:14 +01:00
# handling failures ----
failures <- failures [ ! failures %in% c ( NA , NULL , NaN ) ]
if ( length ( failures ) > 0 & initial_search == TRUE ) {
pkg_env $ mo_failures <- sort ( unique ( failures ) )
pkg_env $ mo_failed <- c ( pkg_env $ mo_failed , pkg_env $ mo_failures )
plural <- c ( " value" , " it" , " was" )
if ( pm_n_distinct ( failures ) > 1 ) {
plural <- c ( " values" , " them" , " were" )
}
x_input_clean <- trimws2 ( x_input )
total_failures <- length ( x_input_clean [as.character ( x_input_clean ) %in% as.character ( failures ) & ! x_input %in% c ( NA , NULL , NaN ) ] )
total_n <- length ( x_input [ ! x_input %in% c ( NA , NULL , NaN ) ] )
msg <- paste0 ( nr2char ( pm_n_distinct ( failures ) ) , " unique " , plural [1 ] ,
" (covering " , percentage ( total_failures / total_n ) ,
" ) could not be coerced and " , plural [3 ] , " considered 'unknown'" )
if ( pm_n_distinct ( failures ) <= 10 ) {
msg <- paste0 ( msg , " : " , vector_and ( failures , quotes = TRUE ) )
}
msg <- paste0 ( msg ,
" .\nUse `mo_failures()` to review " , plural [2 ] , " . Edit the `allow_uncertain` argument if needed (see ?as.mo).\n" ,
" You can also use your own reference data with set_mo_source() or directly, e.g.:\n" ,
' as.mo("mycode", reference_df = data.frame(own = "mycode", mo = "' , MO_lookup $ mo [match ( " Escherichia coli" , MO_lookup $ fullname ) ] , ' "))\n' ,
' mo_name("mycode", reference_df = data.frame(own = "mycode", mo = "' , MO_lookup $ mo [match ( " Escherichia coli" , MO_lookup $ fullname ) ] , ' "))\n' )
warning_ ( paste0 ( " \n" , msg ) ,
add_fn = font_red ,
call = FALSE ,
immediate = TRUE ) # thus will always be shown, even if >= warnings
}
# handling uncertainties ----
if ( NROW ( uncertainties ) > 0 & initial_search == TRUE ) {
uncertainties <- as.list ( pm_distinct ( uncertainties , input , .keep_all = TRUE ) )
pkg_env $ mo_uncertainties <- uncertainties
plural <- c ( " " , " it" , " was" )
if ( length ( uncertainties $ input ) > 1 ) {
plural <- c ( " s" , " them" , " were" )
}
2021-03-05 15:36:39 +01:00
msg <- paste0 ( " Translation is uncertain of " , nr2char ( length ( uncertainties $ input ) ) , " microorganism" , plural [1 ] ,
" . Use `mo_uncertainties()` to review " , plural [2 ] , " ." )
2021-02-18 23:23:14 +01:00
message_ ( msg )
}
x [already_known ] <- x_known
2018-09-27 23:23:48 +02:00
}
2018-07-23 14:14:03 +02:00
}
2021-02-18 23:23:14 +01:00
2018-09-14 10:31:21 +02:00
# Becker ----
2018-09-01 21:19:46 +02:00
if ( Becker == TRUE | Becker == " all" ) {
2020-10-20 21:00:57 +02:00
# warn when species found that are not in:
# - Becker et al. 2014, PMID 25278577
# - Becker et al. 2019, PMID 30872103
# - Becker et al. 2020, PMID 32056452
2021-02-04 16:48:16 +01:00
post_Becker <- character ( 0 ) # 2020-10-20 currently all are mentioned in above papers (otherwise uncomment the section below)
2020-12-11 12:17:23 +01:00
# nolint start
# if (any(x %in% MO_lookup[which(MO_lookup$species %in% post_Becker), property])) {
# warning_("Becker ", font_italic("et al."), " (2014, 2019) does not contain these species named after their publication: ",
# font_italic(paste("S.",
# sort(mo_species(unique(x[x %in% MO_lookup[which(MO_lookup$species %in% post_Becker), property]]))),
# collapse = ", ")),
# ".",
# call = FALSE,
# immediate = TRUE)
# }
# nolint end
2020-10-26 12:23:03 +01:00
2020-09-29 23:35:46 +02:00
# 'MO_CONS' and 'MO_COPS' are <mo> vectors created in R/zzz.R
CoNS <- MO_lookup [which ( MO_lookup $ mo %in% MO_CONS ) , property , drop = TRUE ]
2020-09-14 19:41:48 +02:00
x [x %in% CoNS ] <- lookup ( mo == " B_STPHY_CONS" , uncertainty = -1 )
2020-10-26 12:23:03 +01:00
2020-09-29 23:35:46 +02:00
CoPS <- MO_lookup [which ( MO_lookup $ mo %in% MO_COPS ) , property , drop = TRUE ]
2020-09-14 19:41:48 +02:00
x [x %in% CoPS ] <- lookup ( mo == " B_STPHY_COPS" , uncertainty = -1 )
2020-10-26 12:23:03 +01:00
2018-09-01 21:19:46 +02:00
if ( Becker == " all" ) {
2020-09-14 19:41:48 +02:00
x [x %in% lookup ( fullname %like_case% " ^Staphylococcus aureus" , n = Inf ) ] <- lookup ( mo == " B_STPHY_COPS" , uncertainty = -1 )
2018-09-01 21:19:46 +02:00
}
}
2020-10-26 12:23:03 +01:00
2018-09-14 10:31:21 +02:00
# Lancefield ----
2018-09-04 11:33:30 +02:00
if ( Lancefield == TRUE | Lancefield == " all" ) {
2018-09-27 23:23:48 +02:00
# group A - S. pyogenes
2020-09-14 19:41:48 +02:00
x [x %in% lookup ( genus == " Streptococcus" & species == " pyogenes" , n = Inf ) ] <- lookup ( fullname == " Streptococcus group A" , uncertainty = -1 )
2018-09-27 23:23:48 +02:00
# group B - S. agalactiae
2020-09-14 19:41:48 +02:00
x [x %in% lookup ( genus == " Streptococcus" & species == " agalactiae" , n = Inf ) ] <- lookup ( fullname == " Streptococcus group B" , uncertainty = -1 )
2018-09-01 21:19:46 +02:00
# group C
2020-05-16 13:05:47 +02:00
x [x %in% lookup ( genus == " Streptococcus" &
species %in% c ( " equisimilis" , " equi" , " zooepidemicus" , " dysgalactiae" ) ,
2020-09-14 19:41:48 +02:00
n = Inf ) ] <- lookup ( fullname == " Streptococcus group C" , uncertainty = -1 )
2018-09-04 11:33:30 +02:00
if ( Lancefield == " all" ) {
2018-09-27 23:23:48 +02:00
# all Enterococci
2020-09-14 19:41:48 +02:00
x [x %in% lookup ( genus == " Enterococcus" , n = Inf ) ] <- lookup ( fullname == " Streptococcus group D" , uncertainty = -1 )
2018-09-27 23:23:48 +02:00
}
# group F - S. anginosus
2020-09-14 19:41:48 +02:00
x [x %in% lookup ( genus == " Streptococcus" & species == " anginosus" , n = Inf ) ] <- lookup ( fullname == " Streptococcus group F" , uncertainty = -1 )
2018-09-27 23:23:48 +02:00
# group H - S. sanguinis
2020-09-14 19:41:48 +02:00
x [x %in% lookup ( genus == " Streptococcus" & species == " sanguinis" , n = Inf ) ] <- lookup ( fullname == " Streptococcus group H" , uncertainty = -1 )
2018-09-27 23:23:48 +02:00
# group K - S. salivarius
2020-09-14 19:41:48 +02:00
x [x %in% lookup ( genus == " Streptococcus" & species == " salivarius" , n = Inf ) ] <- lookup ( fullname == " Streptococcus group K" , uncertainty = -1 )
2018-09-01 21:19:46 +02:00
}
2020-10-26 12:23:03 +01:00
2019-02-08 16:06:54 +01:00
# Wrap up ----------------------------------------------------------------
2020-10-26 12:23:03 +01:00
2018-10-19 13:53:31 +02:00
# comply to x, which is also unique and without empty values
2019-03-05 22:47:42 +01:00
x_input_unique_nonempty <- unique ( x_input [ ! is.na ( x_input )
& ! is.null ( x_input )
& ! identical ( x_input , " " )
2019-06-02 19:23:19 +02:00
& ! identical ( x_input , " xxx" ) ] )
2020-10-26 12:23:03 +01:00
2018-08-28 13:51:13 +02:00
# left join the found results to the original input values (x_input)
2018-10-19 13:53:31 +02:00
df_found <- data.frame ( input = as.character ( x_input_unique_nonempty ) ,
found = as.character ( x ) ,
2018-08-28 13:51:13 +02:00
stringsAsFactors = FALSE )
2018-09-27 23:23:48 +02:00
df_input <- data.frame ( input = as.character ( x_input ) ,
2018-08-28 13:51:13 +02:00
stringsAsFactors = FALSE )
2020-10-26 12:23:03 +01:00
2020-09-03 12:31:48 +02:00
# super fast using match() which is a lot faster than merge()
2020-05-16 13:05:47 +02:00
x <- df_found $ found [match ( df_input $ input , df_found $ input ) ]
2020-10-26 12:23:03 +01:00
2018-09-27 23:23:48 +02:00
if ( property == " mo" ) {
2020-11-16 16:57:55 +01:00
x <- set_clean_class ( x , new_class = c ( " mo" , " character" ) )
2018-09-27 23:23:48 +02:00
}
2021-03-04 23:28:32 +01:00
# keep track of time
end_time <- Sys.time ( )
2020-10-26 12:23:03 +01:00
2019-02-08 16:06:54 +01:00
if ( length ( mo_renamed ( ) ) > 0 ) {
2019-07-01 14:03:15 +02:00
print ( mo_renamed ( ) )
2018-12-14 10:52:20 +01:00
}
2020-10-26 12:23:03 +01:00
2020-09-14 19:41:48 +02:00
if ( initial_search == FALSE ) {
# we got here from uncertain_fn().
if ( NROW ( uncertainties ) == 0 ) {
# the stripped/transformed version of x_backup is apparently a full hit, like with: as.mo("Escherichia (hello there) coli")
2020-10-26 12:23:03 +01:00
uncertainties <- rbind ( uncertainties ,
format_uncertainty_as_df ( uncertainty_level = actual_uncertainty ,
input = actual_input ,
2020-09-14 19:41:48 +02:00
result_mo = x ,
2020-11-11 16:49:27 +01:00
candidates = " " ) ,
stringsAsFactors = FALSE )
2020-09-14 19:41:48 +02:00
}
2020-09-14 12:21:23 +02:00
# this will save the uncertain items as attribute, so they can be bound to `uncertainties` in the uncertain_fn() function
x <- structure ( x , uncertainties = uncertainties )
2020-12-24 23:29:10 +01:00
} else {
# keep track of time - give some hints to improve speed if it takes a long time
delta_time <- difftime ( end_time , start_time , units = " secs" )
if ( delta_time >= 30 ) {
2021-03-04 23:28:32 +01:00
message_ ( " Using `as.mo()` took " , round ( delta_time ) , " seconds, which is a long time. Some suggestions to improve speed include:" )
2020-12-24 23:29:10 +01:00
message_ ( word_wrap ( " - Try to use as many valid taxonomic names as possible for your input." ,
extra_indent = 2 ) ,
as_note = FALSE )
2021-01-24 14:48:56 +01:00
message_ ( word_wrap ( " - Save the output and use it as input for future calculations, e.g. create a new variable to your data using `as.mo()`. All functions in this package that rely on microorganism codes will automatically use that new column where possible. All `mo_*()` functions also do not require you to set their `x` argument as long as you have a column of class <mo>." ,
2020-12-24 23:29:10 +01:00
extra_indent = 2 ) ,
as_note = FALSE )
2021-01-18 16:57:56 +01:00
message_ ( word_wrap ( " - Use `set_mo_source()` to continually transform your organisation codes to microorganisms codes used by this package, see `?mo_source`." ,
2020-12-24 23:29:10 +01:00
extra_indent = 2 ) ,
as_note = FALSE )
}
2020-09-14 12:21:23 +02:00
}
2020-12-24 23:29:10 +01:00
2021-02-21 20:15:09 +01:00
if ( isTRUE ( debug ) && initial_search == TRUE ) {
cat ( " Finished function" , time_track ( ) , " \n" )
}
2018-06-08 12:06:54 +02:00
x
}
2018-07-23 14:14:03 +02:00
2019-03-02 22:47:04 +01:00
empty_result <- function ( x ) {
2019-03-12 12:19:27 +01:00
all ( x %in% c ( NA , " UNKNOWN" ) )
2019-03-02 22:47:04 +01:00
}
2019-02-08 16:06:54 +01:00
was_renamed <- function ( name_old , name_new , ref_old = " " , ref_new = " " , mo = " " ) {
2020-10-26 12:23:03 +01:00
newly_set <- data.frame ( old_name = name_old ,
2019-09-18 15:46:09 +02:00
old_ref = ref_old ,
2019-09-15 22:57:30 +02:00
new_name = name_new ,
new_ref = ref_new ,
2020-10-26 12:23:03 +01:00
mo = mo ,
2019-09-15 22:57:30 +02:00
stringsAsFactors = FALSE )
2020-12-27 00:07:00 +01:00
already_set <- pkg_env $ mo_renamed
2019-09-15 22:57:30 +02:00
if ( ! is.null ( already_set ) ) {
2020-12-27 00:07:00 +01:00
pkg_env $ mo_renamed = rbind ( already_set ,
2020-11-11 16:49:27 +01:00
newly_set ,
2020-12-17 16:22:25 +01:00
stringsAsFactors = FALSE )
2018-10-01 14:44:40 +02:00
} else {
2020-12-27 00:07:00 +01:00
pkg_env $ mo_renamed <- newly_set
2018-10-01 14:44:40 +02:00
}
2018-09-25 16:44:40 +02:00
}
2019-08-20 11:40:54 +02:00
format_uncertainty_as_df <- function ( uncertainty_level ,
input ,
2020-09-12 08:49:01 +02:00
result_mo ,
candidates = NULL ) {
2020-12-27 00:07:00 +01:00
if ( ! is.null ( pkg_env $ mo_renamed_last_run ) ) {
fullname <- pkg_env $ mo_renamed_last_run
pkg_env $ mo_renamed_last_run <- NULL
2020-09-12 08:49:01 +02:00
renamed_to <- MO_lookup [match ( result_mo , MO_lookup $ mo ) , " fullname" , drop = TRUE ] [1 ]
2019-08-20 11:40:54 +02:00
} else {
2020-09-12 08:49:01 +02:00
fullname <- MO_lookup [match ( result_mo , MO_lookup $ mo ) , " fullname" , drop = TRUE ] [1 ]
renamed_to <- NA_character_
2019-08-20 11:40:54 +02:00
}
2020-09-12 08:49:01 +02:00
data.frame ( uncertainty = uncertainty_level ,
input = input ,
fullname = fullname ,
renamed_to = renamed_to ,
mo = result_mo ,
2020-09-14 12:21:23 +02:00
# save max 26 entries: the one to be chosen and 25 more
candidates = if ( length ( candidates ) > 1 ) paste ( candidates [c ( 2 : min ( 26 , length ( candidates ) ) ) ] , collapse = " , " ) else " " ,
2020-09-12 08:49:01 +02:00
stringsAsFactors = FALSE )
2019-08-20 11:40:54 +02:00
}
2020-08-28 21:55:47 +02:00
# will be exported using s3_register() in R/zzz.R
2020-08-26 11:33:54 +02:00
pillar_shaft.mo <- function ( x , ... ) {
out <- format ( x )
# grey out the kingdom (part until first "_")
2020-09-25 14:44:50 +02:00
out [ ! is.na ( x ) ] <- gsub ( " ^([A-Z]+_)(.*)" , paste0 ( font_subtle ( " \\1" ) , " \\2" ) , out [ ! is.na ( x ) ] , perl = TRUE )
2020-08-26 11:33:54 +02:00
# and grey out every _
2020-08-28 21:55:47 +02:00
out [ ! is.na ( x ) ] <- gsub ( " _" , font_subtle ( " _" ) , out [ ! is.na ( x ) ] )
2021-04-07 08:37:42 +02:00
2020-08-26 11:33:54 +02:00
# markup NA and UNKNOWN
2020-08-28 21:55:47 +02:00
out [is.na ( x ) ] <- font_na ( " NA" )
out [x == " UNKNOWN" ] <- font_na ( " UNKNOWN" )
2021-04-07 08:37:42 +02:00
if ( ! all ( x [ ! is.na ( x ) ] %in% MO_lookup $ mo ) ) {
# markup old mo codes
out [ ! x %in% MO_lookup $ mo ] <- font_italic ( font_na ( x [ ! x %in% MO_lookup $ mo ] ,
collapse = NULL ) ,
collapse = NULL )
# throw a warning with the affected column name
mo <- tryCatch ( search_type_in_df ( get_current_data ( arg_name = " x" , call = 0 ) , type = " mo" , info = FALSE ) ,
error = function ( e ) NULL )
if ( ! is.null ( mo ) ) {
col <- paste0 ( " Column '" , mo , " '" )
} else {
col <- " The data"
}
2021-05-03 13:06:43 +02:00
warning_ ( col , " contains old MO codes (from a previous AMR package version). " ,
2021-04-07 08:37:42 +02:00
" Please update your MO codes with `as.mo()`." ,
call = FALSE )
}
2020-10-26 12:23:03 +01:00
2020-08-26 11:33:54 +02:00
# make it always fit exactly
2020-12-24 23:29:10 +01:00
max_char <- max ( nchar ( x ) )
if ( is.na ( max_char ) ) {
max_char <- 7
}
2020-08-28 21:55:47 +02:00
create_pillar_column ( out ,
2020-10-26 12:23:03 +01:00
align = " left" ,
2020-12-24 23:29:10 +01:00
width = max_char + ifelse ( any ( x %in% c ( NA , " UNKNOWN" ) ) , 2 , 0 ) )
2020-08-26 11:33:54 +02:00
}
2020-08-28 21:55:47 +02:00
# will be exported using s3_register() in R/zzz.R
2020-08-26 11:33:54 +02:00
type_sum.mo <- function ( x , ... ) {
" mo"
}
2020-08-28 21:55:47 +02:00
# will be exported using s3_register() in R/zzz.R
freq.mo <- function ( x , ... ) {
x_noNA <- as.mo ( x [ ! is.na ( x ) ] ) # as.mo() to get the newest mo codes
grams <- mo_gramstain ( x_noNA , language = NULL )
digits <- list ( ... ) $ digits
if ( is.null ( digits ) ) {
digits <- 2
}
2020-12-17 16:22:25 +01:00
cleaner :: freq.default (
x = x ,
... ,
.add_header = list (
`Gram-negative` = paste0 (
format ( sum ( grams == " Gram-negative" , na.rm = TRUE ) ,
big.mark = " ," ,
decimal.mark = " ." ) ,
" (" , percentage ( sum ( grams == " Gram-negative" , na.rm = TRUE ) / length ( grams ) ,
digits = digits ) ,
" )" ) ,
`Gram-positive` = paste0 (
format ( sum ( grams == " Gram-positive" , na.rm = TRUE ) ,
big.mark = " ," ,
decimal.mark = " ." ) ,
" (" , percentage ( sum ( grams == " Gram-positive" , na.rm = TRUE ) / length ( grams ) ,
digits = digits ) ,
" )" ) ,
`Nr. of genera` = pm_n_distinct ( mo_genus ( x_noNA , language = NULL ) ) ,
`Nr. of species` = pm_n_distinct ( paste ( mo_genus ( x_noNA , language = NULL ) ,
mo_species ( x_noNA , language = NULL ) ) ) ) )
2020-08-28 21:55:47 +02:00
}
2020-09-28 01:08:55 +02:00
# will be exported using s3_register() in R/zzz.R
get_skimmers.mo <- function ( column ) {
2020-12-17 16:22:25 +01:00
skimr :: sfl (
2020-09-28 01:08:55 +02:00
skim_type = " mo" ,
2021-05-06 15:17:11 +02:00
unique_total = ~ length ( unique ( stats :: na.omit ( .) ) ) ,
gram_negative = ~ sum ( mo_is_gram_negative ( .) , na.rm = TRUE ) ,
gram_positive = ~ sum ( mo_is_gram_positive ( .) , na.rm = TRUE ) ,
2020-09-28 01:08:55 +02:00
top_genus = ~ names ( sort ( - table ( mo_genus ( stats :: na.omit ( .) , language = NULL ) ) ) ) [1L ] ,
top_species = ~ names ( sort ( - table ( mo_name ( stats :: na.omit ( .) , language = NULL ) ) ) ) [1L ]
)
}
2020-05-28 16:48:55 +02:00
#' @method print mo
2018-08-31 13:36:19 +02:00
#' @export
#' @noRd
2020-08-26 11:33:54 +02:00
print.mo <- function ( x , print.shortnames = FALSE , ... ) {
2020-05-27 16:37:49 +02:00
cat ( " Class <mo>\n" )
2018-10-12 16:35:18 +02:00
x_names <- names ( x )
2020-08-26 11:33:54 +02:00
if ( is.null ( x_names ) & print.shortnames == TRUE ) {
x_names <- tryCatch ( mo_shortname ( x , ... ) , error = function ( e ) NULL )
}
2018-10-12 16:35:18 +02:00
x <- as.character ( x )
names ( x ) <- x_names
2021-05-03 13:06:43 +02:00
if ( ! all ( x [ ! is.na ( x ) ] %in% MO_lookup $ mo ) ) {
warning_ ( " Some MO codes are from a previous AMR package version. " ,
" Please update these MO codes with `as.mo()`." ,
call = FALSE )
}
2018-10-12 16:35:18 +02:00
print.default ( x , quote = FALSE )
2018-08-31 13:36:19 +02:00
}
2018-07-23 14:14:03 +02:00
2020-05-28 16:48:55 +02:00
#' @method summary mo
2018-12-07 12:04:55 +01:00
#' @export
#' @noRd
summary.mo <- function ( object , ... ) {
# unique and top 1-3
2020-05-16 13:05:47 +02:00
x <- as.mo ( object ) # force again, could be mo from older pkg version
top <- as.data.frame ( table ( x ) , responseName = " n" , stringsAsFactors = FALSE )
top_3 <- top [order ( - top $ n ) , 1 ] [1 : 3 ]
2020-07-22 10:24:23 +02:00
value <- c ( " Class" = " mo" ,
2020-08-28 21:55:47 +02:00
" <NA>" = length ( x [is.na ( x ) ] ) ,
2020-09-18 16:05:53 +02:00
" Unique" = pm_n_distinct ( x [ ! is.na ( x ) ] ) ,
2020-08-28 21:55:47 +02:00
" #1" = top_3 [1 ] ,
" #2" = top_3 [2 ] ,
" #3" = top_3 [3 ] )
2020-07-22 10:24:23 +02:00
class ( value ) <- c ( " summaryDefault" , " table" )
value
2018-12-07 12:04:55 +01:00
}
2020-05-28 16:48:55 +02:00
#' @method as.data.frame mo
2018-07-23 14:14:03 +02:00
#' @export
2018-08-31 13:36:19 +02:00
#' @noRd
2020-05-19 13:18:01 +02:00
as.data.frame.mo <- function ( x , ... ) {
2021-04-07 08:37:42 +02:00
if ( ! all ( x [ ! is.na ( x ) ] %in% MO_lookup $ mo ) ) {
2021-05-03 13:06:43 +02:00
warning_ ( " The data contains old MO codes (from a previous AMR package version). " ,
2021-04-07 08:37:42 +02:00
" Please update your MO codes with `as.mo()`." ,
call = FALSE )
}
2020-05-19 12:08:49 +02:00
nm <- deparse1 ( substitute ( x ) )
2018-08-31 13:36:19 +02:00
if ( ! " nm" %in% names ( list ( ... ) ) ) {
2021-04-07 08:37:42 +02:00
as.data.frame.vector ( x , ... , nm = nm )
2018-08-31 13:36:19 +02:00
} else {
2021-04-07 08:37:42 +02:00
as.data.frame.vector ( x , ... )
2018-08-31 13:36:19 +02:00
}
}
2020-05-28 16:48:55 +02:00
#' @method [ mo
2018-08-31 13:36:19 +02:00
#' @export
#' @noRd
2019-08-14 14:57:06 +02:00
" [.mo" <- function ( x , ... ) {
2019-08-12 14:48:09 +02:00
y <- NextMethod ( )
2019-08-14 14:57:06 +02:00
attributes ( y ) <- attributes ( x )
y
}
2020-05-28 16:48:55 +02:00
#' @method [[ mo
2019-08-14 14:57:06 +02:00
#' @export
#' @noRd
2019-08-26 16:02:03 +02:00
" [[.mo" <- function ( x , ... ) {
2019-08-14 14:57:06 +02:00
y <- NextMethod ( )
2019-08-26 16:02:03 +02:00
attributes ( y ) <- attributes ( x )
2019-08-14 14:57:06 +02:00
y
}
2020-05-28 16:48:55 +02:00
#' @method [<- mo
2019-08-14 14:57:06 +02:00
#' @export
#' @noRd
2019-08-26 16:02:03 +02:00
" [<-.mo" <- function ( i , j , ... , value ) {
2019-08-14 14:57:06 +02:00
y <- NextMethod ( )
2019-08-26 16:02:03 +02:00
attributes ( y ) <- attributes ( i )
2020-04-13 21:09:56 +02:00
# must only contain valid MOs
2021-05-03 13:06:43 +02:00
return_after_integrity_check ( y , " microorganism code" , c ( as.character ( microorganisms $ mo ) ,
as.character ( microorganisms.translation $ mo_old ) ) )
2019-08-14 14:57:06 +02:00
}
2020-05-28 16:48:55 +02:00
#' @method [[<- mo
2019-08-14 14:57:06 +02:00
#' @export
#' @noRd
2019-08-26 16:02:03 +02:00
" [[<-.mo" <- function ( i , j , ... , value ) {
2019-08-14 14:57:06 +02:00
y <- NextMethod ( )
2019-08-26 16:02:03 +02:00
attributes ( y ) <- attributes ( i )
2020-04-13 21:09:56 +02:00
# must only contain valid MOs
2021-05-03 13:06:43 +02:00
return_after_integrity_check ( y , " microorganism code" , c ( as.character ( microorganisms $ mo ) ,
as.character ( microorganisms.translation $ mo_old ) ) )
2019-08-14 14:57:06 +02:00
}
2020-05-28 16:48:55 +02:00
#' @method c mo
2019-08-14 14:57:06 +02:00
#' @export
#' @noRd
2021-05-03 13:06:43 +02:00
c.mo <- function ( ... ) {
x <- list ( ... ) [ [1L ] ]
2019-08-14 14:57:06 +02:00
y <- NextMethod ( )
attributes ( y ) <- attributes ( x )
2021-05-03 13:06:43 +02:00
return_after_integrity_check ( y , " microorganism code" , c ( as.character ( microorganisms $ mo ) ,
as.character ( microorganisms.translation $ mo_old ) ) )
2018-07-23 14:14:03 +02:00
}
2018-12-06 14:36:39 +01:00
2020-09-25 14:44:50 +02:00
#' @method unique mo
#' @export
#' @noRd
unique.mo <- function ( x , incomparables = FALSE , ... ) {
y <- NextMethod ( )
attributes ( y ) <- attributes ( x )
y
}
2021-02-22 20:21:33 +01:00
#' @method rep mo
#' @export
#' @noRd
rep.mo <- function ( x , ... ) {
y <- NextMethod ( )
attributes ( y ) <- attributes ( x )
y
}
2019-02-08 16:06:54 +01:00
#' @rdname as.mo
2018-12-06 14:36:39 +01:00
#' @export
mo_failures <- function ( ) {
2020-12-27 00:07:00 +01:00
pkg_env $ mo_failures
2018-12-06 14:36:39 +01:00
}
2019-02-08 16:06:54 +01:00
#' @rdname as.mo
#' @export
mo_uncertainties <- function ( ) {
2020-12-27 00:07:00 +01:00
if ( is.null ( pkg_env $ mo_uncertainties ) ) {
2019-07-01 14:03:15 +02:00
return ( NULL )
}
2020-12-27 00:07:00 +01:00
set_clean_class ( as.data.frame ( pkg_env $ mo_uncertainties ,
2020-11-16 16:57:55 +01:00
stringsAsFactors = FALSE ) ,
new_class = c ( " mo_uncertainties" , " data.frame" ) )
2019-02-28 13:56:28 +01:00
}
2020-05-28 16:48:55 +02:00
#' @method print mo_uncertainties
2019-02-28 13:56:28 +01:00
#' @export
#' @noRd
print.mo_uncertainties <- function ( x , ... ) {
2019-03-12 12:19:27 +01:00
if ( NROW ( x ) == 0 ) {
return ( NULL )
}
2021-02-22 20:21:33 +01:00
message_ ( " Matching scores are based on human pathogenic prevalence and the resemblance between the input and the full taxonomic name. See `?mo_matching_score`." , as_note = FALSE )
2020-10-26 12:23:03 +01:00
2019-02-27 11:36:12 +01:00
msg <- " "
2019-10-11 17:21:02 +02:00
for ( i in seq_len ( nrow ( x ) ) ) {
2020-09-14 12:21:23 +02:00
if ( x [i , ] $ candidates != " " ) {
candidates <- unlist ( strsplit ( x [i , ] $ candidates , " , " , fixed = TRUE ) )
2020-09-28 11:00:59 +02:00
scores <- mo_matching_score ( x = x [i , ] $ input , n = candidates )
2020-09-12 08:49:01 +02:00
# sort on descending scores
candidates <- candidates [order ( 1 - scores ) ]
2020-09-28 11:00:59 +02:00
scores_formatted <- trimws ( formatC ( round ( scores , 3 ) , format = " f" , digits = 3 ) )
2020-09-14 12:21:23 +02:00
n_candidates <- length ( candidates )
2021-02-17 10:58:13 +01:00
candidates <- vector_and ( paste0 ( candidates , " (" , scores_formatted [order ( 1 - scores ) ] , " )" ) ,
quotes = FALSE ,
sort = FALSE )
2020-09-12 08:49:01 +02:00
# align with input after arrow
2020-10-26 12:23:03 +01:00
candidates <- paste0 ( " \n" ,
2020-10-04 19:26:43 +02:00
strwrap ( paste0 ( " Also matched" ,
2020-10-26 12:23:03 +01:00
ifelse ( n_candidates >= 25 , " (max 25)" , " " ) , " : " ,
2020-10-04 19:26:43 +02:00
candidates ) , # this is already max 25 due to format_uncertainty_as_df()
indent = nchar ( x [i , ] $ input ) + 6 ,
2020-10-26 12:23:03 +01:00
exdent = nchar ( x [i , ] $ input ) + 6 ,
2020-10-04 19:26:43 +02:00
width = 0.98 * getOption ( " width" ) ) ,
collapse = " " )
# after strwrap, make taxonomic names italic
2021-02-21 23:19:40 +01:00
candidates <- gsub ( " ([A-Za-z]+)" , font_italic ( " \\1" ) , candidates , perl = TRUE )
2021-04-07 08:37:42 +02:00
candidates <- gsub ( font_italic ( " and" ) , " and" , candidates , fixed = TRUE )
2020-10-26 12:23:03 +01:00
candidates <- gsub ( paste ( font_italic ( c ( " Also" , " matched" ) , collapse = NULL ) , collapse = " " ) ,
2020-10-04 19:26:43 +02:00
" Also matched" ,
candidates , fixed = TRUE )
candidates <- gsub ( font_italic ( " max" ) , " max" , candidates , fixed = TRUE )
2020-09-12 08:49:01 +02:00
} else {
candidates <- " "
}
2020-09-28 11:00:59 +02:00
score <- trimws ( formatC ( round ( mo_matching_score ( x = x [i , ] $ input ,
n = x [i , ] $ fullname ) ,
3 ) ,
format = " f" , digits = 3 ) )
2019-02-27 11:36:12 +01:00
msg <- paste ( msg ,
2020-10-04 19:26:43 +02:00
paste0 (
strwrap (
paste0 ( ' "' , x [i , ] $ input , ' " -> ' ,
paste0 ( font_bold ( font_italic ( x [i , ] $ fullname ) ) ,
ifelse ( ! is.na ( x [i , ] $ renamed_to ) , paste ( " , renamed to" , font_italic ( x [i , ] $ renamed_to ) ) , " " ) ,
" (" , x [i , ] $ mo ,
" , matching score = " , score ,
" ) " ) ) ,
width = 0.98 * getOption ( " width" ) ,
2020-10-26 12:23:03 +01:00
exdent = nchar ( x [i , ] $ input ) + 6 ) ,
2020-10-04 19:26:43 +02:00
collapse = " \n" ) ,
candidates ,
2019-02-27 11:36:12 +01:00
sep = " \n" )
2020-10-04 19:26:43 +02:00
msg <- paste0 ( gsub ( " \n\n" , " \n" , msg ) , " \n\n" )
2019-02-27 11:36:12 +01:00
}
2019-02-28 13:56:28 +01:00
cat ( msg )
2019-02-08 16:06:54 +01:00
}
#' @rdname as.mo
2018-12-06 14:36:39 +01:00
#' @export
mo_renamed <- function ( ) {
2020-12-27 00:07:00 +01:00
items <- pkg_env $ mo_renamed
2019-07-01 14:03:15 +02:00
if ( is.null ( items ) ) {
2020-11-11 16:49:27 +01:00
items <- data.frame ( stringsAsFactors = FALSE )
2019-09-15 22:57:30 +02:00
} else {
2020-09-18 16:05:53 +02:00
items <- pm_distinct ( items , old_name , .keep_all = TRUE )
2019-07-01 14:03:15 +02:00
}
2020-11-16 16:57:55 +01:00
set_clean_class ( as.data.frame ( items ,
stringsAsFactors = FALSE ) ,
new_class = c ( " mo_renamed" , " data.frame" ) )
2019-02-28 13:56:28 +01:00
}
2020-05-28 16:48:55 +02:00
#' @method print mo_renamed
2019-02-28 13:56:28 +01:00
#' @export
#' @noRd
print.mo_renamed <- function ( x , ... ) {
2019-09-15 22:57:30 +02:00
if ( NROW ( x ) == 0 ) {
return ( invisible ( ) )
}
2019-10-11 17:21:02 +02:00
for ( i in seq_len ( nrow ( x ) ) ) {
2020-10-27 15:56:51 +01:00
message_ ( font_italic ( x $ old_name [i ] ) ,
ifelse ( x $ old_ref [i ] %in% c ( " " , NA ) ,
" " ,
paste0 ( " (" , gsub ( " et al." , font_italic ( " et al." ) , x $ old_ref [i ] ) , " )" ) ) ,
" was renamed " ,
2021-03-04 23:28:32 +01:00
ifelse ( ! x $ new_ref [i ] %in% c ( " " , NA ) && as.integer ( gsub ( " [^0-9]" , " " , x $ new_ref [i ] ) ) < as.integer ( gsub ( " [^0-9]" , " " , x $ old_ref [i ] ) ) ,
2020-10-27 15:56:51 +01:00
font_bold ( " back to " ) ,
" " ) ,
font_italic ( x $ new_name [i ] ) ,
ifelse ( x $ new_ref [i ] %in% c ( " " , NA ) ,
" " ,
paste0 ( " (" , gsub ( " et al." , font_italic ( " et al." ) , x $ new_ref [i ] ) , " )" ) ) ,
" [" , x $ mo [i ] , " ]" )
2019-09-15 22:57:30 +02:00
}
2019-02-27 11:36:12 +01:00
}
nr2char <- function ( x ) {
if ( x %in% c ( 1 : 10 ) ) {
v <- c ( " one" = 1 , " two" = 2 , " three" = 3 , " four" = 4 , " five" = 5 ,
" six" = 6 , " seven" = 7 , " eight" = 8 , " nine" = 9 , " ten" = 10 )
names ( v [x ] )
} else {
x
}
2018-12-06 14:36:39 +01:00
}
2019-03-15 13:57:25 +01:00
unregex <- function ( x ) {
gsub ( " [^a-zA-Z0-9 -]" , " " , x )
}
2019-03-18 14:29:41 +01:00
2019-03-26 14:24:03 +01:00
translate_allow_uncertain <- function ( allow_uncertain ) {
if ( isTRUE ( allow_uncertain ) ) {
# default to uncertainty level 2
allow_uncertain <- 2
} else {
2019-08-13 16:15:08 +02:00
allow_uncertain [tolower ( allow_uncertain ) == " none" ] <- 0
allow_uncertain [tolower ( allow_uncertain ) == " all" ] <- 3
2019-03-26 14:24:03 +01:00
allow_uncertain <- as.integer ( allow_uncertain )
2020-06-22 11:18:40 +02:00
stop_ifnot ( allow_uncertain %in% c ( 0 : 3 ) ,
' `allow_uncertain` must be a number between 0 (or "none") and 3 (or "all"), or TRUE (= 2) or FALSE (= 0)' , call = FALSE )
2019-03-26 14:24:03 +01:00
}
allow_uncertain
}
2019-07-01 14:03:15 +02:00
get_mo_failures_uncertainties_renamed <- function ( ) {
2020-12-27 00:07:00 +01:00
remember <- list ( failures = pkg_env $ mo_failures ,
uncertainties = pkg_env $ mo_uncertainties ,
renamed = pkg_env $ mo_renamed )
2020-05-27 16:37:49 +02:00
# empty them, otherwise mo_shortname("Chlamydophila psittaci") will give 3 notes
2020-12-27 00:07:00 +01:00
pkg_env $ mo_failures <- NULL
pkg_env $ mo_uncertainties <- NULL
pkg_env $ mo_renamed <- NULL
2020-05-27 16:37:49 +02:00
remember
2019-07-01 14:03:15 +02:00
}
load_mo_failures_uncertainties_renamed <- function ( metadata ) {
2020-12-27 00:07:00 +01:00
pkg_env $ mo_failures <- metadata $ failures
pkg_env $ mo_uncertainties <- metadata $ uncertainties
pkg_env $ mo_renamed <- metadata $ renamed
2019-07-01 14:03:15 +02:00
}
2019-11-15 15:25:03 +01:00
2020-04-13 21:09:56 +02:00
trimws2 <- function ( x ) {
trimws ( gsub ( " [\\s]+" , " " , x , perl = TRUE ) )
}
2020-04-14 15:10:09 +02:00
parse_and_convert <- function ( x ) {
2020-04-13 21:09:56 +02:00
tryCatch ( {
2020-04-14 15:10:09 +02:00
if ( ! is.null ( dim ( x ) ) ) {
if ( NCOL ( x ) > 2 ) {
2020-06-22 11:18:40 +02:00
stop ( " a maximum of two columns is allowed" , call. = FALSE )
2020-04-14 15:10:09 +02:00
} else if ( NCOL ( x ) == 2 ) {
2021-02-21 22:56:35 +01:00
# support Tidyverse selection like: df %>% select(colA, colB)
2020-04-14 15:10:09 +02:00
# paste these columns together
x <- as.data.frame ( x , stringsAsFactors = FALSE )
colnames ( x ) <- c ( " A" , " B" )
x <- paste ( x $ A , x $ B )
} else {
2021-02-21 22:56:35 +01:00
# support Tidyverse selection like: df %>% select(colA)
2020-04-14 15:10:09 +02:00
x <- as.data.frame ( x , stringsAsFactors = FALSE ) [ [1 ] ]
}
}
x [is.null ( x ) ] <- NA
2020-04-13 21:09:56 +02:00
parsed <- iconv ( x , to = " UTF-8" )
parsed [is.na ( parsed ) & ! is.na ( x ) ] <- iconv ( x [is.na ( parsed ) & ! is.na ( x ) ] , from = " Latin1" , to = " ASCII//TRANSLIT" )
2020-04-14 14:12:31 +02:00
parsed <- gsub ( ' "' , " " , parsed , fixed = TRUE )
2021-02-21 22:56:35 +01:00
parsed <- gsub ( " +" , " " , parsed , perl = TRUE )
parsed <- trimws ( parsed )
2020-04-14 14:12:31 +02:00
} , error = function ( e ) stop ( e $ message , call. = FALSE ) ) # this will also be thrown when running `as.mo(no_existing_object)`
parsed
2020-04-13 21:09:56 +02:00
}
2020-05-16 13:05:47 +02:00
2020-07-22 12:29:51 +02:00
replace_old_mo_codes <- function ( x , property ) {
2020-07-22 10:24:23 +02:00
if ( any ( toupper ( x ) %in% microorganisms.translation $ mo_old , na.rm = TRUE ) ) {
# get the ones that match
matched <- match ( toupper ( x ) , microorganisms.translation $ mo_old )
# and their new codes
mo_new <- microorganisms.translation $ mo_new [matched ]
# assign on places where a match was found
x [which ( ! is.na ( matched ) ) ] <- mo_new [which ( ! is.na ( matched ) ) ]
2020-11-10 16:35:56 +01:00
n_matched <- length ( matched [ ! is.na ( matched ) ] )
2020-07-22 12:29:51 +02:00
if ( property != " mo" ) {
2021-04-07 08:37:42 +02:00
message_ ( font_blue ( paste0 ( " The input contained " , n_matched ,
2021-05-03 13:06:43 +02:00
" old MO code" , ifelse ( n_matched == 1 , " " , " s" ) ,
2021-04-07 08:37:42 +02:00
" (from a previous AMR package version). Please update your MO codes with `as.mo()`." ) ) )
2020-10-26 12:23:03 +01:00
} else {
2021-05-03 13:06:43 +02:00
message_ ( font_blue ( paste0 ( n_matched , " old MO code" , ifelse ( n_matched == 1 , " " , " s" ) ,
2021-04-07 08:37:42 +02:00
" (from a previous AMR package version) " ,
ifelse ( n_matched == 1 , " was" , " were" ) ,
" updated to " , ifelse ( n_matched == 1 , " a " , " " ) ,
" currently used MO code" , ifelse ( n_matched == 1 , " " , " s" ) , " ." ) ) )
2020-07-22 12:29:51 +02:00
}
2020-07-22 10:24:23 +02:00
}
x
}
2020-09-03 12:31:48 +02:00
replace_ignore_pattern <- function ( x , ignore_pattern ) {
if ( ! is.null ( ignore_pattern ) && ! identical ( trimws2 ( ignore_pattern ) , " " ) ) {
ignore_cases <- x %like% ignore_pattern
if ( sum ( ignore_cases ) > 0 ) {
2020-10-27 15:56:51 +01:00
message_ ( " The following input was ignored by `ignore_pattern = \"" , ignore_pattern , " \"`: " ,
2021-02-04 16:48:16 +01:00
vector_and ( x [ignore_cases ] , quotes = TRUE ) )
x [ignore_cases ] <- NA_character_
2020-09-03 12:31:48 +02:00
}
}
x
}
2020-11-05 01:11:49 +01:00
repair_reference_df <- function ( reference_df ) {
# has valid own reference_df
reference_df <- reference_df %pm>%
pm_filter ( ! is.na ( mo ) )
# keep only first two columns, second must be mo
if ( colnames ( reference_df ) [1 ] == " mo" ) {
reference_df <- reference_df %pm>% pm_select ( 2 , " mo" )
} else {
reference_df <- reference_df %pm>% pm_select ( 1 , " mo" )
}
2020-11-10 16:35:56 +01:00
2020-11-05 01:11:49 +01:00
# remove factors, just keep characters
colnames ( reference_df ) [1 ] <- " x"
2020-11-10 16:35:56 +01:00
reference_df [ , " x" ] <- as.character ( reference_df [ , " x" , drop = TRUE ] )
reference_df [ , " mo" ] <- as.character ( reference_df [ , " mo" , drop = TRUE ] )
2021-05-03 13:06:43 +02:00
# some MO codes might be old
2020-11-10 16:35:56 +01:00
reference_df [ , " mo" ] <- as.mo ( reference_df [ , " mo" , drop = TRUE ] )
2020-11-05 01:11:49 +01:00
reference_df
}
2021-01-25 21:58:00 +01:00
strip_words <- function ( text , n , side = " right" ) {
2021-01-28 16:09:30 +01:00
out <- lapply ( strsplit ( text , " " ) , function ( x ) {
2021-01-25 21:58:00 +01:00
if ( side %like% " ^r" & length ( x ) > n ) {
x [seq_len ( length ( x ) - n ) ]
} else if ( side %like% " ^l" & length ( x ) > n ) {
x [2 : length ( x ) ]
}
} )
vapply ( FUN.VALUE = character ( 1 ) , out , paste , collapse = " " )
}