mirror of https://github.com/msberends/AMR.git
231 lines
19 KiB
R
231 lines
19 KiB
R
% Generated by roxygen2: do not edit by hand
|
|
% Please edit documentation in R/mo.R
|
|
\name{as.mo}
|
|
\alias{as.mo}
|
|
\alias{mo}
|
|
\alias{is.mo}
|
|
\alias{mo_uncertainties}
|
|
\alias{mo_reset_session}
|
|
\title{Transform Input to a Microorganism Code}
|
|
\usage{
|
|
as.mo(
|
|
x,
|
|
Becker = FALSE,
|
|
Lancefield = FALSE,
|
|
minimum_matching_score = NULL,
|
|
allow_uncertain = TRUE,
|
|
keep_synonyms = getOption("AMR_keep_synonyms", TRUE),
|
|
reference_df = get_mo_source(),
|
|
ignore_pattern = getOption("AMR_ignore_pattern", NULL),
|
|
language = get_AMR_locale(),
|
|
info = interactive(),
|
|
...
|
|
)
|
|
|
|
is.mo(x)
|
|
|
|
mo_uncertainties()
|
|
|
|
mo_reset_session()
|
|
}
|
|
\arguments{
|
|
\item{x}{a \link{character} vector or a \link{data.frame} with one or two columns}
|
|
|
|
\item{Becker}{a \link{logical} to indicate whether staphylococci should be categorised into coagulase-negative staphylococci ("CoNS") and coagulase-positive staphylococci ("CoPS") instead of their own species, according to Karsten Becker \emph{et al.} (1,2,3).
|
|
|
|
This excludes \emph{Staphylococcus aureus} at default, use \code{Becker = "all"} to also categorise \emph{S. aureus} as "CoPS".}
|
|
|
|
\item{Lancefield}{a \link{logical} to indicate whether a beta-haemolytic \emph{Streptococcus} should be categorised into Lancefield groups instead of their own species, according to Rebecca C. Lancefield (4). These streptococci will be categorised in their first group, e.g. \emph{Streptococcus dysgalactiae} will be group C, although officially it was also categorised into groups G and L.
|
|
|
|
This excludes enterococci at default (who are in group D), use \code{Lancefield = "all"} to also categorise all enterococci as group D.}
|
|
|
|
\item{minimum_matching_score}{a numeric value to set as the lower limit for the \link[=mo_matching_score]{MO matching score}. When left blank, this will be determined automatically based on the character length of \code{x}, its \link[=microorganisms]{taxonomic kingdom} and \link[=mo_matching_score]{human pathogenicity}.}
|
|
|
|
\item{allow_uncertain}{a number between \code{0} (or \code{"none"}) and \code{3} (or \code{"all"}), or \code{TRUE} (= \code{2}) or \code{FALSE} (= \code{0}) to indicate whether the input should be checked for less probable results, see \emph{Details}}
|
|
|
|
\item{keep_synonyms}{a \link{logical} to indicate if old, previously valid taxonomic names must be preserved and not be corrected to currently accepted names. The default is \code{TRUE}, which will return a note if old taxonomic names are returned. The default can be set with \code{options(AMR_keep_synonyms = ...)}.}
|
|
|
|
\item{reference_df}{a \link{data.frame} to be used for extra reference when translating \code{x} to a valid \code{\link{mo}}. See \code{\link[=set_mo_source]{set_mo_source()}} and \code{\link[=get_mo_source]{get_mo_source()}} to automate the usage of your own codes (e.g. used in your analysis or organisation).}
|
|
|
|
\item{ignore_pattern}{a regular expression (case-insensitive) of which all matches in \code{x} must return \code{NA}. This can be convenient to exclude known non-relevant input and can also be set with the option \code{AMR_ignore_pattern}, e.g. \code{options(AMR_ignore_pattern = "(not reported|contaminated flora)")}.}
|
|
|
|
\item{language}{language to translate text like "no growth", which defaults to the system language (see \code{\link[=get_AMR_locale]{get_AMR_locale()}})}
|
|
|
|
\item{info}{a \link{logical} to indicate if a progress bar should be printed if more than 25 items are to be coerced, defaults to \code{TRUE} only in interactive mode}
|
|
|
|
\item{...}{other arguments passed on to functions}
|
|
}
|
|
\value{
|
|
A \link{character} \link{vector} with additional class \code{\link{mo}}
|
|
}
|
|
\description{
|
|
Use this function to determine a valid microorganism code (\code{\link{mo}}). Determination is done using intelligent rules and the complete taxonomic kingdoms Animalia, Archaea, Bacteria and Protozoa, and most microbial species from the kingdom Fungi (see \emph{Source}). The input can be almost anything: a full name (like \code{"Staphylococcus aureus"}), an abbreviated name (such as \code{"S. aureus"}), an abbreviation known in the field (such as \code{"MRSA"}), or just a genus. See \emph{Examples}.
|
|
}
|
|
\details{
|
|
\subsection{General Info}{
|
|
|
|
A microorganism (MO) code from this package (class: \code{\link{mo}}) is human readable and typically looks like these examples:
|
|
|
|
\if{html}{\out{<div class="sourceCode">}}\preformatted{ Code Full name
|
|
--------------- --------------------------------------
|
|
B_KLBSL Klebsiella
|
|
B_KLBSL_PNMN Klebsiella pneumoniae
|
|
B_KLBSL_PNMN_RHNS Klebsiella pneumoniae rhinoscleromatis
|
|
| | | |
|
|
| | | |
|
|
| | | \\---> subspecies, a 4-5 letter acronym
|
|
| | \\----> species, a 4-5 letter acronym
|
|
| \\----> genus, a 5-7 letter acronym
|
|
\\----> taxonomic kingdom: A (Archaea), AN (Animalia), B (Bacteria),
|
|
F (Fungi), PL (Plantae), P (Protozoa)
|
|
}\if{html}{\out{</div>}}
|
|
|
|
Values that cannot be coerced will be considered 'unknown' and will get the MO code \code{UNKNOWN}.
|
|
|
|
Use the \code{\link[=mo_property]{mo_*}} functions to get properties based on the returned code, see \emph{Examples}.
|
|
|
|
The algorithm uses data from the List of Prokaryotic names with Standing in Nomenclature (LPSN) and the Global Biodiversity Information Facility (GBIF) (see \link{microorganisms}).
|
|
|
|
The \code{\link[=as.mo]{as.mo()}} function uses several coercion rules for fast and logical results. It assesses the input matching criteria in the following order:
|
|
\enumerate{
|
|
\item Human pathogenic prevalence: the function starts with more prevalent microorganisms, followed by less prevalent ones;
|
|
\item Taxonomic kingdom: the function starts with determining Bacteria, then Fungi, then Protozoa, then others;
|
|
\item Breakdown of input values to identify possible matches.
|
|
}
|
|
|
|
This will lead to the effect that e.g. \code{"E. coli"} (a microorganism highly prevalent in humans) will return the microbial ID of \emph{Escherichia coli} and not \emph{Entamoeba coli} (a microorganism less prevalent in humans), although the latter would alphabetically come first.
|
|
}
|
|
|
|
\subsection{Coping with Uncertain Results}{
|
|
|
|
In addition, the \code{\link[=as.mo]{as.mo()}} function can differentiate four levels of uncertainty to guess valid results:
|
|
\itemize{
|
|
\item Uncertainty level 0: no additional rules are applied;
|
|
\item Uncertainty level 1: allow previously accepted (but now invalid) taxonomic names and minor spelling errors;
|
|
\item Uncertainty level 2: allow all of level 1, strip values between brackets, inverse the words of the input, strip off text elements from the end keeping at least two elements;
|
|
\item Uncertainty level 3: allow all of level 1 and 2, strip off text elements from the end, allow any part of a taxonomic name.
|
|
}
|
|
|
|
The level of uncertainty can be set using the argument \code{allow_uncertain}. The default is \code{allow_uncertain = TRUE}, which is equal to uncertainty level 2. Using \code{allow_uncertain = FALSE} is equal to uncertainty level 0 and will skip all rules. You can also use e.g. \code{as.mo(..., allow_uncertain = 1)} to only allow up to level 1 uncertainty.
|
|
|
|
With the default setting (\code{allow_uncertain = TRUE}, level 2), below examples will lead to valid results:
|
|
\itemize{
|
|
\item \code{"Streptococcus group B (known as S. agalactiae)"}. The text between brackets will be removed and a warning will be thrown that the result \emph{Streptococcus group B} (\code{B_STRPT_GRPB}) needs review.
|
|
\item \code{"S. aureus - please mind: MRSA"}. The last word will be stripped, after which the function will try to find a match. If it does not, the second last word will be stripped, etc. Again, a warning will be thrown that the result \emph{Staphylococcus aureus} (\code{B_STPHY_AURS}) needs review.
|
|
\item \code{"Fluoroquinolone-resistant Neisseria gonorrhoeae"}. The first word will be stripped, after which the function will try to find a match. A warning will be thrown that the result \emph{Neisseria gonorrhoeae} (\code{B_NESSR_GNRR}) needs review.
|
|
}
|
|
|
|
There are three helper functions that can be run after using the \code{\link[=as.mo]{as.mo()}} function:
|
|
\itemize{
|
|
\item Use \code{\link[=mo_uncertainties]{mo_uncertainties()}} to get a \link{data.frame} that prints in a pretty format with all taxonomic names that were guessed. The output contains the matching score for all matches (see \emph{Matching Score for Microorganisms} below).
|
|
\item Use \code{\link[=mo_failures]{mo_failures()}} to get a \link{character} \link{vector} with all values that could not be coerced to a valid value.
|
|
\item Use \code{\link[=mo_renamed]{mo_renamed()}} to get a \link{data.frame} with all values that could be coerced based on old, previously accepted taxonomic names.
|
|
}
|
|
}
|
|
|
|
\subsection{Microbial Prevalence of Pathogens in Humans}{
|
|
|
|
The coercion rules consider the prevalence of microorganisms in humans grouped into three groups, which is available as the \code{prevalence} columns in the \link{microorganisms} data set. The grouping into human pathogenic prevalence is explained in the section \emph{Matching Score for Microorganisms} below.
|
|
}
|
|
}
|
|
\section{Source}{
|
|
|
|
\enumerate{
|
|
\item Becker K. \emph{et al.} (2014). \strong{Coagulase-Negative Staphylococci.} \emph{Clin Microbiol Rev.} 27(4): 870-926; \doi{10.1128/CMR.00109-13}
|
|
\item Becker K. \emph{et al.} (2019). \strong{Implications of identifying the recently defined members of the \emph{S. aureus} complex, \emph{S. argenteus} and \emph{S. schweitzeri}: A position paper of members of the ESCMID Study Group for staphylococci and Staphylococcal Diseases (ESGS).} \emph{Clin Microbiol Infect}; \doi{10.1016/j.cmi.2019.02.028}
|
|
\item Becker K. \emph{et al.} (2020). \strong{Emergence of coagulase-negative staphylococci} \emph{Expert Rev Anti Infect Ther.} 18(4):349-366; \doi{10.1080/14787210.2020.1730813}
|
|
\item Lancefield R.C. (1933). \strong{A serological differentiation of human and other groups of hemolytic streptococci}. \emph{J Exp Med.} 57(4): 571-95; \doi{10.1084/jem.57.4.571}
|
|
\item Berends M.S. \emph{et al.} (2022). \strong{Trends in Occurrence and Phenotypic Resistance of Coagulase-Negative Staphylococci (CoNS) Found in Human Blood in the Northern Netherlands between 2013 and 2019} \emph{Microorganisms} 10(9), 1801; \doi{10.3390/microorganisms10091801}
|
|
\item Parte, A.C. \emph{et al.} (2020). \strong{List of Prokaryotic names with Standing in Nomenclature (LPSN) moves to the DSMZ.} International Journal of Systematic and Evolutionary Microbiology, 70, 5607-5612; \doi{10.1099/ijsem.0.004332}. Accessed from \url{https://lpsn.dsmz.de} on 12 September, 2022.
|
|
\item GBIF Secretariat (November 26, 2021). GBIF Backbone Taxonomy. Checklist dataset \doi{10.15468/39omei}. Accessed from \url{https://www.gbif.org} on 12 September, 2022.
|
|
\item Public Health Information Network Vocabulary Access and Distribution System (PHIN VADS). US Edition of SNOMED CT from 1 September 2020. Value Set Name 'Microoganism', OID 2.16.840.1.114222.4.11.1009 (v12). URL: \url{https://phinvads.cdc.gov}
|
|
}
|
|
}
|
|
|
|
\section{Matching Score for Microorganisms}{
|
|
|
|
With ambiguous user input in \code{\link[=as.mo]{as.mo()}} and all the \code{\link[=mo_property]{mo_*}} functions, the returned results are chosen based on their matching score using \code{\link[=mo_matching_score]{mo_matching_score()}}. This matching score \eqn{m}, is calculated as:
|
|
|
|
\ifelse{latex}{\deqn{m_{(x, n)} = \frac{l_{n} - 0.5 \cdot \min \begin{cases}l_{n} \\ \textrm{lev}(x, n)\end{cases}}{l_{n} \cdot p_{n} \cdot k_{n}}}}{\ifelse{html}{\figure{mo_matching_score.png}{options: width="300" alt="mo matching score"}}{m(x, n) = ( l_n * min(l_n, lev(x, n) ) ) / ( l_n * p_n * k_n )}}
|
|
|
|
where:
|
|
\itemize{
|
|
\item \ifelse{html}{\out{<i>x</i> is the user input;}}{\eqn{x} is the user input;}
|
|
\item \ifelse{html}{\out{<i>n</i> is a taxonomic name (genus, species, and subspecies);}}{\eqn{n} is a taxonomic name (genus, species, and subspecies);}
|
|
\item \ifelse{html}{\out{<i>l<sub>n</sub></i> is the length of <i>n</i>;}}{l_n is the length of \eqn{n};}
|
|
\item \ifelse{html}{\out{<i>lev</i> is the <a href="https://en.wikipedia.org/wiki/Levenshtein_distance">Levenshtein distance function</a>, which counts any insertion, deletion and substitution as 1 that is needed to change <i>x</i> into <i>n</i>;}}{lev is the Levenshtein distance function, which counts any insertion, deletion and substitution as 1 that is needed to change \eqn{x} into \eqn{n};}
|
|
\item \ifelse{html}{\out{<i>p<sub>n</sub></i> is the human pathogenic prevalence group of <i>n</i>, as described below;}}{p_n is the human pathogenic prevalence group of \eqn{n}, as described below;}
|
|
\item \ifelse{html}{\out{<i>k<sub>n</sub></i> is the taxonomic kingdom of <i>n</i>, set as Bacteria = 1, Fungi = 2, Protozoa = 3, Archaea = 4, others = 5.}}{l_n is the taxonomic kingdom of \eqn{n}, set as Bacteria = 1, Fungi = 2, Protozoa = 3, Archaea = 4, others = 5.}
|
|
}
|
|
|
|
The grouping into human pathogenic prevalence (\eqn{p}) is based on experience from several microbiological laboratories in the Netherlands in conjunction with international reports on pathogen prevalence:
|
|
|
|
\strong{Group 1} (most prevalent microorganisms) consists of all microorganisms where the taxonomic class is Gammaproteobacteria or where the taxonomic genus is \emph{Enterococcus}, \emph{Staphylococcus} or \emph{Streptococcus}. This group consequently contains all common Gram-negative bacteria, such as \emph{Pseudomonas} and \emph{Legionella} and all species within the order Enterobacterales.
|
|
|
|
\strong{Group 2} consists of all microorganisms where the taxonomic phylum is Proteobacteria, Firmicutes, Actinobacteria or Sarcomastigophora, or where the taxonomic genus is \emph{Absidia}, \emph{Acanthamoeba}, \emph{Acholeplasma}, \emph{Acremonium}, \emph{Actinotignum}, \emph{Aedes}, \emph{Alistipes}, \emph{Alloprevotella}, \emph{Alternaria}, \emph{Amoeba}, \emph{Anaerosalibacter}, \emph{Ancylostoma}, \emph{Angiostrongylus}, \emph{Anisakis}, \emph{Anopheles}, \emph{Apophysomyces}, \emph{Arachnia}, \emph{Aspergillus}, \emph{Aureobasidium}, \emph{Bacteroides}, \emph{Basidiobolus}, \emph{Beauveria}, \emph{Bergeyella}, \emph{Blastocystis}, \emph{Blastomyces}, \emph{Borrelia}, \emph{Brachyspira}, \emph{Branhamella}, \emph{Butyricimonas}, \emph{Candida}, \emph{Capillaria}, \emph{Capnocytophaga}, \emph{Catabacter}, \emph{Cetobacterium}, \emph{Chaetomium}, \emph{Chlamydia}, \emph{Chlamydophila}, \emph{Chryseobacterium}, \emph{Chrysonilia}, \emph{Cladophialophora}, \emph{Cladosporium}, \emph{Conidiobolus}, \emph{Contracaecum}, \emph{Cordylobia}, \emph{Cryptococcus}, \emph{Curvularia}, \emph{Deinococcus}, \emph{Demodex}, \emph{Dermatobia}, \emph{Dientamoeba}, \emph{Diphyllobothrium}, \emph{Dirofilaria}, \emph{Dysgonomonas}, \emph{Echinostoma}, \emph{Elizabethkingia}, \emph{Empedobacter}, \emph{Entamoeba}, \emph{Enterobius}, \emph{Exophiala}, \emph{Exserohilum}, \emph{Fasciola}, \emph{Flavobacterium}, \emph{Fonsecaea}, \emph{Fusarium}, \emph{Fusobacterium}, \emph{Giardia}, \emph{Haloarcula}, \emph{Halobacterium}, \emph{Halococcus}, \emph{Hendersonula}, \emph{Heterophyes}, \emph{Histomonas}, \emph{Histoplasma}, \emph{Hymenolepis}, \emph{Hypomyces}, \emph{Hysterothylacium}, \emph{Leishmania}, \emph{Lelliottia}, \emph{Leptosphaeria}, \emph{Leptotrichia}, \emph{Lucilia}, \emph{Lumbricus}, \emph{Malassezia}, \emph{Malbranchea}, \emph{Metagonimus}, \emph{Microsporidium}, \emph{Microsporum}, \emph{Mortierella}, \emph{Mucor}, \emph{Mycocentrospora}, \emph{Mycoplasma}, \emph{Myroides}, \emph{Necator}, \emph{Nectria}, \emph{Ochroconis}, \emph{Odoribacter}, \emph{Oesophagostomum}, \emph{Oidiodendron}, \emph{Opisthorchis}, \emph{Ornithobacterium}, \emph{Parabacteroides}, \emph{Pediculus}, \emph{Pedobacter}, \emph{Phlebotomus}, \emph{Phocaeicola}, \emph{Phocanema}, \emph{Phoma}, \emph{Piedraia}, \emph{Pithomyces}, \emph{Pityrosporum}, \emph{Pneumocystis}, \emph{Porphyromonas}, \emph{Prevotella}, \emph{Pseudallescheria}, \emph{Pseudoterranova}, \emph{Pulex}, \emph{Rhizomucor}, \emph{Rhizopus}, \emph{Rhodotorula}, \emph{Riemerella}, \emph{Saccharomyces}, \emph{Sarcoptes}, \emph{Scolecobasidium}, \emph{Scopulariopsis}, \emph{Scytalidium}, \emph{Sphingobacterium}, \emph{Spirometra}, \emph{Spiroplasma}, \emph{Sporobolomyces}, \emph{Stachybotrys}, \emph{Streptobacillus}, \emph{Strongyloides}, \emph{Syngamus}, \emph{Taenia}, \emph{Tannerella}, \emph{Tenacibaculum}, \emph{Terrimonas}, \emph{Toxocara}, \emph{Treponema}, \emph{Trichinella}, \emph{Trichobilharzia}, \emph{Trichoderma}, \emph{Trichomonas}, \emph{Trichophyton}, \emph{Trichosporon}, \emph{Trichostrongylus}, \emph{Trichuris}, \emph{Tritirachium}, \emph{Trombicula}, \emph{Trypanosoma}, \emph{Tunga}, \emph{Ureaplasma}, \emph{Victivallis}, \emph{Wautersiella}, \emph{Weeksella} or \emph{Wuchereria}.
|
|
|
|
\strong{Group 3} consists of all other microorganisms.
|
|
|
|
All characters in \eqn{x} and \eqn{n} are ignored that are other than A-Z, a-z, 0-9, spaces and parentheses.
|
|
|
|
All matches are sorted descending on their matching score and for all user input values, the top match will be returned. This will lead to the effect that e.g., \code{"E. coli"} will return the microbial ID of \emph{Escherichia coli} (\eqn{m = 0.688}, a highly prevalent microorganism found in humans) and not \emph{Entamoeba coli} (\eqn{m = 0.079}, a less prevalent microorganism in humans), although the latter would alphabetically come first.
|
|
}
|
|
|
|
\section{Reference Data Publicly Available}{
|
|
|
|
All data sets in this \code{AMR} package (about microorganisms, antibiotics, R/SI interpretation, EUCAST rules, etc.) are publicly and freely available for download in the following formats: R, MS Excel, Apache Feather, Apache Parquet, SPSS, SAS, and Stata. We also provide tab-separated plain text files that are machine-readable and suitable for input in any software program, such as laboratory information systems. Please visit \href{https://msberends.github.io/AMR/articles/datasets.html}{our website for the download links}. The actual files are of course available on \href{https://github.com/msberends/AMR/tree/main/data-raw}{our GitHub repository}.
|
|
}
|
|
|
|
\examples{
|
|
\donttest{
|
|
# These examples all return "B_STPHY_AURS", the ID of S. aureus:
|
|
as.mo(c(
|
|
"sau", # WHONET code
|
|
"stau",
|
|
"STAU",
|
|
"staaur",
|
|
"S. aureus",
|
|
"S aureus",
|
|
"Staphylococcus aureus",
|
|
"Staphylococcus aureus (MRSA,",
|
|
"Zthafilokkoockus oureuz", # handles incorrect spelling
|
|
"MRSA", # Methicillin Resistant S. aureus
|
|
"VISA", # Vancomycin Intermediate S. aureus
|
|
"VRSA", # Vancomycin Resistant S. aureus
|
|
115329001 # SNOMED CT code
|
|
))
|
|
|
|
# Dyslexia is no problem - these all work:
|
|
as.mo(c(
|
|
"Ureaplasma urealyticum",
|
|
"Ureaplasma urealyticus",
|
|
"Ureaplasmium urealytica",
|
|
"Ureaplazma urealitycium"
|
|
))
|
|
|
|
as.mo("Streptococcus group A")
|
|
|
|
as.mo("S. epidermidis") # will remain species: B_STPHY_EPDR
|
|
as.mo("S. epidermidis", Becker = TRUE) # will not remain species: B_STPHY_CONS
|
|
|
|
as.mo("S. pyogenes") # will remain species: B_STRPT_PYGN
|
|
as.mo("S. pyogenes", Lancefield = TRUE) # will not remain species: B_STRPT_GRPA
|
|
|
|
# All mo_* functions use as.mo() internally too (see ?mo_property):
|
|
mo_genus("E. coli")
|
|
mo_gramstain("ESCO")
|
|
mo_is_intrinsic_resistant("ESCCOL", ab = "vanco")
|
|
}
|
|
}
|
|
\seealso{
|
|
\link{microorganisms} for the \link{data.frame} that is being used to determine ID's.
|
|
|
|
The \code{\link[=mo_property]{mo_*}} functions (such as \code{\link[=mo_genus]{mo_genus()}}, \code{\link[=mo_gramstain]{mo_gramstain()}}) to get properties based on the returned code.
|
|
}
|
|
\keyword{Becker}
|
|
\keyword{Lancefield}
|
|
\keyword{becker}
|
|
\keyword{guess}
|
|
\keyword{lancefield}
|
|
\keyword{mo}
|