1
0
mirror of https://github.com/msberends/AMR.git synced 2024-12-25 06:46:11 +01:00

documentation

This commit is contained in:
dr. M.S. (Matthijs) Berends 2023-01-06 19:21:04 +01:00
parent cf16bc7de1
commit 84ed8c32bb
12 changed files with 37 additions and 64 deletions

View File

@ -35,8 +35,8 @@ echo "Running pre-commit hook..."
if command -v Rscript > /dev/null; then
if [ "$(Rscript -e 'cat(all(c('"'pkgload'"', '"'devtools'"', '"'dplyr'"') %in% rownames(installed.packages())))')" = "TRUE" ]; then
Rscript -e "source('data-raw/_pre_commit_hook.R')"
currentpkg=`Rscript -e "cat(pkgload::pkg_name())"`
echo "-> Adding all files in 'data-raw' to this commit"
currentpkg=$(Rscript -e "cat(pkgload::pkg_name())")
echo "-> Adding files in 'data-raw' and 'man' to this commit"
git add data-raw/*
git add man/*
git add R/sysdata.rda
@ -57,18 +57,18 @@ echo "Updating semantic versioning and date..."
# get tags from remote, and remove tags not on remote:
git fetch origin --prune --prune-tags --quiet
currenttagfull=`git describe --tags --abbrev=0`
currenttag=`git describe --tags --abbrev=0 | sed 's/v//'`
currenttagfull=$(git describe --tags --abbrev=0)
currenttag=$(git describe --tags --abbrev=0 | sed 's/v//')
# assume main branch to be 'main' or 'master', pick the right name:
defaultbranch=`git branch | cut -c 3- | grep -E '^master$|^main$'`
defaultbranch=$(git branch | cut -c 3- | grep -E '^master$|^main$')
if [ "$currenttag" = "" ]; then
# there is no tag, so set tag to 0.0.1 and commit index to current count
currenttag="0.0.1"
currentcommit=`git rev-list --count ${defaultbranch}`
currentcommit=$(git rev-list --count ${defaultbranch})
echo "- no git tags found, create one in format 'v(x).(y).(z)' - curently ${currentcommit} previous commits in ${defaultbranch}"
else
# there is a tag, so base version number on that
currentcommit=`git rev-list --count ${currenttagfull}..${defaultbranch}`
currentcommit=$(git rev-list --count ${currenttagfull}..${defaultbranch})
if (( "$currentcommit" == 0 )); then
# tag is new, so this must become the version number
currentversion="$currenttag"

View File

@ -1,5 +1,5 @@
Package: AMR
Version: 1.8.2.9085
Version: 1.8.2.9086
Date: 2023-01-06
Title: Antimicrobial Resistance Data Analysis
Description: Functions to simplify and standardise antimicrobial resistance (AMR)

View File

@ -1,4 +1,4 @@
# AMR 1.8.2.9085
# AMR 1.8.2.9086
*(this beta version will eventually become v2.0! We're happy to reach a new major milestone soon!)*

View File

@ -229,7 +229,7 @@ search_type_in_df <- function(x, type, info = TRUE) {
# take first 'mo' column
found <- colnames(x)[vapply(FUN.VALUE = logical(1), x, is.mo)]
} else if ("mo" %in% colnames_formatted &&
suppressWarnings(all(x$mo %in% c(NA, AMR::microorganisms$mo)))) {
suppressWarnings(all(x$mo %in% c(NA, AMR_env$MO_lookup$mo)))) {
found <- "mo"
} else if (any(colnames_formatted %like_case% "^(mo|microorganism|organism|bacteria|ba[ck]terie)s?$")) {
found <- sort(colnames(x)[colnames_formatted %like_case% "^(mo|microorganism|organism|bacteria|ba[ck]terie)s?$"])

View File

@ -1116,7 +1116,7 @@ edit_rsi <- function(x,
error = function(e) {
txt_error()
stop(paste0(
"In row(s) ", paste(rows[1:min(length(rows), 10)], collapse = ","),
"In row(s) ", paste(rows[seq_len(min(length(rows), 10))], collapse = ","),
ifelse(length(rows) > 10, "...", ""),
" while writing value '", to,
"' to column(s) `", paste(cols, collapse = "`, `"),

View File

@ -165,10 +165,11 @@ join_microorganisms <- function(type, x, by, suffix, ...) {
# otherwise use poorman, see R/aa_helper_pm_functions.R
join_fn <- get(paste0("pm_", type), envir = asNamespace("AMR"))
}
MO_df <- AMR_env$MO_lookup[, colnames(AMR::microorganisms), drop = FALSE]
if (type %like% "full|left|right|inner") {
joined <- join_fn(x = x, y = AMR::microorganisms, by = by, suffix = suffix, ...)
joined <- join_fn(x = x, y = MO_df, by = by, suffix = suffix, ...)
} else {
joined <- join_fn(x = x, y = AMR::microorganisms, by = by, ...)
joined <- join_fn(x = x, y = MO_df, by = by, ...)
}
if ("join.mo" %in% colnames(joined)) {

View File

@ -30,7 +30,7 @@
#' Calculate the Matching Score for Microorganisms
#'
#' This algorithm is used by [as.mo()] and all the [`mo_*`][mo_property()] functions to determine the most probable match of taxonomic records based on user input.
#' @author Dr. Matthijs Berends
#' @author Dr. Matthijs Berends, 2018
#' @param x Any user input value(s)
#' @param n A full taxonomic name, that exists in [`microorganisms$fullname`][microorganisms]
#' @note This algorithm was originally described in: Berends MS *et al.* (2022). **AMR: An R Package for Working with Antimicrobial Resistance Data**. *Journal of Statistical Software*, 104(3), 1-31; \doi{10.18637/jss.v104.i03}.
@ -43,7 +43,7 @@
#'
#' where:
#'
#' * \ifelse{html}{\out{<i>x</i> is the user input;}}{\eqn{x} is the user input;}
#' * \eqn{x} is the user input;
#' * \ifelse{html}{\out{<i>n</i> is a taxonomic name (genus, species, and subspecies);}}{\eqn{n} is a taxonomic name (genus, species, and subspecies);}
#' * \ifelse{html}{\out{<i>l<sub>n</sub></i> is the length of <i>n</i>;}}{l_n is the length of \eqn{n};}
#' * \ifelse{html}{\out{<i>lev</i> is the <a href="https://en.wikipedia.org/wiki/Levenshtein_distance">Levenshtein distance function</a> (counting any insertion as 1, and any deletion or substitution as 2) that is needed to change <i>x</i> into <i>n</i>;}}{lev is the Levenshtein distance function (counting any insertion as 1, and any deletion or substitution as 2) that is needed to change \eqn{x} into \eqn{n};}

View File

@ -36,12 +36,13 @@
#' @param ... other arguments passed on to [as.mo()], such as 'minimum_matching_score', 'ignore_pattern', and 'remove_from_input'
#' @param ab any (vector of) text that can be coerced to a valid antibiotic drug code with [as.ab()]
#' @param open browse the URL using [`browseURL()`][utils::browseURL()]
#' @details All functions will, at default, keep old taxonomic properties. Please refer to this example, knowing that *Escherichia blattae* was renamed to *Shimwellia blattae* in 2010:
#' - `mo_name("Escherichia blattae")` will return `"Shimwellia blattae"` (with a note about the renaming)
#' - `mo_ref("Escherichia blattae", keep_synonyms = TRUE)` will return `"Burgess et al., 1973"` (without a note)
#' - `mo_ref("Shimwellia blattae", keep_synonyms = FALSE)` will return `"Priest et al., 2010"` (without a note)
#' @details All functions will, at default, **not** keep old taxonomic properties, as synonyms are automatically replaced with the current taxonomy. Take for example *Escherichia blattae*, which was renamed to *Shimwellia blattae* in 2010:
#' - `mo_genus("Escherichia blattae")` will return `"Shemwellia"` (with a note about the renaming)
#' - `mo_genus("Escherichia blattae", keep_synonyms = TRUE)` will return `"Escherichia"` (with a warning that the name is outdated)
#' - `mo_ref("Escherichia blattae")` will return `"Priest et al., 2010"` (with a note)
#' - `mo_ref("Escherichia blattae", keep_synonyms = TRUE)` will return `"Burgess et al., 1973"` (with a warning)
#'
#' The short name - [mo_shortname()] - almost always returns the first character of the genus and the full species, like `"E. coli"`. Exceptions are abbreviations of staphylococci (such as *"CoNS"*, Coagulase-Negative Staphylococci) and beta-haemolytic streptococci (such as *"GBS"*, Group B Streptococci). Please bear in mind that e.g. *E. coli* could mean *Escherichia coli* (kingdom of Bacteria) as well as *Entamoeba coli* (kingdom of Protozoa). Returning to the full name will be done using [as.mo()] internally, giving priority to bacteria and human pathogens, i.e. `"E. coli"` will be considered *Escherichia coli*. In other words, `mo_fullname(mo_shortname("Entamoeba coli"))` returns `"Escherichia coli"`.
#' The short name ([mo_shortname()]) returns the first character of the genus and the full species, such as `"E. coli"`, for species and subspecies. Exceptions are abbreviations of staphylococci (such as *"CoNS"*, Coagulase-Negative Staphylococci) and beta-haemolytic streptococci (such as *"GBS"*, Group B Streptococci). Please bear in mind that e.g. *E. coli* could mean *Escherichia coli* (kingdom of Bacteria) as well as *Entamoeba coli* (kingdom of Protozoa). Returning to the full name will be done using [as.mo()] internally, giving priority to bacteria and human pathogens, i.e. `"E. coli"` will be considered *Escherichia coli*. In other words, `mo_fullname(mo_shortname("Entamoeba coli"))` returns `"Escherichia coli"`.
#'
#' Since the top-level of the taxonomy is sometimes referred to as 'kingdom' and sometimes as 'domain', the functions [mo_kingdom()] and [mo_domain()] return the exact same results.
#'
@ -60,7 +61,8 @@
#' SNOMED codes ([mo_snomed()]) are from the version of `r documentation_date(TAXONOMY_VERSION$SNOMED$accessed_date)`. See *Source* and the [microorganisms] data set for more info.
#'
#' Old taxonomic names (so-called 'synonyms') can be retrieved with [mo_synonyms()], the current taxonomic name can be retrieved with [mo_current()]. Both functions return full names.
#' @inheritSection mo_matching_score Matching Score for Microorganisms
#' @section Matching Score for Microorganisms:
#' This function uses [as.mo()] internally, which uses an advanced algorithm to translate arbitrary user input to valid taxonomy using a so-called matching score. You can read about this public algorithm on the [MO matching score page][mo_matching_score()].
#' @inheritSection as.mo Source
#' @rdname mo_property
#' @name mo_property

View File

@ -288,9 +288,9 @@ check_validity_mo_source <- function(x, refer_to_name = "`reference_df`", stop_o
return(FALSE)
}
}
if (!all(x$mo %in% c("", AMR::microorganisms$mo, AMR::microorganisms$fullname), na.rm = TRUE)) {
if (!all(x$mo %in% c("", AMR_env$MO_lookup$mo, AMR_env$MO_lookup$fullname), na.rm = TRUE)) {
if (stop_on_error == TRUE) {
invalid <- x[which(!x$mo %in% c("", AMR::microorganisms$mo, AMR::microorganisms$fullname)), , drop = FALSE]
invalid <- x[which(!x$mo %in% c("", AMR_env$MO_lookup$mo, AMR_env$MO_lookup$fullname)), , drop = FALSE]
if (nrow(invalid) > 1) {
plural <- "s"
} else {

View File

@ -135,7 +135,7 @@ With ambiguous user input in \code{\link[=as.mo]{as.mo()}} and all the \code{\li
where:
\itemize{
\item \ifelse{html}{\out{<i>x</i> is the user input;}}{\eqn{x} is the user input;}
\item \eqn{x} is the user input;
\item \ifelse{html}{\out{<i>n</i> is a taxonomic name (genus, species, and subspecies);}}{\eqn{n} is a taxonomic name (genus, species, and subspecies);}
\item \ifelse{html}{\out{<i>l<sub>n</sub></i> is the length of <i>n</i>;}}{l_n is the length of \eqn{n};}
\item \ifelse{html}{\out{<i>lev</i> is the <a href="https://en.wikipedia.org/wiki/Levenshtein_distance">Levenshtein distance function</a> (counting any insertion as 1, and any deletion or substitution as 2) that is needed to change <i>x</i> into <i>n</i>;}}{lev is the Levenshtein distance function (counting any insertion as 1, and any deletion or substitution as 2) that is needed to change \eqn{x} into \eqn{n};}

View File

@ -27,7 +27,7 @@ With ambiguous user input in \code{\link[=as.mo]{as.mo()}} and all the \code{\li
where:
\itemize{
\item \ifelse{html}{\out{<i>x</i> is the user input;}}{\eqn{x} is the user input;}
\item \eqn{x} is the user input;
\item \ifelse{html}{\out{<i>n</i> is a taxonomic name (genus, species, and subspecies);}}{\eqn{n} is a taxonomic name (genus, species, and subspecies);}
\item \ifelse{html}{\out{<i>l<sub>n</sub></i> is the length of <i>n</i>;}}{l_n is the length of \eqn{n};}
\item \ifelse{html}{\out{<i>lev</i> is the <a href="https://en.wikipedia.org/wiki/Levenshtein_distance">Levenshtein distance function</a> (counting any insertion as 1, and any deletion or substitution as 2) that is needed to change <i>x</i> into <i>n</i>;}}{lev is the Levenshtein distance function (counting any insertion as 1, and any deletion or substitution as 2) that is needed to change \eqn{x} into \eqn{n};}
@ -70,5 +70,5 @@ mo_matching_score(
)
}
\author{
Dr. Matthijs Berends
Dr. Matthijs Berends, 2018
}

View File

@ -294,14 +294,15 @@ mo_property(
Use these functions to return a specific property of a microorganism based on the latest accepted taxonomy. All input values will be evaluated internally with \code{\link[=as.mo]{as.mo()}}, which makes it possible to use microbial abbreviations, codes and names as input. See \emph{Examples}.
}
\details{
All functions will, at default, keep old taxonomic properties. Please refer to this example, knowing that \emph{Escherichia blattae} was renamed to \emph{Shimwellia blattae} in 2010:
All functions will, at default, \strong{not} keep old taxonomic properties, as synonyms are automatically replaced with the current taxonomy. Take for example \emph{Escherichia blattae}, which was renamed to \emph{Shimwellia blattae} in 2010:
\itemize{
\item \code{mo_name("Escherichia blattae")} will return \code{"Shimwellia blattae"} (with a note about the renaming)
\item \code{mo_ref("Escherichia blattae", keep_synonyms = TRUE)} will return \code{"Burgess et al., 1973"} (without a note)
\item \code{mo_ref("Shimwellia blattae", keep_synonyms = FALSE)} will return \code{"Priest et al., 2010"} (without a note)
\item \code{mo_genus("Escherichia blattae")} will return \code{"Shemwellia"} (with a note about the renaming)
\item \code{mo_genus("Escherichia blattae", keep_synonyms = TRUE)} will return \code{"Escherichia"} (with a warning that the name is outdated)
\item \code{mo_ref("Escherichia blattae")} will return \code{"Priest et al., 2010"} (with a note)
\item \code{mo_ref("Escherichia blattae", keep_synonyms = TRUE)} will return \code{"Burgess et al., 1973"} (with a warning)
}
The short name - \code{\link[=mo_shortname]{mo_shortname()}} - almost always returns the first character of the genus and the full species, like \code{"E. coli"}. Exceptions are abbreviations of staphylococci (such as \emph{"CoNS"}, Coagulase-Negative Staphylococci) and beta-haemolytic streptococci (such as \emph{"GBS"}, Group B Streptococci). Please bear in mind that e.g. \emph{E. coli} could mean \emph{Escherichia coli} (kingdom of Bacteria) as well as \emph{Entamoeba coli} (kingdom of Protozoa). Returning to the full name will be done using \code{\link[=as.mo]{as.mo()}} internally, giving priority to bacteria and human pathogens, i.e. \code{"E. coli"} will be considered \emph{Escherichia coli}. In other words, \code{mo_fullname(mo_shortname("Entamoeba coli"))} returns \code{"Escherichia coli"}.
The short name (\code{\link[=mo_shortname]{mo_shortname()}}) returns the first character of the genus and the full species, such as \code{"E. coli"}, for species and subspecies. Exceptions are abbreviations of staphylococci (such as \emph{"CoNS"}, Coagulase-Negative Staphylococci) and beta-haemolytic streptococci (such as \emph{"GBS"}, Group B Streptococci). Please bear in mind that e.g. \emph{E. coli} could mean \emph{Escherichia coli} (kingdom of Bacteria) as well as \emph{Entamoeba coli} (kingdom of Protozoa). Returning to the full name will be done using \code{\link[=as.mo]{as.mo()}} internally, giving priority to bacteria and human pathogens, i.e. \code{"E. coli"} will be considered \emph{Escherichia coli}. In other words, \code{mo_fullname(mo_shortname("Entamoeba coli"))} returns \code{"Escherichia coli"}.
Since the top-level of the taxonomy is sometimes referred to as 'kingdom' and sometimes as 'domain', the functions \code{\link[=mo_kingdom]{mo_kingdom()}} and \code{\link[=mo_domain]{mo_domain()}} return the exact same results.
@ -323,38 +324,7 @@ Old taxonomic names (so-called 'synonyms') can be retrieved with \code{\link[=mo
}
\section{Matching Score for Microorganisms}{
With ambiguous user input in \code{\link[=as.mo]{as.mo()}} and all the \code{\link[=mo_property]{mo_*}} functions, the returned results are chosen based on their matching score using \code{\link[=mo_matching_score]{mo_matching_score()}}. This matching score \eqn{m}, is calculated as:
\ifelse{latex}{\deqn{m_{(x, n)} = \frac{l_{n} - 0.5 \cdot \min \begin{cases}l_{n} \\ \textrm{lev}(x, n)\end{cases}}{l_{n} \cdot p_{n} \cdot k_{n}}}}{\ifelse{html}{\figure{mo_matching_score.png}{options: width="300" alt="mo matching score"}}{m(x, n) = ( l_n * min(l_n, lev(x, n) ) ) / ( l_n * p_n * k_n )}}
where:
\itemize{
\item \ifelse{html}{\out{<i>x</i> is the user input;}}{\eqn{x} is the user input;}
\item \ifelse{html}{\out{<i>n</i> is a taxonomic name (genus, species, and subspecies);}}{\eqn{n} is a taxonomic name (genus, species, and subspecies);}
\item \ifelse{html}{\out{<i>l<sub>n</sub></i> is the length of <i>n</i>;}}{l_n is the length of \eqn{n};}
\item \ifelse{html}{\out{<i>lev</i> is the <a href="https://en.wikipedia.org/wiki/Levenshtein_distance">Levenshtein distance function</a> (counting any insertion as 1, and any deletion or substitution as 2) that is needed to change <i>x</i> into <i>n</i>;}}{lev is the Levenshtein distance function (counting any insertion as 1, and any deletion or substitution as 2) that is needed to change \eqn{x} into \eqn{n};}
\item \ifelse{html}{\out{<i>p<sub>n</sub></i> is the human pathogenic prevalence group of <i>n</i>, as described below;}}{p_n is the human pathogenic prevalence group of \eqn{n}, as described below;}
\item \ifelse{html}{\out{<i>k<sub>n</sub></i> is the taxonomic kingdom of <i>n</i>, set as Bacteria = 1, Fungi = 2, Protozoa = 3, Archaea = 4, others = 5.}}{l_n is the taxonomic kingdom of \eqn{n}, set as Bacteria = 1, Fungi = 2, Protozoa = 3, Archaea = 4, others = 5.}
}
The grouping into human pathogenic prevalence (\eqn{p}) is based on recent work from Bartlett \emph{et al.} (2022, \doi{10.1099/mic.0.001269}) who extensively studied medical-scientific literature to categorise all bacterial species into these groups:
\itemize{
\item \strong{Established}, if a taxonomic species has infected at least three persons in three or more references. These records have \code{prevalence = 1.0} in the \link{microorganisms} data set;
\item \strong{Putative}, if a taxonomic species has fewer than three known cases. These records have \code{prevalence = 1.25} in the \link{microorganisms} data set.
}
Furthermore,
\itemize{
\item Any genus present in the \strong{established} list also has \code{prevalence = 1.0} in the \link{microorganisms} data set;
\item Any other genus present in the \strong{putative} list has \code{prevalence = 1.25} in the \link{microorganisms} data set;
\item Any other species or subspecies of which the genus is present in the two aforementioned groups, has \code{prevalence = 1.5} in the \link{microorganisms} data set;
\item Any \emph{non-bacterial} genus, species or subspecies of which the genus is present in the following list, has \code{prevalence = 1.5} in the \link{microorganisms} data set: \emph{Absidia}, \emph{Acanthamoeba}, \emph{Acremonium}, \emph{Aedes}, \emph{Alternaria}, \emph{Amoeba}, \emph{Ancylostoma}, \emph{Angiostrongylus}, \emph{Anisakis}, \emph{Anopheles}, \emph{Apophysomyces}, \emph{Aspergillus}, \emph{Aureobasidium}, \emph{Basidiobolus}, \emph{Beauveria}, \emph{Blastocystis}, \emph{Blastomyces}, \emph{Candida}, \emph{Capillaria}, \emph{Chaetomium}, \emph{Chrysonilia}, \emph{Cladophialophora}, \emph{Cladosporium}, \emph{Conidiobolus}, \emph{Contracaecum}, \emph{Cordylobia}, \emph{Cryptococcus}, \emph{Curvularia}, \emph{Demodex}, \emph{Dermatobia}, \emph{Dientamoeba}, \emph{Diphyllobothrium}, \emph{Dirofilaria}, \emph{Echinostoma}, \emph{Entamoeba}, \emph{Enterobius}, \emph{Exophiala}, \emph{Exserohilum}, \emph{Fasciola}, \emph{Fonsecaea}, \emph{Fusarium}, \emph{Giardia}, \emph{Haloarcula}, \emph{Halobacterium}, \emph{Halococcus}, \emph{Hendersonula}, \emph{Heterophyes}, \emph{Histomonas}, \emph{Histoplasma}, \emph{Hymenolepis}, \emph{Hypomyces}, \emph{Hysterothylacium}, \emph{Leishmania}, \emph{Malassezia}, \emph{Malbranchea}, \emph{Metagonimus}, \emph{Meyerozyma}, \emph{Microsporidium}, \emph{Microsporum}, \emph{Mortierella}, \emph{Mucor}, \emph{Mycocentrospora}, \emph{Necator}, \emph{Nectria}, \emph{Ochroconis}, \emph{Oesophagostomum}, \emph{Oidiodendron}, \emph{Opisthorchis}, \emph{Pediculus}, \emph{Phlebotomus}, \emph{Phoma}, \emph{Pichia}, \emph{Piedraia}, \emph{Pithomyces}, \emph{Pityrosporum}, \emph{Pneumocystis}, \emph{Pseudallescheria}, \emph{Pseudoterranova}, \emph{Pulex}, \emph{Rhizomucor}, \emph{Rhizopus}, \emph{Rhodotorula}, \emph{Saccharomyces}, \emph{Sarcoptes}, \emph{Scolecobasidium}, \emph{Scopulariopsis}, \emph{Scytalidium}, \emph{Spirometra}, \emph{Sporobolomyces}, \emph{Stachybotrys}, \emph{Strongyloides}, \emph{Syngamus}, \emph{Taenia}, \emph{Toxocara}, \emph{Trichinella}, \emph{Trichobilharzia}, \emph{Trichoderma}, \emph{Trichomonas}, \emph{Trichophyton}, \emph{Trichosporon}, \emph{Trichostrongylus}, \emph{Trichuris}, \emph{Tritirachium}, \emph{Trombicula}, \emph{Trypanosoma}, \emph{Tunga} or \emph{Wuchereria};
\item All other records have \code{prevalence = 2.0} in the \link{microorganisms} data set.
}
When calculating the matching score, all characters in \eqn{x} and \eqn{n} are ignored that are other than A-Z, a-z, 0-9, spaces and parentheses.
All matches are sorted descending on their matching score and for all user input values, the top match will be returned. This will lead to the effect that e.g., \code{"E. coli"} will return the microbial ID of \emph{Escherichia coli} (\eqn{m = 0.688}, a highly prevalent microorganism found in humans) and not \emph{Entamoeba coli} (\eqn{m = 0.159}, a less prevalent microorganism in humans), although the latter would alphabetically come first.
This function uses \code{\link[=as.mo]{as.mo()}} internally, which uses an advanced algorithm to translate arbitrary user input to valid taxonomy using a so-called matching score. You can read about this public algorithm on the \link[=mo_matching_score]{MO matching score page}.
}
\section{Source}{