From 22f6ceb3e41119d7b305d124fcbaf6eac3066b84 Mon Sep 17 00:00:00 2001 From: "Matthijs S. Berends" Date: Sat, 26 Sep 2020 16:51:17 +0200 Subject: [PATCH] (v1.3.0.9031) matching score update --- DESCRIPTION | 2 +- NEWS.md | 2 +- R/mo.R | 16 ++++++++-------- R/mo_matching_score.R | 17 +++++++++-------- docs/404.html | 2 +- docs/LICENSE-text.html | 2 +- docs/articles/index.html | 2 +- docs/authors.html | 2 +- docs/index.html | 2 +- docs/news/index.html | 8 ++++---- docs/pkgdown.yml | 2 +- docs/reference/as.mo.html | 17 +++++++++-------- docs/reference/index.html | 2 +- docs/reference/mo_matching_score.html | 21 +++++++++------------ docs/reference/mo_property.html | 17 +++++++++-------- docs/survey.html | 2 +- man/as.mo.Rd | 16 +++++++++------- man/mo_matching_score.Rd | 18 +++++++++--------- man/mo_property.Rd | 16 +++++++++------- 19 files changed, 85 insertions(+), 81 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index f340b27f..c575e498 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: AMR -Version: 1.3.0.9030 +Version: 1.3.0.9031 Date: 2020-09-26 Title: Antimicrobial Resistance Analysis Authors@R: c( diff --git a/NEWS.md b/NEWS.md index ad4bf8e5..146e7098 100755 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -# AMR 1.3.0.9030 +# AMR 1.3.0.9031 ## Last updated: 26 September 2020 Note: some changes in this version were suggested by anonymous reviewers from the journal we submitted our manuscipt to. We are those reviewers very grateful for going through our code so thoroughly! diff --git a/R/mo.R b/R/mo.R index ddc5e5a5..03487c8a 100755 --- a/R/mo.R +++ b/R/mo.R @@ -301,7 +301,7 @@ exec_as.mo <- function(x, initial = initial_search, uncertainty = actual_uncertainty, input_actual = actual_input) { - + if (!is.null(input_actual)) { input <- input_actual } else { @@ -318,7 +318,7 @@ exec_as.mo <- function(x, if (NROW(res_df) > 1 & uncertainty != -1) { # sort the findings on matching score scores <- mo_matching_score(x = input, - fullname = res_df[, "fullname", drop = TRUE]) + n = res_df[, "fullname", drop = TRUE]) res_df <- res_df[order(scores, decreasing = TRUE), , drop = FALSE] } res <- as.character(res_df[, column, drop = TRUE]) @@ -442,7 +442,7 @@ exec_as.mo <- function(x, # we need special treatment for very prevalent full names, they are likely! # e.g. as.mo("Staphylococcus aureus") x <- MO_lookup[match(tolower(x), MO_lookup$fullname_lower), property, drop = TRUE] - + } else if (all(x %in% reference_data_to_use$fullname)) { # we need special treatment for very prevalent full names, they are likely! # e.g. as.mo("Staphylococcus aureus") @@ -1544,7 +1544,7 @@ exec_as.mo <- function(x, # this will save the uncertain items as attribute, so they can be bound to `uncertainties` in the uncertain_fn() function x <- structure(x, uncertainties = uncertainties) } - + if (old_mo_warning == TRUE & property != "mo") { warning("The input contained old microorganism IDs from previous versions of this package.\nPlease use `as.mo()` on these old IDs to transform them to the new format.\nSUPPORT FOR THIS WILL BE DROPPED IN A FUTURE VERSION.", call. = FALSE) @@ -1639,7 +1639,7 @@ freq.mo <- function(x, ...) { ")"), `No. of genera` = pm_n_distinct(mo_genus(x_noNA, language = NULL)), `No. of species` = pm_n_distinct(paste(mo_genus(x_noNA, language = NULL), - mo_species(x_noNA, language = NULL))))) + mo_species(x_noNA, language = NULL))))) } #' @method print mo @@ -1773,7 +1773,7 @@ print.mo_uncertainties <- function(x, ...) { if (x[i, ]$candidates != "") { candidates <- unlist(strsplit(x[i, ]$candidates, ", ", fixed = TRUE)) scores <- mo_matching_score(x = x[i, ]$input, - fullname = candidates) + n = candidates) # sort on descending scores candidates <- candidates[order(1 - scores)] n_candidates <- length(candidates) @@ -1799,8 +1799,8 @@ print.mo_uncertainties <- function(x, ...) { ifelse(!is.na(x[i, ]$renamed_to), paste(", renamed to", font_italic(x[i, ]$renamed_to)), ""), " (", x[i, ]$mo, ", matching score = ", trimws(percentage(mo_matching_score(x = x[i, ]$input, - fullname = x[i, ]$fullname), - digits = 1)), + n = x[i, ]$fullname), + digits = 1)), ") "), uncertainty_interpretation, candidates), diff --git a/R/mo_matching_score.R b/R/mo_matching_score.R index 9c92a645..38d3bf23 100755 --- a/R/mo_matching_score.R +++ b/R/mo_matching_score.R @@ -24,20 +24,21 @@ #' This helper function is used by [as.mo()] to determine the most probable match of taxonomic records, based on user input. #' @param x Any user input value(s) #' @param n A full taxonomic name, that exists in [`microorganisms$fullname`][microorganisms] -#' @param uncertainty The level of uncertainty set in [as.mo()], see `allow_uncertain` in that function (here, it defaults to 1, but is automatically determined in [as.mo()] based on the number of transformations needed to get to a result) #' @section Matching score for microorganisms: -#' With ambiguous user input in [as.mo()] and all the [`mo_*`][mo_property()] functions, the returned results are chosen based on their matching score using [mo_matching_score()]. This matching score \eqn{m} is calculated as: +#' With ambiguous user input in [as.mo()] and all the [`mo_*`][mo_property()] functions, the returned results are chosen based on their matching score using [mo_matching_score()]. This matching score \eqn{m}, ranging from 0 to 100%, is calculated as: #' -#' \deqn{m_{(x, n)} = \frac{l_{n} - 0.5 \times \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} p k}}{m(x, n) = ( l_n * min(l_n, lev(x, n) ) ) / ( l_n * p * k )} +#' \deqn{m_{(x, n)} = \frac{l_{n} - 0.5 \cdot \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} \cdot p_{n} \cdot k_{n}}}{m(x, n) = ( l_n * min(l_n, lev(x, n) ) ) / ( l_n * p_n * k_n )} #' #' where: #' #' * \eqn{x} is the user input; -#' * \eqn{n} is a taxonomic name (genus, species and subspecies); -#' * \eqn{l_{n}}{l_n} is the length of the taxonomic name; -#' * \eqn{\operatorname{lev}}{lev} is the [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) function; -#' * \eqn{p} is the human pathogenic prevalence, categorised into group \eqn{1}, \eqn{2} and \eqn{3} (see *Details* in `?as.mo`), meaning that \eqn{p = \{1, 2 , 3\}}{p = {1, 2, 3}}; -#' * \eqn{k} is the kingdom index, set as follows: Bacteria = \eqn{1}, Fungi = \eqn{2}, Protozoa = \eqn{3}, Archaea = \eqn{4}, and all others = \eqn{5}, meaning that \eqn{k = \{1, 2 , 3, 4, 5\}}{k = {1, 2, 3, 4, 5}}. +#' * \eqn{n} is a taxonomic name (genus, species and subspecies) as found in [`microorganisms$fullname`][microorganisms]; +#' * \eqn{l_{n}}{l_n} is the length of \eqn{n}; +#' * \eqn{\operatorname{lev}}{lev} is the [Levenshtein distance function](https://en.wikipedia.org/wiki/Levenshtein_distance); +#' * \eqn{p_{n}}{p_n} is the human pathogenic prevalence of \eqn{n}, categorised into group \eqn{1}, \eqn{2} and \eqn{3} (see *Details* in `?as.mo`), meaning that \eqn{p = \{1, 2 , 3\}}{p = {1, 2, 3}}; +#' * \eqn{k_{n}}{k_n} is the kingdom index of \eqn{n}, set as follows: Bacteria = \eqn{1}, Fungi = \eqn{2}, Protozoa = \eqn{3}, Archaea = \eqn{4}, and all others = \eqn{5}, meaning that \eqn{k = \{1, 2 , 3, 4, 5\}}{k = {1, 2, 3, 4, 5}}. +#' +#' This means that the user input `x = "E. coli"` gets for *Escherichia coli* a matching score of `r percentage(mo_matching_score("E. coli", "Escherichia coli"), 1)` and for *Entamoeba coli* a matching score of `r percentage(mo_matching_score("E. coli", "Entamoeba coli"), 1)`. #' #' All matches are sorted descending on their matching score and for all user input values, the top match will be returned. #' @export diff --git a/docs/404.html b/docs/404.html index 027deb6b..a7621ac0 100644 --- a/docs/404.html +++ b/docs/404.html @@ -81,7 +81,7 @@ AMR (for R) - 1.3.0.9030 + 1.3.0.9031 diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index 64e44ae9..aebcd21f 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -81,7 +81,7 @@ AMR (for R) - 1.3.0.9030 + 1.3.0.9031 diff --git a/docs/articles/index.html b/docs/articles/index.html index 17031608..f3e318a5 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -81,7 +81,7 @@ AMR (for R) - 1.3.0.9030 + 1.3.0.9031 diff --git a/docs/authors.html b/docs/authors.html index 7b238d04..364cd51f 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -81,7 +81,7 @@ AMR (for R) - 1.3.0.9030 + 1.3.0.9031 diff --git a/docs/index.html b/docs/index.html index 8d3c70f1..88043e09 100644 --- a/docs/index.html +++ b/docs/index.html @@ -43,7 +43,7 @@ AMR (for R) - 1.3.0.9030 + 1.3.0.9031 diff --git a/docs/news/index.html b/docs/news/index.html index 579687d7..3b15fcdc 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -81,7 +81,7 @@ AMR (for R) - 1.3.0.9030 + 1.3.0.9031 @@ -236,9 +236,9 @@ Source: NEWS.md -
-

-AMR 1.3.0.9030 Unreleased +
+

+AMR 1.3.0.9031 Unreleased

diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml index 0ae45387..6e64473f 100644 --- a/docs/pkgdown.yml +++ b/docs/pkgdown.yml @@ -2,7 +2,7 @@ pandoc: 2.7.3 pkgdown: 1.5.1.9000 pkgdown_sha: eae56f08694abebf93cdfc0dd8e9ede06d8c815f articles: [] -last_built: 2020-09-26T14:25Z +last_built: 2020-09-26T14:51Z urls: reference: https://msberends.github.io/AMR/reference article: https://msberends.github.io/AMR/articles diff --git a/docs/reference/as.mo.html b/docs/reference/as.mo.html index 022f6bcf..42086cb2 100644 --- a/docs/reference/as.mo.html +++ b/docs/reference/as.mo.html @@ -82,7 +82,7 @@ AMR (for R) - 1.3.0.9030 + 1.3.0.9031

@@ -388,17 +388,18 @@ The lifecycle of this function is stableWith ambiguous user input in as.mo() and all the mo_* functions, the returned results are chosen based on their matching score using mo_matching_score(). This matching score \(m\) is calculated as:

-

$$m_{(x, n)} = \frac{l_{n} - 0.5 \times \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} p k}$$

+

With ambiguous user input in as.mo() and all the mo_* functions, the returned results are chosen based on their matching score using mo_matching_score(). This matching score \(m\), ranging from 0 to 100%, is calculated as:

+

$$m_{(x, n)} = \frac{l_{n} - 0.5 \cdot \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} \cdot p_{n} \cdot k_{n}}$$

where:

  • \(x\) is the user input;

  • -
  • \(n\) is a taxonomic name (genus, species and subspecies);

  • -
  • \(l_{n}\) is the length of the taxonomic name;

  • -
  • \(\operatorname{lev}\) is the Levenshtein distance function;

  • -
  • \(p\) is the human pathogenic prevalence, categorised into group \(1\), \(2\) and \(3\) (see Details in ?as.mo), meaning that \(p = \{1, 2 , 3\}\);

  • -
  • \(k\) is the kingdom index, set as follows: Bacteria = \(1\), Fungi = \(2\), Protozoa = \(3\), Archaea = \(4\), and all others = \(5\), meaning that \(k = \{1, 2 , 3, 4, 5\}\).

  • +
  • \(n\) is a taxonomic name (genus, species and subspecies) as found in microorganisms$fullname;

  • +
  • \(l_{n}\) is the length of \(n\);

  • +
  • \(\operatorname{lev}\) is the Levenshtein distance function;

  • +
  • \(p_{n}\) is the human pathogenic prevalence of \(n\), categorised into group \(1\), \(2\) and \(3\) (see Details in ?as.mo), meaning that \(p = \{1, 2 , 3\}\);

  • +
  • \(k_{n}\) is the kingdom index of \(n\), set as follows: Bacteria = \(1\), Fungi = \(2\), Protozoa = \(3\), Archaea = \(4\), and all others = \(5\), meaning that \(k = \{1, 2 , 3, 4, 5\}\).

+

This means that the user input x = "E. coli" gets for Escherichia coli a matching score of 68.8% and for Entamoeba coli a matching score of 7.9%.

All matches are sorted descending on their matching score and for all user input values, the top match will be returned.

Catalogue of Life

diff --git a/docs/reference/index.html b/docs/reference/index.html index 4842f928..837384be 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -81,7 +81,7 @@ AMR (for R) - 1.3.0.9030 + 1.3.0.9031
diff --git a/docs/reference/mo_matching_score.html b/docs/reference/mo_matching_score.html index feabf445..93af2ea2 100644 --- a/docs/reference/mo_matching_score.html +++ b/docs/reference/mo_matching_score.html @@ -82,7 +82,7 @@ AMR (for R) - 1.3.0.9030 + 1.3.0.9031

@@ -255,27 +255,24 @@ n

A full taxonomic name, that exists in microorganisms$fullname

- - uncertainty -

The level of uncertainty set in as.mo(), see allow_uncertain in that function (here, it defaults to 1, but is automatically determined in as.mo() based on the number of transformations needed to get to a result)

-

Matching score for microorganisms

-

With ambiguous user input in as.mo() and all the mo_* functions, the returned results are chosen based on their matching score using mo_matching_score(). This matching score \(m\) is calculated as:

-

$$m_{(x, n)} = \frac{l_{n} - 0.5 \times \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} p k}$$

+

With ambiguous user input in as.mo() and all the mo_* functions, the returned results are chosen based on their matching score using mo_matching_score(). This matching score \(m\), ranging from 0 to 100%, is calculated as:

+

$$m_{(x, n)} = \frac{l_{n} - 0.5 \cdot \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} \cdot p_{n} \cdot k_{n}}$$

where:

+

This means that the user input x = "E. coli" gets for Escherichia coli a matching score of 68.8% and for Entamoeba coli a matching score of 7.9%.

All matches are sorted descending on their matching score and for all user input values, the top match will be returned.

Examples

diff --git a/docs/reference/mo_property.html b/docs/reference/mo_property.html index b44a7ad4..127bf9ee 100644 --- a/docs/reference/mo_property.html +++ b/docs/reference/mo_property.html @@ -82,7 +82,7 @@ AMR (for R) - 1.3.0.9030 + 1.3.0.9031 @@ -350,17 +350,18 @@ The lifecycle of this function is stableWith ambiguous user input in as.mo() and all the mo_* functions, the returned results are chosen based on their matching score using mo_matching_score(). This matching score \(m\) is calculated as:

-

$$m_{(x, n)} = \frac{l_{n} - 0.5 \times \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} p k}$$

+

With ambiguous user input in as.mo() and all the mo_* functions, the returned results are chosen based on their matching score using mo_matching_score(). This matching score \(m\), ranging from 0 to 100%, is calculated as:

+

$$m_{(x, n)} = \frac{l_{n} - 0.5 \cdot \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} \cdot p_{n} \cdot k_{n}}$$

where:

  • \(x\) is the user input;

  • -
  • \(n\) is a taxonomic name (genus, species and subspecies);

  • -
  • \(l_{n}\) is the length of the taxonomic name;

  • -
  • \(\operatorname{lev}\) is the Levenshtein distance function;

  • -
  • \(p\) is the human pathogenic prevalence, categorised into group \(1\), \(2\) and \(3\) (see Details in ?as.mo), meaning that \(p = \{1, 2 , 3\}\);

  • -
  • \(k\) is the kingdom index, set as follows: Bacteria = \(1\), Fungi = \(2\), Protozoa = \(3\), Archaea = \(4\), and all others = \(5\), meaning that \(k = \{1, 2 , 3, 4, 5\}\).

  • +
  • \(n\) is a taxonomic name (genus, species and subspecies) as found in microorganisms$fullname;

  • +
  • \(l_{n}\) is the length of \(n\);

  • +
  • \(\operatorname{lev}\) is the Levenshtein distance function;

  • +
  • \(p_{n}\) is the human pathogenic prevalence of \(n\), categorised into group \(1\), \(2\) and \(3\) (see Details in ?as.mo), meaning that \(p = \{1, 2 , 3\}\);

  • +
  • \(k_{n}\) is the kingdom index of \(n\), set as follows: Bacteria = \(1\), Fungi = \(2\), Protozoa = \(3\), Archaea = \(4\), and all others = \(5\), meaning that \(k = \{1, 2 , 3, 4, 5\}\).

+

This means that the user input x = "E. coli" gets for Escherichia coli a matching score of 68.8% and for Entamoeba coli a matching score of 7.9%.

All matches are sorted descending on their matching score and for all user input values, the top match will be returned.

Catalogue of Life

diff --git a/docs/survey.html b/docs/survey.html index efed9f11..dad18db7 100644 --- a/docs/survey.html +++ b/docs/survey.html @@ -81,7 +81,7 @@ AMR (for R) - 1.3.0.9030 + 1.3.0.9031 diff --git a/man/as.mo.Rd b/man/as.mo.Rd index c06b73ea..f5a3dbd3 100644 --- a/man/as.mo.Rd +++ b/man/as.mo.Rd @@ -146,20 +146,22 @@ If the unlying code needs breaking changes, they will occur gradually. For examp \section{Matching score for microorganisms}{ -With ambiguous user input in \code{\link[=as.mo]{as.mo()}} and all the \code{\link[=mo_property]{mo_*}} functions, the returned results are chosen based on their matching score using \code{\link[=mo_matching_score]{mo_matching_score()}}. This matching score \eqn{m} is calculated as: +With ambiguous user input in \code{\link[=as.mo]{as.mo()}} and all the \code{\link[=mo_property]{mo_*}} functions, the returned results are chosen based on their matching score using \code{\link[=mo_matching_score]{mo_matching_score()}}. This matching score \eqn{m}, ranging from 0 to 100\%, is calculated as: -\deqn{m_{(x, n)} = \frac{l_{n} - 0.5 \times \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} p k}}{m(x, n) = ( l_n * min(l_n, lev(x, n) ) ) / ( l_n * p * k )} +\deqn{m_{(x, n)} = \frac{l_{n} - 0.5 \cdot \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} \cdot p_{n} \cdot k_{n}}}{m(x, n) = ( l_n * min(l_n, lev(x, n) ) ) / ( l_n * p_n * k_n )} where: \itemize{ \item \eqn{x} is the user input; -\item \eqn{n} is a taxonomic name (genus, species and subspecies); -\item \eqn{l_{n}}{l_n} is the length of the taxonomic name; -\item \eqn{\operatorname{lev}}{lev} is the \href{https://en.wikipedia.org/wiki/Levenshtein_distance}{Levenshtein distance} function; -\item \eqn{p} is the human pathogenic prevalence, categorised into group \eqn{1}, \eqn{2} and \eqn{3} (see \emph{Details} in \code{?as.mo}), meaning that \eqn{p = \{1, 2 , 3\}}{p = {1, 2, 3}}; -\item \eqn{k} is the kingdom index, set as follows: Bacteria = \eqn{1}, Fungi = \eqn{2}, Protozoa = \eqn{3}, Archaea = \eqn{4}, and all others = \eqn{5}, meaning that \eqn{k = \{1, 2 , 3, 4, 5\}}{k = {1, 2, 3, 4, 5}}. +\item \eqn{n} is a taxonomic name (genus, species and subspecies) as found in \code{\link[=microorganisms]{microorganisms$fullname}}; +\item \eqn{l_{n}}{l_n} is the length of \eqn{n}; +\item \eqn{\operatorname{lev}}{lev} is the \href{https://en.wikipedia.org/wiki/Levenshtein_distance}{Levenshtein distance function}; +\item \eqn{p_{n}}{p_n} is the human pathogenic prevalence of \eqn{n}, categorised into group \eqn{1}, \eqn{2} and \eqn{3} (see \emph{Details} in \code{?as.mo}), meaning that \eqn{p = \{1, 2 , 3\}}{p = {1, 2, 3}}; +\item \eqn{k_{n}}{k_n} is the kingdom index of \eqn{n}, set as follows: Bacteria = \eqn{1}, Fungi = \eqn{2}, Protozoa = \eqn{3}, Archaea = \eqn{4}, and all others = \eqn{5}, meaning that \eqn{k = \{1, 2 , 3, 4, 5\}}{k = {1, 2, 3, 4, 5}}. } +This means that the user input \code{x = "E. coli"} gets for \emph{Escherichia coli} a matching score of 68.8\% and for \emph{Entamoeba coli} a matching score of 7.9\%. + All matches are sorted descending on their matching score and for all user input values, the top match will be returned. } diff --git a/man/mo_matching_score.Rd b/man/mo_matching_score.Rd index 43fcd90f..975aa02e 100644 --- a/man/mo_matching_score.Rd +++ b/man/mo_matching_score.Rd @@ -10,28 +10,28 @@ mo_matching_score(x, n) \item{x}{Any user input value(s)} \item{n}{A full taxonomic name, that exists in \code{\link[=microorganisms]{microorganisms$fullname}}} - -\item{uncertainty}{The level of uncertainty set in \code{\link[=as.mo]{as.mo()}}, see \code{allow_uncertain} in that function (here, it defaults to 1, but is automatically determined in \code{\link[=as.mo]{as.mo()}} based on the number of transformations needed to get to a result)} } \description{ This helper function is used by \code{\link[=as.mo]{as.mo()}} to determine the most probable match of taxonomic records, based on user input. } \section{Matching score for microorganisms}{ -With ambiguous user input in \code{\link[=as.mo]{as.mo()}} and all the \code{\link[=mo_property]{mo_*}} functions, the returned results are chosen based on their matching score using \code{\link[=mo_matching_score]{mo_matching_score()}}. This matching score \eqn{m} is calculated as: +With ambiguous user input in \code{\link[=as.mo]{as.mo()}} and all the \code{\link[=mo_property]{mo_*}} functions, the returned results are chosen based on their matching score using \code{\link[=mo_matching_score]{mo_matching_score()}}. This matching score \eqn{m}, ranging from 0 to 100\%, is calculated as: -\deqn{m_{(x, n)} = \frac{l_{n} - 0.5 \times \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} p k}}{m(x, n) = ( l_n * min(l_n, lev(x, n) ) ) / ( l_n * p * k )} +\deqn{m_{(x, n)} = \frac{l_{n} - 0.5 \cdot \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} \cdot p_{n} \cdot k_{n}}}{m(x, n) = ( l_n * min(l_n, lev(x, n) ) ) / ( l_n * p_n * k_n )} where: \itemize{ \item \eqn{x} is the user input; -\item \eqn{n} is a taxonomic name (genus, species and subspecies); -\item \eqn{l_{n}}{l_n} is the length of the taxonomic name; -\item \eqn{\operatorname{lev}}{lev} is the \href{https://en.wikipedia.org/wiki/Levenshtein_distance}{Levenshtein distance} function; -\item \eqn{p} is the human pathogenic prevalence, categorised into group \eqn{1}, \eqn{2} and \eqn{3} (see \emph{Details} in \code{?as.mo}), meaning that \eqn{p = \{1, 2 , 3\}}{p = {1, 2, 3}}; -\item \eqn{k} is the kingdom index, set as follows: Bacteria = \eqn{1}, Fungi = \eqn{2}, Protozoa = \eqn{3}, Archaea = \eqn{4}, and all others = \eqn{5}, meaning that \eqn{k = \{1, 2 , 3, 4, 5\}}{k = {1, 2, 3, 4, 5}}. +\item \eqn{n} is a taxonomic name (genus, species and subspecies) as found in \code{\link[=microorganisms]{microorganisms$fullname}}; +\item \eqn{l_{n}}{l_n} is the length of \eqn{n}; +\item \eqn{\operatorname{lev}}{lev} is the \href{https://en.wikipedia.org/wiki/Levenshtein_distance}{Levenshtein distance function}; +\item \eqn{p_{n}}{p_n} is the human pathogenic prevalence of \eqn{n}, categorised into group \eqn{1}, \eqn{2} and \eqn{3} (see \emph{Details} in \code{?as.mo}), meaning that \eqn{p = \{1, 2 , 3\}}{p = {1, 2, 3}}; +\item \eqn{k_{n}}{k_n} is the kingdom index of \eqn{n}, set as follows: Bacteria = \eqn{1}, Fungi = \eqn{2}, Protozoa = \eqn{3}, Archaea = \eqn{4}, and all others = \eqn{5}, meaning that \eqn{k = \{1, 2 , 3, 4, 5\}}{k = {1, 2, 3, 4, 5}}. } +This means that the user input \code{x = "E. coli"} gets for \emph{Escherichia coli} a matching score of 68.8\% and for \emph{Entamoeba coli} a matching score of 7.9\%. + All matches are sorted descending on their matching score and for all user input values, the top match will be returned. } diff --git a/man/mo_property.Rd b/man/mo_property.Rd index c8ca5e80..b75c0886 100644 --- a/man/mo_property.Rd +++ b/man/mo_property.Rd @@ -126,20 +126,22 @@ If the unlying code needs breaking changes, they will occur gradually. For examp \section{Matching score for microorganisms}{ -With ambiguous user input in \code{\link[=as.mo]{as.mo()}} and all the \code{\link[=mo_property]{mo_*}} functions, the returned results are chosen based on their matching score using \code{\link[=mo_matching_score]{mo_matching_score()}}. This matching score \eqn{m} is calculated as: +With ambiguous user input in \code{\link[=as.mo]{as.mo()}} and all the \code{\link[=mo_property]{mo_*}} functions, the returned results are chosen based on their matching score using \code{\link[=mo_matching_score]{mo_matching_score()}}. This matching score \eqn{m}, ranging from 0 to 100\%, is calculated as: -\deqn{m_{(x, n)} = \frac{l_{n} - 0.5 \times \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} p k}}{m(x, n) = ( l_n * min(l_n, lev(x, n) ) ) / ( l_n * p * k )} +\deqn{m_{(x, n)} = \frac{l_{n} - 0.5 \cdot \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} \cdot p_{n} \cdot k_{n}}}{m(x, n) = ( l_n * min(l_n, lev(x, n) ) ) / ( l_n * p_n * k_n )} where: \itemize{ \item \eqn{x} is the user input; -\item \eqn{n} is a taxonomic name (genus, species and subspecies); -\item \eqn{l_{n}}{l_n} is the length of the taxonomic name; -\item \eqn{\operatorname{lev}}{lev} is the \href{https://en.wikipedia.org/wiki/Levenshtein_distance}{Levenshtein distance} function; -\item \eqn{p} is the human pathogenic prevalence, categorised into group \eqn{1}, \eqn{2} and \eqn{3} (see \emph{Details} in \code{?as.mo}), meaning that \eqn{p = \{1, 2 , 3\}}{p = {1, 2, 3}}; -\item \eqn{k} is the kingdom index, set as follows: Bacteria = \eqn{1}, Fungi = \eqn{2}, Protozoa = \eqn{3}, Archaea = \eqn{4}, and all others = \eqn{5}, meaning that \eqn{k = \{1, 2 , 3, 4, 5\}}{k = {1, 2, 3, 4, 5}}. +\item \eqn{n} is a taxonomic name (genus, species and subspecies) as found in \code{\link[=microorganisms]{microorganisms$fullname}}; +\item \eqn{l_{n}}{l_n} is the length of \eqn{n}; +\item \eqn{\operatorname{lev}}{lev} is the \href{https://en.wikipedia.org/wiki/Levenshtein_distance}{Levenshtein distance function}; +\item \eqn{p_{n}}{p_n} is the human pathogenic prevalence of \eqn{n}, categorised into group \eqn{1}, \eqn{2} and \eqn{3} (see \emph{Details} in \code{?as.mo}), meaning that \eqn{p = \{1, 2 , 3\}}{p = {1, 2, 3}}; +\item \eqn{k_{n}}{k_n} is the kingdom index of \eqn{n}, set as follows: Bacteria = \eqn{1}, Fungi = \eqn{2}, Protozoa = \eqn{3}, Archaea = \eqn{4}, and all others = \eqn{5}, meaning that \eqn{k = \{1, 2 , 3, 4, 5\}}{k = {1, 2, 3, 4, 5}}. } +This means that the user input \code{x = "E. coli"} gets for \emph{Escherichia coli} a matching score of 68.8\% and for \emph{Entamoeba coli} a matching score of 7.9\%. + All matches are sorted descending on their matching score and for all user input values, the top match will be returned. }