mirror of
https://github.com/msberends/AMR.git
synced 2025-01-24 11:44:35 +01:00
(v1.3.0.9031) matching score update
This commit is contained in:
parent
050a9a04fb
commit
22f6ceb3e4
@ -1,5 +1,5 @@
|
||||
Package: AMR
|
||||
Version: 1.3.0.9030
|
||||
Version: 1.3.0.9031
|
||||
Date: 2020-09-26
|
||||
Title: Antimicrobial Resistance Analysis
|
||||
Authors@R: c(
|
||||
|
2
NEWS.md
2
NEWS.md
@ -1,4 +1,4 @@
|
||||
# AMR 1.3.0.9030
|
||||
# AMR 1.3.0.9031
|
||||
## <small>Last updated: 26 September 2020</small>
|
||||
|
||||
Note: some changes in this version were suggested by anonymous reviewers from the journal we submitted our manuscipt to. We are those reviewers very grateful for going through our code so thoroughly!
|
||||
|
16
R/mo.R
16
R/mo.R
@ -301,7 +301,7 @@ exec_as.mo <- function(x,
|
||||
initial = initial_search,
|
||||
uncertainty = actual_uncertainty,
|
||||
input_actual = actual_input) {
|
||||
|
||||
|
||||
if (!is.null(input_actual)) {
|
||||
input <- input_actual
|
||||
} else {
|
||||
@ -318,7 +318,7 @@ exec_as.mo <- function(x,
|
||||
if (NROW(res_df) > 1 & uncertainty != -1) {
|
||||
# sort the findings on matching score
|
||||
scores <- mo_matching_score(x = input,
|
||||
fullname = res_df[, "fullname", drop = TRUE])
|
||||
n = res_df[, "fullname", drop = TRUE])
|
||||
res_df <- res_df[order(scores, decreasing = TRUE), , drop = FALSE]
|
||||
}
|
||||
res <- as.character(res_df[, column, drop = TRUE])
|
||||
@ -442,7 +442,7 @@ exec_as.mo <- function(x,
|
||||
# we need special treatment for very prevalent full names, they are likely!
|
||||
# e.g. as.mo("Staphylococcus aureus")
|
||||
x <- MO_lookup[match(tolower(x), MO_lookup$fullname_lower), property, drop = TRUE]
|
||||
|
||||
|
||||
} else if (all(x %in% reference_data_to_use$fullname)) {
|
||||
# we need special treatment for very prevalent full names, they are likely!
|
||||
# e.g. as.mo("Staphylococcus aureus")
|
||||
@ -1544,7 +1544,7 @@ exec_as.mo <- function(x,
|
||||
# this will save the uncertain items as attribute, so they can be bound to `uncertainties` in the uncertain_fn() function
|
||||
x <- structure(x, uncertainties = uncertainties)
|
||||
}
|
||||
|
||||
|
||||
|
||||
if (old_mo_warning == TRUE & property != "mo") {
|
||||
warning("The input contained old microorganism IDs from previous versions of this package.\nPlease use `as.mo()` on these old IDs to transform them to the new format.\nSUPPORT FOR THIS WILL BE DROPPED IN A FUTURE VERSION.", call. = FALSE)
|
||||
@ -1639,7 +1639,7 @@ freq.mo <- function(x, ...) {
|
||||
")"),
|
||||
`No. of genera` = pm_n_distinct(mo_genus(x_noNA, language = NULL)),
|
||||
`No. of species` = pm_n_distinct(paste(mo_genus(x_noNA, language = NULL),
|
||||
mo_species(x_noNA, language = NULL)))))
|
||||
mo_species(x_noNA, language = NULL)))))
|
||||
}
|
||||
|
||||
#' @method print mo
|
||||
@ -1773,7 +1773,7 @@ print.mo_uncertainties <- function(x, ...) {
|
||||
if (x[i, ]$candidates != "") {
|
||||
candidates <- unlist(strsplit(x[i, ]$candidates, ", ", fixed = TRUE))
|
||||
scores <- mo_matching_score(x = x[i, ]$input,
|
||||
fullname = candidates)
|
||||
n = candidates)
|
||||
# sort on descending scores
|
||||
candidates <- candidates[order(1 - scores)]
|
||||
n_candidates <- length(candidates)
|
||||
@ -1799,8 +1799,8 @@ print.mo_uncertainties <- function(x, ...) {
|
||||
ifelse(!is.na(x[i, ]$renamed_to), paste(", renamed to", font_italic(x[i, ]$renamed_to)), ""),
|
||||
" (", x[i, ]$mo,
|
||||
", matching score = ", trimws(percentage(mo_matching_score(x = x[i, ]$input,
|
||||
fullname = x[i, ]$fullname),
|
||||
digits = 1)),
|
||||
n = x[i, ]$fullname),
|
||||
digits = 1)),
|
||||
") "),
|
||||
uncertainty_interpretation,
|
||||
candidates),
|
||||
|
@ -24,20 +24,21 @@
|
||||
#' This helper function is used by [as.mo()] to determine the most probable match of taxonomic records, based on user input.
|
||||
#' @param x Any user input value(s)
|
||||
#' @param n A full taxonomic name, that exists in [`microorganisms$fullname`][microorganisms]
|
||||
#' @param uncertainty The level of uncertainty set in [as.mo()], see `allow_uncertain` in that function (here, it defaults to 1, but is automatically determined in [as.mo()] based on the number of transformations needed to get to a result)
|
||||
#' @section Matching score for microorganisms:
|
||||
#' With ambiguous user input in [as.mo()] and all the [`mo_*`][mo_property()] functions, the returned results are chosen based on their matching score using [mo_matching_score()]. This matching score \eqn{m} is calculated as:
|
||||
#' With ambiguous user input in [as.mo()] and all the [`mo_*`][mo_property()] functions, the returned results are chosen based on their matching score using [mo_matching_score()]. This matching score \eqn{m}, ranging from 0 to 100%, is calculated as:
|
||||
#'
|
||||
#' \deqn{m_{(x, n)} = \frac{l_{n} - 0.5 \times \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} p k}}{m(x, n) = ( l_n * min(l_n, lev(x, n) ) ) / ( l_n * p * k )}
|
||||
#' \deqn{m_{(x, n)} = \frac{l_{n} - 0.5 \cdot \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} \cdot p_{n} \cdot k_{n}}}{m(x, n) = ( l_n * min(l_n, lev(x, n) ) ) / ( l_n * p_n * k_n )}
|
||||
#'
|
||||
#' where:
|
||||
#'
|
||||
#' * \eqn{x} is the user input;
|
||||
#' * \eqn{n} is a taxonomic name (genus, species and subspecies);
|
||||
#' * \eqn{l_{n}}{l_n} is the length of the taxonomic name;
|
||||
#' * \eqn{\operatorname{lev}}{lev} is the [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) function;
|
||||
#' * \eqn{p} is the human pathogenic prevalence, categorised into group \eqn{1}, \eqn{2} and \eqn{3} (see *Details* in `?as.mo`), meaning that \eqn{p = \{1, 2 , 3\}}{p = {1, 2, 3}};
|
||||
#' * \eqn{k} is the kingdom index, set as follows: Bacteria = \eqn{1}, Fungi = \eqn{2}, Protozoa = \eqn{3}, Archaea = \eqn{4}, and all others = \eqn{5}, meaning that \eqn{k = \{1, 2 , 3, 4, 5\}}{k = {1, 2, 3, 4, 5}}.
|
||||
#' * \eqn{n} is a taxonomic name (genus, species and subspecies) as found in [`microorganisms$fullname`][microorganisms];
|
||||
#' * \eqn{l_{n}}{l_n} is the length of \eqn{n};
|
||||
#' * \eqn{\operatorname{lev}}{lev} is the [Levenshtein distance function](https://en.wikipedia.org/wiki/Levenshtein_distance);
|
||||
#' * \eqn{p_{n}}{p_n} is the human pathogenic prevalence of \eqn{n}, categorised into group \eqn{1}, \eqn{2} and \eqn{3} (see *Details* in `?as.mo`), meaning that \eqn{p = \{1, 2 , 3\}}{p = {1, 2, 3}};
|
||||
#' * \eqn{k_{n}}{k_n} is the kingdom index of \eqn{n}, set as follows: Bacteria = \eqn{1}, Fungi = \eqn{2}, Protozoa = \eqn{3}, Archaea = \eqn{4}, and all others = \eqn{5}, meaning that \eqn{k = \{1, 2 , 3, 4, 5\}}{k = {1, 2, 3, 4, 5}}.
|
||||
#'
|
||||
#' This means that the user input `x = "E. coli"` gets for *Escherichia coli* a matching score of `r percentage(mo_matching_score("E. coli", "Escherichia coli"), 1)` and for *Entamoeba coli* a matching score of `r percentage(mo_matching_score("E. coli", "Entamoeba coli"), 1)`.
|
||||
#'
|
||||
#' All matches are sorted descending on their matching score and for all user input values, the top match will be returned.
|
||||
#' @export
|
||||
|
@ -81,7 +81,7 @@
|
||||
</button>
|
||||
<span class="navbar-brand">
|
||||
<a class="navbar-link" href="https://msberends.github.io/AMR/index.html">AMR (for R)</a>
|
||||
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9030</span>
|
||||
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9031</span>
|
||||
</span>
|
||||
</div>
|
||||
|
||||
|
@ -81,7 +81,7 @@
|
||||
</button>
|
||||
<span class="navbar-brand">
|
||||
<a class="navbar-link" href="index.html">AMR (for R)</a>
|
||||
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9030</span>
|
||||
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9031</span>
|
||||
</span>
|
||||
</div>
|
||||
|
||||
|
@ -81,7 +81,7 @@
|
||||
</button>
|
||||
<span class="navbar-brand">
|
||||
<a class="navbar-link" href="../index.html">AMR (for R)</a>
|
||||
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9030</span>
|
||||
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9031</span>
|
||||
</span>
|
||||
</div>
|
||||
|
||||
|
@ -81,7 +81,7 @@
|
||||
</button>
|
||||
<span class="navbar-brand">
|
||||
<a class="navbar-link" href="index.html">AMR (for R)</a>
|
||||
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9030</span>
|
||||
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9031</span>
|
||||
</span>
|
||||
</div>
|
||||
|
||||
|
@ -43,7 +43,7 @@
|
||||
</button>
|
||||
<span class="navbar-brand">
|
||||
<a class="navbar-link" href="index.html">AMR (for R)</a>
|
||||
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9030</span>
|
||||
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9031</span>
|
||||
</span>
|
||||
</div>
|
||||
|
||||
|
@ -81,7 +81,7 @@
|
||||
</button>
|
||||
<span class="navbar-brand">
|
||||
<a class="navbar-link" href="../index.html">AMR (for R)</a>
|
||||
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9030</span>
|
||||
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9031</span>
|
||||
</span>
|
||||
</div>
|
||||
|
||||
@ -236,9 +236,9 @@
|
||||
<small>Source: <a href='https://github.com/msberends/AMR/blob/master/NEWS.md'><code>NEWS.md</code></a></small>
|
||||
</div>
|
||||
|
||||
<div id="amr-1309030" class="section level1">
|
||||
<h1 class="page-header" data-toc-text="1.3.0.9030">
|
||||
<a href="#amr-1309030" class="anchor"></a>AMR 1.3.0.9030<small> Unreleased </small>
|
||||
<div id="amr-1309031" class="section level1">
|
||||
<h1 class="page-header" data-toc-text="1.3.0.9031">
|
||||
<a href="#amr-1309031" class="anchor"></a>AMR 1.3.0.9031<small> Unreleased </small>
|
||||
</h1>
|
||||
<div id="last-updated-26-september-2020" class="section level2">
|
||||
<h2 class="hasAnchor">
|
||||
|
@ -2,7 +2,7 @@ pandoc: 2.7.3
|
||||
pkgdown: 1.5.1.9000
|
||||
pkgdown_sha: eae56f08694abebf93cdfc0dd8e9ede06d8c815f
|
||||
articles: []
|
||||
last_built: 2020-09-26T14:25Z
|
||||
last_built: 2020-09-26T14:51Z
|
||||
urls:
|
||||
reference: https://msberends.github.io/AMR/reference
|
||||
article: https://msberends.github.io/AMR/articles
|
||||
|
@ -82,7 +82,7 @@
|
||||
</button>
|
||||
<span class="navbar-brand">
|
||||
<a class="navbar-link" href="../index.html">AMR (for R)</a>
|
||||
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9030</span>
|
||||
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9031</span>
|
||||
</span>
|
||||
</div>
|
||||
|
||||
@ -388,17 +388,18 @@ The <a href='lifecycle.html'>lifecycle</a> of this function is <strong>stable</s
|
||||
|
||||
|
||||
|
||||
<p>With ambiguous user input in <code>as.mo()</code> and all the <code><a href='mo_property.html'>mo_*</a></code> functions, the returned results are chosen based on their matching score using <code><a href='mo_matching_score.html'>mo_matching_score()</a></code>. This matching score \(m\) is calculated as:</p>
|
||||
<p>$$m_{(x, n)} = \frac{l_{n} - 0.5 \times \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} p k}$$</p>
|
||||
<p>With ambiguous user input in <code>as.mo()</code> and all the <code><a href='mo_property.html'>mo_*</a></code> functions, the returned results are chosen based on their matching score using <code><a href='mo_matching_score.html'>mo_matching_score()</a></code>. This matching score \(m\), ranging from 0 to 100%, is calculated as:</p>
|
||||
<p>$$m_{(x, n)} = \frac{l_{n} - 0.5 \cdot \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} \cdot p_{n} \cdot k_{n}}$$</p>
|
||||
<p>where:</p><ul>
|
||||
<li><p>\(x\) is the user input;</p></li>
|
||||
<li><p>\(n\) is a taxonomic name (genus, species and subspecies);</p></li>
|
||||
<li><p>\(l_{n}\) is the length of the taxonomic name;</p></li>
|
||||
<li><p>\(\operatorname{lev}\) is the <a href='https://en.wikipedia.org/wiki/Levenshtein_distance'>Levenshtein distance</a> function;</p></li>
|
||||
<li><p>\(p\) is the human pathogenic prevalence, categorised into group \(1\), \(2\) and \(3\) (see <em>Details</em> in <code>?as.mo</code>), meaning that \(p = \{1, 2 , 3\}\);</p></li>
|
||||
<li><p>\(k\) is the kingdom index, set as follows: Bacteria = \(1\), Fungi = \(2\), Protozoa = \(3\), Archaea = \(4\), and all others = \(5\), meaning that \(k = \{1, 2 , 3, 4, 5\}\).</p></li>
|
||||
<li><p>\(n\) is a taxonomic name (genus, species and subspecies) as found in <code><a href='microorganisms.html'>microorganisms$fullname</a></code>;</p></li>
|
||||
<li><p>\(l_{n}\) is the length of \(n\);</p></li>
|
||||
<li><p>\(\operatorname{lev}\) is the <a href='https://en.wikipedia.org/wiki/Levenshtein_distance'>Levenshtein distance function</a>;</p></li>
|
||||
<li><p>\(p_{n}\) is the human pathogenic prevalence of \(n\), categorised into group \(1\), \(2\) and \(3\) (see <em>Details</em> in <code>?as.mo</code>), meaning that \(p = \{1, 2 , 3\}\);</p></li>
|
||||
<li><p>\(k_{n}\) is the kingdom index of \(n\), set as follows: Bacteria = \(1\), Fungi = \(2\), Protozoa = \(3\), Archaea = \(4\), and all others = \(5\), meaning that \(k = \{1, 2 , 3, 4, 5\}\).</p></li>
|
||||
</ul>
|
||||
|
||||
<p>This means that the user input <code>x = "E. coli"</code> gets for <em>Escherichia coli</em> a matching score of 68.8% and for <em>Entamoeba coli</em> a matching score of 7.9%.</p>
|
||||
<p>All matches are sorted descending on their matching score and for all user input values, the top match will be returned.</p>
|
||||
<h2 class="hasAnchor" id="catalogue-of-life"><a class="anchor" href="#catalogue-of-life"></a>Catalogue of Life</h2>
|
||||
|
||||
|
@ -81,7 +81,7 @@
|
||||
</button>
|
||||
<span class="navbar-brand">
|
||||
<a class="navbar-link" href="../index.html">AMR (for R)</a>
|
||||
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9030</span>
|
||||
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9031</span>
|
||||
</span>
|
||||
</div>
|
||||
|
||||
|
@ -82,7 +82,7 @@
|
||||
</button>
|
||||
<span class="navbar-brand">
|
||||
<a class="navbar-link" href="../index.html">AMR (for R)</a>
|
||||
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9030</span>
|
||||
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9031</span>
|
||||
</span>
|
||||
</div>
|
||||
|
||||
@ -255,27 +255,24 @@
|
||||
<th>n</th>
|
||||
<td><p>A full taxonomic name, that exists in <code><a href='microorganisms.html'>microorganisms$fullname</a></code></p></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>uncertainty</th>
|
||||
<td><p>The level of uncertainty set in <code><a href='as.mo.html'>as.mo()</a></code>, see <code>allow_uncertain</code> in that function (here, it defaults to 1, but is automatically determined in <code><a href='as.mo.html'>as.mo()</a></code> based on the number of transformations needed to get to a result)</p></td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
<h2 class="hasAnchor" id="matching-score-for-microorganisms"><a class="anchor" href="#matching-score-for-microorganisms"></a>Matching score for microorganisms</h2>
|
||||
|
||||
|
||||
|
||||
<p>With ambiguous user input in <code><a href='as.mo.html'>as.mo()</a></code> and all the <code><a href='mo_property.html'>mo_*</a></code> functions, the returned results are chosen based on their matching score using <code>mo_matching_score()</code>. This matching score \(m\) is calculated as:</p>
|
||||
<p>$$m_{(x, n)} = \frac{l_{n} - 0.5 \times \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} p k}$$</p>
|
||||
<p>With ambiguous user input in <code><a href='as.mo.html'>as.mo()</a></code> and all the <code><a href='mo_property.html'>mo_*</a></code> functions, the returned results are chosen based on their matching score using <code>mo_matching_score()</code>. This matching score \(m\), ranging from 0 to 100%, is calculated as:</p>
|
||||
<p>$$m_{(x, n)} = \frac{l_{n} - 0.5 \cdot \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} \cdot p_{n} \cdot k_{n}}$$</p>
|
||||
<p>where:</p><ul>
|
||||
<li><p>\(x\) is the user input;</p></li>
|
||||
<li><p>\(n\) is a taxonomic name (genus, species and subspecies);</p></li>
|
||||
<li><p>\(l_{n}\) is the length of the taxonomic name;</p></li>
|
||||
<li><p>\(\operatorname{lev}\) is the <a href='https://en.wikipedia.org/wiki/Levenshtein_distance'>Levenshtein distance</a> function;</p></li>
|
||||
<li><p>\(p\) is the human pathogenic prevalence, categorised into group \(1\), \(2\) and \(3\) (see <em>Details</em> in <code><a href='as.mo.html'>?as.mo</a></code>), meaning that \(p = \{1, 2 , 3\}\);</p></li>
|
||||
<li><p>\(k\) is the kingdom index, set as follows: Bacteria = \(1\), Fungi = \(2\), Protozoa = \(3\), Archaea = \(4\), and all others = \(5\), meaning that \(k = \{1, 2 , 3, 4, 5\}\).</p></li>
|
||||
<li><p>\(n\) is a taxonomic name (genus, species and subspecies) as found in <code><a href='microorganisms.html'>microorganisms$fullname</a></code>;</p></li>
|
||||
<li><p>\(l_{n}\) is the length of \(n\);</p></li>
|
||||
<li><p>\(\operatorname{lev}\) is the <a href='https://en.wikipedia.org/wiki/Levenshtein_distance'>Levenshtein distance function</a>;</p></li>
|
||||
<li><p>\(p_{n}\) is the human pathogenic prevalence of \(n\), categorised into group \(1\), \(2\) and \(3\) (see <em>Details</em> in <code><a href='as.mo.html'>?as.mo</a></code>), meaning that \(p = \{1, 2 , 3\}\);</p></li>
|
||||
<li><p>\(k_{n}\) is the kingdom index of \(n\), set as follows: Bacteria = \(1\), Fungi = \(2\), Protozoa = \(3\), Archaea = \(4\), and all others = \(5\), meaning that \(k = \{1, 2 , 3, 4, 5\}\).</p></li>
|
||||
</ul>
|
||||
|
||||
<p>This means that the user input <code>x = "E. coli"</code> gets for <em>Escherichia coli</em> a matching score of 68.8% and for <em>Entamoeba coli</em> a matching score of 7.9%.</p>
|
||||
<p>All matches are sorted descending on their matching score and for all user input values, the top match will be returned.</p>
|
||||
|
||||
<h2 class="hasAnchor" id="examples"><a class="anchor" href="#examples"></a>Examples</h2>
|
||||
|
@ -82,7 +82,7 @@
|
||||
</button>
|
||||
<span class="navbar-brand">
|
||||
<a class="navbar-link" href="../index.html">AMR (for R)</a>
|
||||
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9030</span>
|
||||
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9031</span>
|
||||
</span>
|
||||
</div>
|
||||
|
||||
@ -350,17 +350,18 @@ The <a href='lifecycle.html'>lifecycle</a> of this function is <strong>stable</s
|
||||
|
||||
|
||||
|
||||
<p>With ambiguous user input in <code><a href='as.mo.html'>as.mo()</a></code> and all the <code>mo_*</code> functions, the returned results are chosen based on their matching score using <code><a href='mo_matching_score.html'>mo_matching_score()</a></code>. This matching score \(m\) is calculated as:</p>
|
||||
<p>$$m_{(x, n)} = \frac{l_{n} - 0.5 \times \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} p k}$$</p>
|
||||
<p>With ambiguous user input in <code><a href='as.mo.html'>as.mo()</a></code> and all the <code>mo_*</code> functions, the returned results are chosen based on their matching score using <code><a href='mo_matching_score.html'>mo_matching_score()</a></code>. This matching score \(m\), ranging from 0 to 100%, is calculated as:</p>
|
||||
<p>$$m_{(x, n)} = \frac{l_{n} - 0.5 \cdot \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} \cdot p_{n} \cdot k_{n}}$$</p>
|
||||
<p>where:</p><ul>
|
||||
<li><p>\(x\) is the user input;</p></li>
|
||||
<li><p>\(n\) is a taxonomic name (genus, species and subspecies);</p></li>
|
||||
<li><p>\(l_{n}\) is the length of the taxonomic name;</p></li>
|
||||
<li><p>\(\operatorname{lev}\) is the <a href='https://en.wikipedia.org/wiki/Levenshtein_distance'>Levenshtein distance</a> function;</p></li>
|
||||
<li><p>\(p\) is the human pathogenic prevalence, categorised into group \(1\), \(2\) and \(3\) (see <em>Details</em> in <code><a href='as.mo.html'>?as.mo</a></code>), meaning that \(p = \{1, 2 , 3\}\);</p></li>
|
||||
<li><p>\(k\) is the kingdom index, set as follows: Bacteria = \(1\), Fungi = \(2\), Protozoa = \(3\), Archaea = \(4\), and all others = \(5\), meaning that \(k = \{1, 2 , 3, 4, 5\}\).</p></li>
|
||||
<li><p>\(n\) is a taxonomic name (genus, species and subspecies) as found in <code><a href='microorganisms.html'>microorganisms$fullname</a></code>;</p></li>
|
||||
<li><p>\(l_{n}\) is the length of \(n\);</p></li>
|
||||
<li><p>\(\operatorname{lev}\) is the <a href='https://en.wikipedia.org/wiki/Levenshtein_distance'>Levenshtein distance function</a>;</p></li>
|
||||
<li><p>\(p_{n}\) is the human pathogenic prevalence of \(n\), categorised into group \(1\), \(2\) and \(3\) (see <em>Details</em> in <code><a href='as.mo.html'>?as.mo</a></code>), meaning that \(p = \{1, 2 , 3\}\);</p></li>
|
||||
<li><p>\(k_{n}\) is the kingdom index of \(n\), set as follows: Bacteria = \(1\), Fungi = \(2\), Protozoa = \(3\), Archaea = \(4\), and all others = \(5\), meaning that \(k = \{1, 2 , 3, 4, 5\}\).</p></li>
|
||||
</ul>
|
||||
|
||||
<p>This means that the user input <code>x = "E. coli"</code> gets for <em>Escherichia coli</em> a matching score of 68.8% and for <em>Entamoeba coli</em> a matching score of 7.9%.</p>
|
||||
<p>All matches are sorted descending on their matching score and for all user input values, the top match will be returned.</p>
|
||||
<h2 class="hasAnchor" id="catalogue-of-life"><a class="anchor" href="#catalogue-of-life"></a>Catalogue of Life</h2>
|
||||
|
||||
|
@ -81,7 +81,7 @@
|
||||
</button>
|
||||
<span class="navbar-brand">
|
||||
<a class="navbar-link" href="index.html">AMR (for R)</a>
|
||||
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9030</span>
|
||||
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9031</span>
|
||||
</span>
|
||||
</div>
|
||||
|
||||
|
16
man/as.mo.Rd
16
man/as.mo.Rd
@ -146,20 +146,22 @@ If the unlying code needs breaking changes, they will occur gradually. For examp
|
||||
|
||||
\section{Matching score for microorganisms}{
|
||||
|
||||
With ambiguous user input in \code{\link[=as.mo]{as.mo()}} and all the \code{\link[=mo_property]{mo_*}} functions, the returned results are chosen based on their matching score using \code{\link[=mo_matching_score]{mo_matching_score()}}. This matching score \eqn{m} is calculated as:
|
||||
With ambiguous user input in \code{\link[=as.mo]{as.mo()}} and all the \code{\link[=mo_property]{mo_*}} functions, the returned results are chosen based on their matching score using \code{\link[=mo_matching_score]{mo_matching_score()}}. This matching score \eqn{m}, ranging from 0 to 100\%, is calculated as:
|
||||
|
||||
\deqn{m_{(x, n)} = \frac{l_{n} - 0.5 \times \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} p k}}{m(x, n) = ( l_n * min(l_n, lev(x, n) ) ) / ( l_n * p * k )}
|
||||
\deqn{m_{(x, n)} = \frac{l_{n} - 0.5 \cdot \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} \cdot p_{n} \cdot k_{n}}}{m(x, n) = ( l_n * min(l_n, lev(x, n) ) ) / ( l_n * p_n * k_n )}
|
||||
|
||||
where:
|
||||
\itemize{
|
||||
\item \eqn{x} is the user input;
|
||||
\item \eqn{n} is a taxonomic name (genus, species and subspecies);
|
||||
\item \eqn{l_{n}}{l_n} is the length of the taxonomic name;
|
||||
\item \eqn{\operatorname{lev}}{lev} is the \href{https://en.wikipedia.org/wiki/Levenshtein_distance}{Levenshtein distance} function;
|
||||
\item \eqn{p} is the human pathogenic prevalence, categorised into group \eqn{1}, \eqn{2} and \eqn{3} (see \emph{Details} in \code{?as.mo}), meaning that \eqn{p = \{1, 2 , 3\}}{p = {1, 2, 3}};
|
||||
\item \eqn{k} is the kingdom index, set as follows: Bacteria = \eqn{1}, Fungi = \eqn{2}, Protozoa = \eqn{3}, Archaea = \eqn{4}, and all others = \eqn{5}, meaning that \eqn{k = \{1, 2 , 3, 4, 5\}}{k = {1, 2, 3, 4, 5}}.
|
||||
\item \eqn{n} is a taxonomic name (genus, species and subspecies) as found in \code{\link[=microorganisms]{microorganisms$fullname}};
|
||||
\item \eqn{l_{n}}{l_n} is the length of \eqn{n};
|
||||
\item \eqn{\operatorname{lev}}{lev} is the \href{https://en.wikipedia.org/wiki/Levenshtein_distance}{Levenshtein distance function};
|
||||
\item \eqn{p_{n}}{p_n} is the human pathogenic prevalence of \eqn{n}, categorised into group \eqn{1}, \eqn{2} and \eqn{3} (see \emph{Details} in \code{?as.mo}), meaning that \eqn{p = \{1, 2 , 3\}}{p = {1, 2, 3}};
|
||||
\item \eqn{k_{n}}{k_n} is the kingdom index of \eqn{n}, set as follows: Bacteria = \eqn{1}, Fungi = \eqn{2}, Protozoa = \eqn{3}, Archaea = \eqn{4}, and all others = \eqn{5}, meaning that \eqn{k = \{1, 2 , 3, 4, 5\}}{k = {1, 2, 3, 4, 5}}.
|
||||
}
|
||||
|
||||
This means that the user input \code{x = "E. coli"} gets for \emph{Escherichia coli} a matching score of 68.8\% and for \emph{Entamoeba coli} a matching score of 7.9\%.
|
||||
|
||||
All matches are sorted descending on their matching score and for all user input values, the top match will be returned.
|
||||
}
|
||||
|
||||
|
@ -10,28 +10,28 @@ mo_matching_score(x, n)
|
||||
\item{x}{Any user input value(s)}
|
||||
|
||||
\item{n}{A full taxonomic name, that exists in \code{\link[=microorganisms]{microorganisms$fullname}}}
|
||||
|
||||
\item{uncertainty}{The level of uncertainty set in \code{\link[=as.mo]{as.mo()}}, see \code{allow_uncertain} in that function (here, it defaults to 1, but is automatically determined in \code{\link[=as.mo]{as.mo()}} based on the number of transformations needed to get to a result)}
|
||||
}
|
||||
\description{
|
||||
This helper function is used by \code{\link[=as.mo]{as.mo()}} to determine the most probable match of taxonomic records, based on user input.
|
||||
}
|
||||
\section{Matching score for microorganisms}{
|
||||
|
||||
With ambiguous user input in \code{\link[=as.mo]{as.mo()}} and all the \code{\link[=mo_property]{mo_*}} functions, the returned results are chosen based on their matching score using \code{\link[=mo_matching_score]{mo_matching_score()}}. This matching score \eqn{m} is calculated as:
|
||||
With ambiguous user input in \code{\link[=as.mo]{as.mo()}} and all the \code{\link[=mo_property]{mo_*}} functions, the returned results are chosen based on their matching score using \code{\link[=mo_matching_score]{mo_matching_score()}}. This matching score \eqn{m}, ranging from 0 to 100\%, is calculated as:
|
||||
|
||||
\deqn{m_{(x, n)} = \frac{l_{n} - 0.5 \times \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} p k}}{m(x, n) = ( l_n * min(l_n, lev(x, n) ) ) / ( l_n * p * k )}
|
||||
\deqn{m_{(x, n)} = \frac{l_{n} - 0.5 \cdot \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} \cdot p_{n} \cdot k_{n}}}{m(x, n) = ( l_n * min(l_n, lev(x, n) ) ) / ( l_n * p_n * k_n )}
|
||||
|
||||
where:
|
||||
\itemize{
|
||||
\item \eqn{x} is the user input;
|
||||
\item \eqn{n} is a taxonomic name (genus, species and subspecies);
|
||||
\item \eqn{l_{n}}{l_n} is the length of the taxonomic name;
|
||||
\item \eqn{\operatorname{lev}}{lev} is the \href{https://en.wikipedia.org/wiki/Levenshtein_distance}{Levenshtein distance} function;
|
||||
\item \eqn{p} is the human pathogenic prevalence, categorised into group \eqn{1}, \eqn{2} and \eqn{3} (see \emph{Details} in \code{?as.mo}), meaning that \eqn{p = \{1, 2 , 3\}}{p = {1, 2, 3}};
|
||||
\item \eqn{k} is the kingdom index, set as follows: Bacteria = \eqn{1}, Fungi = \eqn{2}, Protozoa = \eqn{3}, Archaea = \eqn{4}, and all others = \eqn{5}, meaning that \eqn{k = \{1, 2 , 3, 4, 5\}}{k = {1, 2, 3, 4, 5}}.
|
||||
\item \eqn{n} is a taxonomic name (genus, species and subspecies) as found in \code{\link[=microorganisms]{microorganisms$fullname}};
|
||||
\item \eqn{l_{n}}{l_n} is the length of \eqn{n};
|
||||
\item \eqn{\operatorname{lev}}{lev} is the \href{https://en.wikipedia.org/wiki/Levenshtein_distance}{Levenshtein distance function};
|
||||
\item \eqn{p_{n}}{p_n} is the human pathogenic prevalence of \eqn{n}, categorised into group \eqn{1}, \eqn{2} and \eqn{3} (see \emph{Details} in \code{?as.mo}), meaning that \eqn{p = \{1, 2 , 3\}}{p = {1, 2, 3}};
|
||||
\item \eqn{k_{n}}{k_n} is the kingdom index of \eqn{n}, set as follows: Bacteria = \eqn{1}, Fungi = \eqn{2}, Protozoa = \eqn{3}, Archaea = \eqn{4}, and all others = \eqn{5}, meaning that \eqn{k = \{1, 2 , 3, 4, 5\}}{k = {1, 2, 3, 4, 5}}.
|
||||
}
|
||||
|
||||
This means that the user input \code{x = "E. coli"} gets for \emph{Escherichia coli} a matching score of 68.8\% and for \emph{Entamoeba coli} a matching score of 7.9\%.
|
||||
|
||||
All matches are sorted descending on their matching score and for all user input values, the top match will be returned.
|
||||
}
|
||||
|
||||
|
@ -126,20 +126,22 @@ If the unlying code needs breaking changes, they will occur gradually. For examp
|
||||
|
||||
\section{Matching score for microorganisms}{
|
||||
|
||||
With ambiguous user input in \code{\link[=as.mo]{as.mo()}} and all the \code{\link[=mo_property]{mo_*}} functions, the returned results are chosen based on their matching score using \code{\link[=mo_matching_score]{mo_matching_score()}}. This matching score \eqn{m} is calculated as:
|
||||
With ambiguous user input in \code{\link[=as.mo]{as.mo()}} and all the \code{\link[=mo_property]{mo_*}} functions, the returned results are chosen based on their matching score using \code{\link[=mo_matching_score]{mo_matching_score()}}. This matching score \eqn{m}, ranging from 0 to 100\%, is calculated as:
|
||||
|
||||
\deqn{m_{(x, n)} = \frac{l_{n} - 0.5 \times \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} p k}}{m(x, n) = ( l_n * min(l_n, lev(x, n) ) ) / ( l_n * p * k )}
|
||||
\deqn{m_{(x, n)} = \frac{l_{n} - 0.5 \cdot \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} \cdot p_{n} \cdot k_{n}}}{m(x, n) = ( l_n * min(l_n, lev(x, n) ) ) / ( l_n * p_n * k_n )}
|
||||
|
||||
where:
|
||||
\itemize{
|
||||
\item \eqn{x} is the user input;
|
||||
\item \eqn{n} is a taxonomic name (genus, species and subspecies);
|
||||
\item \eqn{l_{n}}{l_n} is the length of the taxonomic name;
|
||||
\item \eqn{\operatorname{lev}}{lev} is the \href{https://en.wikipedia.org/wiki/Levenshtein_distance}{Levenshtein distance} function;
|
||||
\item \eqn{p} is the human pathogenic prevalence, categorised into group \eqn{1}, \eqn{2} and \eqn{3} (see \emph{Details} in \code{?as.mo}), meaning that \eqn{p = \{1, 2 , 3\}}{p = {1, 2, 3}};
|
||||
\item \eqn{k} is the kingdom index, set as follows: Bacteria = \eqn{1}, Fungi = \eqn{2}, Protozoa = \eqn{3}, Archaea = \eqn{4}, and all others = \eqn{5}, meaning that \eqn{k = \{1, 2 , 3, 4, 5\}}{k = {1, 2, 3, 4, 5}}.
|
||||
\item \eqn{n} is a taxonomic name (genus, species and subspecies) as found in \code{\link[=microorganisms]{microorganisms$fullname}};
|
||||
\item \eqn{l_{n}}{l_n} is the length of \eqn{n};
|
||||
\item \eqn{\operatorname{lev}}{lev} is the \href{https://en.wikipedia.org/wiki/Levenshtein_distance}{Levenshtein distance function};
|
||||
\item \eqn{p_{n}}{p_n} is the human pathogenic prevalence of \eqn{n}, categorised into group \eqn{1}, \eqn{2} and \eqn{3} (see \emph{Details} in \code{?as.mo}), meaning that \eqn{p = \{1, 2 , 3\}}{p = {1, 2, 3}};
|
||||
\item \eqn{k_{n}}{k_n} is the kingdom index of \eqn{n}, set as follows: Bacteria = \eqn{1}, Fungi = \eqn{2}, Protozoa = \eqn{3}, Archaea = \eqn{4}, and all others = \eqn{5}, meaning that \eqn{k = \{1, 2 , 3, 4, 5\}}{k = {1, 2, 3, 4, 5}}.
|
||||
}
|
||||
|
||||
This means that the user input \code{x = "E. coli"} gets for \emph{Escherichia coli} a matching score of 68.8\% and for \emph{Entamoeba coli} a matching score of 7.9\%.
|
||||
|
||||
All matches are sorted descending on their matching score and for all user input values, the top match will be returned.
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user