(v1.3.0.9030) matching score update

This commit is contained in:
dr. M.S. (Matthijs) Berends 2020-09-26 16:26:01 +02:00
parent 9667c2994f
commit 050a9a04fb
33 changed files with 249 additions and 175 deletions

View File

@ -1,6 +1,6 @@
Package: AMR Package: AMR
Version: 1.3.0.9029 Version: 1.3.0.9030
Date: 2020-09-25 Date: 2020-09-26
Title: Antimicrobial Resistance Analysis Title: Antimicrobial Resistance Analysis
Authors@R: c( Authors@R: c(
person(role = c("aut", "cre"), person(role = c("aut", "cre"),

View File

@ -1,5 +1,5 @@
# AMR 1.3.0.9029 # AMR 1.3.0.9030
## <small>Last updated: 25 September 2020</small> ## <small>Last updated: 26 September 2020</small>
Note: some changes in this version were suggested by anonymous reviewers from the journal we submitted our manuscipt to. We are those reviewers very grateful for going through our code so thoroughly! Note: some changes in this version were suggested by anonymous reviewers from the journal we submitted our manuscipt to. We are those reviewers very grateful for going through our code so thoroughly!

View File

@ -48,18 +48,6 @@ pm_left_join <- function(x, y, by = NULL, suffix = c(".x", ".y")) {
rownames(merged) <- NULL rownames(merged) <- NULL
merged merged
} }
# pm_filter_join_worker <- function(x, y, by = NULL, type = c("anti", "semi")) {
# type <- match.arg(type, choices = c("anti", "semi"), several.ok = FALSE)
# if (is.null(by)) {
# by <- intersect(names(x), names(y))
# join_message(by)
# }
# rows <- interaction(x[, by]) %in% interaction(y[, by])
# if (type == "anti") rows <- !rows
# res <- x[rows, , drop = FALSE]
# rownames(res) <- NULL
# res
# }
quick_case_when <- function(...) { quick_case_when <- function(...) {
vectors <- list(...) vectors <- list(...)

View File

@ -27,12 +27,12 @@
#' @name join #' @name join
#' @aliases join inner_join #' @aliases join inner_join
#' @param x existing table to join, or character vector #' @param x existing table to join, or character vector
#' @param by a variable to join by - if left empty will search for a column with class [`mo`] (created with [as.mo()]) or will be `"mo"` if that column name exists in `x`, could otherwise be a column name of `x` with values that exist in `microorganisms$mo` (like `by = "bacteria_id"`), or another column in [microorganisms] (but then it should be named, like `by = c("my_genus_species" = "fullname")`) #' @param by a variable to join by - if left empty will search for a column with class [`mo`] (created with [as.mo()]) or will be `"mo"` if that column name exists in `x`, could otherwise be a column name of `x` with values that exist in `microorganisms$mo` (like `by = "bacteria_id"`), or another column in [microorganisms] (but then it should be named, like `by = c("bacteria_id" = "fullname")`)
#' @param suffix if there are non-joined duplicate variables in `x` and `y`, these suffixes will be added to the output to disambiguate them. Should be a character vector of length 2. #' @param suffix if there are non-joined duplicate variables in `x` and `y`, these suffixes will be added to the output to disambiguate them. Should be a character vector of length 2.
#' @param ... ignored #' @param ... ignored
#' @details **Note:** As opposed to the `join()` functions of `dplyr`, [character] vectors are supported and at default existing columns will get a suffix `"2"` and the newly joined columns will not get a suffix. #' @details **Note:** As opposed to the `join()` functions of `dplyr`, [character] vectors are supported and at default existing columns will get a suffix `"2"` and the newly joined columns will not get a suffix.
#' #'
#' These functions rely on [merge()], a base R function to do joins. #' If the `dplyr` package is installed, their join functions will be used. Otherwise, the much slower [merge()] function from base R will be used.
#' @inheritSection AMR Read more on our website! #' @inheritSection AMR Read more on our website!
#' @export #' @export
#' @examples #' @examples
@ -60,9 +60,17 @@ inner_join_microorganisms <- function(x, by = NULL, suffix = c("2", ""), ...) {
x_class <- get_prejoined_class(x) x_class <- get_prejoined_class(x)
x <- checked$x x <- checked$x
by <- checked$by by <- checked$by
# use dplyr if available - it's much faster
dplyr_inner <- import_fn("inner_join", "dplyr", error_on_fail = FALSE)
if (!is.null(dplyr_inner)) {
join <- suppressWarnings(
dplyr_inner(x = x, y = microorganisms, by = by, suffix = suffix, ...)
)
} else {
join <- suppressWarnings( join <- suppressWarnings(
pm_inner_join(x = x, y = microorganisms, by = by, suffix = suffix, ...) pm_inner_join(x = x, y = microorganisms, by = by, suffix = suffix, ...)
) )
}
if (NROW(join) > NROW(x)) { if (NROW(join) > NROW(x)) {
warning("The newly joined tbl contains ", nrow(join) - nrow(x), " rows more that its original.") warning("The newly joined tbl contains ", nrow(join) - nrow(x), " rows more that its original.")
} }
@ -79,9 +87,17 @@ left_join_microorganisms <- function(x, by = NULL, suffix = c("2", ""), ...) {
x_class <- get_prejoined_class(x) x_class <- get_prejoined_class(x)
x <- checked$x x <- checked$x
by <- checked$by by <- checked$by
# use dplyr if available - it's much faster
dplyr_left <- import_fn("left_join", "dplyr", error_on_fail = FALSE)
if (!is.null(dplyr_left)) {
join <- suppressWarnings(
dplyr_left(x = x, y = microorganisms, by = by, suffix = suffix, ...)
)
} else {
join <- suppressWarnings( join <- suppressWarnings(
pm_left_join(x = x, y = microorganisms, by = by, suffix = suffix, ...) pm_left_join(x = x, y = microorganisms, by = by, suffix = suffix, ...)
) )
}
if (NROW(join) > NROW(x)) { if (NROW(join) > NROW(x)) {
warning("The newly joined tbl contains ", nrow(join) - nrow(x), " rows more that its original.") warning("The newly joined tbl contains ", nrow(join) - nrow(x), " rows more that its original.")
} }
@ -98,9 +114,17 @@ right_join_microorganisms <- function(x, by = NULL, suffix = c("2", ""), ...) {
x_class <- get_prejoined_class(x) x_class <- get_prejoined_class(x)
x <- checked$x x <- checked$x
by <- checked$by by <- checked$by
# use dplyr if available - it's much faster
dplyr_right <- import_fn("right_join", "dplyr", error_on_fail = FALSE)
if (!is.null(dplyr_right)) {
join <- suppressWarnings(
dplyr_right(x = x, y = microorganisms, by = by, suffix = suffix, ...)
)
} else {
join <- suppressWarnings( join <- suppressWarnings(
pm_right_join(x = x, y = microorganisms, by = by, suffix = suffix, ...) pm_right_join(x = x, y = microorganisms, by = by, suffix = suffix, ...)
) )
}
if (NROW(join) > NROW(x)) { if (NROW(join) > NROW(x)) {
warning("The newly joined tbl contains ", nrow(join) - nrow(x), " rows more that its original.") warning("The newly joined tbl contains ", nrow(join) - nrow(x), " rows more that its original.")
} }
@ -117,9 +141,17 @@ full_join_microorganisms <- function(x, by = NULL, suffix = c("2", ""), ...) {
x_class <- get_prejoined_class(x) x_class <- get_prejoined_class(x)
x <- checked$x x <- checked$x
by <- checked$by by <- checked$by
# use dplyr if available - it's much faster
dplyr_full <- import_fn("full_join", "dplyr", error_on_fail = FALSE)
if (!is.null(dplyr_full)) {
join <- suppressWarnings(
dplyr_full(x = x, y = microorganisms, by = by, suffix = suffix, ...)
)
} else {
join <- suppressWarnings( join <- suppressWarnings(
pm_full_join(x = x, y = microorganisms, by = by, suffix = suffix, ...) pm_full_join(x = x, y = microorganisms, by = by, suffix = suffix, ...)
) )
}
if (NROW(join) > NROW(x)) { if (NROW(join) > NROW(x)) {
warning("The newly joined tbl contains ", nrow(join) - nrow(x), " rows more that its original.") warning("The newly joined tbl contains ", nrow(join) - nrow(x), " rows more that its original.")
} }
@ -136,9 +168,17 @@ semi_join_microorganisms <- function(x, by = NULL, ...) {
checked <- joins_check_df(x, by) checked <- joins_check_df(x, by)
x <- checked$x x <- checked$x
by <- checked$by by <- checked$by
# use dplyr if available - it's much faster
dplyr_semi <- import_fn("semi_join", "dplyr", error_on_fail = FALSE)
if (!is.null(dplyr_semi)) {
join <- suppressWarnings( join <- suppressWarnings(
pm_semi_join(x = x, y = microorganisms, by = by, ...) dplyr_semi(x = x, y = microorganisms, by = by,...)
) )
} else {
join <- suppressWarnings(
pm_semi_join(x = x, y = microorganisms, by = by,...)
)
}
class(join) <- x_class class(join) <- x_class
join join
} }
@ -152,9 +192,17 @@ anti_join_microorganisms <- function(x, by = NULL, ...) {
x_class <- get_prejoined_class(x) x_class <- get_prejoined_class(x)
x <- checked$x x <- checked$x
by <- checked$by by <- checked$by
# use dplyr if available - it's much faster
dplyr_anti <- import_fn("anti_join", "dplyr", error_on_fail = FALSE)
if (!is.null(dplyr_anti)) {
join <- suppressWarnings( join <- suppressWarnings(
pm_anti_join(x = x, y = microorganisms, by = by, ...) dplyr_anti(x = x, y = microorganisms, by = by,...)
) )
} else {
join <- suppressWarnings(
pm_anti_join(x = x, y = microorganisms, by = by,...)
)
}
class(join) <- x_class class(join) <- x_class
join join
} }

32
R/mo.R
View File

@ -101,20 +101,7 @@
#' #'
#' Group 3 (least prevalent microorganisms) consists of all other microorganisms. This group contains microorganisms most probably not found in humans. #' Group 3 (least prevalent microorganisms) consists of all other microorganisms. This group contains microorganisms most probably not found in humans.
#' #'
#' ## Background on matching scores #' @inheritSection mo_matching_score Matching score for microorganisms
#' With ambiguous user input, the returned results are chosen based on their matching score using [mo_matching_score()]. This matching score is based on four parameters:
#'
#' 1. The prevalence \eqn{P} is categorised into group 1, 2 and 3 as stated above;
#' 2. A kingdom index \eqn{K} is set as follows: Bacteria = 1, Fungi = 2, Protozoa = 3, Archaea = 4, and all others = 5;
#' 3. The level of uncertainty \eqn{U} needed to get to the result, as stated above (1 to 3);
#' 4. The [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) \eqn{L} is the distance between the user input and all taxonomic full names, with the text length of the user input being the maximum distance. A modified version of the Levenshtein distance \eqn{L'} based on the text length of the full name \eqn{F} is calculated as:
#'
#' \deqn{L' = 1 - \frac{0.5L}{F}}{L' = 1 - ((0.5 * L) / F)}
#'
#' The final matching score \eqn{M} is calculated as:
#' \deqn{M = L' \times \frac{1}{P K U} = \frac{F - 0.5L}{F P K U}}{M = L' * (1 / (P * K * U)) = (F - 0.5L) / (F * P * K * U)}
#'
#' All matches are sorted descending on their matching score and for all user input values, the top match will be returned.
#' @inheritSection catalogue_of_life Catalogue of Life #' @inheritSection catalogue_of_life Catalogue of Life
# (source as a section here, so it can be inherited by other man pages:) # (source as a section here, so it can be inherited by other man pages:)
#' @section Source: #' @section Source:
@ -331,8 +318,7 @@ exec_as.mo <- function(x,
if (NROW(res_df) > 1 & uncertainty != -1) { if (NROW(res_df) > 1 & uncertainty != -1) {
# sort the findings on matching score # sort the findings on matching score
scores <- mo_matching_score(x = input, scores <- mo_matching_score(x = input,
fullname = res_df[, "fullname", drop = TRUE], fullname = res_df[, "fullname", drop = TRUE])
uncertainty = uncertainty)
res_df <- res_df[order(scores, decreasing = TRUE), , drop = FALSE] res_df <- res_df[order(scores, decreasing = TRUE), , drop = FALSE]
} }
res <- as.character(res_df[, column, drop = TRUE]) res <- as.character(res_df[, column, drop = TRUE])
@ -1779,7 +1765,7 @@ print.mo_uncertainties <- function(x, ...) {
if (NROW(x) == 0) { if (NROW(x) == 0) {
return(NULL) return(NULL)
} }
cat(font_blue(strwrap(c("Matching scores are based on human pathogenic prevalence and the resemblance between the input and the full taxonomic name. Furthermore, an indication is given about the probability of the match - the more transformations are needed for coercion, the more improbable the result.")), collapse = "\n")) cat(font_blue(strwrap(c("Matching scores are based on human pathogenic prevalence and the resemblance between the input and the full taxonomic name. Furthermore, an indication is given about the certainty of the match - the more transformations are needed for coercion, the less certain the result.")), collapse = "\n"))
cat("\n") cat("\n")
msg <- "" msg <- ""
@ -1787,8 +1773,7 @@ print.mo_uncertainties <- function(x, ...) {
if (x[i, ]$candidates != "") { if (x[i, ]$candidates != "") {
candidates <- unlist(strsplit(x[i, ]$candidates, ", ", fixed = TRUE)) candidates <- unlist(strsplit(x[i, ]$candidates, ", ", fixed = TRUE))
scores <- mo_matching_score(x = x[i, ]$input, scores <- mo_matching_score(x = x[i, ]$input,
fullname = candidates, fullname = candidates)
uncertainty = x[i, ]$uncertainty)
# sort on descending scores # sort on descending scores
candidates <- candidates[order(1 - scores)] candidates <- candidates[order(1 - scores)]
n_candidates <- length(candidates) n_candidates <- length(candidates)
@ -1802,11 +1787,11 @@ print.mo_uncertainties <- function(x, ...) {
candidates <- "" candidates <- ""
} }
if (x[i, ]$uncertainty == 1) { if (x[i, ]$uncertainty == 1) {
uncertainty_interpretation <- font_green("* MOST PROBABLE *") uncertainty_interpretation <- font_green("* very certain *")
} else if (x[i, ]$uncertainty == 1) { } else if (x[i, ]$uncertainty == 1) {
uncertainty_interpretation <- font_yellow("* PROBABLE *") uncertainty_interpretation <- font_yellow("* certain *")
} else { } else {
uncertainty_interpretation <- font_red("* IMPROBABLE *") uncertainty_interpretation <- font_red("* not certain *")
} }
msg <- paste(msg, msg <- paste(msg,
paste0('"', x[i, ]$input, '" -> ', paste0('"', x[i, ]$input, '" -> ',
@ -1814,8 +1799,7 @@ print.mo_uncertainties <- function(x, ...) {
ifelse(!is.na(x[i, ]$renamed_to), paste(", renamed to", font_italic(x[i, ]$renamed_to)), ""), ifelse(!is.na(x[i, ]$renamed_to), paste(", renamed to", font_italic(x[i, ]$renamed_to)), ""),
" (", x[i, ]$mo, " (", x[i, ]$mo,
", matching score = ", trimws(percentage(mo_matching_score(x = x[i, ]$input, ", matching score = ", trimws(percentage(mo_matching_score(x = x[i, ]$input,
fullname = x[i, ]$fullname, fullname = x[i, ]$fullname),
uncertainty = x[i, ]$uncertainty),
digits = 1)), digits = 1)),
") "), ") "),
uncertainty_interpretation, uncertainty_interpretation,

View File

@ -23,50 +23,53 @@
#' #'
#' This helper function is used by [as.mo()] to determine the most probable match of taxonomic records, based on user input. #' This helper function is used by [as.mo()] to determine the most probable match of taxonomic records, based on user input.
#' @param x Any user input value(s) #' @param x Any user input value(s)
#' @param fullname A full taxonomic name, that exists in [`microorganisms$fullname`][microorganisms] #' @param n A full taxonomic name, that exists in [`microorganisms$fullname`][microorganisms]
#' @param uncertainty The level of uncertainty set in [as.mo()], see `allow_uncertain` in that function (here, it defaults to 1, but is automatically determined in [as.mo()] based on the number of transformations needed to get to a result) #' @param uncertainty The level of uncertainty set in [as.mo()], see `allow_uncertain` in that function (here, it defaults to 1, but is automatically determined in [as.mo()] based on the number of transformations needed to get to a result)
#' @details The matching score is based on four parameters: #' @section Matching score for microorganisms:
#' With ambiguous user input in [as.mo()] and all the [`mo_*`][mo_property()] functions, the returned results are chosen based on their matching score using [mo_matching_score()]. This matching score \eqn{m} is calculated as:
#' #'
#' 1. A human pathogenic prevalence \eqn{P}, that is categorised into group 1, 2 and 3 (see [as.mo()]); #' \deqn{m_{(x, n)} = \frac{l_{n} - 0.5 \times \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} p k}}{m(x, n) = ( l_n * min(l_n, lev(x, n) ) ) / ( l_n * p * k )}
#' 2. A kingdom index \eqn{K} is set as follows: Bacteria = 1, Fungi = 2, Protozoa = 3, Archaea = 4, and all others = 5;
#' 3. The level of uncertainty \eqn{U} that is needed to get to a result (1 to 3, see [as.mo()]);
#' 4. The [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) \eqn{L} is the distance between the user input and all taxonomic full names, with the text length of the user input being the maximum distance. A modified version of the Levenshtein distance \eqn{L'} based on the text length of the full name \eqn{F} is calculated as:
#' #'
#' \deqn{L' = 1 - \frac{0.5L}{F}}{L' = 1 - ((0.5 * L) / F)} #' where:
#' #'
#' The final matching score \eqn{M} is calculated as: #' * \eqn{x} is the user input;
#' \deqn{M = L' \times \frac{1}{P K U} = \frac{F - 0.5L}{F P K U}}{M = L' * (1 / (P * K * U)) = (F - 0.5L) / (F * P * K * U)} #' * \eqn{n} is a taxonomic name (genus, species and subspecies);
#' * \eqn{l_{n}}{l_n} is the length of the taxonomic name;
#' * \eqn{\operatorname{lev}}{lev} is the [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) function;
#' * \eqn{p} is the human pathogenic prevalence, categorised into group \eqn{1}, \eqn{2} and \eqn{3} (see *Details* in `?as.mo`), meaning that \eqn{p = \{1, 2 , 3\}}{p = {1, 2, 3}};
#' * \eqn{k} is the kingdom index, set as follows: Bacteria = \eqn{1}, Fungi = \eqn{2}, Protozoa = \eqn{3}, Archaea = \eqn{4}, and all others = \eqn{5}, meaning that \eqn{k = \{1, 2 , 3, 4, 5\}}{k = {1, 2, 3, 4, 5}}.
#' #'
#' All matches are sorted descending on their matching score and for all user input values, the top match will be returned.
#' @export #' @export
#' @examples #' @examples
#' as.mo("E. coli") #' as.mo("E. coli")
#' mo_uncertainties() #' mo_uncertainties()
mo_matching_score <- function(x, fullname, uncertainty = 1) { #'
# fullname is always a taxonomically valid full name #' mo_matching_score("E. coli", "Escherichia coli")
mo_matching_score <- function(x, n) {
# n is always a taxonomically valid full name
levenshtein <- double(length = length(x)) levenshtein <- double(length = length(x))
if (length(fullname) == 1) { if (length(n) == 1) {
fullname <- rep(fullname, length(x)) n <- rep(n, length(x))
} }
if (length(x) == 1) { if (length(x) == 1) {
x <- rep(x, length(fullname)) x <- rep(x, length(n))
} }
for (i in seq_len(length(x))) { for (i in seq_len(length(x))) {
# determine Levenshtein distance, but maximise to nchar of fullname # determine Levenshtein distance, but maximise to nchar of n
levenshtein[i] <- min(as.double(utils::adist(x[i], fullname[i], ignore.case = FALSE)), levenshtein[i] <- min(as.double(utils::adist(x[i], n[i], ignore.case = FALSE)),
nchar(fullname[i])) nchar(n[i]))
} }
# F = length of fullname # F = length of fullname
var_F <- nchar(fullname) var_F <- nchar(n)
# L = modified Levenshtein distance # L = modified Levenshtein distance
var_L <- levenshtein var_L <- levenshtein
# P = Prevalence (1 to 3) # P = Prevalence (1 to 3)
var_P <- MO_lookup[match(fullname, MO_lookup$fullname), "prevalence", drop = TRUE] var_P <- MO_lookup[match(n, MO_lookup$fullname), "prevalence", drop = TRUE]
# K = kingdom index (Bacteria = 1, Fungi = 2, Protozoa = 3, Archaea = 4, others = 5) # K = kingdom index (Bacteria = 1, Fungi = 2, Protozoa = 3, Archaea = 4, others = 5)
var_K <- MO_lookup[match(fullname, MO_lookup$fullname), "kingdom_index", drop = TRUE] var_K <- MO_lookup[match(n, MO_lookup$fullname), "kingdom_index", drop = TRUE]
# U = uncertainty level (1 to 3), as per as.mo()
var_U <- uncertainty
# matching score: # matching score:
(var_F - 0.5 * var_L) / (var_F * var_P * var_K * var_U) (var_F - 0.5 * var_L) / (var_F * var_P * var_K)
} }

View File

@ -42,6 +42,7 @@
#' All output will be [translate]d where possible. #' All output will be [translate]d where possible.
#' #'
#' The function [mo_url()] will return the direct URL to the online database entry, which also shows the scientific reference of the concerned species. #' The function [mo_url()] will return the direct URL to the online database entry, which also shows the scientific reference of the concerned species.
#' @inheritSection mo_matching_score Matching score for microorganisms
#' @inheritSection catalogue_of_life Catalogue of Life #' @inheritSection catalogue_of_life Catalogue of Life
#' @inheritSection as.mo Source #' @inheritSection as.mo Source
#' @rdname mo_property #' @rdname mo_property

View File

@ -81,7 +81,7 @@
</button> </button>
<span class="navbar-brand"> <span class="navbar-brand">
<a class="navbar-link" href="https://msberends.github.io/AMR/index.html">AMR (for R)</a> <a class="navbar-link" href="https://msberends.github.io/AMR/index.html">AMR (for R)</a>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9029</span> <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9030</span>
</span> </span>
</div> </div>

View File

@ -81,7 +81,7 @@
</button> </button>
<span class="navbar-brand"> <span class="navbar-brand">
<a class="navbar-link" href="index.html">AMR (for R)</a> <a class="navbar-link" href="index.html">AMR (for R)</a>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9029</span> <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9030</span>
</span> </span>
</div> </div>

View File

@ -81,7 +81,7 @@
</button> </button>
<span class="navbar-brand"> <span class="navbar-brand">
<a class="navbar-link" href="../index.html">AMR (for R)</a> <a class="navbar-link" href="../index.html">AMR (for R)</a>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9029</span> <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9030</span>
</span> </span>
</div> </div>

View File

@ -81,7 +81,7 @@
</button> </button>
<span class="navbar-brand"> <span class="navbar-brand">
<a class="navbar-link" href="index.html">AMR (for R)</a> <a class="navbar-link" href="index.html">AMR (for R)</a>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9029</span> <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9030</span>
</span> </span>
</div> </div>

View File

@ -43,7 +43,7 @@
</button> </button>
<span class="navbar-brand"> <span class="navbar-brand">
<a class="navbar-link" href="index.html">AMR (for R)</a> <a class="navbar-link" href="index.html">AMR (for R)</a>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9029</span> <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9030</span>
</span> </span>
</div> </div>

View File

@ -81,7 +81,7 @@
</button> </button>
<span class="navbar-brand"> <span class="navbar-brand">
<a class="navbar-link" href="../index.html">AMR (for R)</a> <a class="navbar-link" href="../index.html">AMR (for R)</a>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9029</span> <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9030</span>
</span> </span>
</div> </div>
@ -236,13 +236,13 @@
<small>Source: <a href='https://github.com/msberends/AMR/blob/master/NEWS.md'><code>NEWS.md</code></a></small> <small>Source: <a href='https://github.com/msberends/AMR/blob/master/NEWS.md'><code>NEWS.md</code></a></small>
</div> </div>
<div id="amr-1309029" class="section level1"> <div id="amr-1309030" class="section level1">
<h1 class="page-header" data-toc-text="1.3.0.9029"> <h1 class="page-header" data-toc-text="1.3.0.9030">
<a href="#amr-1309029" class="anchor"></a>AMR 1.3.0.9029<small> Unreleased </small> <a href="#amr-1309030" class="anchor"></a>AMR 1.3.0.9030<small> Unreleased </small>
</h1> </h1>
<div id="last-updated-25-september-2020" class="section level2"> <div id="last-updated-26-september-2020" class="section level2">
<h2 class="hasAnchor"> <h2 class="hasAnchor">
<a href="#last-updated-25-september-2020" class="anchor"></a><small>Last updated: 25 September 2020</small> <a href="#last-updated-26-september-2020" class="anchor"></a><small>Last updated: 26 September 2020</small>
</h2> </h2>
<p>Note: some changes in this version were suggested by anonymous reviewers from the journal we submitted our manuscipt to. We are those reviewers very grateful for going through our code so thoroughly!</p> <p>Note: some changes in this version were suggested by anonymous reviewers from the journal we submitted our manuscipt to. We are those reviewers very grateful for going through our code so thoroughly!</p>
<div id="new" class="section level3"> <div id="new" class="section level3">

View File

@ -2,7 +2,7 @@ pandoc: 2.7.3
pkgdown: 1.5.1.9000 pkgdown: 1.5.1.9000
pkgdown_sha: eae56f08694abebf93cdfc0dd8e9ede06d8c815f pkgdown_sha: eae56f08694abebf93cdfc0dd8e9ede06d8c815f
articles: [] articles: []
last_built: 2020-09-25T12:44Z last_built: 2020-09-26T14:25Z
urls: urls:
reference: https://msberends.github.io/AMR/reference reference: https://msberends.github.io/AMR/reference
article: https://msberends.github.io/AMR/articles article: https://msberends.github.io/AMR/articles

View File

@ -82,7 +82,7 @@
</button> </button>
<span class="navbar-brand"> <span class="navbar-brand">
<a class="navbar-link" href="../index.html">AMR (for R)</a> <a class="navbar-link" href="../index.html">AMR (for R)</a>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9028</span> <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9030</span>
</span> </span>
</div> </div>
@ -366,21 +366,6 @@
<p>Group 2 consists of all microorganisms where the taxonomic phylum is Proteobacteria, Firmicutes, Actinobacteria or Sarcomastigophora, or where the taxonomic genus is <em>Aspergillus</em>, <em>Bacteroides</em>, <em>Candida</em>, <em>Capnocytophaga</em>, <em>Chryseobacterium</em>, <em>Cryptococcus</em>, <em>Elisabethkingia</em>, <em>Flavobacterium</em>, <em>Fusobacterium</em>, <em>Giardia</em>, <em>Leptotrichia</em>, <em>Mycoplasma</em>, <em>Prevotella</em>, <em>Rhodotorula</em>, <em>Treponema</em>, <em>Trichophyton</em> or <em>Ureaplasma</em>. This group consequently contains all less common and rare human pathogens.</p> <p>Group 2 consists of all microorganisms where the taxonomic phylum is Proteobacteria, Firmicutes, Actinobacteria or Sarcomastigophora, or where the taxonomic genus is <em>Aspergillus</em>, <em>Bacteroides</em>, <em>Candida</em>, <em>Capnocytophaga</em>, <em>Chryseobacterium</em>, <em>Cryptococcus</em>, <em>Elisabethkingia</em>, <em>Flavobacterium</em>, <em>Fusobacterium</em>, <em>Giardia</em>, <em>Leptotrichia</em>, <em>Mycoplasma</em>, <em>Prevotella</em>, <em>Rhodotorula</em>, <em>Treponema</em>, <em>Trichophyton</em> or <em>Ureaplasma</em>. This group consequently contains all less common and rare human pathogens.</p>
<p>Group 3 (least prevalent microorganisms) consists of all other microorganisms. This group contains microorganisms most probably not found in humans.</p> <p>Group 3 (least prevalent microorganisms) consists of all other microorganisms. This group contains microorganisms most probably not found in humans.</p>
<h3>Background on matching scores</h3>
<p>With ambiguous user input, the returned results are chosen based on their matching score using <code><a href='mo_matching_score.html'>mo_matching_score()</a></code>. This matching score is based on four parameters:</p><ol>
<li><p>The prevalence \(P\) is categorised into group 1, 2 and 3 as stated above;</p></li>
<li><p>A kingdom index \(K\) is set as follows: Bacteria = 1, Fungi = 2, Protozoa = 3, Archaea = 4, and all others = 5;</p></li>
<li><p>The level of uncertainty \(U\) needed to get to the result, as stated above (1 to 3);</p></li>
<li><p>The <a href='https://en.wikipedia.org/wiki/Levenshtein_distance'>Levenshtein distance</a> \(L\) is the distance between the user input and all taxonomic full names, with the text length of the user input being the maximum distance. A modified version of the Levenshtein distance \(L'\) based on the text length of the full name \(F\) is calculated as:</p></li>
</ol>
<p>$$L' = 1 - \frac{0.5L}{F}$$</p>
<p>The final matching score \(M\) is calculated as:
$$M = L' \times \frac{1}{P K U} = \frac{F - 0.5L}{F P K U}$$</p>
<p>All matches are sorted descending on their matching score and for all user input values, the top match will be returned.</p>
<h2 class="hasAnchor" id="source"><a class="anchor" href="#source"></a>Source</h2> <h2 class="hasAnchor" id="source"><a class="anchor" href="#source"></a>Source</h2>
@ -399,6 +384,22 @@ $$M = L' \times \frac{1}{P K U} = \frac{F - 0.5L}{F P K U}$$</p>
<p><img src='figures/lifecycle_stable.svg' style=margin-bottom:5px /> <br /> <p><img src='figures/lifecycle_stable.svg' style=margin-bottom:5px /> <br />
The <a href='lifecycle.html'>lifecycle</a> of this function is <strong>stable</strong>. In a stable function, major changes are unlikely. This means that the unlying code will generally evolve by adding new arguments; removing arguments or changing the meaning of existing arguments will be avoided.</p> The <a href='lifecycle.html'>lifecycle</a> of this function is <strong>stable</strong>. In a stable function, major changes are unlikely. This means that the unlying code will generally evolve by adding new arguments; removing arguments or changing the meaning of existing arguments will be avoided.</p>
<p>If the unlying code needs breaking changes, they will occur gradually. For example, a parameter will be deprecated and first continue to work, but will emit an message informing you of the change. Next, typically after at least one newly released version on CRAN, the message will be transformed to an error.</p> <p>If the unlying code needs breaking changes, they will occur gradually. For example, a parameter will be deprecated and first continue to work, but will emit an message informing you of the change. Next, typically after at least one newly released version on CRAN, the message will be transformed to an error.</p>
<h2 class="hasAnchor" id="matching-score-for-microorganisms"><a class="anchor" href="#matching-score-for-microorganisms"></a>Matching score for microorganisms</h2>
<p>With ambiguous user input in <code>as.mo()</code> and all the <code><a href='mo_property.html'>mo_*</a></code> functions, the returned results are chosen based on their matching score using <code><a href='mo_matching_score.html'>mo_matching_score()</a></code>. This matching score \(m\) is calculated as:</p>
<p>$$m_{(x, n)} = \frac{l_{n} - 0.5 \times \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} p k}$$</p>
<p>where:</p><ul>
<li><p>\(x\) is the user input;</p></li>
<li><p>\(n\) is a taxonomic name (genus, species and subspecies);</p></li>
<li><p>\(l_{n}\) is the length of the taxonomic name;</p></li>
<li><p>\(\operatorname{lev}\) is the <a href='https://en.wikipedia.org/wiki/Levenshtein_distance'>Levenshtein distance</a> function;</p></li>
<li><p>\(p\) is the human pathogenic prevalence, categorised into group \(1\), \(2\) and \(3\) (see <em>Details</em> in <code>?as.mo</code>), meaning that \(p = \{1, 2 , 3\}\);</p></li>
<li><p>\(k\) is the kingdom index, set as follows: Bacteria = \(1\), Fungi = \(2\), Protozoa = \(3\), Archaea = \(4\), and all others = \(5\), meaning that \(k = \{1, 2 , 3, 4, 5\}\).</p></li>
</ul>
<p>All matches are sorted descending on their matching score and for all user input values, the top match will be returned.</p>
<h2 class="hasAnchor" id="catalogue-of-life"><a class="anchor" href="#catalogue-of-life"></a>Catalogue of Life</h2> <h2 class="hasAnchor" id="catalogue-of-life"><a class="anchor" href="#catalogue-of-life"></a>Catalogue of Life</h2>

View File

@ -81,7 +81,7 @@
</button> </button>
<span class="navbar-brand"> <span class="navbar-brand">
<a class="navbar-link" href="../index.html">AMR (for R)</a> <a class="navbar-link" href="../index.html">AMR (for R)</a>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9029</span> <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9030</span>
</span> </span>
</div> </div>

View File

@ -82,7 +82,7 @@
</button> </button>
<span class="navbar-brand"> <span class="navbar-brand">
<a class="navbar-link" href="../index.html">AMR (for R)</a> <a class="navbar-link" href="../index.html">AMR (for R)</a>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9026</span> <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9029</span>
</span> </span>
</div> </div>
@ -263,7 +263,7 @@
</tr> </tr>
<tr> <tr>
<th>by</th> <th>by</th>
<td><p>a variable to join by - if left empty will search for a column with class <code><a href='as.mo.html'>mo</a></code> (created with <code><a href='as.mo.html'>as.mo()</a></code>) or will be <code>"mo"</code> if that column name exists in <code>x</code>, could otherwise be a column name of <code>x</code> with values that exist in <code>microorganisms$mo</code> (like <code>by = "bacteria_id"</code>), or another column in <a href='microorganisms.html'>microorganisms</a> (but then it should be named, like <code>by = c("my_genus_species" = "fullname")</code>)</p></td> <td><p>a variable to join by - if left empty will search for a column with class <code><a href='as.mo.html'>mo</a></code> (created with <code><a href='as.mo.html'>as.mo()</a></code>) or will be <code>"mo"</code> if that column name exists in <code>x</code>, could otherwise be a column name of <code>x</code> with values that exist in <code>microorganisms$mo</code> (like <code>by = "bacteria_id"</code>), or another column in <a href='microorganisms.html'>microorganisms</a> (but then it should be named, like <code>by = c("bacteria_id" = "fullname")</code>)</p></td>
</tr> </tr>
<tr> <tr>
<th>suffix</th> <th>suffix</th>
@ -278,7 +278,7 @@
<h2 class="hasAnchor" id="details"><a class="anchor" href="#details"></a>Details</h2> <h2 class="hasAnchor" id="details"><a class="anchor" href="#details"></a>Details</h2>
<p><strong>Note:</strong> As opposed to the <code>join()</code> functions of <code>dplyr</code>, <a href='https://rdrr.io/r/base/character.html'>character</a> vectors are supported and at default existing columns will get a suffix <code>"2"</code> and the newly joined columns will not get a suffix.</p> <p><strong>Note:</strong> As opposed to the <code>join()</code> functions of <code>dplyr</code>, <a href='https://rdrr.io/r/base/character.html'>character</a> vectors are supported and at default existing columns will get a suffix <code>"2"</code> and the newly joined columns will not get a suffix.</p>
<p>These functions rely on <code><a href='https://rdrr.io/r/base/merge.html'>merge()</a></code>, a base R function to do joins.</p> <p>If the <code>dplyr</code> package is installed, their join functions will be used. Otherwise, the much slower <code><a href='https://rdrr.io/r/base/merge.html'>merge()</a></code> function from base R will be used.</p>
<h2 class="hasAnchor" id="stable-lifecycle"><a class="anchor" href="#stable-lifecycle"></a>Stable lifecycle</h2> <h2 class="hasAnchor" id="stable-lifecycle"><a class="anchor" href="#stable-lifecycle"></a>Stable lifecycle</h2>

View File

@ -82,7 +82,7 @@
</button> </button>
<span class="navbar-brand"> <span class="navbar-brand">
<a class="navbar-link" href="../index.html">AMR (for R)</a> <a class="navbar-link" href="../index.html">AMR (for R)</a>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9028</span> <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9030</span>
</span> </span>
</div> </div>
@ -242,7 +242,7 @@
<p>This helper function is used by <code><a href='as.mo.html'>as.mo()</a></code> to determine the most probable match of taxonomic records, based on user input.</p> <p>This helper function is used by <code><a href='as.mo.html'>as.mo()</a></code> to determine the most probable match of taxonomic records, based on user input.</p>
</div> </div>
<pre class="usage"><span class='fu'>mo_matching_score</span>(<span class='kw'>x</span>, <span class='kw'>fullname</span>, uncertainty = <span class='fl'>1</span>)</pre> <pre class="usage"><span class='fu'>mo_matching_score</span>(<span class='kw'>x</span>, <span class='kw'>n</span>)</pre>
<h2 class="hasAnchor" id="arguments"><a class="anchor" href="#arguments"></a>Arguments</h2> <h2 class="hasAnchor" id="arguments"><a class="anchor" href="#arguments"></a>Arguments</h2>
<table class="ref-arguments"> <table class="ref-arguments">
@ -252,7 +252,7 @@
<td><p>Any user input value(s)</p></td> <td><p>Any user input value(s)</p></td>
</tr> </tr>
<tr> <tr>
<th>fullname</th> <th>n</th>
<td><p>A full taxonomic name, that exists in <code><a href='microorganisms.html'>microorganisms$fullname</a></code></p></td> <td><p>A full taxonomic name, that exists in <code><a href='microorganisms.html'>microorganisms$fullname</a></code></p></td>
</tr> </tr>
<tr> <tr>
@ -261,22 +261,28 @@
</tr> </tr>
</table> </table>
<h2 class="hasAnchor" id="details"><a class="anchor" href="#details"></a>Details</h2> <h2 class="hasAnchor" id="matching-score-for-microorganisms"><a class="anchor" href="#matching-score-for-microorganisms"></a>Matching score for microorganisms</h2>
<p>The matching score is based on four parameters:</p><ol>
<li><p>A human pathogenic prevalence \(P\), that is categorised into group 1, 2 and 3 (see <code><a href='as.mo.html'>as.mo()</a></code>);</p></li>
<li><p>A kingdom index \(K\) is set as follows: Bacteria = 1, Fungi = 2, Protozoa = 3, Archaea = 4, and all others = 5;</p></li>
<li><p>The level of uncertainty \(U\) that is needed to get to a result (1 to 3, see <code><a href='as.mo.html'>as.mo()</a></code>);</p></li>
<li><p>The <a href='https://en.wikipedia.org/wiki/Levenshtein_distance'>Levenshtein distance</a> \(L\) is the distance between the user input and all taxonomic full names, with the text length of the user input being the maximum distance. A modified version of the Levenshtein distance \(L'\) based on the text length of the full name \(F\) is calculated as:</p></li>
</ol>
<p>$$L' = 1 - \frac{0.5L}{F}$$</p>
<p>The final matching score \(M\) is calculated as: <p>With ambiguous user input in <code><a href='as.mo.html'>as.mo()</a></code> and all the <code><a href='mo_property.html'>mo_*</a></code> functions, the returned results are chosen based on their matching score using <code>mo_matching_score()</code>. This matching score \(m\) is calculated as:</p>
$$M = L' \times \frac{1}{P K U} = \frac{F - 0.5L}{F P K U}$$</p> <p>$$m_{(x, n)} = \frac{l_{n} - 0.5 \times \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} p k}$$</p>
<p>where:</p><ul>
<li><p>\(x\) is the user input;</p></li>
<li><p>\(n\) is a taxonomic name (genus, species and subspecies);</p></li>
<li><p>\(l_{n}\) is the length of the taxonomic name;</p></li>
<li><p>\(\operatorname{lev}\) is the <a href='https://en.wikipedia.org/wiki/Levenshtein_distance'>Levenshtein distance</a> function;</p></li>
<li><p>\(p\) is the human pathogenic prevalence, categorised into group \(1\), \(2\) and \(3\) (see <em>Details</em> in <code><a href='as.mo.html'>?as.mo</a></code>), meaning that \(p = \{1, 2 , 3\}\);</p></li>
<li><p>\(k\) is the kingdom index, set as follows: Bacteria = \(1\), Fungi = \(2\), Protozoa = \(3\), Archaea = \(4\), and all others = \(5\), meaning that \(k = \{1, 2 , 3, 4, 5\}\).</p></li>
</ul>
<p>All matches are sorted descending on their matching score and for all user input values, the top match will be returned.</p>
<h2 class="hasAnchor" id="examples"><a class="anchor" href="#examples"></a>Examples</h2> <h2 class="hasAnchor" id="examples"><a class="anchor" href="#examples"></a>Examples</h2>
<pre class="examples"><span class='fu'><a href='as.mo.html'>as.mo</a></span>(<span class='st'>"E. coli"</span>) <pre class="examples"><span class='fu'><a href='as.mo.html'>as.mo</a></span>(<span class='st'>"E. coli"</span>)
<span class='fu'><a href='as.mo.html'>mo_uncertainties</a></span>() <span class='fu'><a href='as.mo.html'>mo_uncertainties</a></span>()
<span class='fu'>mo_matching_score</span>(<span class='st'>"E. coli"</span>, <span class='st'>"Escherichia coli"</span>)
</pre> </pre>
</div> </div>
<div class="col-md-3 hidden-xs hidden-sm" id="pkgdown-sidebar"> <div class="col-md-3 hidden-xs hidden-sm" id="pkgdown-sidebar">

View File

@ -82,7 +82,7 @@
</button> </button>
<span class="navbar-brand"> <span class="navbar-brand">
<a class="navbar-link" href="../index.html">AMR (for R)</a> <a class="navbar-link" href="../index.html">AMR (for R)</a>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9026</span> <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9030</span>
</span> </span>
</div> </div>
@ -346,6 +346,22 @@
<p><img src='figures/lifecycle_stable.svg' style=margin-bottom:5px /> <br /> <p><img src='figures/lifecycle_stable.svg' style=margin-bottom:5px /> <br />
The <a href='lifecycle.html'>lifecycle</a> of this function is <strong>stable</strong>. In a stable function, major changes are unlikely. This means that the unlying code will generally evolve by adding new arguments; removing arguments or changing the meaning of existing arguments will be avoided.</p> The <a href='lifecycle.html'>lifecycle</a> of this function is <strong>stable</strong>. In a stable function, major changes are unlikely. This means that the unlying code will generally evolve by adding new arguments; removing arguments or changing the meaning of existing arguments will be avoided.</p>
<p>If the unlying code needs breaking changes, they will occur gradually. For example, a parameter will be deprecated and first continue to work, but will emit an message informing you of the change. Next, typically after at least one newly released version on CRAN, the message will be transformed to an error.</p> <p>If the unlying code needs breaking changes, they will occur gradually. For example, a parameter will be deprecated and first continue to work, but will emit an message informing you of the change. Next, typically after at least one newly released version on CRAN, the message will be transformed to an error.</p>
<h2 class="hasAnchor" id="matching-score-for-microorganisms"><a class="anchor" href="#matching-score-for-microorganisms"></a>Matching score for microorganisms</h2>
<p>With ambiguous user input in <code><a href='as.mo.html'>as.mo()</a></code> and all the <code>mo_*</code> functions, the returned results are chosen based on their matching score using <code><a href='mo_matching_score.html'>mo_matching_score()</a></code>. This matching score \(m\) is calculated as:</p>
<p>$$m_{(x, n)} = \frac{l_{n} - 0.5 \times \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} p k}$$</p>
<p>where:</p><ul>
<li><p>\(x\) is the user input;</p></li>
<li><p>\(n\) is a taxonomic name (genus, species and subspecies);</p></li>
<li><p>\(l_{n}\) is the length of the taxonomic name;</p></li>
<li><p>\(\operatorname{lev}\) is the <a href='https://en.wikipedia.org/wiki/Levenshtein_distance'>Levenshtein distance</a> function;</p></li>
<li><p>\(p\) is the human pathogenic prevalence, categorised into group \(1\), \(2\) and \(3\) (see <em>Details</em> in <code><a href='as.mo.html'>?as.mo</a></code>), meaning that \(p = \{1, 2 , 3\}\);</p></li>
<li><p>\(k\) is the kingdom index, set as follows: Bacteria = \(1\), Fungi = \(2\), Protozoa = \(3\), Archaea = \(4\), and all others = \(5\), meaning that \(k = \{1, 2 , 3, 4, 5\}\).</p></li>
</ul>
<p>All matches are sorted descending on their matching score and for all user input values, the top match will be returned.</p>
<h2 class="hasAnchor" id="catalogue-of-life"><a class="anchor" href="#catalogue-of-life"></a>Catalogue of Life</h2> <h2 class="hasAnchor" id="catalogue-of-life"><a class="anchor" href="#catalogue-of-life"></a>Catalogue of Life</h2>

View File

@ -81,7 +81,7 @@
</button> </button>
<span class="navbar-brand"> <span class="navbar-brand">
<a class="navbar-link" href="index.html">AMR (for R)</a> <a class="navbar-link" href="index.html">AMR (for R)</a>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9029</span> <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">1.3.0.9030</span>
</span> </span>
</div> </div>

View File

@ -125,24 +125,6 @@ Group 2 consists of all microorganisms where the taxonomic phylum is Proteobacte
Group 3 (least prevalent microorganisms) consists of all other microorganisms. This group contains microorganisms most probably not found in humans. Group 3 (least prevalent microorganisms) consists of all other microorganisms. This group contains microorganisms most probably not found in humans.
} }
\subsection{Background on matching scores}{
With ambiguous user input, the returned results are chosen based on their matching score using \code{\link[=mo_matching_score]{mo_matching_score()}}. This matching score is based on four parameters:
\enumerate{
\item The prevalence \eqn{P} is categorised into group 1, 2 and 3 as stated above;
\item A kingdom index \eqn{K} is set as follows: Bacteria = 1, Fungi = 2, Protozoa = 3, Archaea = 4, and all others = 5;
\item The level of uncertainty \eqn{U} needed to get to the result, as stated above (1 to 3);
\item The \href{https://en.wikipedia.org/wiki/Levenshtein_distance}{Levenshtein distance} \eqn{L} is the distance between the user input and all taxonomic full names, with the text length of the user input being the maximum distance. A modified version of the Levenshtein distance \eqn{L'} based on the text length of the full name \eqn{F} is calculated as:
}
\deqn{L' = 1 - \frac{0.5L}{F}}{L' = 1 - ((0.5 * L) / F)}
The final matching score \eqn{M} is calculated as:
\deqn{M = L' \times \frac{1}{P K U} = \frac{F - 0.5L}{F P K U}}{M = L' * (1 / (P * K * U)) = (F - 0.5L) / (F * P * K * U)}
All matches are sorted descending on their matching score and for all user input values, the top match will be returned.
}
} }
\section{Source}{ \section{Source}{
@ -162,6 +144,25 @@ The \link[=lifecycle]{lifecycle} of this function is \strong{stable}. In a stabl
If the unlying code needs breaking changes, they will occur gradually. For example, a parameter will be deprecated and first continue to work, but will emit an message informing you of the change. Next, typically after at least one newly released version on CRAN, the message will be transformed to an error. If the unlying code needs breaking changes, they will occur gradually. For example, a parameter will be deprecated and first continue to work, but will emit an message informing you of the change. Next, typically after at least one newly released version on CRAN, the message will be transformed to an error.
} }
\section{Matching score for microorganisms}{
With ambiguous user input in \code{\link[=as.mo]{as.mo()}} and all the \code{\link[=mo_property]{mo_*}} functions, the returned results are chosen based on their matching score using \code{\link[=mo_matching_score]{mo_matching_score()}}. This matching score \eqn{m} is calculated as:
\deqn{m_{(x, n)} = \frac{l_{n} - 0.5 \times \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} p k}}{m(x, n) = ( l_n * min(l_n, lev(x, n) ) ) / ( l_n * p * k )}
where:
\itemize{
\item \eqn{x} is the user input;
\item \eqn{n} is a taxonomic name (genus, species and subspecies);
\item \eqn{l_{n}}{l_n} is the length of the taxonomic name;
\item \eqn{\operatorname{lev}}{lev} is the \href{https://en.wikipedia.org/wiki/Levenshtein_distance}{Levenshtein distance} function;
\item \eqn{p} is the human pathogenic prevalence, categorised into group \eqn{1}, \eqn{2} and \eqn{3} (see \emph{Details} in \code{?as.mo}), meaning that \eqn{p = \{1, 2 , 3\}}{p = {1, 2, 3}};
\item \eqn{k} is the kingdom index, set as follows: Bacteria = \eqn{1}, Fungi = \eqn{2}, Protozoa = \eqn{3}, Archaea = \eqn{4}, and all others = \eqn{5}, meaning that \eqn{k = \{1, 2 , 3, 4, 5\}}{k = {1, 2, 3, 4, 5}}.
}
All matches are sorted descending on their matching score and for all user input values, the top match will be returned.
}
\section{Catalogue of Life}{ \section{Catalogue of Life}{
\if{html}{\figure{logo_col.png}{options: height=40px style=margin-bottom:5px} \cr} \if{html}{\figure{logo_col.png}{options: height=40px style=margin-bottom:5px} \cr}

View File

@ -26,7 +26,7 @@ anti_join_microorganisms(x, by = NULL, ...)
\arguments{ \arguments{
\item{x}{existing table to join, or character vector} \item{x}{existing table to join, or character vector}
\item{by}{a variable to join by - if left empty will search for a column with class \code{\link{mo}} (created with \code{\link[=as.mo]{as.mo()}}) or will be \code{"mo"} if that column name exists in \code{x}, could otherwise be a column name of \code{x} with values that exist in \code{microorganisms$mo} (like \code{by = "bacteria_id"}), or another column in \link{microorganisms} (but then it should be named, like \code{by = c("my_genus_species" = "fullname")})} \item{by}{a variable to join by - if left empty will search for a column with class \code{\link{mo}} (created with \code{\link[=as.mo]{as.mo()}}) or will be \code{"mo"} if that column name exists in \code{x}, could otherwise be a column name of \code{x} with values that exist in \code{microorganisms$mo} (like \code{by = "bacteria_id"}), or another column in \link{microorganisms} (but then it should be named, like \code{by = c("bacteria_id" = "fullname")})}
\item{suffix}{if there are non-joined duplicate variables in \code{x} and \code{y}, these suffixes will be added to the output to disambiguate them. Should be a character vector of length 2.} \item{suffix}{if there are non-joined duplicate variables in \code{x} and \code{y}, these suffixes will be added to the output to disambiguate them. Should be a character vector of length 2.}
@ -38,7 +38,7 @@ Join the data set \link{microorganisms} easily to an existing table or character
\details{ \details{
\strong{Note:} As opposed to the \code{join()} functions of \code{dplyr}, \link{character} vectors are supported and at default existing columns will get a suffix \code{"2"} and the newly joined columns will not get a suffix. \strong{Note:} As opposed to the \code{join()} functions of \code{dplyr}, \link{character} vectors are supported and at default existing columns will get a suffix \code{"2"} and the newly joined columns will not get a suffix.
These functions rely on \code{\link[=merge]{merge()}}, a base R function to do joins. If the \code{dplyr} package is installed, their join functions will be used. Otherwise, the much slower \code{\link[=merge]{merge()}} function from base R will be used.
} }
\section{Stable lifecycle}{ \section{Stable lifecycle}{

View File

@ -4,33 +4,40 @@
\alias{mo_matching_score} \alias{mo_matching_score}
\title{Calculate the matching score for microorganisms} \title{Calculate the matching score for microorganisms}
\usage{ \usage{
mo_matching_score(x, fullname, uncertainty = 1) mo_matching_score(x, n)
} }
\arguments{ \arguments{
\item{x}{Any user input value(s)} \item{x}{Any user input value(s)}
\item{fullname}{A full taxonomic name, that exists in \code{\link[=microorganisms]{microorganisms$fullname}}} \item{n}{A full taxonomic name, that exists in \code{\link[=microorganisms]{microorganisms$fullname}}}
\item{uncertainty}{The level of uncertainty set in \code{\link[=as.mo]{as.mo()}}, see \code{allow_uncertain} in that function (here, it defaults to 1, but is automatically determined in \code{\link[=as.mo]{as.mo()}} based on the number of transformations needed to get to a result)} \item{uncertainty}{The level of uncertainty set in \code{\link[=as.mo]{as.mo()}}, see \code{allow_uncertain} in that function (here, it defaults to 1, but is automatically determined in \code{\link[=as.mo]{as.mo()}} based on the number of transformations needed to get to a result)}
} }
\description{ \description{
This helper function is used by \code{\link[=as.mo]{as.mo()}} to determine the most probable match of taxonomic records, based on user input. This helper function is used by \code{\link[=as.mo]{as.mo()}} to determine the most probable match of taxonomic records, based on user input.
} }
\details{ \section{Matching score for microorganisms}{
The matching score is based on four parameters:
\enumerate{ With ambiguous user input in \code{\link[=as.mo]{as.mo()}} and all the \code{\link[=mo_property]{mo_*}} functions, the returned results are chosen based on their matching score using \code{\link[=mo_matching_score]{mo_matching_score()}}. This matching score \eqn{m} is calculated as:
\item A human pathogenic prevalence \eqn{P}, that is categorised into group 1, 2 and 3 (see \code{\link[=as.mo]{as.mo()}});
\item A kingdom index \eqn{K} is set as follows: Bacteria = 1, Fungi = 2, Protozoa = 3, Archaea = 4, and all others = 5; \deqn{m_{(x, n)} = \frac{l_{n} - 0.5 \times \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} p k}}{m(x, n) = ( l_n * min(l_n, lev(x, n) ) ) / ( l_n * p * k )}
\item The level of uncertainty \eqn{U} that is needed to get to a result (1 to 3, see \code{\link[=as.mo]{as.mo()}});
\item The \href{https://en.wikipedia.org/wiki/Levenshtein_distance}{Levenshtein distance} \eqn{L} is the distance between the user input and all taxonomic full names, with the text length of the user input being the maximum distance. A modified version of the Levenshtein distance \eqn{L'} based on the text length of the full name \eqn{F} is calculated as: where:
\itemize{
\item \eqn{x} is the user input;
\item \eqn{n} is a taxonomic name (genus, species and subspecies);
\item \eqn{l_{n}}{l_n} is the length of the taxonomic name;
\item \eqn{\operatorname{lev}}{lev} is the \href{https://en.wikipedia.org/wiki/Levenshtein_distance}{Levenshtein distance} function;
\item \eqn{p} is the human pathogenic prevalence, categorised into group \eqn{1}, \eqn{2} and \eqn{3} (see \emph{Details} in \code{?as.mo}), meaning that \eqn{p = \{1, 2 , 3\}}{p = {1, 2, 3}};
\item \eqn{k} is the kingdom index, set as follows: Bacteria = \eqn{1}, Fungi = \eqn{2}, Protozoa = \eqn{3}, Archaea = \eqn{4}, and all others = \eqn{5}, meaning that \eqn{k = \{1, 2 , 3, 4, 5\}}{k = {1, 2, 3, 4, 5}}.
} }
\deqn{L' = 1 - \frac{0.5L}{F}}{L' = 1 - ((0.5 * L) / F)} All matches are sorted descending on their matching score and for all user input values, the top match will be returned.
The final matching score \eqn{M} is calculated as:
\deqn{M = L' \times \frac{1}{P K U} = \frac{F - 0.5L}{F P K U}}{M = L' * (1 / (P * K * U)) = (F - 0.5L) / (F * P * K * U)}
} }
\examples{ \examples{
as.mo("E. coli") as.mo("E. coli")
mo_uncertainties() mo_uncertainties()
mo_matching_score("E. coli", "Escherichia coli")
} }

View File

@ -124,6 +124,25 @@ The \link[=lifecycle]{lifecycle} of this function is \strong{stable}. In a stabl
If the unlying code needs breaking changes, they will occur gradually. For example, a parameter will be deprecated and first continue to work, but will emit an message informing you of the change. Next, typically after at least one newly released version on CRAN, the message will be transformed to an error. If the unlying code needs breaking changes, they will occur gradually. For example, a parameter will be deprecated and first continue to work, but will emit an message informing you of the change. Next, typically after at least one newly released version on CRAN, the message will be transformed to an error.
} }
\section{Matching score for microorganisms}{
With ambiguous user input in \code{\link[=as.mo]{as.mo()}} and all the \code{\link[=mo_property]{mo_*}} functions, the returned results are chosen based on their matching score using \code{\link[=mo_matching_score]{mo_matching_score()}}. This matching score \eqn{m} is calculated as:
\deqn{m_{(x, n)} = \frac{l_{n} - 0.5 \times \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} p k}}{m(x, n) = ( l_n * min(l_n, lev(x, n) ) ) / ( l_n * p * k )}
where:
\itemize{
\item \eqn{x} is the user input;
\item \eqn{n} is a taxonomic name (genus, species and subspecies);
\item \eqn{l_{n}}{l_n} is the length of the taxonomic name;
\item \eqn{\operatorname{lev}}{lev} is the \href{https://en.wikipedia.org/wiki/Levenshtein_distance}{Levenshtein distance} function;
\item \eqn{p} is the human pathogenic prevalence, categorised into group \eqn{1}, \eqn{2} and \eqn{3} (see \emph{Details} in \code{?as.mo}), meaning that \eqn{p = \{1, 2 , 3\}}{p = {1, 2, 3}};
\item \eqn{k} is the kingdom index, set as follows: Bacteria = \eqn{1}, Fungi = \eqn{2}, Protozoa = \eqn{3}, Archaea = \eqn{4}, and all others = \eqn{5}, meaning that \eqn{k = \{1, 2 , 3, 4, 5\}}{k = {1, 2, 3, 4, 5}}.
}
All matches are sorted descending on their matching score and for all user input values, the top match will be returned.
}
\section{Catalogue of Life}{ \section{Catalogue of Life}{
\if{html}{\figure{logo_col.png}{options: height=40px style=margin-bottom:5px} \cr} \if{html}{\figure{logo_col.png}{options: height=40px style=margin-bottom:5px} \cr}

View File

@ -37,7 +37,7 @@ test_that("counts work", {
expect_equal(suppressWarnings(count_S(example_isolates$AMX)) + count_I(example_isolates$AMX), expect_equal(suppressWarnings(count_S(example_isolates$AMX)) + count_I(example_isolates$AMX),
count_SI(example_isolates$AMX)) count_SI(example_isolates$AMX))
library(dplyr) library(dplyr, warn.conflicts = FALSE)
expect_equal(example_isolates %>% count_susceptible(AMC), 1433) expect_equal(example_isolates %>% count_susceptible(AMC), 1433)
expect_equal(example_isolates %>% count_susceptible(AMC, GEN, only_all_tested = TRUE), 1687) expect_equal(example_isolates %>% count_susceptible(AMC, GEN, only_all_tested = TRUE), 1687)
expect_equal(example_isolates %>% count_susceptible(AMC, GEN, only_all_tested = FALSE), 1764) expect_equal(example_isolates %>% count_susceptible(AMC, GEN, only_all_tested = FALSE), 1764)

View File

@ -37,7 +37,7 @@ test_that("disk works", {
expect_warning(as.disk("INVALID VALUE")) expect_warning(as.disk("INVALID VALUE"))
expect_output(print(as.disk(12))) expect_output(print(as.disk(12)))
library(dplyr) library(dplyr, warn.conflicts = FALSE)
expect_output(print(tibble(d = as.disk(12)))) expect_output(print(tibble(d = as.disk(12))))
}) })

View File

@ -72,7 +72,7 @@ test_that("EUCAST rules work", {
expect_equal(suppressWarnings(eucast_rules(a, "mo", info = FALSE)), b) expect_equal(suppressWarnings(eucast_rules(a, "mo", info = FALSE)), b)
# piperacillin must be R in Enterobacteriaceae when tica is R # piperacillin must be R in Enterobacteriaceae when tica is R
library(dplyr) library(dplyr, warn.conflicts = FALSE)
expect_equal(suppressWarnings( expect_equal(suppressWarnings(
example_isolates %>% example_isolates %>%
mutate(TIC = as.rsi("R"), mutate(TIC = as.rsi("R"),
@ -85,7 +85,7 @@ test_that("EUCAST rules work", {
as.character()), as.character()),
"R") "R")
# Azithromicin and Clarythromycin must be equal to Erythromycin # Azithromycin and Clarythromycin must be equal to Erythromycin
a <- eucast_rules(data.frame(mo = example_isolates$mo, a <- eucast_rules(data.frame(mo = example_isolates$mo,
ERY = example_isolates$ERY, ERY = example_isolates$ERY,
AZM = as.rsi("R"), AZM = as.rsi("R"),

View File

@ -27,7 +27,7 @@ test_that("ggplot_rsi works", {
skip_if_not_installed("ggplot2") skip_if_not_installed("ggplot2")
library(dplyr) library(dplyr, warn.conflicts = FALSE)
library(ggplot2) library(ggplot2)
pdf(NULL) # prevent Rplots.pdf being created pdf(NULL) # prevent Rplots.pdf being created

View File

@ -57,7 +57,7 @@ test_that("joins work", {
expect_warning(right_join_microorganisms("B_ESCHR_COLI")) expect_warning(right_join_microorganisms("B_ESCHR_COLI"))
expect_warning(full_join_microorganisms("B_ESCHR_COLI")) expect_warning(full_join_microorganisms("B_ESCHR_COLI"))
library(dplyr) library(dplyr, warn.conflicts = FALSE)
x <- tibble(bact = as.mo("E.coli")) x <- tibble(bact = as.mo("E.coli"))
expect_warning(left_join_microorganisms(x %>% group_by(bact), "bact")) expect_warning(left_join_microorganisms(x %>% group_by(bact), "bact"))

View File

@ -32,6 +32,6 @@ test_that("keyantibiotics work", {
expect_true(key_antibiotics_equal(".SS", "SI.", ignore_I = TRUE)) expect_true(key_antibiotics_equal(".SS", "SI.", ignore_I = TRUE))
expect_false(key_antibiotics_equal(".SS", "SI.", ignore_I = FALSE)) expect_false(key_antibiotics_equal(".SS", "SI.", ignore_I = FALSE))
library(dplyr) library(dplyr, warn.conflicts = FALSE)
expect_warning(key_antibiotics(example_isolates %>% slice(rep(1, 10)))) expect_warning(key_antibiotics(example_isolates %>% slice(rep(1, 10))))
}) })

View File

@ -25,7 +25,7 @@ test_that("as.mo works", {
skip_on_cran() skip_on_cran()
library(dplyr) library(dplyr, warn.conflicts = FALSE)
MOs <- microorganisms %>% filter(!is.na(mo), nchar(mo) > 3) MOs <- microorganisms %>% filter(!is.na(mo), nchar(mo) > 3)
expect_identical(as.character(MOs$mo), as.character(as.mo(MOs$mo))) expect_identical(as.character(MOs$mo), as.character(as.mo(MOs$mo)))

View File

@ -40,7 +40,7 @@ test_that("prediction of rsi works", {
expect_silent(ggplot_rsi_predict(x)) expect_silent(ggplot_rsi_predict(x))
expect_error(ggplot_rsi_predict(example_isolates)) expect_error(ggplot_rsi_predict(example_isolates))
library(dplyr) library(dplyr, warn.conflicts = FALSE)
expect_output(rsi_predict(x = filter(example_isolates, mo == "B_ESCHR_COLI"), expect_output(rsi_predict(x = filter(example_isolates, mo == "B_ESCHR_COLI"),
model = "binomial", model = "binomial",

View File

@ -54,7 +54,7 @@ test_that("rsi works", {
expect_identical(as.logical(lapply(example_isolates, is.rsi.eligible)), expect_identical(as.logical(lapply(example_isolates, is.rsi.eligible)),
rep(FALSE, length(example_isolates))) rep(FALSE, length(example_isolates)))
library(dplyr) library(dplyr, warn.conflicts = FALSE)
# 40 rsi columns # 40 rsi columns
expect_equal(example_isolates %>% expect_equal(example_isolates %>%
mutate_at(vars(PEN:RIF), as.character) %>% mutate_at(vars(PEN:RIF), as.character) %>%