1
0
mirror of https://github.com/msberends/AMR.git synced 2025-07-10 19:41:55 +02:00

(v1.3.0.9030) matching score update

This commit is contained in:
2020-09-26 16:26:01 +02:00
parent 9667c2994f
commit 050a9a04fb
33 changed files with 249 additions and 175 deletions

32
R/mo.R
View File

@ -101,20 +101,7 @@
#'
#' Group 3 (least prevalent microorganisms) consists of all other microorganisms. This group contains microorganisms most probably not found in humans.
#'
#' ## Background on matching scores
#' With ambiguous user input, the returned results are chosen based on their matching score using [mo_matching_score()]. This matching score is based on four parameters:
#'
#' 1. The prevalence \eqn{P} is categorised into group 1, 2 and 3 as stated above;
#' 2. A kingdom index \eqn{K} is set as follows: Bacteria = 1, Fungi = 2, Protozoa = 3, Archaea = 4, and all others = 5;
#' 3. The level of uncertainty \eqn{U} needed to get to the result, as stated above (1 to 3);
#' 4. The [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) \eqn{L} is the distance between the user input and all taxonomic full names, with the text length of the user input being the maximum distance. A modified version of the Levenshtein distance \eqn{L'} based on the text length of the full name \eqn{F} is calculated as:
#'
#' \deqn{L' = 1 - \frac{0.5L}{F}}{L' = 1 - ((0.5 * L) / F)}
#'
#' The final matching score \eqn{M} is calculated as:
#' \deqn{M = L' \times \frac{1}{P K U} = \frac{F - 0.5L}{F P K U}}{M = L' * (1 / (P * K * U)) = (F - 0.5L) / (F * P * K * U)}
#'
#' All matches are sorted descending on their matching score and for all user input values, the top match will be returned.
#' @inheritSection mo_matching_score Matching score for microorganisms
#' @inheritSection catalogue_of_life Catalogue of Life
# (source as a section here, so it can be inherited by other man pages:)
#' @section Source:
@ -331,8 +318,7 @@ exec_as.mo <- function(x,
if (NROW(res_df) > 1 & uncertainty != -1) {
# sort the findings on matching score
scores <- mo_matching_score(x = input,
fullname = res_df[, "fullname", drop = TRUE],
uncertainty = uncertainty)
fullname = res_df[, "fullname", drop = TRUE])
res_df <- res_df[order(scores, decreasing = TRUE), , drop = FALSE]
}
res <- as.character(res_df[, column, drop = TRUE])
@ -1779,7 +1765,7 @@ print.mo_uncertainties <- function(x, ...) {
if (NROW(x) == 0) {
return(NULL)
}
cat(font_blue(strwrap(c("Matching scores are based on human pathogenic prevalence and the resemblance between the input and the full taxonomic name. Furthermore, an indication is given about the probability of the match - the more transformations are needed for coercion, the more improbable the result.")), collapse = "\n"))
cat(font_blue(strwrap(c("Matching scores are based on human pathogenic prevalence and the resemblance between the input and the full taxonomic name. Furthermore, an indication is given about the certainty of the match - the more transformations are needed for coercion, the less certain the result.")), collapse = "\n"))
cat("\n")
msg <- ""
@ -1787,8 +1773,7 @@ print.mo_uncertainties <- function(x, ...) {
if (x[i, ]$candidates != "") {
candidates <- unlist(strsplit(x[i, ]$candidates, ", ", fixed = TRUE))
scores <- mo_matching_score(x = x[i, ]$input,
fullname = candidates,
uncertainty = x[i, ]$uncertainty)
fullname = candidates)
# sort on descending scores
candidates <- candidates[order(1 - scores)]
n_candidates <- length(candidates)
@ -1802,11 +1787,11 @@ print.mo_uncertainties <- function(x, ...) {
candidates <- ""
}
if (x[i, ]$uncertainty == 1) {
uncertainty_interpretation <- font_green("* MOST PROBABLE *")
uncertainty_interpretation <- font_green("* very certain *")
} else if (x[i, ]$uncertainty == 1) {
uncertainty_interpretation <- font_yellow("* PROBABLE *")
uncertainty_interpretation <- font_yellow("* certain *")
} else {
uncertainty_interpretation <- font_red("* IMPROBABLE *")
uncertainty_interpretation <- font_red("* not certain *")
}
msg <- paste(msg,
paste0('"', x[i, ]$input, '" -> ',
@ -1814,8 +1799,7 @@ print.mo_uncertainties <- function(x, ...) {
ifelse(!is.na(x[i, ]$renamed_to), paste(", renamed to", font_italic(x[i, ]$renamed_to)), ""),
" (", x[i, ]$mo,
", matching score = ", trimws(percentage(mo_matching_score(x = x[i, ]$input,
fullname = x[i, ]$fullname,
uncertainty = x[i, ]$uncertainty),
fullname = x[i, ]$fullname),
digits = 1)),
") "),
uncertainty_interpretation,