1
0
mirror of https://github.com/msberends/AMR.git synced 2025-08-28 06:22:12 +02:00

(v1.3.0.9023) optimalisation

This commit is contained in:
2020-09-19 11:54:01 +02:00
parent 4e40e42011
commit d049cce69b
30 changed files with 104 additions and 578 deletions

View File

@@ -32,10 +32,10 @@
#' 3. The level of uncertainty \eqn{U} that is needed to get to a result (1 to 3, see [as.mo()]);
#' 4. The [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) \eqn{L} is the distance between the user input and all taxonomic full names, with the text length of the user input being the maximum distance. A modified version of the Levenshtein distance \eqn{L'} based on the text length of the full name \eqn{F} is calculated as:
#'
#' \deqn{L' = F - \frac{0.5 \times L}{F}}{L' = F - (0.5 * L) / F}
#' \deqn{L' = F - \frac{0.5L}{F}}{L' = (F - 0.5L) / F}
#'
#' The final matching score \eqn{M} is calculated as:
#' \deqn{M = L' \times \frac{1}{P \times K} * \frac{1}{U}}{M = L' * (1 / (P * K)) * (1 / U)}
#' \deqn{M = L' \times \frac{1}{P K U} = \frac{F - 0.5L}{F P K U}}{M = L' * (1 / (P * K * U)) = (F - 0.5L) / (F * P * K * U)}
#'
#' @export
#' @examples
@@ -55,9 +55,18 @@ mo_matching_score <- function(x, fullname, uncertainty = 1) {
levenshtein[i] <- min(as.double(utils::adist(x[i], fullname[i], ignore.case = FALSE)),
nchar(fullname[i]))
}
# self-made score between 0 and 1 (for % certainty, so 0 means huge distance, 1 means no distance)
dist <- (nchar(fullname) - 0.5 * levenshtein) / nchar(fullname)
prevalence_kingdom_index <- tryCatch(MO_lookup[match(fullname, MO_lookup$fullname), "prevalence_kingdom_index", drop = TRUE],
error = function(e) rep(1, length(fullname)))
dist * (1 / prevalence_kingdom_index) * (1 / uncertainty)
# F = length of fullname
var_F <- nchar(fullname)
# L = modified Levenshtein distance
var_L <- levenshtein
# P = Prevalence (1 to 3)
var_P <- MO_lookup[match(fullname, MO_lookup$fullname), "prevalence", drop = TRUE]
# K = kingdom index (Bacteria = 1, Fungi = 2, Protozoa = 3, Archaea = 4, others = 5)
var_K <- MO_lookup[match(fullname, MO_lookup$fullname), "kingdom_index", drop = TRUE]
# U = uncertainty level (1 to 3), as per as.mo()
var_U <- uncertainty
# matching score:
(var_F - 0.5 * L) / (var_F * var_P * var_K * var_U)
}