1
0
mirror of https://github.com/msberends/AMR.git synced 2026-05-14 01:50:51 +02:00
Files
AMR/R/mo_matching_score.R
Claude b3b8d301ff Fix #287 (complex fallback) and #288 (species epithet scoring bias)
#287: as.mo() now strips " complex" from input when that exact complex
is not in the taxonomy and retries with the bare name, so inputs like
"Proteus vulgaris complex" no longer return NA.

#288: mo_matching_score() applies a ×2 bonus when the input has an
abbreviated genus (≤3 chars) and the candidate's species epithet exactly
matches the input species epithet. This ensures "S. apiospermum" resolves
to Scedosporium apiospermum rather than Staphylococcus aureus, overcoming
the kingdom/prevalence denominator bias in favour of common bacteria.

https://claude.ai/code/session_01VH4Ju4Xq9aW1AHuoVbjGEo
2026-05-06 15:11:31 +00:00

151 lines
8.7 KiB
R
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# ==================================================================== #
# TITLE: #
# AMR: An R Package for Working with Antimicrobial Resistance Data #
# #
# SOURCE CODE: #
# https://github.com/msberends/AMR #
# #
# PLEASE CITE THIS SOFTWARE AS: #
# Berends MS, Luz CF, Friedrich AW, et al. (2022). #
# AMR: An R Package for Working with Antimicrobial Resistance Data. #
# Journal of Statistical Software, 104(3), 1-31. #
# https://doi.org/10.18637/jss.v104.i03 #
# #
# Developed at the University of Groningen and the University Medical #
# Center Groningen in The Netherlands, in collaboration with many #
# colleagues from around the world, see our website. #
# #
# This R package is free software; you can freely use and distribute #
# it for both personal and commercial purposes under the terms of the #
# GNU General Public License version 2.0 (GNU GPL-2), as published by #
# the Free Software Foundation. #
# We created this package for both routine data analysis and academic #
# research and it was publicly released in the hope that it will be #
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. #
# #
# Visit our website for the full manual and a complete tutorial about #
# how to conduct AMR data analysis: https://amr-for-r.org #
# ==================================================================== #
#' Calculate the Matching Score for Microorganisms
#'
#' This algorithm is used by [as.mo()] and all the [`mo_*`][mo_property()] functions to determine the most probable match of taxonomic records based on user input.
#' @param x Any user input value(s).
#' @param n A full taxonomic name, that exists in [`microorganisms$fullname`][microorganisms].
#' @note This algorithm was originally developed in 2018 and subsequently described in: Berends MS *et al.* (2022). **AMR: An R Package for Working with Antimicrobial Resistance Data**. *Journal of Statistical Software*, 104(3), 1-31; \doi{10.18637/jss.v104.i03}.
#'
#' Later, the work of Bartlett A *et al.* about bacterial pathogens infecting humans (2022, \doi{10.1099/mic.0.001269}) was incorporated, and optimalisations to the algorithm were made.
#' @section Matching Score for Microorganisms:
#' With ambiguous user input in [as.mo()] and all the [`mo_*`][mo_property()] functions, the returned results are chosen based on their matching score using [mo_matching_score()]. This matching score \eqn{m}, is calculated as:
#'
#' \deqn{m_{(x, n)} = \frac{l_{n} - 0.5 \cdot \min \begin{cases}l_{n} \\ \textrm{lev}(x, n)\end{cases}}{l_{n} \cdot p_{n} \cdot k_{n}}}
#'
#' where:
#'
#' * \eqn{x} is the user input;
#' * \eqn{n} is a taxonomic name (genus, species, and subspecies);
#' * \eqn{l_n} is the length of \eqn{n};
#' * \eqn{lev} is the [Levenshtein distance function](https://en.wikipedia.org/wiki/Levenshtein_distance) (counting any insertion as 1, and any deletion or substitution as 2) that is needed to change \eqn{x} into \eqn{n};
#' * \eqn{p_n} is the human pathogenic prevalence group of \eqn{n}, as described below;
#' * \eqn{k_n} is the taxonomic kingdom of \eqn{n}, set as Bacteria = 1, Fungi = 1.25, Protozoa = 1.5, Chromista = 1.75, Archaea = 2, others = 3.
#'
#' The grouping into human pathogenic prevalence \eqn{p} is based on recent work from Bartlett *et al.* (2022, \doi{10.1099/mic.0.001269}) who extensively studied medical-scientific literature to categorise all bacterial species into these groups:
#'
#' - **Established**, if a taxonomic species has infected at least three persons in three or more references. These records have `prevalence = 1.15` in the [microorganisms] data set;
#' - **Putative**, if a taxonomic species has fewer than three known cases. These records have `prevalence = 1.25` in the [microorganisms] data set.
#'
#' Furthermore,
#'
#' - Genera from the World Health Organization's (WHO) Priority Pathogen List have `prevalence = 1.0` in the [microorganisms] data set;
#' - Any genus present in the **established** list also has `prevalence = 1.15` in the [microorganisms] data set;
#' - Any other genus present in the **putative** list has `prevalence = 1.25` in the [microorganisms] data set;
#' - Any other species or subspecies of which the genus is present in the two aforementioned groups, has `prevalence = 1.5` in the [microorganisms] data set;
#' - Any *non-bacterial* genus, species or subspecies of which the genus is present in the following list, has `prevalence = 1.25` in the [microorganisms] data set: `r vector_or(MO_RELEVANT_GENERA, quotes = "*")`;
#' - All other records have `prevalence = 2.0` in the [microorganisms] data set.
#'
#' When calculating the matching score, all characters in \eqn{x} and \eqn{n} are ignored that are other than A-Z, a-z, 0-9, spaces and parentheses.
#'
#' All matches are sorted descending on their matching score and for all user input values, the top match will be returned. This will lead to the effect that e.g., `"E. coli"` will return the microbial ID of *Escherichia coli* (\eqn{m = `r round(mo_matching_score("E. coli", "Escherichia coli"), 3)`}, a highly prevalent microorganism found in humans) and not *Entamoeba coli* (\eqn{m = `r round(mo_matching_score("E. coli", "Entamoeba coli"), 3)`}, a less prevalent microorganism in humans), although the latter would alphabetically come first.
#' @export
#' @inheritSection AMR Download Our Reference Data
#' @examples
#' mo_reset_session()
#'
#' as.mo("E. coli")
#' mo_uncertainties()
#'
#' mo_matching_score(
#' x = "E. coli",
#' n = c("Escherichia coli", "Entamoeba coli")
#' )
mo_matching_score <- function(x, n) {
meet_criteria(x, allow_class = c("character", "data.frame", "list"))
meet_criteria(n, allow_class = "character")
add_MO_lookup_to_AMR_env()
x <- parse_and_convert(x)
# no dots and other non-whitespace characters
x <- gsub("[^a-zA-Z0-9 \\(\\)]+", "", x)
# only keep one space
x <- gsub(" +", " ", x)
# force a capital letter, so this conversion will not count as a substitution
substr(x, 1, 1) <- toupper(substr(x, 1, 1))
# n is always a taxonomically valid full name
if (length(n) == 1) {
n <- rep(n, length(x))
}
if (length(x) == 1) {
x <- rep(x, length(n))
}
# length of fullname
l_n <- nchar(n)
lev <- double(length = length(x))
l_n.lev <- double(length = length(x))
# get Levenshtein distance
lev <- unlist(Map(f = function(a, b) {
as.double(utils::adist(a, b,
ignore.case = FALSE,
fixed = TRUE,
costs = c(insertions = 1, deletions = 2, substitutions = 2),
counts = FALSE
))
}, x, n, USE.NAMES = FALSE))
l_n.lev[l_n < lev] <- l_n[l_n < lev]
l_n.lev[lev < l_n] <- lev[lev < l_n]
l_n.lev[lev == l_n] <- lev[lev == l_n]
# human pathogenic prevalence (1 to 3), see ?as.mo
p_n <- AMR_env$MO_lookup[match(n, AMR_env$MO_lookup$fullname), "prevalence", drop = TRUE]
# kingdom index (Bacteria = 1, Fungi = 2, Protozoa = 3, Archaea = 4, others = 5)
k_n <- AMR_env$MO_lookup[match(n, AMR_env$MO_lookup$fullname), "kingdom_index", drop = TRUE]
# base matching score
score <- (l_n - 0.5 * l_n.lev) / (l_n * p_n * k_n)
# Issue #288: when the genus is abbreviated (≤3 chars) and the species epithet of the
# candidate exactly matches the species epithet of the input, boost the score ×2.
# This prevents a prevalent bacterium (low p_n/k_n) from outranking a rarer organism
# whose species epithet is the only exact match, e.g. "S. apiospermum" → Scedosporium.
x_parts_list <- strsplit(x, " ", fixed = TRUE)
n_parts_list <- strsplit(n, " ", fixed = TRUE)
x_genus <- vapply(x_parts_list, function(w) if (length(w) >= 1) w[1L] else "", character(1L))
x_sp <- vapply(x_parts_list, function(w) if (length(w) >= 2L) tolower(w[2L]) else "", character(1L))
n_g1 <- vapply(n_parts_list, function(w) if (length(w) >= 1L) tolower(substr(w[1L], 1L, 1L)) else "", character(1L))
n_sp <- vapply(n_parts_list, function(w) if (length(w) >= 2L) tolower(w[2L]) else "", character(1L))
exact_sp <- nchar(x_genus) <= 3L &
x_sp != "" &
n_sp != "" &
tolower(substr(x_genus, 1L, 1L)) == n_g1 &
x_sp == n_sp
score[exact_sp] <- score[exact_sp] * 2
score
}