1
0
mirror of https://github.com/msberends/AMR.git synced 2026-06-29 15:36:21 +02:00

(v3.0.1.9080) fix(as.mo): resolve abbreviated genus when species has subspecies (#288 follow-up) (#301)

When a genus+species abbreviation like "P. ovale" was used, the previous
bypass (Issue #288) checked sum(sp_exact) == 1, which failed if the species
also had subspecies sharing the epithet (ovale curtisi, ovale wallikeri).
The fix extends the bypass to fire whenever all exact species matches belong
to one genus, collapsing to the species-rank record (subspecies == "") for
genus+species queries and preserving the chosen row for explicit subspecies
queries.

Also extends the data-invariant test to cover all taxonomic rank columns
from domain to subspecies, not just the terminal three.


Claude-Session: https://claude.ai/code/session_01M4fqQYQYJ3drdudkDYNqAY

Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
Matthijs Berends
2026-06-27 15:20:38 +02:00
committed by GitHub
parent 03be4b87fc
commit 518425311e
5 changed files with 60 additions and 8 deletions

30
R/mo.R
View File

@@ -352,16 +352,34 @@ as.mo <- function(x,
(MO_lookup_current$species_first == substr(x_parts[2], 1, 1) |
MO_lookup_current$subspecies_first == substr(x_parts[2], 1, 1) |
MO_lookup_current$subspecies_first == substr(x_parts[3], 1, 1)))
# Issue #288: if the species (and subspecies) word(s) in the input exactly match
# exactly one candidate, use only that candidate and bypass the 0.55 cutoff.
# This prevents prevalent bacteria from outranking a rarer organism whose species
# epithet is an unambiguous exact match, e.g. "S. apiospermum" → Scedosporium.
# Issue #288 (extended): if the species (and subspecies) word(s) in the input
# exactly match candidates that all belong to one and the same genus, bypass the
# 0.55 cutoff. A species together with its subspecies/autonyms (e.g. Plasmodium
# ovale + curtisi + wallikeri) is the same taxon, so for a genus+species input we
# collapse to the species-rank record (subspecies == ""). This prevents prevalent
# bacteria from outranking a rarer organism whose species epithet is an
# unambiguous exact match, e.g. "S. apiospermum" -> Scedosporium, "P. ovale" ->
# Plasmodium ovale. If two different genera share the epithet, the genus check
# stays FALSE and the normal matching score arbitrates.
sp_exact <- tolower(MO_lookup_current$species[filtr]) == x_parts[2]
if (length(x_parts) == 3) {
sp_exact <- sp_exact & tolower(MO_lookup_current$subspecies[filtr]) == x_parts[3]
}
if (sum(sp_exact) == 1) {
filtr <- filtr[sp_exact]
exact_idx <- filtr[sp_exact]
if (length(exact_idx) >= 1 &&
length(unique(MO_lookup_current$genus_lower[exact_idx])) == 1) {
if (length(x_parts) == 2) {
# genus + species only: collapse to the species-rank record (subspecies == "")
is_species_rank <- MO_lookup_current$subspecies[exact_idx] == ""
if (any(is_species_rank)) {
filtr <- exact_idx[is_species_rank][1]
} else {
filtr <- exact_idx[1]
}
} else {
# explicit subspecies given, unambiguous within the genus
filtr <- exact_idx[1]
}
minimum_matching_score <- 0
}
} else {