Revert mo_matching_score.R; fix #288 in mo.R pre-filter instead

Issue #288: when the abbreviated-genus pre-filter (≤3 char genus) yields exactly one candidate whose species (and subspecies) word(s) exactly match the input, narrow filtr to that single candidate and set minimum_matching_score = 0. This bypasses the automatic 0.55 cutoff that only runs in the is.null() branch, so "S. apiospermum" resolves to Scedosporium apiospermum without touching the validated scoring formula. https://claude.ai/code/session_01VH4Ju4Xq9aW1AHuoVbjGEo
2026-07-17 04:30:56 +02:00 · 2026-05-06 18:07:37 +00:00
parent b3b8d301ff
commit d23004641f
2 changed files with 14 additions and 22 deletions
--- a/R/mo.R
+++ b/R/mo.R
@@ -352,6 +352,18 @@ as.mo <- function(x,
            (MO_lookup_current$species_first == substr(x_parts[2], 1, 1) |
              MO_lookup_current$subspecies_first == substr(x_parts[2], 1, 1) |
              MO_lookup_current$subspecies_first == substr(x_parts[3], 1, 1)))
+          # Issue #288: if the species (and subspecies) word(s) in the input exactly match
+          # exactly one candidate, use only that candidate and bypass the 0.55 cutoff.
+          # This prevents prevalent bacteria from outranking a rarer organism whose species
+          # epithet is an unambiguous exact match, e.g. "S. apiospermum" → Scedosporium.
+          sp_exact <- tolower(MO_lookup_current$species[filtr]) == x_parts[2]
+          if (length(x_parts) == 3) {
+            sp_exact <- sp_exact & tolower(MO_lookup_current$subspecies[filtr]) == x_parts[3]
+          }
+          if (sum(sp_exact) == 1) {
+            filtr <- filtr[sp_exact]
+            minimum_matching_score <- 0
+          }
        } else {
          filtr <- which(MO_lookup_current$full_first == substr(x_parts[1], 1, 1) |
            MO_lookup_current$species_first == substr(x_parts[2], 1, 1) |
--- a/R/mo_matching_score.R
+++ b/R/mo_matching_score.R
@@ -125,26 +125,6 @@ mo_matching_score <- function(x, n) {
  # kingdom index (Bacteria = 1, Fungi = 2, Protozoa = 3, Archaea = 4, others = 5)
  k_n <- AMR_env$MO_lookup[match(n, AMR_env$MO_lookup$fullname), "kingdom_index", drop = TRUE]

-  # base matching score
-  score <- (l_n - 0.5 * l_n.lev) / (l_n * p_n * k_n)
-
-  # Issue #288: when the genus is abbreviated (≤3 chars) and the species epithet of the
-  # candidate exactly matches the species epithet of the input, boost the score ×2.
-  # This prevents a prevalent bacterium (low p_n/k_n) from outranking a rarer organism
-  # whose species epithet is the only exact match, e.g. "S. apiospermum" → Scedosporium.
-  x_parts_list <- strsplit(x, " ", fixed = TRUE)
-  n_parts_list <- strsplit(n, " ", fixed = TRUE)
-  x_genus <- vapply(x_parts_list, function(w) if (length(w) >= 1) w[1L] else "", character(1L))
-  x_sp    <- vapply(x_parts_list, function(w) if (length(w) >= 2L) tolower(w[2L]) else "", character(1L))
-  n_g1    <- vapply(n_parts_list, function(w) if (length(w) >= 1L) tolower(substr(w[1L], 1L, 1L)) else "", character(1L))
-  n_sp    <- vapply(n_parts_list, function(w) if (length(w) >= 2L) tolower(w[2L]) else "", character(1L))
-
-  exact_sp <- nchar(x_genus) <= 3L &
-    x_sp != "" &
-    n_sp != "" &
-    tolower(substr(x_genus, 1L, 1L)) == n_g1 &
-    x_sp == n_sp
-  score[exact_sp] <- score[exact_sp] * 2
-
-  score
+  # matching score:
+  (l_n - 0.5 * l_n.lev) / (l_n * p_n * k_n)
 }