From b3b8d301ffcff73bd819572e9ce7e995727fdb23 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 6 May 2026 15:11:31 +0000
Subject: [PATCH] Fix #287 (complex fallback) and #288 (species epithet scoring
 bias)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

#287: as.mo() now strips " complex" from input when that exact complex
is not in the taxonomy and retries with the bare name, so inputs like
"Proteus vulgaris complex" no longer return NA.

#288: mo_matching_score() applies a ×2 bonus when the input has an
abbreviated genus (≤3 chars) and the candidate's species epithet exactly
matches the input species epithet. This ensures "S. apiospermum" resolves
to Scedosporium apiospermum rather than Staphylococcus aureus, overcoming
the kingdom/prevalence denominator bias in favour of common bacteria.

https://claude.ai/code/session_01VH4Ju4Xq9aW1AHuoVbjGEo
---
 DESCRIPTION              |  4 ++--
 NEWS.md                  |  4 +++-
 R/mo.R                   |  9 +++++++++
 R/mo_matching_score.R    | 24 ++++++++++++++++++++++--
 tests/testthat/test-mo.R | 10 ++++++++++
 5 files changed, 46 insertions(+), 5 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 8fa2c1be0..c0cd36711 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: AMR
-Version: 3.0.1.9058
-Date: 2026-05-02
+Version: 3.0.1.9059
+Date: 2026-05-06
 Title: Antimicrobial Resistance Data Analysis
 Description: Functions to simplify and standardise antimicrobial resistance (AMR)
   data analysis and to work with microbial and antimicrobial properties by
diff --git a/NEWS.md b/NEWS.md
index 39a28b7ee..5d51f4178 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,4 +1,4 @@
-# AMR 3.0.1.9058
+# AMR 3.0.1.9059
 
 Planned as v3.1.0, May 2026.
 
@@ -18,6 +18,8 @@ Planned as v3.1.0, May 2026.
 * `as.mic()`: values in scientific notation (e.g. `1e-3`) now handled correctly
 * `as.ab()`: codes containing "PH" or "TH" (e.g. `ETH`, `PHE`) no longer return `NA` when mixed with unrecognised input (#245)
 * Combined MIC/SIR input values (e.g. `"<= 0.002; S"` or `"S; 0.002"`) now parsed correctly (#252)
+* `as.mo()`: input of the form `"X complex"` now falls back to `"X"` when the complex is not a distinct taxon in the database, preventing `NA` results for valid clinical descriptions such as `"Proteus vulgaris complex"` (#287)
+* `mo_matching_score()`: abbreviated-genus input (e.g. `"S. apiospermum"`) now correctly ranks candidates whose species epithet exactly matches the input above more-prevalent organisms whose species does not match; fixes `"S. apiospermum"` resolving to *Staphylococcus* instead of *Scedosporium apiospermum* (#288)
 * `get_author_year()` in the microorganism reproduction script now strips `emend.` and everything after it, so `ref` reflects the combination authority rather than the emendation author (e.g. *Rhodococcus equi* now returns "Goodfellow et al., 1977" instead of "Nouioui et al., 2018")
 * BRMO classification now includes bacterial complexes (#275)
 * Translation fixes for Italian CoNS/CoPS names (#256), Dutch antimicrobials, and `sir_df()` foreign-language output (#272)
diff --git a/R/mo.R b/R/mo.R
index da6fa4484..7bafbb7c1 100755
--- a/R/mo.R
+++ b/R/mo.R
@@ -322,6 +322,15 @@ as.mo <- function(x,
         return(as.character(MO_lookup_current$mo[match(x_out, MO_lookup_current$fullname_lower)]))
       }
 
+      # Issue #287: "X complex" is not a distinct taxon - strip " complex" and try "X"
+      if (grepl(" complex$", x_out, ignore.case = FALSE)) {
+        x_out <- sub(" complex$", "", x_out)
+        x_search_cleaned <- sub(" [Cc]omplex$", "", x_search_cleaned)
+        if (x_out %in% MO_lookup_current$fullname_lower) {
+          return(as.character(MO_lookup_current$mo[match(x_out, MO_lookup_current$fullname_lower)]))
+        }
+      }
+
       # input must not be too short
       if (nchar(x_out) < 3) {
         return("UNKNOWN")
diff --git a/R/mo_matching_score.R b/R/mo_matching_score.R
index 78cd1f663..13e1c5435 100755
--- a/R/mo_matching_score.R
+++ b/R/mo_matching_score.R
@@ -125,6 +125,26 @@ mo_matching_score <- function(x, n) {
   # kingdom index (Bacteria = 1, Fungi = 2, Protozoa = 3, Archaea = 4, others = 5)
   k_n <- AMR_env$MO_lookup[match(n, AMR_env$MO_lookup$fullname), "kingdom_index", drop = TRUE]
 
-  # matching score:
-  (l_n - 0.5 * l_n.lev) / (l_n * p_n * k_n)
+  # base matching score
+  score <- (l_n - 0.5 * l_n.lev) / (l_n * p_n * k_n)
+
+  # Issue #288: when the genus is abbreviated (≤3 chars) and the species epithet of the
+  # candidate exactly matches the species epithet of the input, boost the score ×2.
+  # This prevents a prevalent bacterium (low p_n/k_n) from outranking a rarer organism
+  # whose species epithet is the only exact match, e.g. "S. apiospermum" → Scedosporium.
+  x_parts_list <- strsplit(x, " ", fixed = TRUE)
+  n_parts_list <- strsplit(n, " ", fixed = TRUE)
+  x_genus <- vapply(x_parts_list, function(w) if (length(w) >= 1) w[1L] else "", character(1L))
+  x_sp    <- vapply(x_parts_list, function(w) if (length(w) >= 2L) tolower(w[2L]) else "", character(1L))
+  n_g1    <- vapply(n_parts_list, function(w) if (length(w) >= 1L) tolower(substr(w[1L], 1L, 1L)) else "", character(1L))
+  n_sp    <- vapply(n_parts_list, function(w) if (length(w) >= 2L) tolower(w[2L]) else "", character(1L))
+
+  exact_sp <- nchar(x_genus) <= 3L &
+    x_sp != "" &
+    n_sp != "" &
+    tolower(substr(x_genus, 1L, 1L)) == n_g1 &
+    x_sp == n_sp
+  score[exact_sp] <- score[exact_sp] * 2
+
+  score
 }
diff --git a/tests/testthat/test-mo.R b/tests/testthat/test-mo.R
index 78653ce2e..3f98ec6b5 100644
--- a/tests/testthat/test-mo.R
+++ b/tests/testthat/test-mo.R
@@ -84,6 +84,16 @@ test_that("test-mo.R", {
 
   # expect_warning(as.mo("Acinetobacter calcoaceticus/baumannii complex"))
 
+  # Issue #287: "X complex" fallback to "X" when complex is not a distinct taxon
+  expect_identical(as.character(suppressWarnings(as.mo("Proteus vulgaris complex"))), as.character(suppressWarnings(as.mo("Proteus vulgaris"))))
+  expect_identical(as.character(suppressWarnings(as.mo("Enterobacter cloacae complex"))), as.character(as.mo("Enterobacter cloacae complex")))
+
+  # Issue #288: abbreviated genus with exact species epithet match should win
+  expect_identical(
+    as.character(suppressWarnings(as.mo("S. apiospermum"))),
+    as.character(suppressWarnings(as.mo("Scedosporium apiospermum")))
+  )
+
   # prevalent MO
   expect_identical(
     suppressWarnings(as.character(