(v3.0.1.9080) fix(as.mo): resolve abbreviated genus when species has subspecies (#288 follow-up) (#301)

When a genus+species abbreviation like "P. ovale" was used, the previous bypass (Issue #288) checked sum(sp_exact) == 1, which failed if the species also had subspecies sharing the epithet (ovale curtisi, ovale wallikeri). The fix extends the bypass to fire whenever all exact species matches belong to one genus, collapsing to the species-rank record (subspecies == "") for genus+species queries and preserving the chosen row for explicit subspecies queries. Also extends the data-invariant test to cover all taxonomic rank columns from domain to subspecies, not just the terminal three. Claude-Session: https://claude.ai/code/session_01M4fqQYQYJ3drdudkDYNqAY Co-authored-by: Claude <noreply@anthropic.com>
2026-06-29 06:56:20 +02:00 · 2026-06-27 15:20:38 +02:00
parent 03be4b87fc
commit 518425311e
5 changed files with 60 additions and 8 deletions
--- a/2
+++ b/2
@@ -1,5 +1,5 @@
 Package: AMR
-Version: 3.0.1.9079
+Version: 3.0.1.9080
 Date: 2026-06-27
 Title: Antimicrobial Resistance Data Analysis
 Description: Functions to simplify and standardise antimicrobial resistance (AMR)
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,4 +1,4 @@
-# AMR 3.0.1.9079
+# AMR 3.0.1.9080

 Planned as v3.1.0, end of June 2026.

@@ -32,6 +32,7 @@ Planned as v3.1.0, end of June 2026.
 * `as.mo()`: 
  * Input of the form `"X complex"` now falls back to `"X"` when the complex is not a distinct taxon in the database, preventing `NA` results for valid clinical descriptions such as `"Proteus vulgaris complex"` (#287)
  * Abbreviated-genus input (e.g. `"S. apiospermum"`) now correctly ranks candidates whose species epithet exactly matches the input above more-prevalent organisms whose species does not match; fixes `"S. apiospermum"` resolving to *Staphylococcus* instead of *Scedosporium apiospermum* (#288)
+  * Abbreviated-genus input for species that have subspecies (e.g. `"P. ovale"`) now collapses to the species-rank record instead of incorrectly matching a more-prevalent organism; explicit subspecies queries (e.g. `"P. ovale curtisi"`) are preserved (#288)
 * `get_author_year()` in the microorganism reproduction script now strips `emend.` and everything after it, so `ref` reflects the combination authority rather than the emendation author (e.g. *Rhodococcus equi* now returns "Goodfellow et al., 1977" instead of "Nouioui et al., 2018")
 * BRMO classification now includes bacterial complexes (#275)
 * Translation fixes for Italian CoNS/CoPS names (#256), Dutch antimicrobials, and `sir_df()` foreign-language output (#272)
--- a/R/mo.R
+++ b/R/mo.R
@@ -352,16 +352,34 @@ as.mo <- function(x,
            (MO_lookup_current$species_first == substr(x_parts[2], 1, 1) |
              MO_lookup_current$subspecies_first == substr(x_parts[2], 1, 1) |
              MO_lookup_current$subspecies_first == substr(x_parts[3], 1, 1)))
-          # Issue #288: if the species (and subspecies) word(s) in the input exactly match
-          # exactly one candidate, use only that candidate and bypass the 0.55 cutoff.
-          # This prevents prevalent bacteria from outranking a rarer organism whose species
-          # epithet is an unambiguous exact match, e.g. "S. apiospermum" → Scedosporium.
+          # Issue #288 (extended): if the species (and subspecies) word(s) in the input
+          # exactly match candidates that all belong to one and the same genus, bypass the
+          # 0.55 cutoff. A species together with its subspecies/autonyms (e.g. Plasmodium
+          # ovale + curtisi + wallikeri) is the same taxon, so for a genus+species input we
+          # collapse to the species-rank record (subspecies == ""). This prevents prevalent
+          # bacteria from outranking a rarer organism whose species epithet is an
+          # unambiguous exact match, e.g. "S. apiospermum" -> Scedosporium, "P. ovale" ->
+          # Plasmodium ovale. If two different genera share the epithet, the genus check
+          # stays FALSE and the normal matching score arbitrates.
          sp_exact <- tolower(MO_lookup_current$species[filtr]) == x_parts[2]
          if (length(x_parts) == 3) {
            sp_exact <- sp_exact & tolower(MO_lookup_current$subspecies[filtr]) == x_parts[3]
          }
-          if (sum(sp_exact) == 1) {
-            filtr <- filtr[sp_exact]
+          exact_idx <- filtr[sp_exact]
+          if (length(exact_idx) >= 1 &&
+            length(unique(MO_lookup_current$genus_lower[exact_idx])) == 1) {
+            if (length(x_parts) == 2) {
+              # genus + species only: collapse to the species-rank record (subspecies == "")
+              is_species_rank <- MO_lookup_current$subspecies[exact_idx] == ""
+              if (any(is_species_rank)) {
+                filtr <- exact_idx[is_species_rank][1]
+              } else {
+                filtr <- exact_idx[1]
+              }
+            } else {
+              # explicit subspecies given, unambiguous within the genus
+              filtr <- exact_idx[1]
+            }
            minimum_matching_score <- 0
          }
        } else {
--- a/tests/testthat/test-data.R
+++ b/tests/testthat/test-data.R
@@ -142,3 +142,9 @@ test_that("test-data.R", {
  # x <- check_non_ascii() %>%
  #   filter(file %unlike% "^(data-raw|docs|git_)")
 })
+
+test_that("taxonomic name columns contain no NA (empty string is used instead)", {
+  for (col in c("domain", "kingdom", "phylum", "class", "order", "family", "genus", "species", "subspecies")) {
+    expect_false(anyNA(microorganisms[[col]]), info = col)
+  }
+})
--- a/tests/testthat/test-mo.R
+++ b/tests/testthat/test-mo.R
@@ -338,3 +338,30 @@ test_that("test-mo.R", {
    )
  }
 })
+
+test_that("as.mo() resolves abbreviated genus when species carries subspecies (#288 follow-up)", {
+  # "P. ovale" must resolve to Plasmodium ovale, not a Pseudomonas species,
+  # even though P. ovale has subspecies (curtisi, wallikeri) sharing the epithet.
+  expect_identical(
+    as.mo("P. ovale", keep_synonyms = TRUE, info = FALSE),
+    as.mo("Plasmodium ovale", keep_synonyms = TRUE, info = FALSE)
+  )
+  expect_identical(
+    mo_name("P. ovale", keep_synonyms = TRUE, language = NULL),
+    "Plasmodium ovale"
+  )
+
+  # Non-regression: the original #288 example must still work.
+  expect_identical(
+    mo_genus("S. apiospermum", keep_synonyms = TRUE, language = NULL),
+    "Scedosporium"
+  )
+
+  # Explicit subspecies must not be collapsed to species rank.
+  if (any(microorganisms$fullname == "Plasmodium ovale curtisi")) {
+    expect_identical(
+      mo_name("P. ovale curtisi", keep_synonyms = TRUE, language = NULL),
+      "Plasmodium ovale curtisi"
+    )
+  }
+})