1
0
mirror of https://github.com/msberends/AMR.git synced 2026-06-29 10:16:20 +02:00

(v3.0.1.9080) fix(as.mo): resolve abbreviated genus when species has subspecies (#288 follow-up) (#301)

When a genus+species abbreviation like "P. ovale" was used, the previous
bypass (Issue #288) checked sum(sp_exact) == 1, which failed if the species
also had subspecies sharing the epithet (ovale curtisi, ovale wallikeri).
The fix extends the bypass to fire whenever all exact species matches belong
to one genus, collapsing to the species-rank record (subspecies == "") for
genus+species queries and preserving the chosen row for explicit subspecies
queries.

Also extends the data-invariant test to cover all taxonomic rank columns
from domain to subspecies, not just the terminal three.


Claude-Session: https://claude.ai/code/session_01M4fqQYQYJ3drdudkDYNqAY

Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
Matthijs Berends
2026-06-27 15:20:38 +02:00
committed by GitHub
parent 03be4b87fc
commit 518425311e
5 changed files with 60 additions and 8 deletions

View File

@@ -1,5 +1,5 @@
Package: AMR Package: AMR
Version: 3.0.1.9079 Version: 3.0.1.9080
Date: 2026-06-27 Date: 2026-06-27
Title: Antimicrobial Resistance Data Analysis Title: Antimicrobial Resistance Data Analysis
Description: Functions to simplify and standardise antimicrobial resistance (AMR) Description: Functions to simplify and standardise antimicrobial resistance (AMR)

View File

@@ -1,4 +1,4 @@
# AMR 3.0.1.9079 # AMR 3.0.1.9080
Planned as v3.1.0, end of June 2026. Planned as v3.1.0, end of June 2026.
@@ -32,6 +32,7 @@ Planned as v3.1.0, end of June 2026.
* `as.mo()`: * `as.mo()`:
* Input of the form `"X complex"` now falls back to `"X"` when the complex is not a distinct taxon in the database, preventing `NA` results for valid clinical descriptions such as `"Proteus vulgaris complex"` (#287) * Input of the form `"X complex"` now falls back to `"X"` when the complex is not a distinct taxon in the database, preventing `NA` results for valid clinical descriptions such as `"Proteus vulgaris complex"` (#287)
* Abbreviated-genus input (e.g. `"S. apiospermum"`) now correctly ranks candidates whose species epithet exactly matches the input above more-prevalent organisms whose species does not match; fixes `"S. apiospermum"` resolving to *Staphylococcus* instead of *Scedosporium apiospermum* (#288) * Abbreviated-genus input (e.g. `"S. apiospermum"`) now correctly ranks candidates whose species epithet exactly matches the input above more-prevalent organisms whose species does not match; fixes `"S. apiospermum"` resolving to *Staphylococcus* instead of *Scedosporium apiospermum* (#288)
* Abbreviated-genus input for species that have subspecies (e.g. `"P. ovale"`) now collapses to the species-rank record instead of incorrectly matching a more-prevalent organism; explicit subspecies queries (e.g. `"P. ovale curtisi"`) are preserved (#288)
* `get_author_year()` in the microorganism reproduction script now strips `emend.` and everything after it, so `ref` reflects the combination authority rather than the emendation author (e.g. *Rhodococcus equi* now returns "Goodfellow et al., 1977" instead of "Nouioui et al., 2018") * `get_author_year()` in the microorganism reproduction script now strips `emend.` and everything after it, so `ref` reflects the combination authority rather than the emendation author (e.g. *Rhodococcus equi* now returns "Goodfellow et al., 1977" instead of "Nouioui et al., 2018")
* BRMO classification now includes bacterial complexes (#275) * BRMO classification now includes bacterial complexes (#275)
* Translation fixes for Italian CoNS/CoPS names (#256), Dutch antimicrobials, and `sir_df()` foreign-language output (#272) * Translation fixes for Italian CoNS/CoPS names (#256), Dutch antimicrobials, and `sir_df()` foreign-language output (#272)

30
R/mo.R
View File

@@ -352,16 +352,34 @@ as.mo <- function(x,
(MO_lookup_current$species_first == substr(x_parts[2], 1, 1) | (MO_lookup_current$species_first == substr(x_parts[2], 1, 1) |
MO_lookup_current$subspecies_first == substr(x_parts[2], 1, 1) | MO_lookup_current$subspecies_first == substr(x_parts[2], 1, 1) |
MO_lookup_current$subspecies_first == substr(x_parts[3], 1, 1))) MO_lookup_current$subspecies_first == substr(x_parts[3], 1, 1)))
# Issue #288: if the species (and subspecies) word(s) in the input exactly match # Issue #288 (extended): if the species (and subspecies) word(s) in the input
# exactly one candidate, use only that candidate and bypass the 0.55 cutoff. # exactly match candidates that all belong to one and the same genus, bypass the
# This prevents prevalent bacteria from outranking a rarer organism whose species # 0.55 cutoff. A species together with its subspecies/autonyms (e.g. Plasmodium
# epithet is an unambiguous exact match, e.g. "S. apiospermum" → Scedosporium. # ovale + curtisi + wallikeri) is the same taxon, so for a genus+species input we
# collapse to the species-rank record (subspecies == ""). This prevents prevalent
# bacteria from outranking a rarer organism whose species epithet is an
# unambiguous exact match, e.g. "S. apiospermum" -> Scedosporium, "P. ovale" ->
# Plasmodium ovale. If two different genera share the epithet, the genus check
# stays FALSE and the normal matching score arbitrates.
sp_exact <- tolower(MO_lookup_current$species[filtr]) == x_parts[2] sp_exact <- tolower(MO_lookup_current$species[filtr]) == x_parts[2]
if (length(x_parts) == 3) { if (length(x_parts) == 3) {
sp_exact <- sp_exact & tolower(MO_lookup_current$subspecies[filtr]) == x_parts[3] sp_exact <- sp_exact & tolower(MO_lookup_current$subspecies[filtr]) == x_parts[3]
} }
if (sum(sp_exact) == 1) { exact_idx <- filtr[sp_exact]
filtr <- filtr[sp_exact] if (length(exact_idx) >= 1 &&
length(unique(MO_lookup_current$genus_lower[exact_idx])) == 1) {
if (length(x_parts) == 2) {
# genus + species only: collapse to the species-rank record (subspecies == "")
is_species_rank <- MO_lookup_current$subspecies[exact_idx] == ""
if (any(is_species_rank)) {
filtr <- exact_idx[is_species_rank][1]
} else {
filtr <- exact_idx[1]
}
} else {
# explicit subspecies given, unambiguous within the genus
filtr <- exact_idx[1]
}
minimum_matching_score <- 0 minimum_matching_score <- 0
} }
} else { } else {

View File

@@ -142,3 +142,9 @@ test_that("test-data.R", {
# x <- check_non_ascii() %>% # x <- check_non_ascii() %>%
# filter(file %unlike% "^(data-raw|docs|git_)") # filter(file %unlike% "^(data-raw|docs|git_)")
}) })
test_that("taxonomic name columns contain no NA (empty string is used instead)", {
for (col in c("domain", "kingdom", "phylum", "class", "order", "family", "genus", "species", "subspecies")) {
expect_false(anyNA(microorganisms[[col]]), info = col)
}
})

View File

@@ -338,3 +338,30 @@ test_that("test-mo.R", {
) )
} }
}) })
test_that("as.mo() resolves abbreviated genus when species carries subspecies (#288 follow-up)", {
# "P. ovale" must resolve to Plasmodium ovale, not a Pseudomonas species,
# even though P. ovale has subspecies (curtisi, wallikeri) sharing the epithet.
expect_identical(
as.mo("P. ovale", keep_synonyms = TRUE, info = FALSE),
as.mo("Plasmodium ovale", keep_synonyms = TRUE, info = FALSE)
)
expect_identical(
mo_name("P. ovale", keep_synonyms = TRUE, language = NULL),
"Plasmodium ovale"
)
# Non-regression: the original #288 example must still work.
expect_identical(
mo_genus("S. apiospermum", keep_synonyms = TRUE, language = NULL),
"Scedosporium"
)
# Explicit subspecies must not be collapsed to species rank.
if (any(microorganisms$fullname == "Plasmodium ovale curtisi")) {
expect_identical(
mo_name("P. ovale curtisi", keep_synonyms = TRUE, language = NULL),
"Plasmodium ovale curtisi"
)
}
})