1
0
mirror of https://github.com/msberends/AMR.git synced 2026-05-14 01:50:51 +02:00

Fix #287 (complex fallback) and #288 (species epithet scoring bias)

#287: as.mo() now strips " complex" from input when that exact complex
is not in the taxonomy and retries with the bare name, so inputs like
"Proteus vulgaris complex" no longer return NA.

#288: mo_matching_score() applies a ×2 bonus when the input has an
abbreviated genus (≤3 chars) and the candidate's species epithet exactly
matches the input species epithet. This ensures "S. apiospermum" resolves
to Scedosporium apiospermum rather than Staphylococcus aureus, overcoming
the kingdom/prevalence denominator bias in favour of common bacteria.

https://claude.ai/code/session_01VH4Ju4Xq9aW1AHuoVbjGEo
This commit is contained in:
Claude
2026-05-06 15:11:31 +00:00
parent 155c2707ce
commit b3b8d301ff
5 changed files with 46 additions and 5 deletions

View File

@@ -1,6 +1,6 @@
Package: AMR Package: AMR
Version: 3.0.1.9058 Version: 3.0.1.9059
Date: 2026-05-02 Date: 2026-05-06
Title: Antimicrobial Resistance Data Analysis Title: Antimicrobial Resistance Data Analysis
Description: Functions to simplify and standardise antimicrobial resistance (AMR) Description: Functions to simplify and standardise antimicrobial resistance (AMR)
data analysis and to work with microbial and antimicrobial properties by data analysis and to work with microbial and antimicrobial properties by

View File

@@ -1,4 +1,4 @@
# AMR 3.0.1.9058 # AMR 3.0.1.9059
Planned as v3.1.0, May 2026. Planned as v3.1.0, May 2026.
@@ -18,6 +18,8 @@ Planned as v3.1.0, May 2026.
* `as.mic()`: values in scientific notation (e.g. `1e-3`) now handled correctly * `as.mic()`: values in scientific notation (e.g. `1e-3`) now handled correctly
* `as.ab()`: codes containing "PH" or "TH" (e.g. `ETH`, `PHE`) no longer return `NA` when mixed with unrecognised input (#245) * `as.ab()`: codes containing "PH" or "TH" (e.g. `ETH`, `PHE`) no longer return `NA` when mixed with unrecognised input (#245)
* Combined MIC/SIR input values (e.g. `"<= 0.002; S"` or `"S; 0.002"`) now parsed correctly (#252) * Combined MIC/SIR input values (e.g. `"<= 0.002; S"` or `"S; 0.002"`) now parsed correctly (#252)
* `as.mo()`: input of the form `"X complex"` now falls back to `"X"` when the complex is not a distinct taxon in the database, preventing `NA` results for valid clinical descriptions such as `"Proteus vulgaris complex"` (#287)
* `mo_matching_score()`: abbreviated-genus input (e.g. `"S. apiospermum"`) now correctly ranks candidates whose species epithet exactly matches the input above more-prevalent organisms whose species does not match; fixes `"S. apiospermum"` resolving to *Staphylococcus* instead of *Scedosporium apiospermum* (#288)
* `get_author_year()` in the microorganism reproduction script now strips `emend.` and everything after it, so `ref` reflects the combination authority rather than the emendation author (e.g. *Rhodococcus equi* now returns "Goodfellow et al., 1977" instead of "Nouioui et al., 2018") * `get_author_year()` in the microorganism reproduction script now strips `emend.` and everything after it, so `ref` reflects the combination authority rather than the emendation author (e.g. *Rhodococcus equi* now returns "Goodfellow et al., 1977" instead of "Nouioui et al., 2018")
* BRMO classification now includes bacterial complexes (#275) * BRMO classification now includes bacterial complexes (#275)
* Translation fixes for Italian CoNS/CoPS names (#256), Dutch antimicrobials, and `sir_df()` foreign-language output (#272) * Translation fixes for Italian CoNS/CoPS names (#256), Dutch antimicrobials, and `sir_df()` foreign-language output (#272)

9
R/mo.R
View File

@@ -322,6 +322,15 @@ as.mo <- function(x,
return(as.character(MO_lookup_current$mo[match(x_out, MO_lookup_current$fullname_lower)])) return(as.character(MO_lookup_current$mo[match(x_out, MO_lookup_current$fullname_lower)]))
} }
# Issue #287: "X complex" is not a distinct taxon - strip " complex" and try "X"
if (grepl(" complex$", x_out, ignore.case = FALSE)) {
x_out <- sub(" complex$", "", x_out)
x_search_cleaned <- sub(" [Cc]omplex$", "", x_search_cleaned)
if (x_out %in% MO_lookup_current$fullname_lower) {
return(as.character(MO_lookup_current$mo[match(x_out, MO_lookup_current$fullname_lower)]))
}
}
# input must not be too short # input must not be too short
if (nchar(x_out) < 3) { if (nchar(x_out) < 3) {
return("UNKNOWN") return("UNKNOWN")

View File

@@ -125,6 +125,26 @@ mo_matching_score <- function(x, n) {
# kingdom index (Bacteria = 1, Fungi = 2, Protozoa = 3, Archaea = 4, others = 5) # kingdom index (Bacteria = 1, Fungi = 2, Protozoa = 3, Archaea = 4, others = 5)
k_n <- AMR_env$MO_lookup[match(n, AMR_env$MO_lookup$fullname), "kingdom_index", drop = TRUE] k_n <- AMR_env$MO_lookup[match(n, AMR_env$MO_lookup$fullname), "kingdom_index", drop = TRUE]
# matching score: # base matching score
(l_n - 0.5 * l_n.lev) / (l_n * p_n * k_n) score <- (l_n - 0.5 * l_n.lev) / (l_n * p_n * k_n)
# Issue #288: when the genus is abbreviated (≤3 chars) and the species epithet of the
# candidate exactly matches the species epithet of the input, boost the score ×2.
# This prevents a prevalent bacterium (low p_n/k_n) from outranking a rarer organism
# whose species epithet is the only exact match, e.g. "S. apiospermum" → Scedosporium.
x_parts_list <- strsplit(x, " ", fixed = TRUE)
n_parts_list <- strsplit(n, " ", fixed = TRUE)
x_genus <- vapply(x_parts_list, function(w) if (length(w) >= 1) w[1L] else "", character(1L))
x_sp <- vapply(x_parts_list, function(w) if (length(w) >= 2L) tolower(w[2L]) else "", character(1L))
n_g1 <- vapply(n_parts_list, function(w) if (length(w) >= 1L) tolower(substr(w[1L], 1L, 1L)) else "", character(1L))
n_sp <- vapply(n_parts_list, function(w) if (length(w) >= 2L) tolower(w[2L]) else "", character(1L))
exact_sp <- nchar(x_genus) <= 3L &
x_sp != "" &
n_sp != "" &
tolower(substr(x_genus, 1L, 1L)) == n_g1 &
x_sp == n_sp
score[exact_sp] <- score[exact_sp] * 2
score
} }

View File

@@ -84,6 +84,16 @@ test_that("test-mo.R", {
# expect_warning(as.mo("Acinetobacter calcoaceticus/baumannii complex")) # expect_warning(as.mo("Acinetobacter calcoaceticus/baumannii complex"))
# Issue #287: "X complex" fallback to "X" when complex is not a distinct taxon
expect_identical(as.character(suppressWarnings(as.mo("Proteus vulgaris complex"))), as.character(suppressWarnings(as.mo("Proteus vulgaris"))))
expect_identical(as.character(suppressWarnings(as.mo("Enterobacter cloacae complex"))), as.character(as.mo("Enterobacter cloacae complex")))
# Issue #288: abbreviated genus with exact species epithet match should win
expect_identical(
as.character(suppressWarnings(as.mo("S. apiospermum"))),
as.character(suppressWarnings(as.mo("Scedosporium apiospermum")))
)
# prevalent MO # prevalent MO
expect_identical( expect_identical(
suppressWarnings(as.character( suppressWarnings(as.character(