mirror of
https://github.com/msberends/AMR.git
synced 2026-05-14 03:50:49 +02:00
#287: as.mo() now strips " complex" from input when that exact complex is not in the taxonomy and retries with the bare name, so inputs like "Proteus vulgaris complex" no longer return NA. #288: mo_matching_score() applies a ×2 bonus when the input has an abbreviated genus (≤3 chars) and the candidate's species epithet exactly matches the input species epithet. This ensures "S. apiospermum" resolves to Scedosporium apiospermum rather than Staphylococcus aureus, overcoming the kingdom/prevalence denominator bias in favour of common bacteria. https://claude.ai/code/session_01VH4Ju4Xq9aW1AHuoVbjGEo
This commit is contained in:
9
R/mo.R
9
R/mo.R
@@ -322,6 +322,15 @@ as.mo <- function(x,
|
||||
return(as.character(MO_lookup_current$mo[match(x_out, MO_lookup_current$fullname_lower)]))
|
||||
}
|
||||
|
||||
# Issue #287: "X complex" is not a distinct taxon - strip " complex" and try "X"
|
||||
if (grepl(" complex$", x_out, ignore.case = FALSE)) {
|
||||
x_out <- sub(" complex$", "", x_out)
|
||||
x_search_cleaned <- sub(" [Cc]omplex$", "", x_search_cleaned)
|
||||
if (x_out %in% MO_lookup_current$fullname_lower) {
|
||||
return(as.character(MO_lookup_current$mo[match(x_out, MO_lookup_current$fullname_lower)]))
|
||||
}
|
||||
}
|
||||
|
||||
# input must not be too short
|
||||
if (nchar(x_out) < 3) {
|
||||
return("UNKNOWN")
|
||||
|
||||
@@ -125,6 +125,26 @@ mo_matching_score <- function(x, n) {
|
||||
# kingdom index (Bacteria = 1, Fungi = 2, Protozoa = 3, Archaea = 4, others = 5)
|
||||
k_n <- AMR_env$MO_lookup[match(n, AMR_env$MO_lookup$fullname), "kingdom_index", drop = TRUE]
|
||||
|
||||
# matching score:
|
||||
(l_n - 0.5 * l_n.lev) / (l_n * p_n * k_n)
|
||||
# base matching score
|
||||
score <- (l_n - 0.5 * l_n.lev) / (l_n * p_n * k_n)
|
||||
|
||||
# Issue #288: when the genus is abbreviated (≤3 chars) and the species epithet of the
|
||||
# candidate exactly matches the species epithet of the input, boost the score ×2.
|
||||
# This prevents a prevalent bacterium (low p_n/k_n) from outranking a rarer organism
|
||||
# whose species epithet is the only exact match, e.g. "S. apiospermum" → Scedosporium.
|
||||
x_parts_list <- strsplit(x, " ", fixed = TRUE)
|
||||
n_parts_list <- strsplit(n, " ", fixed = TRUE)
|
||||
x_genus <- vapply(x_parts_list, function(w) if (length(w) >= 1) w[1L] else "", character(1L))
|
||||
x_sp <- vapply(x_parts_list, function(w) if (length(w) >= 2L) tolower(w[2L]) else "", character(1L))
|
||||
n_g1 <- vapply(n_parts_list, function(w) if (length(w) >= 1L) tolower(substr(w[1L], 1L, 1L)) else "", character(1L))
|
||||
n_sp <- vapply(n_parts_list, function(w) if (length(w) >= 2L) tolower(w[2L]) else "", character(1L))
|
||||
|
||||
exact_sp <- nchar(x_genus) <= 3L &
|
||||
x_sp != "" &
|
||||
n_sp != "" &
|
||||
tolower(substr(x_genus, 1L, 1L)) == n_g1 &
|
||||
x_sp == n_sp
|
||||
score[exact_sp] <- score[exact_sp] * 2
|
||||
|
||||
score
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user