diff --git a/R/mo.R b/R/mo.R index 7bafbb7c1..d2f79afd5 100755 --- a/R/mo.R +++ b/R/mo.R @@ -352,6 +352,18 @@ as.mo <- function(x, (MO_lookup_current$species_first == substr(x_parts[2], 1, 1) | MO_lookup_current$subspecies_first == substr(x_parts[2], 1, 1) | MO_lookup_current$subspecies_first == substr(x_parts[3], 1, 1))) + # Issue #288: if the species (and subspecies) word(s) in the input exactly match + # exactly one candidate, use only that candidate and bypass the 0.55 cutoff. + # This prevents prevalent bacteria from outranking a rarer organism whose species + # epithet is an unambiguous exact match, e.g. "S. apiospermum" → Scedosporium. + sp_exact <- tolower(MO_lookup_current$species[filtr]) == x_parts[2] + if (length(x_parts) == 3) { + sp_exact <- sp_exact & tolower(MO_lookup_current$subspecies[filtr]) == x_parts[3] + } + if (sum(sp_exact) == 1) { + filtr <- filtr[sp_exact] + minimum_matching_score <- 0 + } } else { filtr <- which(MO_lookup_current$full_first == substr(x_parts[1], 1, 1) | MO_lookup_current$species_first == substr(x_parts[2], 1, 1) | diff --git a/R/mo_matching_score.R b/R/mo_matching_score.R index 13e1c5435..78cd1f663 100755 --- a/R/mo_matching_score.R +++ b/R/mo_matching_score.R @@ -125,26 +125,6 @@ mo_matching_score <- function(x, n) { # kingdom index (Bacteria = 1, Fungi = 2, Protozoa = 3, Archaea = 4, others = 5) k_n <- AMR_env$MO_lookup[match(n, AMR_env$MO_lookup$fullname), "kingdom_index", drop = TRUE] - # base matching score - score <- (l_n - 0.5 * l_n.lev) / (l_n * p_n * k_n) - - # Issue #288: when the genus is abbreviated (≤3 chars) and the species epithet of the - # candidate exactly matches the species epithet of the input, boost the score ×2. - # This prevents a prevalent bacterium (low p_n/k_n) from outranking a rarer organism - # whose species epithet is the only exact match, e.g. "S. apiospermum" → Scedosporium. - x_parts_list <- strsplit(x, " ", fixed = TRUE) - n_parts_list <- strsplit(n, " ", fixed = TRUE) - x_genus <- vapply(x_parts_list, function(w) if (length(w) >= 1) w[1L] else "", character(1L)) - x_sp <- vapply(x_parts_list, function(w) if (length(w) >= 2L) tolower(w[2L]) else "", character(1L)) - n_g1 <- vapply(n_parts_list, function(w) if (length(w) >= 1L) tolower(substr(w[1L], 1L, 1L)) else "", character(1L)) - n_sp <- vapply(n_parts_list, function(w) if (length(w) >= 2L) tolower(w[2L]) else "", character(1L)) - - exact_sp <- nchar(x_genus) <= 3L & - x_sp != "" & - n_sp != "" & - tolower(substr(x_genus, 1L, 1L)) == n_g1 & - x_sp == n_sp - score[exact_sp] <- score[exact_sp] * 2 - - score + # matching score: + (l_n - 0.5 * l_n.lev) / (l_n * p_n * k_n) }