(v2.1.1.9082) algorithm updates

2025-12-05 04:09:16 +01:00 · 2024-09-30 18:46:55 +02:00
parent ac1c40d8bb
commit 681fe011fe
25 changed files with 15593 additions and 15505 deletions
--- a/R/aa_helper_functions.R
+++ b/R/aa_helper_functions.R
@@ -1547,6 +1547,7 @@ add_MO_lookup_to_AMR_env <- function() {
    MO_lookup[which(MO_lookup$kingdom == "Bacteria" | MO_lookup$mo == "UNKNOWN"), "kingdom_index"] <- 1
    MO_lookup[which(MO_lookup$kingdom == "Fungi"), "kingdom_index"] <- 1.25
    MO_lookup[which(MO_lookup$kingdom == "Protozoa"), "kingdom_index"] <- 1.5
+    MO_lookup[which(MO_lookup$kingdom == "Chromista"), "kingdom_index"] <- 1.75
    MO_lookup[which(MO_lookup$kingdom == "Archaea"), "kingdom_index"] <- 2
    # all the rest
    MO_lookup[which(is.na(MO_lookup$kingdom_index)), "kingdom_index"] <- 3
--- a/R/first_isolate.R
+++ b/R/first_isolate.R
@@ -670,6 +670,16 @@ duplicated_antibiogram <- function(antibiogram, points_threshold, ignore_I, type
    # fast return, only 1 isolate
    return(FALSE)
  }
+  stop("Check R/first_isolate.R -> duplicated_antibiogram()")
+  # first sort on data availability - count the dots and order that ascending
+  number_dots <- vapply(FUN.VALUE = integer(1),
+                        antibiogram,
+                        function(x) sum(strsplit(x, "", fixed = TRUE)[[1]] == "."),
+                        USE.NAMES = FALSE)
+  new_order <- order(number_dots, antibiogram)
+  antibiogram.bak <- antibiogram
+  antibiogram <- antibiogram[new_order]
+  
  out <- rep(NA, length(antibiogram))
  out[1] <- FALSE
  out[2] <- antimicrobials_equal(antibiogram[1], antibiogram[2],
@@ -680,11 +690,6 @@ duplicated_antibiogram <- function(antibiogram, points_threshold, ignore_I, type
    return(out)
  }
  
-  # sort after the second one (since we already determined AB equality of the first two)
-  original_sort <- c(1, 2, rank(antibiogram[3:length(antibiogram)]) + 2)
-  antibiogram.bak <- antibiogram
-  antibiogram <- c(antibiogram[1:2], sort(antibiogram[3:length(antibiogram)]))
-  
  # we can skip the duplicates - they are never unique antibiograms of course
  duplicates <- duplicated(antibiogram)
  out[3:length(out)][duplicates[3:length(out)] == TRUE] <- TRUE
@@ -703,7 +708,7 @@ duplicated_antibiogram <- function(antibiogram, points_threshold, ignore_I, type
                                              type = type)))
  }
  
-  out <- out[original_sort]
+  out <- out[order(new_order)]
  # rerun duplicated again
  duplicates <- duplicated(antibiogram.bak)
  out[duplicates == TRUE] <- TRUE
--- a/R/mo.R
+++ b/R/mo.R
@@ -1081,6 +1081,9 @@ convert_colloquial_input <- function(x) {
  out[x %like_case% "(^| )yeast?"] <- "F_YEAST"
  out[x %like_case% "(^| )fung(us|i)"] <- "F_FUNGUS"
  
+  # protozoa
+  out[x %like_case% "protozo"] <- "P_PROTOZOAN" # to hit it with most languages, and "protozo" does not occur in the microorganisms data set for anything else
+  
  # trivial names known to the field
  out[x %like_case% "meningo[ck]o[ck]"] <- "B_NESSR_MNNG"
  out[x %like_case% "gono[ck]o[ck]"] <- "B_NESSR_GNRR"
--- a/R/mo_matching_score.R
+++ b/R/mo_matching_score.R
@@ -30,12 +30,11 @@
 #' Calculate the Matching Score for Microorganisms
 #'
 #' This algorithm is used by [as.mo()] and all the [`mo_*`][mo_property()] functions to determine the most probable match of taxonomic records based on user input.
-#' @author Dr. Matthijs Berends, 2018
 #' @param x Any user input value(s)
 #' @param n A full taxonomic name, that exists in [`microorganisms$fullname`][microorganisms]
-#' @note This algorithm was originally described in: Berends MS *et al.* (2022). **AMR: An R Package for Working with Antimicrobial Resistance Data**. *Journal of Statistical Software*, 104(3), 1-31; \doi{10.18637/jss.v104.i03}.
+#' @note This algorithm was originally developed in 2018 and subsequently described in: Berends MS *et al.* (2022). **AMR: An R Package for Working with Antimicrobial Resistance Data**. *Journal of Statistical Software*, 104(3), 1-31; \doi{10.18637/jss.v104.i03}.
 #'
-#' Later, the work of Bartlett A *et al.* about bacterial pathogens infecting humans (2022, \doi{10.1099/mic.0.001269}) was incorporated.
+#' Later, the work of Bartlett A *et al.* about bacterial pathogens infecting humans (2022, \doi{10.1099/mic.0.001269}) was incorporated, and optimalisations to the algorithm were made.
 #' @section Matching Score for Microorganisms:
 #' With ambiguous user input in [as.mo()] and all the [`mo_*`][mo_property()] functions, the returned results are chosen based on their matching score using [mo_matching_score()]. This matching score \eqn{m}, is calculated as:
 #'
@@ -50,16 +49,17 @@
 #' * \eqn{l_n} is the length of \eqn{n};
 #' * \eqn{lev} is the [Levenshtein distance function](https://en.wikipedia.org/wiki/Levenshtein_distance) (counting any insertion as 1, and any deletion or substitution as 2) that is needed to change \eqn{x} into \eqn{n};
 #' * \eqn{p_n} is the human pathogenic prevalence group of \eqn{n}, as described below;
-#' * \eqn{k_n} is the taxonomic kingdom of \eqn{n}, set as Bacteria = 1, Fungi = 1.25, Protozoa = 1.5, Archaea = 2, others = 3.
+#' * \eqn{k_n} is the taxonomic kingdom of \eqn{n}, set as Bacteria = 1, Fungi = 1.25, Protozoa = 1.5, Chromista = 1.75, Archaea = 2, others = 3.
 #'
 #' The grouping into human pathogenic prevalence \eqn{p} is based on recent work from Bartlett *et al.* (2022, \doi{10.1099/mic.0.001269}) who extensively studied medical-scientific literature to categorise all bacterial species into these groups:
 #'
-#' - **Established**, if a taxonomic species has infected at least three persons in three or more references. These records have `prevalence = 1.0` in the [microorganisms] data set;
+#' - **Established**, if a taxonomic species has infected at least three persons in three or more references. These records have `prevalence = 1.15` in the [microorganisms] data set;
 #' - **Putative**, if a taxonomic species has fewer than three known cases. These records have `prevalence = 1.25` in the [microorganisms] data set.
 #'
 #' Furthermore,
 #'
-#' - Any genus present in the **established** list also has `prevalence = 1.0` in the [microorganisms] data set;
+#' - Genera from the World Health Organization's (WHO) Priority Pathogen List have `prevalence = 1.0` in the [microorganisms] data set;
+#' - Any genus present in the **established** list also has `prevalence = 1.15` in the [microorganisms] data set;
 #' - Any other genus present in the **putative** list has `prevalence = 1.25` in the [microorganisms] data set;
 #' - Any other species or subspecies of which the genus is present in the two aforementioned groups, has `prevalence = 1.5` in the [microorganisms] data set;
 #' - Any *non-bacterial* genus, species or subspecies of which the genus is present in the following list, has `prevalence = 1.25` in the [microorganisms] data set: `r vector_or(MO_RELEVANT_GENERA, quotes = "*")`;
--- a/R/mo_property.R
+++ b/R/mo_property.R
@@ -442,8 +442,8 @@ mo_pathogenicity <- function(x, language = get_AMR_locale(), keep_synonyms = get
  kngd <- AMR_env$MO_lookup$kingdom[match(x.mo, AMR_env$MO_lookup$mo)]
  rank <- AMR_env$MO_lookup$rank[match(x.mo, AMR_env$MO_lookup$mo)]

-  out <- factor(case_when_AMR(prev == 1 & kngd == "Bacteria" & rank != "genus" ~ "Pathogenic",
-                              (prev < 2 & kngd == "Fungi") ~ "Potentially pathogenic",
+  out <- factor(case_when_AMR(prev <= 1.15 & kngd == "Bacteria" & rank != "genus" ~ "Pathogenic",
+                              prev < 2 & kngd == "Fungi" ~ "Potentially pathogenic",
                              prev == 2 & kngd == "Bacteria" ~ "Non-pathogenic",
                              kngd == "Bacteria" ~ "Potentially pathogenic",
                              TRUE ~ "Unknown"),
--- a/R/sysdata.rda
+++ b/R/sysdata.rda