mirror of
https://github.com/msberends/AMR.git
synced 2024-12-25 20:06:12 +01:00
(v2.1.1.9082) algorithm updates
This commit is contained in:
parent
ac1c40d8bb
commit
681fe011fe
@ -1,6 +1,6 @@
|
||||
Package: AMR
|
||||
Version: 2.1.1.9081
|
||||
Date: 2024-09-29
|
||||
Version: 2.1.1.9082
|
||||
Date: 2024-09-30
|
||||
Title: Antimicrobial Resistance Data Analysis
|
||||
Description: Functions to simplify and standardise antimicrobial resistance (AMR)
|
||||
data analysis and to work with microbial and antimicrobial properties by
|
||||
|
7
NEWS.md
7
NEWS.md
@ -1,4 +1,4 @@
|
||||
# AMR 2.1.1.9081
|
||||
# AMR 2.1.1.9082
|
||||
|
||||
*(this beta version will eventually become v3.0. We're happy to reach a new major milestone soon, which will be all about the new One Health support! Install this beta using [the instructions here](https://msberends.github.io/AMR/#latest-development-version).)*
|
||||
|
||||
@ -20,7 +20,7 @@ This package now supports not only tools for AMR data analysis in clinical setti
|
||||
* MycoBank has now been integrated as the primary taxonomic source for fungi. The `microorganisms` data set has been enriched with new columns (`mycobank`, `mycobank_parent`, and `mycobank_renamed_to`) that provide detailed information for fungal species.
|
||||
* A remarkable addition of over 20,000 new fungal records
|
||||
* New function `mo_mycobank()` to retrieve the MycoBank record number, analogous to existing functions such as `mo_lpsn()` and `mo_gbif()`.
|
||||
* The `as.mo()` function and all `mo_*()` functions now includes an `only_fungi` argument, allowing users to restrict results solely to fungal species. This ensures fungi are prioritised over bacteria during microorganism identification. This can also be set globally with the new `AMR_only_fungi` option.
|
||||
* The `as.mo()` function and all `mo_*()` functions now include an `only_fungi` argument, allowing users to restrict results solely to fungal species. This ensures fungi are prioritised over bacteria during microorganism identification. This can also be set globally with the new `AMR_only_fungi` option.
|
||||
* Also updated other kingdoms, welcoming a total of 2,149 new records from 2023 and 927 from 2024.
|
||||
* **Updated clinical breakpoints**
|
||||
* EUCAST 2024 and CLSI 2024 are now supported, by adding all of their over 4,000 new clinical breakpoints to the `clinical_breakpoints` data set for usage in `as.sir()`. EUCAST 2024 is now the new default guideline for all MIC and disk diffusion interpretations.
|
||||
@ -63,10 +63,11 @@ This package now supports not only tools for AMR data analysis in clinical setti
|
||||
* Improved overall algorithm of `as.mo()` for better performance and accuracy. Specifically, more weight is given to genus and species combinations in cases where the subspecies is miswritten, so that the result will be the correct genus and species.
|
||||
* Intermediate log2 levels used for MIC plotting are now more common values instead of following a strict dilution range
|
||||
* Fixed a bug for when `antibiogram()` returns an empty data set
|
||||
* Updated the prevalence calculation to include genera from the World Health Organization's (WHO) Priority Pathogen List
|
||||
|
||||
## Other
|
||||
* Greatly updated and expanded documentation
|
||||
* Added Jordan Stull, Matthew Saab, and Javier Sanchez as contributors, to thank them for their valuable input
|
||||
* Added Larisse Bolton, Jordan Stull, Matthew Saab, and Javier Sanchez as contributors, to thank them for their valuable input
|
||||
* Stopped support for SAS (`.xpt`) files, since their file structure and extremely inefficient and requires more disk space than GitHub allows in a single commit.
|
||||
|
||||
## Older Versions
|
||||
|
@ -1547,6 +1547,7 @@ add_MO_lookup_to_AMR_env <- function() {
|
||||
MO_lookup[which(MO_lookup$kingdom == "Bacteria" | MO_lookup$mo == "UNKNOWN"), "kingdom_index"] <- 1
|
||||
MO_lookup[which(MO_lookup$kingdom == "Fungi"), "kingdom_index"] <- 1.25
|
||||
MO_lookup[which(MO_lookup$kingdom == "Protozoa"), "kingdom_index"] <- 1.5
|
||||
MO_lookup[which(MO_lookup$kingdom == "Chromista"), "kingdom_index"] <- 1.75
|
||||
MO_lookup[which(MO_lookup$kingdom == "Archaea"), "kingdom_index"] <- 2
|
||||
# all the rest
|
||||
MO_lookup[which(is.na(MO_lookup$kingdom_index)), "kingdom_index"] <- 3
|
||||
|
@ -670,6 +670,16 @@ duplicated_antibiogram <- function(antibiogram, points_threshold, ignore_I, type
|
||||
# fast return, only 1 isolate
|
||||
return(FALSE)
|
||||
}
|
||||
stop("Check R/first_isolate.R -> duplicated_antibiogram()")
|
||||
# first sort on data availability - count the dots and order that ascending
|
||||
number_dots <- vapply(FUN.VALUE = integer(1),
|
||||
antibiogram,
|
||||
function(x) sum(strsplit(x, "", fixed = TRUE)[[1]] == "."),
|
||||
USE.NAMES = FALSE)
|
||||
new_order <- order(number_dots, antibiogram)
|
||||
antibiogram.bak <- antibiogram
|
||||
antibiogram <- antibiogram[new_order]
|
||||
|
||||
out <- rep(NA, length(antibiogram))
|
||||
out[1] <- FALSE
|
||||
out[2] <- antimicrobials_equal(antibiogram[1], antibiogram[2],
|
||||
@ -680,11 +690,6 @@ duplicated_antibiogram <- function(antibiogram, points_threshold, ignore_I, type
|
||||
return(out)
|
||||
}
|
||||
|
||||
# sort after the second one (since we already determined AB equality of the first two)
|
||||
original_sort <- c(1, 2, rank(antibiogram[3:length(antibiogram)]) + 2)
|
||||
antibiogram.bak <- antibiogram
|
||||
antibiogram <- c(antibiogram[1:2], sort(antibiogram[3:length(antibiogram)]))
|
||||
|
||||
# we can skip the duplicates - they are never unique antibiograms of course
|
||||
duplicates <- duplicated(antibiogram)
|
||||
out[3:length(out)][duplicates[3:length(out)] == TRUE] <- TRUE
|
||||
@ -703,7 +708,7 @@ duplicated_antibiogram <- function(antibiogram, points_threshold, ignore_I, type
|
||||
type = type)))
|
||||
}
|
||||
|
||||
out <- out[original_sort]
|
||||
out <- out[order(new_order)]
|
||||
# rerun duplicated again
|
||||
duplicates <- duplicated(antibiogram.bak)
|
||||
out[duplicates == TRUE] <- TRUE
|
||||
|
3
R/mo.R
3
R/mo.R
@ -1081,6 +1081,9 @@ convert_colloquial_input <- function(x) {
|
||||
out[x %like_case% "(^| )yeast?"] <- "F_YEAST"
|
||||
out[x %like_case% "(^| )fung(us|i)"] <- "F_FUNGUS"
|
||||
|
||||
# protozoa
|
||||
out[x %like_case% "protozo"] <- "P_PROTOZOAN" # to hit it with most languages, and "protozo" does not occur in the microorganisms data set for anything else
|
||||
|
||||
# trivial names known to the field
|
||||
out[x %like_case% "meningo[ck]o[ck]"] <- "B_NESSR_MNNG"
|
||||
out[x %like_case% "gono[ck]o[ck]"] <- "B_NESSR_GNRR"
|
||||
|
@ -30,12 +30,11 @@
|
||||
#' Calculate the Matching Score for Microorganisms
|
||||
#'
|
||||
#' This algorithm is used by [as.mo()] and all the [`mo_*`][mo_property()] functions to determine the most probable match of taxonomic records based on user input.
|
||||
#' @author Dr. Matthijs Berends, 2018
|
||||
#' @param x Any user input value(s)
|
||||
#' @param n A full taxonomic name, that exists in [`microorganisms$fullname`][microorganisms]
|
||||
#' @note This algorithm was originally described in: Berends MS *et al.* (2022). **AMR: An R Package for Working with Antimicrobial Resistance Data**. *Journal of Statistical Software*, 104(3), 1-31; \doi{10.18637/jss.v104.i03}.
|
||||
#' @note This algorithm was originally developed in 2018 and subsequently described in: Berends MS *et al.* (2022). **AMR: An R Package for Working with Antimicrobial Resistance Data**. *Journal of Statistical Software*, 104(3), 1-31; \doi{10.18637/jss.v104.i03}.
|
||||
#'
|
||||
#' Later, the work of Bartlett A *et al.* about bacterial pathogens infecting humans (2022, \doi{10.1099/mic.0.001269}) was incorporated.
|
||||
#' Later, the work of Bartlett A *et al.* about bacterial pathogens infecting humans (2022, \doi{10.1099/mic.0.001269}) was incorporated, and optimalisations to the algorithm were made.
|
||||
#' @section Matching Score for Microorganisms:
|
||||
#' With ambiguous user input in [as.mo()] and all the [`mo_*`][mo_property()] functions, the returned results are chosen based on their matching score using [mo_matching_score()]. This matching score \eqn{m}, is calculated as:
|
||||
#'
|
||||
@ -50,16 +49,17 @@
|
||||
#' * \eqn{l_n} is the length of \eqn{n};
|
||||
#' * \eqn{lev} is the [Levenshtein distance function](https://en.wikipedia.org/wiki/Levenshtein_distance) (counting any insertion as 1, and any deletion or substitution as 2) that is needed to change \eqn{x} into \eqn{n};
|
||||
#' * \eqn{p_n} is the human pathogenic prevalence group of \eqn{n}, as described below;
|
||||
#' * \eqn{k_n} is the taxonomic kingdom of \eqn{n}, set as Bacteria = 1, Fungi = 1.25, Protozoa = 1.5, Archaea = 2, others = 3.
|
||||
#' * \eqn{k_n} is the taxonomic kingdom of \eqn{n}, set as Bacteria = 1, Fungi = 1.25, Protozoa = 1.5, Chromista = 1.75, Archaea = 2, others = 3.
|
||||
#'
|
||||
#' The grouping into human pathogenic prevalence \eqn{p} is based on recent work from Bartlett *et al.* (2022, \doi{10.1099/mic.0.001269}) who extensively studied medical-scientific literature to categorise all bacterial species into these groups:
|
||||
#'
|
||||
#' - **Established**, if a taxonomic species has infected at least three persons in three or more references. These records have `prevalence = 1.0` in the [microorganisms] data set;
|
||||
#' - **Established**, if a taxonomic species has infected at least three persons in three or more references. These records have `prevalence = 1.15` in the [microorganisms] data set;
|
||||
#' - **Putative**, if a taxonomic species has fewer than three known cases. These records have `prevalence = 1.25` in the [microorganisms] data set.
|
||||
#'
|
||||
#' Furthermore,
|
||||
#'
|
||||
#' - Any genus present in the **established** list also has `prevalence = 1.0` in the [microorganisms] data set;
|
||||
#' - Genera from the World Health Organization's (WHO) Priority Pathogen List have `prevalence = 1.0` in the [microorganisms] data set;
|
||||
#' - Any genus present in the **established** list also has `prevalence = 1.15` in the [microorganisms] data set;
|
||||
#' - Any other genus present in the **putative** list has `prevalence = 1.25` in the [microorganisms] data set;
|
||||
#' - Any other species or subspecies of which the genus is present in the two aforementioned groups, has `prevalence = 1.5` in the [microorganisms] data set;
|
||||
#' - Any *non-bacterial* genus, species or subspecies of which the genus is present in the following list, has `prevalence = 1.25` in the [microorganisms] data set: `r vector_or(MO_RELEVANT_GENERA, quotes = "*")`;
|
||||
|
@ -442,8 +442,8 @@ mo_pathogenicity <- function(x, language = get_AMR_locale(), keep_synonyms = get
|
||||
kngd <- AMR_env$MO_lookup$kingdom[match(x.mo, AMR_env$MO_lookup$mo)]
|
||||
rank <- AMR_env$MO_lookup$rank[match(x.mo, AMR_env$MO_lookup$mo)]
|
||||
|
||||
out <- factor(case_when_AMR(prev == 1 & kngd == "Bacteria" & rank != "genus" ~ "Pathogenic",
|
||||
(prev < 2 & kngd == "Fungi") ~ "Potentially pathogenic",
|
||||
out <- factor(case_when_AMR(prev <= 1.15 & kngd == "Bacteria" & rank != "genus" ~ "Pathogenic",
|
||||
prev < 2 & kngd == "Fungi" ~ "Potentially pathogenic",
|
||||
prev == 2 & kngd == "Bacteria" ~ "Non-pathogenic",
|
||||
kngd == "Bacteria" ~ "Potentially pathogenic",
|
||||
TRUE ~ "Unknown"),
|
||||
|
BIN
R/sysdata.rda
BIN
R/sysdata.rda
Binary file not shown.
@ -158,6 +158,32 @@ pre_commit_lst$MO_STREP_ABCG <- AMR::microorganisms$mo[which(AMR::microorganisms
|
||||
"group a", "group b", "group c", "group g"
|
||||
))]
|
||||
pre_commit_lst$MO_LANCEFIELD <- AMR::microorganisms$mo[which(AMR::microorganisms$mo %like% "^(B_STRPT_PYGN(_|$)|B_STRPT_AGLC(_|$)|B_STRPT_(DYSG|EQUI)(_|$)|B_STRPT_ANGN(_|$)|B_STRPT_(DYSG|CANS)(_|$)|B_STRPT_SNGN(_|$)|B_STRPT_SLVR(_|$))")]
|
||||
pre_commit_lst$MO_WHO_PRIORITY_GENERA <- c(
|
||||
# World Health Organization's (WHO) Priority Pathogen List
|
||||
"Acinetobacter",
|
||||
"Aspergillus",
|
||||
"Blastomyces",
|
||||
"Campylobacter",
|
||||
"Candida",
|
||||
"Clostridioides",
|
||||
"Coccidioides",
|
||||
"Cryptococcus",
|
||||
"Enterococcus",
|
||||
"Fusarium",
|
||||
"Haemophilus",
|
||||
"Helicobacter",
|
||||
"Histoplasma",
|
||||
"Klebsiella",
|
||||
"Mycobacterium",
|
||||
"Neisseria",
|
||||
"Paracoccidioides",
|
||||
"Pneumocystis",
|
||||
"Pseudomonas",
|
||||
"Salmonella",
|
||||
"Shigella",
|
||||
"Staphylococcus",
|
||||
"Streptococcus"
|
||||
)
|
||||
pre_commit_lst$MO_RELEVANT_GENERA <- c(
|
||||
"Absidia",
|
||||
"Acanthamoeba",
|
||||
|
File diff suppressed because one or more lines are too long
Before Width: | Height: | Size: 171 KiB After Width: | Height: | Size: 178 KiB |
Binary file not shown.
Binary file not shown.
@ -1 +1 @@
|
||||
f1cbe414851c1eee08ff5440a47af76c
|
||||
5b5544d28deade33092925a6758277c4
|
||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@ -455,7 +455,7 @@ sort(table(taxonomy_lpsn$status))
|
||||
|
||||
# Read MycoBank data ------------------------------------------------------------------------------
|
||||
|
||||
taxonomy_mycobank <- readxl::read_excel(file_mycobank, guess_max = 1e5)
|
||||
taxonomy_mycobank <- read_excel(file_mycobank, guess_max = 1e5)
|
||||
taxonomy_mycobank.bak <- taxonomy_mycobank
|
||||
|
||||
taxonomy_mycobank <- taxonomy_mycobank %>%
|
||||
@ -1224,10 +1224,12 @@ nonbacterial_genera <- nonbacterial_genera[nonbacterial_genera %unlike% "unknown
|
||||
# see https://doi.org/10.1099/mic.0.001269
|
||||
taxonomy <- taxonomy %>%
|
||||
mutate(prevalence = case_when(
|
||||
# genera of the pathogens mentioned in the World Health Organization's (WHO) Priority Pathogen List
|
||||
genus %in% MO_WHO_PRIORITY_GENERA ~ 1.0,
|
||||
# 'established' means 'have infected at least three persons in three or more references'
|
||||
paste(genus, species) %in% established & rank %in% c("species", "subspecies") ~ 1.0,
|
||||
paste(genus, species) %in% established & rank %in% c("species", "subspecies") ~ 1.15,
|
||||
# other genera in the 'established' group
|
||||
genus %in% established_genera & rank == "genus" ~ 1.0,
|
||||
genus %in% established_genera & rank == "genus" ~ 1.15,
|
||||
|
||||
# 'putative' means 'fewer than three known cases'
|
||||
paste(genus, species) %in% putative & rank %in% c("species", "subspecies") ~ 1.25,
|
||||
@ -2148,7 +2150,7 @@ microorganisms <- taxonomy
|
||||
|
||||
# set class <mo>
|
||||
class(microorganisms$mo) <- c("mo", "character")
|
||||
microorganisms <- microorganisms %>% arrange(fullname)
|
||||
microorganisms <- microorganisms %>% arrange(fullname) %>% df_remove_nonASCII()
|
||||
usethis::use_data(microorganisms, overwrite = TRUE, version = 2, compress = "xz")
|
||||
rm(microorganisms)
|
||||
|
||||
|
@ -8,6 +8,7 @@ unknown Gram-negatives TRUE TRUE FALSE TRUE 不明革兰氏阴性菌 neznámé g
|
||||
unknown Gram-positives TRUE TRUE FALSE TRUE 不明革兰氏阳性菌 neznámé grampozitivní ukendte Gram-positive onbekende Gram-positieven tuntemattomat grampositiiviset Gram positifs inconnus unbekannte Grampositiven άγνωστοι θετικοί κατά Gram Gram positivi sconosciuti 未知のグラム陽性菌 ukjent Gram-positive Nieznane bakterie Gram-dodatnie Gram positivos desconhecidos Gram-pozitive necunoscute неизвестные грамположительные Gram positivos desconocidos okända Gram-positiva bilinmeyen Gram-pozitifler невідомі грампозитивні
|
||||
unknown anaerobic Gram-negatives TRUE TRUE FALSE TRUE ukendte anaerobe Gram-negative onbekende anaerobe Gram-negatieven unbekannte anaerobe Gramnegativen
|
||||
unknown anaerobic Gram-positives TRUE TRUE FALSE TRUE ukendte anaerobe Gram-positive onbekende anaerobe Gram-positieven unbekannte anaerobe Grampositiven
|
||||
unknown protozoan TRUE TRUE FALSE TRUE 未知原生动物 neznámý prvok ukendt protozo onbekend protozoön tuntematon alkueläin protozoaire inconnu unbekanntes Protozoon άγνωστο πρωτόζωο protozoo sconosciuto 未知の原生動物 ukjent protozo nieznany pierwotniak protozoário desconhecido protozoar necunoscut неизвестное простейшее protozoo desconocido okänd protozo bilinmeyen protozoa невідоме найпростіше
|
||||
unknown fungus TRUE TRUE FALSE TRUE 未知真菌 neznámé houby ukendt svamp onbekende schimmel tuntematon sieni champignon inconnu unbekannter Pilze άγνωστος μύκητας fungo sconosciuto 未知真菌 ukjent sopp Nieznany grzyb fungo desconhecido ciuperci necunoscute неизвестный грибок hongo desconocido Okänd svamp bilinmeyen mantar невідомий гриб
|
||||
unknown yeast TRUE TRUE FALSE TRUE 未知酵母菌 neznámé kvasinky ukendt gær onbekende gist tuntematon hiiva levure inconnue unbekannte Hefe άγνωστος ζυμομύκητας lievito sconosciuto 未知酵母 ukjent gjær Nieznany drożdżak levedura desconhecida drojdie necunoscută неизвестные дрожжи levadura desconocida Okänd jäst bilinmeyen maya невідомі дріжджі
|
||||
unknown name TRUE TRUE FALSE TRUE 不明名称 neznámý název ukendt navn onbekende naam tuntematon nimi nom inconnu unbekannte Name άγνωστο όνομα nome sconosciuto 名称未知 ukjent navn nieznana nazwa nome desconhecido nume necunoscut неизвестное название nombre desconocido okänt namn bilinmeyen isim невідома назва
|
||||
|
|
Binary file not shown.
@ -94,7 +94,7 @@ expect_equal(mo_synonyms("Escherichia coli"), NULL)
|
||||
expect_true(length(mo_synonyms("Candida albicans")) > 1)
|
||||
expect_inherits(mo_synonyms(c("Candida albicans", "Escherichia coli")), "list")
|
||||
expect_equal(names(mo_info("Escherichia coli")), c(
|
||||
"mo",
|
||||
"mo", "rank",
|
||||
"kingdom", "phylum", "class", "order", "family", "genus", "species", "subspecies",
|
||||
"status", "synonyms", "gramstain", "oxygen_tolerance",
|
||||
"url", "ref", "snomed", "lpsn", "mycobank", "gbif", "group_members"
|
||||
|
141
logo.svg
141
logo.svg
File diff suppressed because one or more lines are too long
Before Width: | Height: | Size: 171 KiB After Width: | Height: | Size: 178 KiB |
@ -186,18 +186,19 @@ where:
|
||||
\item \eqn{l_n} is the length of \eqn{n};
|
||||
\item \eqn{lev} is the \href{https://en.wikipedia.org/wiki/Levenshtein_distance}{Levenshtein distance function} (counting any insertion as 1, and any deletion or substitution as 2) that is needed to change \eqn{x} into \eqn{n};
|
||||
\item \eqn{p_n} is the human pathogenic prevalence group of \eqn{n}, as described below;
|
||||
\item \eqn{k_n} is the taxonomic kingdom of \eqn{n}, set as Bacteria = 1, Fungi = 1.25, Protozoa = 1.5, Archaea = 2, others = 3.
|
||||
\item \eqn{k_n} is the taxonomic kingdom of \eqn{n}, set as Bacteria = 1, Fungi = 1.25, Protozoa = 1.5, Chromista = 1.75, Archaea = 2, others = 3.
|
||||
}
|
||||
|
||||
The grouping into human pathogenic prevalence \eqn{p} is based on recent work from Bartlett \emph{et al.} (2022, \doi{10.1099/mic.0.001269}) who extensively studied medical-scientific literature to categorise all bacterial species into these groups:
|
||||
\itemize{
|
||||
\item \strong{Established}, if a taxonomic species has infected at least three persons in three or more references. These records have \code{prevalence = 1.0} in the \link{microorganisms} data set;
|
||||
\item \strong{Established}, if a taxonomic species has infected at least three persons in three or more references. These records have \code{prevalence = 1.15} in the \link{microorganisms} data set;
|
||||
\item \strong{Putative}, if a taxonomic species has fewer than three known cases. These records have \code{prevalence = 1.25} in the \link{microorganisms} data set.
|
||||
}
|
||||
|
||||
Furthermore,
|
||||
\itemize{
|
||||
\item Any genus present in the \strong{established} list also has \code{prevalence = 1.0} in the \link{microorganisms} data set;
|
||||
\item Genera from the World Health Organization's (WHO) Priority Pathogen List have \code{prevalence = 1.0} in the \link{microorganisms} data set;
|
||||
\item Any genus present in the \strong{established} list also has \code{prevalence = 1.15} in the \link{microorganisms} data set;
|
||||
\item Any other genus present in the \strong{putative} list has \code{prevalence = 1.25} in the \link{microorganisms} data set;
|
||||
\item Any other species or subspecies of which the genus is present in the two aforementioned groups, has \code{prevalence = 1.5} in the \link{microorganisms} data set;
|
||||
\item Any \emph{non-bacterial} genus, species or subspecies of which the genus is present in the following list, has \code{prevalence = 1.25} in the \link{microorganisms} data set: \emph{Absidia}, \emph{Acanthamoeba}, \emph{Acremonium}, \emph{Actinomucor}, \emph{Aedes}, \emph{Alternaria}, \emph{Amoeba}, \emph{Ancylostoma}, \emph{Angiostrongylus}, \emph{Anisakis}, \emph{Anopheles}, \emph{Apophysomyces}, \emph{Arthroderma}, \emph{Aspergillus}, \emph{Aureobasidium}, \emph{Basidiobolus}, \emph{Beauveria}, \emph{Bipolaris}, \emph{Blastobotrys}, \emph{Blastocystis}, \emph{Blastomyces}, \emph{Candida}, \emph{Capillaria}, \emph{Chaetomium}, \emph{Chilomastix}, \emph{Chrysonilia}, \emph{Chrysosporium}, \emph{Cladophialophora}, \emph{Cladosporium}, \emph{Clavispora}, \emph{Coccidioides}, \emph{Cokeromyces}, \emph{Conidiobolus}, \emph{Coniochaeta}, \emph{Contracaecum}, \emph{Cordylobia}, \emph{Cryptococcus}, \emph{Cryptosporidium}, \emph{Cunninghamella}, \emph{Curvularia}, \emph{Cyberlindnera}, \emph{Debaryozyma}, \emph{Demodex}, \emph{Dermatobia}, \emph{Dientamoeba}, \emph{Diphyllobothrium}, \emph{Dirofilaria}, \emph{Echinostoma}, \emph{Entamoeba}, \emph{Enterobius}, \emph{Epidermophyton}, \emph{Exidia}, \emph{Exophiala}, \emph{Exserohilum}, \emph{Fasciola}, \emph{Fonsecaea}, \emph{Fusarium}, \emph{Geotrichum}, \emph{Giardia}, \emph{Graphium}, \emph{Haloarcula}, \emph{Halobacterium}, \emph{Halococcus}, \emph{Hansenula}, \emph{Hendersonula}, \emph{Heterophyes}, \emph{Histomonas}, \emph{Histoplasma}, \emph{Hortaea}, \emph{Hymenolepis}, \emph{Hypomyces}, \emph{Hysterothylacium}, \emph{Kloeckera}, \emph{Kluyveromyces}, \emph{Kodamaea}, \emph{Lacazia}, \emph{Leishmania}, \emph{Lichtheimia}, \emph{Lodderomyces}, \emph{Lomentospora}, \emph{Madurella}, \emph{Malassezia}, \emph{Malbranchea}, \emph{Metagonimus}, \emph{Meyerozyma}, \emph{Microsporidium}, \emph{Microsporum}, \emph{Millerozyma}, \emph{Mortierella}, \emph{Mucor}, \emph{Mycocentrospora}, \emph{Nannizzia}, \emph{Necator}, \emph{Nectria}, \emph{Ochroconis}, \emph{Oesophagostomum}, \emph{Oidiodendron}, \emph{Opisthorchis}, \emph{Paecilomyces}, \emph{Paracoccidioides}, \emph{Pediculus}, \emph{Penicillium}, \emph{Phaeoacremonium}, \emph{Phaeomoniella}, \emph{Phialophora}, \emph{Phlebotomus}, \emph{Phoma}, \emph{Pichia}, \emph{Piedraia}, \emph{Pithomyces}, \emph{Pityrosporum}, \emph{Pneumocystis}, \emph{Pseudallescheria}, \emph{Pseudoscopulariopsis}, \emph{Pseudoterranova}, \emph{Pulex}, \emph{Purpureocillium}, \emph{Quambalaria}, \emph{Rhinocladiella}, \emph{Rhizomucor}, \emph{Rhizopus}, \emph{Rhodotorula}, \emph{Saccharomyces}, \emph{Saksenaea}, \emph{Saprochaete}, \emph{Sarcoptes}, \emph{Scedosporium}, \emph{Schistosoma}, \emph{Schizosaccharomyces}, \emph{Scolecobasidium}, \emph{Scopulariopsis}, \emph{Scytalidium}, \emph{Spirometra}, \emph{Sporobolomyces}, \emph{Sporopachydermia}, \emph{Sporothrix}, \emph{Sporotrichum}, \emph{Stachybotrys}, \emph{Strongyloides}, \emph{Syncephalastrum}, \emph{Syngamus}, \emph{Taenia}, \emph{Talaromyces}, \emph{Teleomorph}, \emph{Toxocara}, \emph{Trichinella}, \emph{Trichobilharzia}, \emph{Trichoderma}, \emph{Trichomonas}, \emph{Trichophyton}, \emph{Trichosporon}, \emph{Trichostrongylus}, \emph{Trichuris}, \emph{Tritirachium}, \emph{Trombicula}, \emph{Trypanosoma}, \emph{Tunga}, \emph{Ulocladium}, \emph{Ustilago}, \emph{Verticillium}, \emph{Wallemia}, \emph{Wangiella}, \emph{Wickerhamomyces}, \emph{Wuchereria}, \emph{Yarrowia}, or \emph{Zygosaccharomyces};
|
||||
@ -206,7 +207,7 @@ Furthermore,
|
||||
|
||||
When calculating the matching score, all characters in \eqn{x} and \eqn{n} are ignored that are other than A-Z, a-z, 0-9, spaces and parentheses.
|
||||
|
||||
All matches are sorted descending on their matching score and for all user input values, the top match will be returned. This will lead to the effect that e.g., \code{"E. coli"} will return the microbial ID of \emph{Escherichia coli} (\eqn{m = 0.688}, a highly prevalent microorganism found in humans) and not \emph{Entamoeba coli} (\eqn{m = 0.381}, a less prevalent microorganism in humans), although the latter would alphabetically come first.
|
||||
All matches are sorted descending on their matching score and for all user input values, the top match will be returned. This will lead to the effect that e.g., \code{"E. coli"} will return the microbial ID of \emph{Escherichia coli} (\eqn{m = 0.598}, a highly prevalent microorganism found in humans) and not \emph{Entamoeba coli} (\eqn{m = 0.381}, a less prevalent microorganism in humans), although the latter would alphabetically come first.
|
||||
}
|
||||
|
||||
\section{Reference Data Publicly Available}{
|
||||
|
@ -15,9 +15,9 @@ mo_matching_score(x, n)
|
||||
This algorithm is used by \code{\link[=as.mo]{as.mo()}} and all the \code{\link[=mo_property]{mo_*}} functions to determine the most probable match of taxonomic records based on user input.
|
||||
}
|
||||
\note{
|
||||
This algorithm was originally described in: Berends MS \emph{et al.} (2022). \strong{AMR: An R Package for Working with Antimicrobial Resistance Data}. \emph{Journal of Statistical Software}, 104(3), 1-31; \doi{10.18637/jss.v104.i03}.
|
||||
This algorithm was originally developed in 2018 and subsequently described in: Berends MS \emph{et al.} (2022). \strong{AMR: An R Package for Working with Antimicrobial Resistance Data}. \emph{Journal of Statistical Software}, 104(3), 1-31; \doi{10.18637/jss.v104.i03}.
|
||||
|
||||
Later, the work of Bartlett A \emph{et al.} about bacterial pathogens infecting humans (2022, \doi{10.1099/mic.0.001269}) was incorporated.
|
||||
Later, the work of Bartlett A \emph{et al.} about bacterial pathogens infecting humans (2022, \doi{10.1099/mic.0.001269}) was incorporated, and optimalisations to the algorithm were made.
|
||||
}
|
||||
\section{Matching Score for Microorganisms}{
|
||||
|
||||
@ -34,18 +34,19 @@ where:
|
||||
\item \eqn{l_n} is the length of \eqn{n};
|
||||
\item \eqn{lev} is the \href{https://en.wikipedia.org/wiki/Levenshtein_distance}{Levenshtein distance function} (counting any insertion as 1, and any deletion or substitution as 2) that is needed to change \eqn{x} into \eqn{n};
|
||||
\item \eqn{p_n} is the human pathogenic prevalence group of \eqn{n}, as described below;
|
||||
\item \eqn{k_n} is the taxonomic kingdom of \eqn{n}, set as Bacteria = 1, Fungi = 1.25, Protozoa = 1.5, Archaea = 2, others = 3.
|
||||
\item \eqn{k_n} is the taxonomic kingdom of \eqn{n}, set as Bacteria = 1, Fungi = 1.25, Protozoa = 1.5, Chromista = 1.75, Archaea = 2, others = 3.
|
||||
}
|
||||
|
||||
The grouping into human pathogenic prevalence \eqn{p} is based on recent work from Bartlett \emph{et al.} (2022, \doi{10.1099/mic.0.001269}) who extensively studied medical-scientific literature to categorise all bacterial species into these groups:
|
||||
\itemize{
|
||||
\item \strong{Established}, if a taxonomic species has infected at least three persons in three or more references. These records have \code{prevalence = 1.0} in the \link{microorganisms} data set;
|
||||
\item \strong{Established}, if a taxonomic species has infected at least three persons in three or more references. These records have \code{prevalence = 1.15} in the \link{microorganisms} data set;
|
||||
\item \strong{Putative}, if a taxonomic species has fewer than three known cases. These records have \code{prevalence = 1.25} in the \link{microorganisms} data set.
|
||||
}
|
||||
|
||||
Furthermore,
|
||||
\itemize{
|
||||
\item Any genus present in the \strong{established} list also has \code{prevalence = 1.0} in the \link{microorganisms} data set;
|
||||
\item Genera from the World Health Organization's (WHO) Priority Pathogen List have \code{prevalence = 1.0} in the \link{microorganisms} data set;
|
||||
\item Any genus present in the \strong{established} list also has \code{prevalence = 1.15} in the \link{microorganisms} data set;
|
||||
\item Any other genus present in the \strong{putative} list has \code{prevalence = 1.25} in the \link{microorganisms} data set;
|
||||
\item Any other species or subspecies of which the genus is present in the two aforementioned groups, has \code{prevalence = 1.5} in the \link{microorganisms} data set;
|
||||
\item Any \emph{non-bacterial} genus, species or subspecies of which the genus is present in the following list, has \code{prevalence = 1.25} in the \link{microorganisms} data set: \emph{Absidia}, \emph{Acanthamoeba}, \emph{Acremonium}, \emph{Actinomucor}, \emph{Aedes}, \emph{Alternaria}, \emph{Amoeba}, \emph{Ancylostoma}, \emph{Angiostrongylus}, \emph{Anisakis}, \emph{Anopheles}, \emph{Apophysomyces}, \emph{Arthroderma}, \emph{Aspergillus}, \emph{Aureobasidium}, \emph{Basidiobolus}, \emph{Beauveria}, \emph{Bipolaris}, \emph{Blastobotrys}, \emph{Blastocystis}, \emph{Blastomyces}, \emph{Candida}, \emph{Capillaria}, \emph{Chaetomium}, \emph{Chilomastix}, \emph{Chrysonilia}, \emph{Chrysosporium}, \emph{Cladophialophora}, \emph{Cladosporium}, \emph{Clavispora}, \emph{Coccidioides}, \emph{Cokeromyces}, \emph{Conidiobolus}, \emph{Coniochaeta}, \emph{Contracaecum}, \emph{Cordylobia}, \emph{Cryptococcus}, \emph{Cryptosporidium}, \emph{Cunninghamella}, \emph{Curvularia}, \emph{Cyberlindnera}, \emph{Debaryozyma}, \emph{Demodex}, \emph{Dermatobia}, \emph{Dientamoeba}, \emph{Diphyllobothrium}, \emph{Dirofilaria}, \emph{Echinostoma}, \emph{Entamoeba}, \emph{Enterobius}, \emph{Epidermophyton}, \emph{Exidia}, \emph{Exophiala}, \emph{Exserohilum}, \emph{Fasciola}, \emph{Fonsecaea}, \emph{Fusarium}, \emph{Geotrichum}, \emph{Giardia}, \emph{Graphium}, \emph{Haloarcula}, \emph{Halobacterium}, \emph{Halococcus}, \emph{Hansenula}, \emph{Hendersonula}, \emph{Heterophyes}, \emph{Histomonas}, \emph{Histoplasma}, \emph{Hortaea}, \emph{Hymenolepis}, \emph{Hypomyces}, \emph{Hysterothylacium}, \emph{Kloeckera}, \emph{Kluyveromyces}, \emph{Kodamaea}, \emph{Lacazia}, \emph{Leishmania}, \emph{Lichtheimia}, \emph{Lodderomyces}, \emph{Lomentospora}, \emph{Madurella}, \emph{Malassezia}, \emph{Malbranchea}, \emph{Metagonimus}, \emph{Meyerozyma}, \emph{Microsporidium}, \emph{Microsporum}, \emph{Millerozyma}, \emph{Mortierella}, \emph{Mucor}, \emph{Mycocentrospora}, \emph{Nannizzia}, \emph{Necator}, \emph{Nectria}, \emph{Ochroconis}, \emph{Oesophagostomum}, \emph{Oidiodendron}, \emph{Opisthorchis}, \emph{Paecilomyces}, \emph{Paracoccidioides}, \emph{Pediculus}, \emph{Penicillium}, \emph{Phaeoacremonium}, \emph{Phaeomoniella}, \emph{Phialophora}, \emph{Phlebotomus}, \emph{Phoma}, \emph{Pichia}, \emph{Piedraia}, \emph{Pithomyces}, \emph{Pityrosporum}, \emph{Pneumocystis}, \emph{Pseudallescheria}, \emph{Pseudoscopulariopsis}, \emph{Pseudoterranova}, \emph{Pulex}, \emph{Purpureocillium}, \emph{Quambalaria}, \emph{Rhinocladiella}, \emph{Rhizomucor}, \emph{Rhizopus}, \emph{Rhodotorula}, \emph{Saccharomyces}, \emph{Saksenaea}, \emph{Saprochaete}, \emph{Sarcoptes}, \emph{Scedosporium}, \emph{Schistosoma}, \emph{Schizosaccharomyces}, \emph{Scolecobasidium}, \emph{Scopulariopsis}, \emph{Scytalidium}, \emph{Spirometra}, \emph{Sporobolomyces}, \emph{Sporopachydermia}, \emph{Sporothrix}, \emph{Sporotrichum}, \emph{Stachybotrys}, \emph{Strongyloides}, \emph{Syncephalastrum}, \emph{Syngamus}, \emph{Taenia}, \emph{Talaromyces}, \emph{Teleomorph}, \emph{Toxocara}, \emph{Trichinella}, \emph{Trichobilharzia}, \emph{Trichoderma}, \emph{Trichomonas}, \emph{Trichophyton}, \emph{Trichosporon}, \emph{Trichostrongylus}, \emph{Trichuris}, \emph{Tritirachium}, \emph{Trombicula}, \emph{Trypanosoma}, \emph{Tunga}, \emph{Ulocladium}, \emph{Ustilago}, \emph{Verticillium}, \emph{Wallemia}, \emph{Wangiella}, \emph{Wickerhamomyces}, \emph{Wuchereria}, \emph{Yarrowia}, or \emph{Zygosaccharomyces};
|
||||
@ -54,7 +55,7 @@ Furthermore,
|
||||
|
||||
When calculating the matching score, all characters in \eqn{x} and \eqn{n} are ignored that are other than A-Z, a-z, 0-9, spaces and parentheses.
|
||||
|
||||
All matches are sorted descending on their matching score and for all user input values, the top match will be returned. This will lead to the effect that e.g., \code{"E. coli"} will return the microbial ID of \emph{Escherichia coli} (\eqn{m = 0.688}, a highly prevalent microorganism found in humans) and not \emph{Entamoeba coli} (\eqn{m = 0.381}, a less prevalent microorganism in humans), although the latter would alphabetically come first.
|
||||
All matches are sorted descending on their matching score and for all user input values, the top match will be returned. This will lead to the effect that e.g., \code{"E. coli"} will return the microbial ID of \emph{Escherichia coli} (\eqn{m = 0.598}, a highly prevalent microorganism found in humans) and not \emph{Entamoeba coli} (\eqn{m = 0.381}, a less prevalent microorganism in humans), although the latter would alphabetically come first.
|
||||
}
|
||||
|
||||
\section{Reference Data Publicly Available}{
|
||||
@ -73,6 +74,3 @@ mo_matching_score(
|
||||
n = c("Escherichia coli", "Entamoeba coli")
|
||||
)
|
||||
}
|
||||
\author{
|
||||
Dr. Matthijs Berends, 2018
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user