diff --git a/DESCRIPTION b/DESCRIPTION index 37d7b636b..f340b27f7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: AMR -Version: 1.3.0.9029 -Date: 2020-09-25 +Version: 1.3.0.9030 +Date: 2020-09-26 Title: Antimicrobial Resistance Analysis Authors@R: c( person(role = c("aut", "cre"), diff --git a/NEWS.md b/NEWS.md index ffbfb4ae4..ad4bf8e5d 100755 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,5 @@ -# AMR 1.3.0.9029 -## Last updated: 25 September 2020 +# AMR 1.3.0.9030 +## Last updated: 26 September 2020 Note: some changes in this version were suggested by anonymous reviewers from the journal we submitted our manuscipt to. We are those reviewers very grateful for going through our code so thoroughly! diff --git a/R/aa_helper_functions.R b/R/aa_helper_functions.R index 283b2cdef..e64d959e1 100755 --- a/R/aa_helper_functions.R +++ b/R/aa_helper_functions.R @@ -48,18 +48,6 @@ pm_left_join <- function(x, y, by = NULL, suffix = c(".x", ".y")) { rownames(merged) <- NULL merged } -# pm_filter_join_worker <- function(x, y, by = NULL, type = c("anti", "semi")) { -# type <- match.arg(type, choices = c("anti", "semi"), several.ok = FALSE) -# if (is.null(by)) { -# by <- intersect(names(x), names(y)) -# join_message(by) -# } -# rows <- interaction(x[, by]) %in% interaction(y[, by]) -# if (type == "anti") rows <- !rows -# res <- x[rows, , drop = FALSE] -# rownames(res) <- NULL -# res -# } quick_case_when <- function(...) { vectors <- list(...) diff --git a/R/join_microorganisms.R b/R/join_microorganisms.R index 8224a4670..2dd64a586 100755 --- a/R/join_microorganisms.R +++ b/R/join_microorganisms.R @@ -27,12 +27,12 @@ #' @name join #' @aliases join inner_join #' @param x existing table to join, or character vector -#' @param by a variable to join by - if left empty will search for a column with class [`mo`] (created with [as.mo()]) or will be `"mo"` if that column name exists in `x`, could otherwise be a column name of `x` with values that exist in `microorganisms$mo` (like `by = "bacteria_id"`), or another column in [microorganisms] (but then it should be named, like `by = c("my_genus_species" = "fullname")`) +#' @param by a variable to join by - if left empty will search for a column with class [`mo`] (created with [as.mo()]) or will be `"mo"` if that column name exists in `x`, could otherwise be a column name of `x` with values that exist in `microorganisms$mo` (like `by = "bacteria_id"`), or another column in [microorganisms] (but then it should be named, like `by = c("bacteria_id" = "fullname")`) #' @param suffix if there are non-joined duplicate variables in `x` and `y`, these suffixes will be added to the output to disambiguate them. Should be a character vector of length 2. #' @param ... ignored #' @details **Note:** As opposed to the `join()` functions of `dplyr`, [character] vectors are supported and at default existing columns will get a suffix `"2"` and the newly joined columns will not get a suffix. #' -#' These functions rely on [merge()], a base R function to do joins. +#' If the `dplyr` package is installed, their join functions will be used. Otherwise, the much slower [merge()] function from base R will be used. #' @inheritSection AMR Read more on our website! #' @export #' @examples @@ -60,9 +60,17 @@ inner_join_microorganisms <- function(x, by = NULL, suffix = c("2", ""), ...) { x_class <- get_prejoined_class(x) x <- checked$x by <- checked$by - join <- suppressWarnings( - pm_inner_join(x = x, y = microorganisms, by = by, suffix = suffix, ...) - ) + # use dplyr if available - it's much faster + dplyr_inner <- import_fn("inner_join", "dplyr", error_on_fail = FALSE) + if (!is.null(dplyr_inner)) { + join <- suppressWarnings( + dplyr_inner(x = x, y = microorganisms, by = by, suffix = suffix, ...) + ) + } else { + join <- suppressWarnings( + pm_inner_join(x = x, y = microorganisms, by = by, suffix = suffix, ...) + ) + } if (NROW(join) > NROW(x)) { warning("The newly joined tbl contains ", nrow(join) - nrow(x), " rows more that its original.") } @@ -79,9 +87,17 @@ left_join_microorganisms <- function(x, by = NULL, suffix = c("2", ""), ...) { x_class <- get_prejoined_class(x) x <- checked$x by <- checked$by - join <- suppressWarnings( - pm_left_join(x = x, y = microorganisms, by = by, suffix = suffix, ...) - ) + # use dplyr if available - it's much faster + dplyr_left <- import_fn("left_join", "dplyr", error_on_fail = FALSE) + if (!is.null(dplyr_left)) { + join <- suppressWarnings( + dplyr_left(x = x, y = microorganisms, by = by, suffix = suffix, ...) + ) + } else { + join <- suppressWarnings( + pm_left_join(x = x, y = microorganisms, by = by, suffix = suffix, ...) + ) + } if (NROW(join) > NROW(x)) { warning("The newly joined tbl contains ", nrow(join) - nrow(x), " rows more that its original.") } @@ -98,9 +114,17 @@ right_join_microorganisms <- function(x, by = NULL, suffix = c("2", ""), ...) { x_class <- get_prejoined_class(x) x <- checked$x by <- checked$by - join <- suppressWarnings( - pm_right_join(x = x, y = microorganisms, by = by, suffix = suffix, ...) - ) + # use dplyr if available - it's much faster + dplyr_right <- import_fn("right_join", "dplyr", error_on_fail = FALSE) + if (!is.null(dplyr_right)) { + join <- suppressWarnings( + dplyr_right(x = x, y = microorganisms, by = by, suffix = suffix, ...) + ) + } else { + join <- suppressWarnings( + pm_right_join(x = x, y = microorganisms, by = by, suffix = suffix, ...) + ) + } if (NROW(join) > NROW(x)) { warning("The newly joined tbl contains ", nrow(join) - nrow(x), " rows more that its original.") } @@ -117,9 +141,17 @@ full_join_microorganisms <- function(x, by = NULL, suffix = c("2", ""), ...) { x_class <- get_prejoined_class(x) x <- checked$x by <- checked$by - join <- suppressWarnings( - pm_full_join(x = x, y = microorganisms, by = by, suffix = suffix, ...) - ) + # use dplyr if available - it's much faster + dplyr_full <- import_fn("full_join", "dplyr", error_on_fail = FALSE) + if (!is.null(dplyr_full)) { + join <- suppressWarnings( + dplyr_full(x = x, y = microorganisms, by = by, suffix = suffix, ...) + ) + } else { + join <- suppressWarnings( + pm_full_join(x = x, y = microorganisms, by = by, suffix = suffix, ...) + ) + } if (NROW(join) > NROW(x)) { warning("The newly joined tbl contains ", nrow(join) - nrow(x), " rows more that its original.") } @@ -136,9 +168,17 @@ semi_join_microorganisms <- function(x, by = NULL, ...) { checked <- joins_check_df(x, by) x <- checked$x by <- checked$by - join <- suppressWarnings( - pm_semi_join(x = x, y = microorganisms, by = by, ...) - ) + # use dplyr if available - it's much faster + dplyr_semi <- import_fn("semi_join", "dplyr", error_on_fail = FALSE) + if (!is.null(dplyr_semi)) { + join <- suppressWarnings( + dplyr_semi(x = x, y = microorganisms, by = by,...) + ) + } else { + join <- suppressWarnings( + pm_semi_join(x = x, y = microorganisms, by = by,...) + ) + } class(join) <- x_class join } @@ -152,9 +192,17 @@ anti_join_microorganisms <- function(x, by = NULL, ...) { x_class <- get_prejoined_class(x) x <- checked$x by <- checked$by - join <- suppressWarnings( - pm_anti_join(x = x, y = microorganisms, by = by, ...) - ) + # use dplyr if available - it's much faster + dplyr_anti <- import_fn("anti_join", "dplyr", error_on_fail = FALSE) + if (!is.null(dplyr_anti)) { + join <- suppressWarnings( + dplyr_anti(x = x, y = microorganisms, by = by,...) + ) + } else { + join <- suppressWarnings( + pm_anti_join(x = x, y = microorganisms, by = by,...) + ) + } class(join) <- x_class join } diff --git a/R/mo.R b/R/mo.R index 8a0221105..ddc5e5a5f 100755 --- a/R/mo.R +++ b/R/mo.R @@ -101,20 +101,7 @@ #' #' Group 3 (least prevalent microorganisms) consists of all other microorganisms. This group contains microorganisms most probably not found in humans. #' -#' ## Background on matching scores -#' With ambiguous user input, the returned results are chosen based on their matching score using [mo_matching_score()]. This matching score is based on four parameters: -#' -#' 1. The prevalence \eqn{P} is categorised into group 1, 2 and 3 as stated above; -#' 2. A kingdom index \eqn{K} is set as follows: Bacteria = 1, Fungi = 2, Protozoa = 3, Archaea = 4, and all others = 5; -#' 3. The level of uncertainty \eqn{U} needed to get to the result, as stated above (1 to 3); -#' 4. The [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) \eqn{L} is the distance between the user input and all taxonomic full names, with the text length of the user input being the maximum distance. A modified version of the Levenshtein distance \eqn{L'} based on the text length of the full name \eqn{F} is calculated as: -#' -#' \deqn{L' = 1 - \frac{0.5L}{F}}{L' = 1 - ((0.5 * L) / F)} -#' -#' The final matching score \eqn{M} is calculated as: -#' \deqn{M = L' \times \frac{1}{P K U} = \frac{F - 0.5L}{F P K U}}{M = L' * (1 / (P * K * U)) = (F - 0.5L) / (F * P * K * U)} -#' -#' All matches are sorted descending on their matching score and for all user input values, the top match will be returned. +#' @inheritSection mo_matching_score Matching score for microorganisms #' @inheritSection catalogue_of_life Catalogue of Life # (source as a section here, so it can be inherited by other man pages:) #' @section Source: @@ -331,8 +318,7 @@ exec_as.mo <- function(x, if (NROW(res_df) > 1 & uncertainty != -1) { # sort the findings on matching score scores <- mo_matching_score(x = input, - fullname = res_df[, "fullname", drop = TRUE], - uncertainty = uncertainty) + fullname = res_df[, "fullname", drop = TRUE]) res_df <- res_df[order(scores, decreasing = TRUE), , drop = FALSE] } res <- as.character(res_df[, column, drop = TRUE]) @@ -1779,7 +1765,7 @@ print.mo_uncertainties <- function(x, ...) { if (NROW(x) == 0) { return(NULL) } - cat(font_blue(strwrap(c("Matching scores are based on human pathogenic prevalence and the resemblance between the input and the full taxonomic name. Furthermore, an indication is given about the probability of the match - the more transformations are needed for coercion, the more improbable the result.")), collapse = "\n")) + cat(font_blue(strwrap(c("Matching scores are based on human pathogenic prevalence and the resemblance between the input and the full taxonomic name. Furthermore, an indication is given about the certainty of the match - the more transformations are needed for coercion, the less certain the result.")), collapse = "\n")) cat("\n") msg <- "" @@ -1787,8 +1773,7 @@ print.mo_uncertainties <- function(x, ...) { if (x[i, ]$candidates != "") { candidates <- unlist(strsplit(x[i, ]$candidates, ", ", fixed = TRUE)) scores <- mo_matching_score(x = x[i, ]$input, - fullname = candidates, - uncertainty = x[i, ]$uncertainty) + fullname = candidates) # sort on descending scores candidates <- candidates[order(1 - scores)] n_candidates <- length(candidates) @@ -1802,11 +1787,11 @@ print.mo_uncertainties <- function(x, ...) { candidates <- "" } if (x[i, ]$uncertainty == 1) { - uncertainty_interpretation <- font_green("* MOST PROBABLE *") + uncertainty_interpretation <- font_green("* very certain *") } else if (x[i, ]$uncertainty == 1) { - uncertainty_interpretation <- font_yellow("* PROBABLE *") + uncertainty_interpretation <- font_yellow("* certain *") } else { - uncertainty_interpretation <- font_red("* IMPROBABLE *") + uncertainty_interpretation <- font_red("* not certain *") } msg <- paste(msg, paste0('"', x[i, ]$input, '" -> ', @@ -1814,8 +1799,7 @@ print.mo_uncertainties <- function(x, ...) { ifelse(!is.na(x[i, ]$renamed_to), paste(", renamed to", font_italic(x[i, ]$renamed_to)), ""), " (", x[i, ]$mo, ", matching score = ", trimws(percentage(mo_matching_score(x = x[i, ]$input, - fullname = x[i, ]$fullname, - uncertainty = x[i, ]$uncertainty), + fullname = x[i, ]$fullname), digits = 1)), ") "), uncertainty_interpretation, diff --git a/R/mo_matching_score.R b/R/mo_matching_score.R index c44daef7e..9c92a645f 100755 --- a/R/mo_matching_score.R +++ b/R/mo_matching_score.R @@ -23,50 +23,53 @@ #' #' This helper function is used by [as.mo()] to determine the most probable match of taxonomic records, based on user input. #' @param x Any user input value(s) -#' @param fullname A full taxonomic name, that exists in [`microorganisms$fullname`][microorganisms] +#' @param n A full taxonomic name, that exists in [`microorganisms$fullname`][microorganisms] #' @param uncertainty The level of uncertainty set in [as.mo()], see `allow_uncertain` in that function (here, it defaults to 1, but is automatically determined in [as.mo()] based on the number of transformations needed to get to a result) -#' @details The matching score is based on four parameters: +#' @section Matching score for microorganisms: +#' With ambiguous user input in [as.mo()] and all the [`mo_*`][mo_property()] functions, the returned results are chosen based on their matching score using [mo_matching_score()]. This matching score \eqn{m} is calculated as: #' -#' 1. A human pathogenic prevalence \eqn{P}, that is categorised into group 1, 2 and 3 (see [as.mo()]); -#' 2. A kingdom index \eqn{K} is set as follows: Bacteria = 1, Fungi = 2, Protozoa = 3, Archaea = 4, and all others = 5; -#' 3. The level of uncertainty \eqn{U} that is needed to get to a result (1 to 3, see [as.mo()]); -#' 4. The [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) \eqn{L} is the distance between the user input and all taxonomic full names, with the text length of the user input being the maximum distance. A modified version of the Levenshtein distance \eqn{L'} based on the text length of the full name \eqn{F} is calculated as: -#' -#' \deqn{L' = 1 - \frac{0.5L}{F}}{L' = 1 - ((0.5 * L) / F)} -#' -#' The final matching score \eqn{M} is calculated as: -#' \deqn{M = L' \times \frac{1}{P K U} = \frac{F - 0.5L}{F P K U}}{M = L' * (1 / (P * K * U)) = (F - 0.5L) / (F * P * K * U)} +#' \deqn{m_{(x, n)} = \frac{l_{n} - 0.5 \times \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} p k}}{m(x, n) = ( l_n * min(l_n, lev(x, n) ) ) / ( l_n * p * k )} #' +#' where: +#' +#' * \eqn{x} is the user input; +#' * \eqn{n} is a taxonomic name (genus, species and subspecies); +#' * \eqn{l_{n}}{l_n} is the length of the taxonomic name; +#' * \eqn{\operatorname{lev}}{lev} is the [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) function; +#' * \eqn{p} is the human pathogenic prevalence, categorised into group \eqn{1}, \eqn{2} and \eqn{3} (see *Details* in `?as.mo`), meaning that \eqn{p = \{1, 2 , 3\}}{p = {1, 2, 3}}; +#' * \eqn{k} is the kingdom index, set as follows: Bacteria = \eqn{1}, Fungi = \eqn{2}, Protozoa = \eqn{3}, Archaea = \eqn{4}, and all others = \eqn{5}, meaning that \eqn{k = \{1, 2 , 3, 4, 5\}}{k = {1, 2, 3, 4, 5}}. +#' +#' All matches are sorted descending on their matching score and for all user input values, the top match will be returned. #' @export #' @examples #' as.mo("E. coli") #' mo_uncertainties() -mo_matching_score <- function(x, fullname, uncertainty = 1) { - # fullname is always a taxonomically valid full name +#' +#' mo_matching_score("E. coli", "Escherichia coli") +mo_matching_score <- function(x, n) { + # n is always a taxonomically valid full name levenshtein <- double(length = length(x)) - if (length(fullname) == 1) { - fullname <- rep(fullname, length(x)) + if (length(n) == 1) { + n <- rep(n, length(x)) } if (length(x) == 1) { - x <- rep(x, length(fullname)) + x <- rep(x, length(n)) } for (i in seq_len(length(x))) { - # determine Levenshtein distance, but maximise to nchar of fullname - levenshtein[i] <- min(as.double(utils::adist(x[i], fullname[i], ignore.case = FALSE)), - nchar(fullname[i])) + # determine Levenshtein distance, but maximise to nchar of n + levenshtein[i] <- min(as.double(utils::adist(x[i], n[i], ignore.case = FALSE)), + nchar(n[i])) } # F = length of fullname - var_F <- nchar(fullname) + var_F <- nchar(n) # L = modified Levenshtein distance var_L <- levenshtein # P = Prevalence (1 to 3) - var_P <- MO_lookup[match(fullname, MO_lookup$fullname), "prevalence", drop = TRUE] + var_P <- MO_lookup[match(n, MO_lookup$fullname), "prevalence", drop = TRUE] # K = kingdom index (Bacteria = 1, Fungi = 2, Protozoa = 3, Archaea = 4, others = 5) - var_K <- MO_lookup[match(fullname, MO_lookup$fullname), "kingdom_index", drop = TRUE] - # U = uncertainty level (1 to 3), as per as.mo() - var_U <- uncertainty + var_K <- MO_lookup[match(n, MO_lookup$fullname), "kingdom_index", drop = TRUE] # matching score: - (var_F - 0.5 * var_L) / (var_F * var_P * var_K * var_U) + (var_F - 0.5 * var_L) / (var_F * var_P * var_K) } diff --git a/R/mo_property.R b/R/mo_property.R index c96027f61..e429d9412 100755 --- a/R/mo_property.R +++ b/R/mo_property.R @@ -42,6 +42,7 @@ #' All output will be [translate]d where possible. #' #' The function [mo_url()] will return the direct URL to the online database entry, which also shows the scientific reference of the concerned species. +#' @inheritSection mo_matching_score Matching score for microorganisms #' @inheritSection catalogue_of_life Catalogue of Life #' @inheritSection as.mo Source #' @rdname mo_property diff --git a/docs/404.html b/docs/404.html index 8c45c889f..027deb6bd 100644 --- a/docs/404.html +++ b/docs/404.html @@ -81,7 +81,7 @@ AMR (for R) - 1.3.0.9029 + 1.3.0.9030 diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index 36af7ed34..64e44ae94 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -81,7 +81,7 @@ AMR (for R) - 1.3.0.9029 + 1.3.0.9030 diff --git a/docs/articles/index.html b/docs/articles/index.html index 6dac5bb36..170316086 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -81,7 +81,7 @@ AMR (for R) - 1.3.0.9029 + 1.3.0.9030 diff --git a/docs/authors.html b/docs/authors.html index 31538a702..7b238d043 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -81,7 +81,7 @@ AMR (for R) - 1.3.0.9029 + 1.3.0.9030 diff --git a/docs/index.html b/docs/index.html index 93ac465b8..8d3c70f1e 100644 --- a/docs/index.html +++ b/docs/index.html @@ -43,7 +43,7 @@ AMR (for R) - 1.3.0.9029 + 1.3.0.9030 diff --git a/docs/news/index.html b/docs/news/index.html index d6d9958ec..579687d7b 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -81,7 +81,7 @@ AMR (for R) - 1.3.0.9029 + 1.3.0.9030 @@ -236,13 +236,13 @@ Source: NEWS.md -
-

-AMR 1.3.0.9029 Unreleased +
+

+AMR 1.3.0.9030 Unreleased

-
+

-Last updated: 25 September 2020 +Last updated: 26 September 2020

Note: some changes in this version were suggested by anonymous reviewers from the journal we submitted our manuscipt to. We are those reviewers very grateful for going through our code so thoroughly!

diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml index fd39eafb6..0ae45387a 100644 --- a/docs/pkgdown.yml +++ b/docs/pkgdown.yml @@ -2,7 +2,7 @@ pandoc: 2.7.3 pkgdown: 1.5.1.9000 pkgdown_sha: eae56f08694abebf93cdfc0dd8e9ede06d8c815f articles: [] -last_built: 2020-09-25T12:44Z +last_built: 2020-09-26T14:25Z urls: reference: https://msberends.github.io/AMR/reference article: https://msberends.github.io/AMR/articles diff --git a/docs/reference/as.mo.html b/docs/reference/as.mo.html index f59567c2f..022f6bcfc 100644 --- a/docs/reference/as.mo.html +++ b/docs/reference/as.mo.html @@ -82,7 +82,7 @@ AMR (for R) - 1.3.0.9028 + 1.3.0.9030
@@ -366,21 +366,6 @@

Group 2 consists of all microorganisms where the taxonomic phylum is Proteobacteria, Firmicutes, Actinobacteria or Sarcomastigophora, or where the taxonomic genus is Aspergillus, Bacteroides, Candida, Capnocytophaga, Chryseobacterium, Cryptococcus, Elisabethkingia, Flavobacterium, Fusobacterium, Giardia, Leptotrichia, Mycoplasma, Prevotella, Rhodotorula, Treponema, Trichophyton or Ureaplasma. This group consequently contains all less common and rare human pathogens.

Group 3 (least prevalent microorganisms) consists of all other microorganisms. This group contains microorganisms most probably not found in humans.

-

Background on matching scores

- - -

With ambiguous user input, the returned results are chosen based on their matching score using mo_matching_score(). This matching score is based on four parameters:

    -
  1. The prevalence \(P\) is categorised into group 1, 2 and 3 as stated above;

  2. -
  3. A kingdom index \(K\) is set as follows: Bacteria = 1, Fungi = 2, Protozoa = 3, Archaea = 4, and all others = 5;

  4. -
  5. The level of uncertainty \(U\) needed to get to the result, as stated above (1 to 3);

  6. -
  7. The Levenshtein distance \(L\) is the distance between the user input and all taxonomic full names, with the text length of the user input being the maximum distance. A modified version of the Levenshtein distance \(L'\) based on the text length of the full name \(F\) is calculated as:

  8. -
- -

$$L' = 1 - \frac{0.5L}{F}$$

-

The final matching score \(M\) is calculated as: -$$M = L' \times \frac{1}{P K U} = \frac{F - 0.5L}{F P K U}$$

-

All matches are sorted descending on their matching score and for all user input values, the top match will be returned.

-

Source

@@ -399,6 +384,22 @@ $$M = L' \times \frac{1}{P K U} = \frac{F - 0.5L}{F P K U}$$


The lifecycle of this function is stable. In a stable function, major changes are unlikely. This means that the unlying code will generally evolve by adding new arguments; removing arguments or changing the meaning of existing arguments will be avoided.

If the unlying code needs breaking changes, they will occur gradually. For example, a parameter will be deprecated and first continue to work, but will emit an message informing you of the change. Next, typically after at least one newly released version on CRAN, the message will be transformed to an error.

+

Matching score for microorganisms

+ + + +

With ambiguous user input in as.mo() and all the mo_* functions, the returned results are chosen based on their matching score using mo_matching_score(). This matching score \(m\) is calculated as:

+

$$m_{(x, n)} = \frac{l_{n} - 0.5 \times \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} p k}$$

+

where:

    +
  • \(x\) is the user input;

  • +
  • \(n\) is a taxonomic name (genus, species and subspecies);

  • +
  • \(l_{n}\) is the length of the taxonomic name;

  • +
  • \(\operatorname{lev}\) is the Levenshtein distance function;

  • +
  • \(p\) is the human pathogenic prevalence, categorised into group \(1\), \(2\) and \(3\) (see Details in ?as.mo), meaning that \(p = \{1, 2 , 3\}\);

  • +
  • \(k\) is the kingdom index, set as follows: Bacteria = \(1\), Fungi = \(2\), Protozoa = \(3\), Archaea = \(4\), and all others = \(5\), meaning that \(k = \{1, 2 , 3, 4, 5\}\).

  • +
+ +

All matches are sorted descending on their matching score and for all user input values, the top match will be returned.

Catalogue of Life

diff --git a/docs/reference/index.html b/docs/reference/index.html index 406eb19be..4842f9283 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -81,7 +81,7 @@ AMR (for R) - 1.3.0.9029 + 1.3.0.9030
diff --git a/docs/reference/join.html b/docs/reference/join.html index 0d6053b32..349a4833c 100644 --- a/docs/reference/join.html +++ b/docs/reference/join.html @@ -82,7 +82,7 @@ AMR (for R) - 1.3.0.9026 + 1.3.0.9029
@@ -263,7 +263,7 @@ by -

a variable to join by - if left empty will search for a column with class mo (created with as.mo()) or will be "mo" if that column name exists in x, could otherwise be a column name of x with values that exist in microorganisms$mo (like by = "bacteria_id"), or another column in microorganisms (but then it should be named, like by = c("my_genus_species" = "fullname"))

+

a variable to join by - if left empty will search for a column with class mo (created with as.mo()) or will be "mo" if that column name exists in x, could otherwise be a column name of x with values that exist in microorganisms$mo (like by = "bacteria_id"), or another column in microorganisms (but then it should be named, like by = c("bacteria_id" = "fullname"))

suffix @@ -278,7 +278,7 @@

Details

Note: As opposed to the join() functions of dplyr, character vectors are supported and at default existing columns will get a suffix "2" and the newly joined columns will not get a suffix.

-

These functions rely on merge(), a base R function to do joins.

+

If the dplyr package is installed, their join functions will be used. Otherwise, the much slower merge() function from base R will be used.

Stable lifecycle

diff --git a/docs/reference/mo_matching_score.html b/docs/reference/mo_matching_score.html index 5e6654eb1..feabf4454 100644 --- a/docs/reference/mo_matching_score.html +++ b/docs/reference/mo_matching_score.html @@ -82,7 +82,7 @@ AMR (for R) - 1.3.0.9028 + 1.3.0.9030
@@ -242,7 +242,7 @@

This helper function is used by as.mo() to determine the most probable match of taxonomic records, based on user input.

-
mo_matching_score(x, fullname, uncertainty = 1)
+
mo_matching_score(x, n)

Arguments

@@ -252,7 +252,7 @@ - + @@ -261,22 +261,28 @@

Any user input value(s)

fullnamen

A full taxonomic name, that exists in microorganisms$fullname

-

Details

+

Matching score for microorganisms

-

The matching score is based on four parameters:

    -
  1. A human pathogenic prevalence \(P\), that is categorised into group 1, 2 and 3 (see as.mo());

  2. -
  3. A kingdom index \(K\) is set as follows: Bacteria = 1, Fungi = 2, Protozoa = 3, Archaea = 4, and all others = 5;

  4. -
  5. The level of uncertainty \(U\) that is needed to get to a result (1 to 3, see as.mo());

  6. -
  7. The Levenshtein distance \(L\) is the distance between the user input and all taxonomic full names, with the text length of the user input being the maximum distance. A modified version of the Levenshtein distance \(L'\) based on the text length of the full name \(F\) is calculated as:

  8. -
+ -

$$L' = 1 - \frac{0.5L}{F}$$

-

The final matching score \(M\) is calculated as: -$$M = L' \times \frac{1}{P K U} = \frac{F - 0.5L}{F P K U}$$

+

With ambiguous user input in as.mo() and all the mo_* functions, the returned results are chosen based on their matching score using mo_matching_score(). This matching score \(m\) is calculated as:

+

$$m_{(x, n)} = \frac{l_{n} - 0.5 \times \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} p k}$$

+

where:

+ +

All matches are sorted descending on their matching score and for all user input values, the top match will be returned.

Examples

as.mo("E. coli")
 mo_uncertainties()
+
+mo_matching_score("E. coli", "Escherichia coli")
 
@@ -346,6 +346,22 @@


The lifecycle of this function is stable. In a stable function, major changes are unlikely. This means that the unlying code will generally evolve by adding new arguments; removing arguments or changing the meaning of existing arguments will be avoided.

If the unlying code needs breaking changes, they will occur gradually. For example, a parameter will be deprecated and first continue to work, but will emit an message informing you of the change. Next, typically after at least one newly released version on CRAN, the message will be transformed to an error.

+

Matching score for microorganisms

+ + + +

With ambiguous user input in as.mo() and all the mo_* functions, the returned results are chosen based on their matching score using mo_matching_score(). This matching score \(m\) is calculated as:

+

$$m_{(x, n)} = \frac{l_{n} - 0.5 \times \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} p k}$$

+

where:

+ +

All matches are sorted descending on their matching score and for all user input values, the top match will be returned.

Catalogue of Life

diff --git a/docs/survey.html b/docs/survey.html index 315ad96f9..efed9f117 100644 --- a/docs/survey.html +++ b/docs/survey.html @@ -81,7 +81,7 @@ AMR (for R) - 1.3.0.9029 + 1.3.0.9030 diff --git a/man/as.mo.Rd b/man/as.mo.Rd index 1965bfea6..c06b73eaa 100644 --- a/man/as.mo.Rd +++ b/man/as.mo.Rd @@ -125,24 +125,6 @@ Group 2 consists of all microorganisms where the taxonomic phylum is Proteobacte Group 3 (least prevalent microorganisms) consists of all other microorganisms. This group contains microorganisms most probably not found in humans. } - -\subsection{Background on matching scores}{ - -With ambiguous user input, the returned results are chosen based on their matching score using \code{\link[=mo_matching_score]{mo_matching_score()}}. This matching score is based on four parameters: -\enumerate{ -\item The prevalence \eqn{P} is categorised into group 1, 2 and 3 as stated above; -\item A kingdom index \eqn{K} is set as follows: Bacteria = 1, Fungi = 2, Protozoa = 3, Archaea = 4, and all others = 5; -\item The level of uncertainty \eqn{U} needed to get to the result, as stated above (1 to 3); -\item The \href{https://en.wikipedia.org/wiki/Levenshtein_distance}{Levenshtein distance} \eqn{L} is the distance between the user input and all taxonomic full names, with the text length of the user input being the maximum distance. A modified version of the Levenshtein distance \eqn{L'} based on the text length of the full name \eqn{F} is calculated as: -} - -\deqn{L' = 1 - \frac{0.5L}{F}}{L' = 1 - ((0.5 * L) / F)} - -The final matching score \eqn{M} is calculated as: -\deqn{M = L' \times \frac{1}{P K U} = \frac{F - 0.5L}{F P K U}}{M = L' * (1 / (P * K * U)) = (F - 0.5L) / (F * P * K * U)} - -All matches are sorted descending on their matching score and for all user input values, the top match will be returned. -} } \section{Source}{ @@ -162,6 +144,25 @@ The \link[=lifecycle]{lifecycle} of this function is \strong{stable}. In a stabl If the unlying code needs breaking changes, they will occur gradually. For example, a parameter will be deprecated and first continue to work, but will emit an message informing you of the change. Next, typically after at least one newly released version on CRAN, the message will be transformed to an error. } +\section{Matching score for microorganisms}{ + +With ambiguous user input in \code{\link[=as.mo]{as.mo()}} and all the \code{\link[=mo_property]{mo_*}} functions, the returned results are chosen based on their matching score using \code{\link[=mo_matching_score]{mo_matching_score()}}. This matching score \eqn{m} is calculated as: + +\deqn{m_{(x, n)} = \frac{l_{n} - 0.5 \times \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} p k}}{m(x, n) = ( l_n * min(l_n, lev(x, n) ) ) / ( l_n * p * k )} + +where: +\itemize{ +\item \eqn{x} is the user input; +\item \eqn{n} is a taxonomic name (genus, species and subspecies); +\item \eqn{l_{n}}{l_n} is the length of the taxonomic name; +\item \eqn{\operatorname{lev}}{lev} is the \href{https://en.wikipedia.org/wiki/Levenshtein_distance}{Levenshtein distance} function; +\item \eqn{p} is the human pathogenic prevalence, categorised into group \eqn{1}, \eqn{2} and \eqn{3} (see \emph{Details} in \code{?as.mo}), meaning that \eqn{p = \{1, 2 , 3\}}{p = {1, 2, 3}}; +\item \eqn{k} is the kingdom index, set as follows: Bacteria = \eqn{1}, Fungi = \eqn{2}, Protozoa = \eqn{3}, Archaea = \eqn{4}, and all others = \eqn{5}, meaning that \eqn{k = \{1, 2 , 3, 4, 5\}}{k = {1, 2, 3, 4, 5}}. +} + +All matches are sorted descending on their matching score and for all user input values, the top match will be returned. +} + \section{Catalogue of Life}{ \if{html}{\figure{logo_col.png}{options: height=40px style=margin-bottom:5px} \cr} diff --git a/man/join.Rd b/man/join.Rd index c642c3d48..bea60bda0 100755 --- a/man/join.Rd +++ b/man/join.Rd @@ -26,7 +26,7 @@ anti_join_microorganisms(x, by = NULL, ...) \arguments{ \item{x}{existing table to join, or character vector} -\item{by}{a variable to join by - if left empty will search for a column with class \code{\link{mo}} (created with \code{\link[=as.mo]{as.mo()}}) or will be \code{"mo"} if that column name exists in \code{x}, could otherwise be a column name of \code{x} with values that exist in \code{microorganisms$mo} (like \code{by = "bacteria_id"}), or another column in \link{microorganisms} (but then it should be named, like \code{by = c("my_genus_species" = "fullname")})} +\item{by}{a variable to join by - if left empty will search for a column with class \code{\link{mo}} (created with \code{\link[=as.mo]{as.mo()}}) or will be \code{"mo"} if that column name exists in \code{x}, could otherwise be a column name of \code{x} with values that exist in \code{microorganisms$mo} (like \code{by = "bacteria_id"}), or another column in \link{microorganisms} (but then it should be named, like \code{by = c("bacteria_id" = "fullname")})} \item{suffix}{if there are non-joined duplicate variables in \code{x} and \code{y}, these suffixes will be added to the output to disambiguate them. Should be a character vector of length 2.} @@ -38,7 +38,7 @@ Join the data set \link{microorganisms} easily to an existing table or character \details{ \strong{Note:} As opposed to the \code{join()} functions of \code{dplyr}, \link{character} vectors are supported and at default existing columns will get a suffix \code{"2"} and the newly joined columns will not get a suffix. -These functions rely on \code{\link[=merge]{merge()}}, a base R function to do joins. +If the \code{dplyr} package is installed, their join functions will be used. Otherwise, the much slower \code{\link[=merge]{merge()}} function from base R will be used. } \section{Stable lifecycle}{ diff --git a/man/mo_matching_score.Rd b/man/mo_matching_score.Rd index 3a71e5a6e..43fcd90f6 100644 --- a/man/mo_matching_score.Rd +++ b/man/mo_matching_score.Rd @@ -4,33 +4,40 @@ \alias{mo_matching_score} \title{Calculate the matching score for microorganisms} \usage{ -mo_matching_score(x, fullname, uncertainty = 1) +mo_matching_score(x, n) } \arguments{ \item{x}{Any user input value(s)} -\item{fullname}{A full taxonomic name, that exists in \code{\link[=microorganisms]{microorganisms$fullname}}} +\item{n}{A full taxonomic name, that exists in \code{\link[=microorganisms]{microorganisms$fullname}}} \item{uncertainty}{The level of uncertainty set in \code{\link[=as.mo]{as.mo()}}, see \code{allow_uncertain} in that function (here, it defaults to 1, but is automatically determined in \code{\link[=as.mo]{as.mo()}} based on the number of transformations needed to get to a result)} } \description{ This helper function is used by \code{\link[=as.mo]{as.mo()}} to determine the most probable match of taxonomic records, based on user input. } -\details{ -The matching score is based on four parameters: -\enumerate{ -\item A human pathogenic prevalence \eqn{P}, that is categorised into group 1, 2 and 3 (see \code{\link[=as.mo]{as.mo()}}); -\item A kingdom index \eqn{K} is set as follows: Bacteria = 1, Fungi = 2, Protozoa = 3, Archaea = 4, and all others = 5; -\item The level of uncertainty \eqn{U} that is needed to get to a result (1 to 3, see \code{\link[=as.mo]{as.mo()}}); -\item The \href{https://en.wikipedia.org/wiki/Levenshtein_distance}{Levenshtein distance} \eqn{L} is the distance between the user input and all taxonomic full names, with the text length of the user input being the maximum distance. A modified version of the Levenshtein distance \eqn{L'} based on the text length of the full name \eqn{F} is calculated as: +\section{Matching score for microorganisms}{ + +With ambiguous user input in \code{\link[=as.mo]{as.mo()}} and all the \code{\link[=mo_property]{mo_*}} functions, the returned results are chosen based on their matching score using \code{\link[=mo_matching_score]{mo_matching_score()}}. This matching score \eqn{m} is calculated as: + +\deqn{m_{(x, n)} = \frac{l_{n} - 0.5 \times \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} p k}}{m(x, n) = ( l_n * min(l_n, lev(x, n) ) ) / ( l_n * p * k )} + +where: +\itemize{ +\item \eqn{x} is the user input; +\item \eqn{n} is a taxonomic name (genus, species and subspecies); +\item \eqn{l_{n}}{l_n} is the length of the taxonomic name; +\item \eqn{\operatorname{lev}}{lev} is the \href{https://en.wikipedia.org/wiki/Levenshtein_distance}{Levenshtein distance} function; +\item \eqn{p} is the human pathogenic prevalence, categorised into group \eqn{1}, \eqn{2} and \eqn{3} (see \emph{Details} in \code{?as.mo}), meaning that \eqn{p = \{1, 2 , 3\}}{p = {1, 2, 3}}; +\item \eqn{k} is the kingdom index, set as follows: Bacteria = \eqn{1}, Fungi = \eqn{2}, Protozoa = \eqn{3}, Archaea = \eqn{4}, and all others = \eqn{5}, meaning that \eqn{k = \{1, 2 , 3, 4, 5\}}{k = {1, 2, 3, 4, 5}}. } -\deqn{L' = 1 - \frac{0.5L}{F}}{L' = 1 - ((0.5 * L) / F)} - -The final matching score \eqn{M} is calculated as: -\deqn{M = L' \times \frac{1}{P K U} = \frac{F - 0.5L}{F P K U}}{M = L' * (1 / (P * K * U)) = (F - 0.5L) / (F * P * K * U)} +All matches are sorted descending on their matching score and for all user input values, the top match will be returned. } + \examples{ as.mo("E. coli") mo_uncertainties() + +mo_matching_score("E. coli", "Escherichia coli") } diff --git a/man/mo_property.Rd b/man/mo_property.Rd index cfa7443c8..c8ca5e805 100644 --- a/man/mo_property.Rd +++ b/man/mo_property.Rd @@ -124,6 +124,25 @@ The \link[=lifecycle]{lifecycle} of this function is \strong{stable}. In a stabl If the unlying code needs breaking changes, they will occur gradually. For example, a parameter will be deprecated and first continue to work, but will emit an message informing you of the change. Next, typically after at least one newly released version on CRAN, the message will be transformed to an error. } +\section{Matching score for microorganisms}{ + +With ambiguous user input in \code{\link[=as.mo]{as.mo()}} and all the \code{\link[=mo_property]{mo_*}} functions, the returned results are chosen based on their matching score using \code{\link[=mo_matching_score]{mo_matching_score()}}. This matching score \eqn{m} is calculated as: + +\deqn{m_{(x, n)} = \frac{l_{n} - 0.5 \times \min \begin{cases}l_{n} \\ \operatorname{lev}(x, n)\end{cases}}{l_{n} p k}}{m(x, n) = ( l_n * min(l_n, lev(x, n) ) ) / ( l_n * p * k )} + +where: +\itemize{ +\item \eqn{x} is the user input; +\item \eqn{n} is a taxonomic name (genus, species and subspecies); +\item \eqn{l_{n}}{l_n} is the length of the taxonomic name; +\item \eqn{\operatorname{lev}}{lev} is the \href{https://en.wikipedia.org/wiki/Levenshtein_distance}{Levenshtein distance} function; +\item \eqn{p} is the human pathogenic prevalence, categorised into group \eqn{1}, \eqn{2} and \eqn{3} (see \emph{Details} in \code{?as.mo}), meaning that \eqn{p = \{1, 2 , 3\}}{p = {1, 2, 3}}; +\item \eqn{k} is the kingdom index, set as follows: Bacteria = \eqn{1}, Fungi = \eqn{2}, Protozoa = \eqn{3}, Archaea = \eqn{4}, and all others = \eqn{5}, meaning that \eqn{k = \{1, 2 , 3, 4, 5\}}{k = {1, 2, 3, 4, 5}}. +} + +All matches are sorted descending on their matching score and for all user input values, the top match will be returned. +} + \section{Catalogue of Life}{ \if{html}{\figure{logo_col.png}{options: height=40px style=margin-bottom:5px} \cr} diff --git a/tests/testthat/test-count.R b/tests/testthat/test-count.R index 159a3059d..a6cdc04e4 100644 --- a/tests/testthat/test-count.R +++ b/tests/testthat/test-count.R @@ -37,7 +37,7 @@ test_that("counts work", { expect_equal(suppressWarnings(count_S(example_isolates$AMX)) + count_I(example_isolates$AMX), count_SI(example_isolates$AMX)) - library(dplyr) + library(dplyr, warn.conflicts = FALSE) expect_equal(example_isolates %>% count_susceptible(AMC), 1433) expect_equal(example_isolates %>% count_susceptible(AMC, GEN, only_all_tested = TRUE), 1687) expect_equal(example_isolates %>% count_susceptible(AMC, GEN, only_all_tested = FALSE), 1764) diff --git a/tests/testthat/test-disk.R b/tests/testthat/test-disk.R index 218b383ce..ecbb289ba 100755 --- a/tests/testthat/test-disk.R +++ b/tests/testthat/test-disk.R @@ -37,7 +37,7 @@ test_that("disk works", { expect_warning(as.disk("INVALID VALUE")) expect_output(print(as.disk(12))) - library(dplyr) + library(dplyr, warn.conflicts = FALSE) expect_output(print(tibble(d = as.disk(12)))) }) diff --git a/tests/testthat/test-eucast_rules.R b/tests/testthat/test-eucast_rules.R index 04312b59b..1d9e986ec 100755 --- a/tests/testthat/test-eucast_rules.R +++ b/tests/testthat/test-eucast_rules.R @@ -72,7 +72,7 @@ test_that("EUCAST rules work", { expect_equal(suppressWarnings(eucast_rules(a, "mo", info = FALSE)), b) # piperacillin must be R in Enterobacteriaceae when tica is R - library(dplyr) + library(dplyr, warn.conflicts = FALSE) expect_equal(suppressWarnings( example_isolates %>% mutate(TIC = as.rsi("R"), @@ -85,7 +85,7 @@ test_that("EUCAST rules work", { as.character()), "R") - # Azithromicin and Clarythromycin must be equal to Erythromycin + # Azithromycin and Clarythromycin must be equal to Erythromycin a <- eucast_rules(data.frame(mo = example_isolates$mo, ERY = example_isolates$ERY, AZM = as.rsi("R"), diff --git a/tests/testthat/test-ggplot_rsi.R b/tests/testthat/test-ggplot_rsi.R index 4468bf61b..80d67f318 100644 --- a/tests/testthat/test-ggplot_rsi.R +++ b/tests/testthat/test-ggplot_rsi.R @@ -27,7 +27,7 @@ test_that("ggplot_rsi works", { skip_if_not_installed("ggplot2") - library(dplyr) + library(dplyr, warn.conflicts = FALSE) library(ggplot2) pdf(NULL) # prevent Rplots.pdf being created diff --git a/tests/testthat/test-join_microorganisms.R b/tests/testthat/test-join_microorganisms.R index bdda1c627..25a1f8310 100755 --- a/tests/testthat/test-join_microorganisms.R +++ b/tests/testthat/test-join_microorganisms.R @@ -57,7 +57,7 @@ test_that("joins work", { expect_warning(right_join_microorganisms("B_ESCHR_COLI")) expect_warning(full_join_microorganisms("B_ESCHR_COLI")) - library(dplyr) + library(dplyr, warn.conflicts = FALSE) x <- tibble(bact = as.mo("E.coli")) expect_warning(left_join_microorganisms(x %>% group_by(bact), "bact")) diff --git a/tests/testthat/test-key_antibiotics.R b/tests/testthat/test-key_antibiotics.R index 578772e71..90af2e146 100644 --- a/tests/testthat/test-key_antibiotics.R +++ b/tests/testthat/test-key_antibiotics.R @@ -32,6 +32,6 @@ test_that("keyantibiotics work", { expect_true(key_antibiotics_equal(".SS", "SI.", ignore_I = TRUE)) expect_false(key_antibiotics_equal(".SS", "SI.", ignore_I = FALSE)) - library(dplyr) + library(dplyr, warn.conflicts = FALSE) expect_warning(key_antibiotics(example_isolates %>% slice(rep(1, 10)))) }) diff --git a/tests/testthat/test-mo.R b/tests/testthat/test-mo.R index cf652da2a..217facd98 100644 --- a/tests/testthat/test-mo.R +++ b/tests/testthat/test-mo.R @@ -25,7 +25,7 @@ test_that("as.mo works", { skip_on_cran() - library(dplyr) + library(dplyr, warn.conflicts = FALSE) MOs <- microorganisms %>% filter(!is.na(mo), nchar(mo) > 3) expect_identical(as.character(MOs$mo), as.character(as.mo(MOs$mo))) diff --git a/tests/testthat/test-resistance_predict.R b/tests/testthat/test-resistance_predict.R index 3aafb98ba..3f2101f33 100644 --- a/tests/testthat/test-resistance_predict.R +++ b/tests/testthat/test-resistance_predict.R @@ -40,7 +40,7 @@ test_that("prediction of rsi works", { expect_silent(ggplot_rsi_predict(x)) expect_error(ggplot_rsi_predict(example_isolates)) - library(dplyr) + library(dplyr, warn.conflicts = FALSE) expect_output(rsi_predict(x = filter(example_isolates, mo == "B_ESCHR_COLI"), model = "binomial", diff --git a/tests/testthat/test-rsi.R b/tests/testthat/test-rsi.R index 4f3676f8d..3637a8019 100644 --- a/tests/testthat/test-rsi.R +++ b/tests/testthat/test-rsi.R @@ -54,7 +54,7 @@ test_that("rsi works", { expect_identical(as.logical(lapply(example_isolates, is.rsi.eligible)), rep(FALSE, length(example_isolates))) - library(dplyr) + library(dplyr, warn.conflicts = FALSE) # 40 rsi columns expect_equal(example_isolates %>% mutate_at(vars(PEN:RIF), as.character) %>%