diff --git a/DESCRIPTION b/DESCRIPTION index d54190df..a7b6196f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: AMR -Version: 1.0.1 -Date: 2020-02-22 +Version: 1.0.1.9000 +Date: 2020-03-07 Title: Antimicrobial Resistance Analysis Authors@R: c( person(role = c("aut", "cre"), diff --git a/NAMESPACE b/NAMESPACE index 42a93202..a4ae25e8 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -37,6 +37,7 @@ S3method(pillar_shaft,rsi) S3method(plot,mic) S3method(plot,resistance_predict) S3method(plot,rsi) +S3method(prcomp,data.frame) S3method(print,ab) S3method(print,bug_drug_combinations) S3method(print,catalogue_of_life_version) @@ -120,6 +121,7 @@ export(g.test) export(geom_rsi) export(get_locale) export(get_mo_source) +export(ggplot_pca) export(ggplot_rsi) export(ggplot_rsi_predict) export(guess_ab_col) @@ -169,6 +171,7 @@ export(mrgn) export(n_rsi) export(p.symbol) export(p_symbol) +export(pca) export(portion_I) export(portion_IR) export(portion_R) @@ -224,6 +227,7 @@ exportMethods(kurtosis.default) exportMethods(kurtosis.matrix) exportMethods(plot.mic) exportMethods(plot.rsi) +exportMethods(prcomp.data.frame) exportMethods(print.ab) exportMethods(print.bug_drug_combinations) exportMethods(print.catalogue_of_life_version) @@ -318,6 +322,7 @@ importFrom(pillar,pillar_shaft) importFrom(pillar,type_sum) importFrom(rlang,as_label) importFrom(rlang,enquos) +importFrom(rlang,eval_tidy) importFrom(stats,complete.cases) importFrom(stats,glm) importFrom(stats,lm) diff --git a/NEWS.md b/NEWS.md index bfd37ce9..349a057a 100755 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,9 @@ +# AMR 1.0.1.9000 + +### New +* Support for easy principal component analysis for AMR, using the new `pca()` function +* Plotting biplots for principal component analysis using the new `ggplot_pca()` function + # AMR 1.0.1 ### Changed diff --git a/R/age.R b/R/age.R index a74aef91..bc62f4e8 100755 --- a/R/age.R +++ b/R/age.R @@ -27,7 +27,7 @@ #' @param reference reference date(s) (defaults to today), will be coerced with [as.POSIXlt()] and cannot be lower than `x` #' @param exact a logical to indicate whether age calculation should be exact, i.e. with decimals. It divides the number of days of [year-to-date](https://en.wikipedia.org/wiki/Year-to-date) (YTD) of `x` by the number of days in the year of `reference` (either 365 or 366). #' @param na.rm a logical to indicate whether missing values should be removed -#' @return An integer (no decimals) if `exact = FALSE`, a double (with decimals) otherwise +#' @return An [integer] (no decimals) if `exact = FALSE`, a [double] (with decimals) otherwise #' @seealso To split ages into groups, use the [age_groups()] function. #' @importFrom dplyr if_else #' @inheritSection AMR Read more on our website! @@ -95,8 +95,8 @@ age <- function(x, reference = Sys.Date(), exact = FALSE, na.rm = FALSE) { #' @inheritSection lifecycle Stable lifecycle #' @param x age, e.g. calculated with [age()] #' @param split_at values to split `x` at, defaults to age groups 0-11, 12-24, 25-54, 55-74 and 75+. See Details. -#' @param na.rm a logical to indicate whether missing values should be removed -#' @details To split ages, the input can be: +#' @param na.rm a [logical] to indicate whether missing values should be removed +#' @details To split ages, the input for the `split_at` parameter can be: #' #' * A numeric vector. A vector of e.g. `c(10, 20)` will split on 0-9, 10-19 and 20+. A value of only `50` will split on 0-49 and 50+. #' The default is to split on young children (0-11), youth (12-24), young adults (25-54), middle-aged adults (55-74) and elderly (75+). @@ -104,8 +104,8 @@ age <- function(x, reference = Sys.Date(), exact = FALSE, na.rm = FALSE) { #' - `"children"` or `"kids"`, equivalent of: `c(0, 1, 2, 4, 6, 13, 18)`. This will split on 0, 1, 2-3, 4-5, 6-12, 13-17 and 18+. #' - `"elderly"` or `"seniors"`, equivalent of: `c(65, 75, 85)`. This will split on 0-64, 65-74, 75-84, 85+. #' - `"fives"`, equivalent of: `1:20 * 5`. This will split on 0-4, 5-9, 10-14, ..., 90-94, 95-99, 100+. -#' - `"tens"`, equivalent of: `1:10 * 10`. This will split on 0-9, 10-19, 20-29, ... 80-89, 90-99, 100+. -#' @return Ordered [`factor`] +#' - `"tens"`, equivalent of: `1:10 * 10`. This will split on 0-9, 10-19, 20-29, ..., 80-89, 90-99, 100+. +#' @return Ordered [factor] #' @seealso To determine ages, based on one or more reference dates, use the [age()] function. #' @export #' @inheritSection AMR Read more on our website! diff --git a/R/ggplot_pca.R b/R/ggplot_pca.R new file mode 100755 index 00000000..b03ae9aa --- /dev/null +++ b/R/ggplot_pca.R @@ -0,0 +1,349 @@ +# ==================================================================== # +# TITLE # +# Antimicrobial Resistance (AMR) Analysis # +# # +# SOURCE # +# https://gitlab.com/msberends/AMR # +# # +# LICENCE # +# (c) 2018-2020 Berends MS, Luz CF et al. # +# # +# This R package is free software; you can freely use and distribute # +# it for both personal and commercial purposes under the terms of the # +# GNU General Public License version 2.0 (GNU GPL-2), as published by # +# the Free Software Foundation. # +# # +# We created this package for both routine data analysis and academic # +# research and it was publicly released in the hope that it will be # +# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. # +# Visit our website for more info: https://msberends.gitlab.io/AMR. # +# ==================================================================== # + +#' PCA biplot with `ggplot2` +#' +#' This function is to produce a `ggplot2` variant of a so-called [biplot](https://en.wikipedia.org/wiki/Biplot) for PCA (principal component analysis), but is more flexible and more appealing than the base \R [biplot()] function. +#' @inheritSection lifecycle Maturing lifecycle +#' @param x an object returned by [pca()], [prcomp()] or [princomp()] +#' @inheritParams stats::biplot.prcomp +#' @param labels an optional vector of labels for the observations. If set, the labels will be placed below their respective points. When using the [pca()] function as input for `x`, this will be determined automatically based on the attribute `non_numeric_cols`, see [pca()]. +#' @param labels_textsize the size of the text used for the labels +#' @param labels_text_placement adjustment factor the placement of the variable names (`>=1` means further away from the arrow head) +#' @param groups an optional vector of groups for the labels, with the same length as `labels`. If set, the points and labels will be coloured according to these groups. When using the [pca()] function as input for `x`, this will be determined automatically based on the attribute `non_numeric_cols`, see [pca()]. +#' @param ellipse a logical to indicate whether a normal data ellipse should be drawn for each group (set with `groups`) +#' @param ellipse_prob statistical size of the ellipse in normal probability +#' @param ellipse_size the size of the ellipse line +#' @param ellipse_alpha the alpha (transparency) of the ellipse line +#' @param points_alpha the alpha (transparency) of the points +#' @param arrows a logical to indicate whether arrows should be drawn +#' @param arrows_textsize the size of the text for variable names +#' @param arrows_colour the colour of the arrow and their text +#' @param arrows_size the size (thickness) of the arrow lines +#' @param arrows_textsize the size of the text at the end of the arrows +#' @param arrows_alpha the alpha (transparency) of the arrows and their text +#' @param base_textsize the text size for all plot elements except the labels and arrows +#' @param ... Parameters passed on to functions +#' @source The [ggplot_pca()] function is based on the [ggbiplot()] function from the `ggbiplot` package by Vince Vu, as found on GitHub: (retrieved: 2 March 2020, their latest commit: [`7325e88`](https://github.com/vqv/ggbiplot/commit/7325e880485bea4c07465a0304c470608fffb5d9); 12 February 2015). +#' +#' As per their GPL-2 licence that demands documentation of code changes, the changes made based on the source code were: +#' 1. Rewritten code to remove the dependency on packages `plyr`, `scales` and `grid` +#' 2. Parametrised more options, like arrow and ellipse settings +#' 3. Added total amount of explained variance as a caption in the plot +#' 4. Cleaned all syntax based on the `lintr` package +#' 5. Updated documentation +#' @details The default colours for labels and points is set with [scale_colour_viridis_d()], but these can be changed by adding another scale for colour, like [scale_colour_brewer()]. +#' @rdname ggplot_pca +#' @export +#' @examples +#' # `example_isolates` is a dataset available in the AMR package. +#' # See ?example_isolates. +#' +#' # See ?pca for more info about Principal Component Analysis (PCA). +#' library(dplyr) +#' pca_model <- example_isolates %>% +#' filter(mo_genus(mo) == "Staphylococcus") %>% +#' group_by(species = mo_shortname(mo)) %>% +#' summarise_if (is.rsi, resistance) %>% +#' pca(FLC, AMC, CXM, GEN, TOB, TMP, SXT, CIP, TEC, TCY, ERY) +#' +#' # old +#' biplot(pca_model) +#' +#' # new +#' ggplot_pca(pca_model) +ggplot_pca <- function(x, + choices = 1:2, + scale = TRUE, + labels = NULL, + labels_textsize = 3, + labels_text_placement = 1.5, + groups = NULL, + ellipse = FALSE, + ellipse_prob = 0.68, + ellipse_size = 0.5, + ellipse_alpha = 0.25, + points_size = 2, + points_alpha = 0.25, + arrows = TRUE, + arrows_colour = "darkblue", + arrows_size = 0.5, + arrows_textsize = 3, + arrows_alpha = 0.75, + base_textsize = 10, + ...) { + + stopifnot_installed_package("ggplot2") + + calculations <- pca_calculations(pca_model = x, + groups = groups, + groups_missing = missing(groups), + labels = labels, + labels_missing = missing(labels), + choices = choices, + scale = scale, + ellipse_prob = ellipse_prob, + labels_text_placement = labels_text_placement) + nobs.factor <- calculations$nobs.factor + d <- calculations$d + u <- calculations$u + v <- calculations$v + choices <- calculations$choices + df.u <- calculations$df.u + df.v <- calculations$df.v + r <- calculations$r + ell <- calculations$ell + groups <- calculations$groups + group_name <- calculations$group_name + labels <- calculations$labels + + stopifnot(length(choices) == 2) + + # Append the proportion of explained variance to the axis labels + if ((1 - as.integer(scale)) == 0) { + u.axis.labs <- paste("Standardised PC", choices, sep = "") + } else { + u.axis.labs <- paste("PC", choices, sep = "") + } + u.axis.labs <- paste(u.axis.labs, + paste0("\n(explained var: ", + percentage(x$sdev[choices] ^ 2 / sum(x$sdev ^ 2)), ")")) + + # Score Labels + if (!is.null(labels)) { + df.u$labels <- labels + } + + # Grouping variable + if (!is.null(groups)) { + df.u$groups <- groups + } + + + # Base plot + g <- ggplot2::ggplot(data = df.u, + ggplot2::aes(x = xvar, y = yvar)) + + ggplot2::xlab(u.axis.labs[1]) + + ggplot2::ylab(u.axis.labs[2]) + + ggplot2::expand_limits(x = c(-1.15, 1.15), + y = c(-1.15, 1.15)) + + # Draw either labels or points + if (!is.null(df.u$labels)) { + if (!is.null(df.u$groups)) { + g <- g + + ggplot2::geom_point(ggplot2::aes(colour = groups), + alpha = points_alpha, + size = points_size) + + ggplot2::geom_text(ggplot2::aes(label = labels, colour = groups), + nudge_y = -0.05, + size = labels_textsize) + + ggplot2::scale_colour_viridis_d() + + ggplot2::labs(colour = group_name) + } else { + g <- g + + ggplot2::geom_point(alpha = points_alpha, + size = points_size) + + ggplot2::geom_text(ggplot2::aes(label = labels), + nudge_y = -0.05, + size = labels_textsize) + } + } else { + if (!is.null(df.u$groups)) { + g <- g + + ggplot2::geom_point(ggplot2::aes(colour = groups), + alpha = points_alpha, + size = points_size) + + ggplot2::scale_colour_viridis_d() + + ggplot2::labs(colour = group_name) + } else { + g <- g + ggplot2::geom_point(alpha = points_alpha, + size = points_size) + } + } + + # Overlay a concentration ellipse if there are groups + if (!is.null(df.u$groups) & isTRUE(ellipse)) { + g <- g + + ggplot2::geom_path(data = ell, + ggplot2::aes(colour = groups, group = groups), + size = ellipse_size, + alpha = points_alpha) + } + + # Label the variable axes + if (arrows == TRUE) { + g <- g + + ggplot2::geom_segment(data = df.v, + ggplot2::aes(x = 0, y = 0, xend = xvar, yend = yvar), + arrow = ggplot2::arrow(length = ggplot2::unit(0.5, "picas"), + angle = 20, + ends = "last", + type = "open"), + colour = arrows_colour, + size = arrows_size, + alpha = arrows_alpha) + + ggplot2::geom_text(data = df.v, + ggplot2::aes(label = varname, x = xvar, y = yvar, angle = angle, hjust = hjust), + colour = arrows_colour, + size = arrows_textsize, + alpha = arrows_alpha) + } + + # Add caption label about total explained variance + g <- g + ggplot2::labs(caption = paste0("Total explained variance: ", + percentage(sum(x$sdev[choices] ^ 2 / sum(x$sdev ^ 2))))) + + # mark-up nicely + g <- g + ggplot2::theme_minimal(base_size = base_textsize) + + ggplot2::theme(panel.grid.major = ggplot2::element_line(colour = "grey85"), + panel.grid.minor = ggplot2::element_blank(), + # centre title and subtitle + plot.title = ggplot2::element_text(hjust = 0.5), + plot.subtitle = ggplot2::element_text(hjust = 0.5)) + + g +} + +#' @importFrom dplyr bind_rows +pca_calculations <- function(pca_model, + groups = NULL, + groups_missing = TRUE, + labels = NULL, + labels_missing = TRUE, + choices = 1:2, + scale = 1, + ellipse_prob = 0.68, + labels_text_placement = 1.5) { + + non_numeric_cols <- attributes(pca_model)$non_numeric_cols + if (groups_missing) { + groups <- tryCatch(non_numeric_cols[[1]], + error = function(e) NULL) + group_name <- tryCatch(colnames(non_numeric_cols[1]), + error = function(e) NULL) + } + if (labels_missing) { + labels <- tryCatch(non_numeric_cols[[2]], + error = function(e) NULL) + } + if (!is.null(groups) & is.null(labels)) { + # turn them around + labels <- groups + groups <- NULL + group_name <- NULL + } + + # Recover the SVD + if (inherits(pca_model, "prcomp")) { + nobs.factor <- sqrt(nrow(pca_model$x) - 1) + d <- pca_model$sdev + u <- sweep(pca_model$x, 2, 1 / (d * nobs.factor), FUN = "*") + v <- pca_model$rotation + } else if (inherits(pca_model, "princomp")) { + nobs.factor <- sqrt(pca_model$n.obs) + d <- pca_model$sdev + u <- sweep(pca_model$scores, 2, 1 / (d * nobs.factor), FUN = "*") + v <- pca_model$loadings + } else if (inherits(pca_model, "PCA")) { + nobs.factor <- sqrt(nrow(pca_model$call$X)) + d <- unlist(sqrt(pca_model$eig)[1]) + u <- sweep(pca_model$ind$coord, 2, 1 / (d * nobs.factor), FUN = "*") + v <- sweep(pca_model$var$coord, 2, sqrt(pca_model$eig[seq_len(ncol(pca_model$var$coord)), 1]), FUN = "/") + } else if (inherits(pca_model, "lda")) { + nobs.factor <- sqrt(pca_model$N) + d <- pca_model$svd + u <- predict(pca_model)$x / nobs.factor + v <- pca_model$scaling + d.total <- sum(d ^ 2) + } else { + stop("Expected a object of class prcomp, princomp, PCA, or lda") + } + + # Scores + choices <- pmin(choices, ncol(u)) + obs.scale <- 1 - as.integer(scale) + df.u <- as.data.frame(sweep(u[, choices], 2, d[choices] ^ obs.scale, FUN = "*")) + + # Directions + v <- sweep(v, 2, d ^ as.integer(scale), FUN = "*") + df.v <- as.data.frame(v[, choices]) + + names(df.u) <- c("xvar", "yvar") + names(df.v) <- names(df.u) + + df.u <- df.u * nobs.factor + + # Scale the radius of the correlation circle so that it corresponds to + # a data ellipse for the standardized PC scores + circle_prob <- 0.69 + r <- sqrt(qchisq(circle_prob, df = 2)) * prod(colMeans(df.u ^ 2)) ^ (0.25) + + # Scale directions + v.scale <- rowSums(v ^ 2) + df.v <- r * df.v / sqrt(max(v.scale)) + + # Grouping variable + if (!is.null(groups)) { + df.u$groups <- groups + } + + df.v$varname <- rownames(v) + + # Variables for text label placement + df.v$angle <- with(df.v, (180 / pi) * atan(yvar / xvar)) + df.v$hjust <- with(df.v, (1 - labels_text_placement * sign(xvar)) / 2) + + if (!is.null(df.u$groups)) { + theta <<- c(seq(-pi, pi, length = 50), seq(pi, -pi, length = 50)) + circle <<- cbind(cos(theta), sin(theta)) + ell <- bind_rows( + sapply(unique(df.u$groups), function(g, df = df.u) { + x <- df[which(df$groups == g), , drop = FALSE] + if (nrow(x) <= 2) { + return(NULL) + } + sigma <- var(cbind(x$xvar, x$yvar)) + mu <- c(mean(x$xvar), mean(x$yvar)) + ed <- sqrt(qchisq(ellipse_prob, df = 2)) + data.frame(sweep(circle %*% chol(sigma) * ed, 2, mu, FUN = "+"), + groups = x$groups[1]) + })) + names(ell)[1:2] <- c("xvar", "yvar") + } else { + ell <- NULL + } + + list(nobs.factor = nobs.factor, + d = d, + u = u, + v = v, + choices = choices, + df.u = df.u, + df.v = df.v, + r = r, + ell = ell, + groups = groups, + group_name = group_name, + labels = labels + ) + +} diff --git a/R/ggplot_rsi.R b/R/ggplot_rsi.R index 3f8e86f5..fd6142c7 100755 --- a/R/ggplot_rsi.R +++ b/R/ggplot_rsi.R @@ -186,12 +186,12 @@ ggplot_rsi <- function(data, x.title = "Antimicrobial", y.title = "Proportion", ...) { - + stopifnot_installed_package("ggplot2") - + x <- x[1] facet <- facet[1] - + # we work with aes_string later on x_deparse <- deparse(substitute(x)) if (x_deparse != "x") { @@ -210,16 +210,16 @@ ggplot_rsi <- function(data, if (facet %in% c("NULL", "")) { facet <- NULL } - + if (is.null(position)) { position <- "fill" } - + p <- ggplot2::ggplot(data = data) + geom_rsi(position = position, x = x, fill = fill, translate_ab = translate_ab, combine_SI = combine_SI, combine_IR = combine_IR, ...) + theme_rsi() - + if (fill == "interpretation") { # set RSI colours if (isFALSE(colours) & missing(datalabels.colour)) { @@ -228,12 +228,12 @@ ggplot_rsi <- function(data, } p <- p + scale_rsi_colours(colours = colours) } - + if (identical(position, "fill")) { # proportions, so use y scale with percentage p <- p + scale_y_percent(breaks = breaks, limits = limits) } - + if (datalabels == TRUE) { p <- p + labels_rsi_count(position = position, x = x, @@ -243,17 +243,17 @@ ggplot_rsi <- function(data, datalabels.size = datalabels.size, datalabels.colour = datalabels.colour) } - + if (!is.null(facet)) { p <- p + facet_rsi(facet = facet, nrow = nrow) } - + p <- p + ggplot2::labs(title = title, subtitle = subtitle, caption = caption, x = x.title, y = y.title) - + p } @@ -267,24 +267,24 @@ geom_rsi <- function(position = NULL, combine_SI = TRUE, combine_IR = FALSE, ...) { - + stopifnot_installed_package("ggplot2") - + if (is.data.frame(position)) { stop("`position` is invalid. Did you accidentally use '%>%' instead of '+'?", call. = FALSE) } - + y <- "value" if (missing(position) | is.null(position)) { position <- "fill" } - + if (identical(position, "fill")) { position <- ggplot2::position_fill(vjust = 0.5, reverse = TRUE) } - + x <- x[1] - + # we work with aes_string later on x_deparse <- deparse(substitute(x)) if (x_deparse != "x") { @@ -293,33 +293,33 @@ geom_rsi <- function(position = NULL, if (x %like% '".*"') { x <- substr(x, 2, nchar(x) - 1) } - + if (tolower(x) %in% tolower(c("ab", "abx", "antibiotics"))) { x <- "antibiotic" } else if (tolower(x) %in% tolower(c("SIR", "RSI", "interpretations", "result"))) { x <- "interpretation" } - + ggplot2::layer(geom = "bar", stat = "identity", position = position, mapping = ggplot2::aes_string(x = x, y = y, fill = fill), params = list(...), data = function(x) { rsi_df(data = x, - translate_ab = translate_ab, - language = language, - combine_SI = combine_SI, - combine_IR = combine_IR) + translate_ab = translate_ab, + language = language, + combine_SI = combine_SI, + combine_IR = combine_IR) }) - + } #' @rdname ggplot_rsi #' @export facet_rsi <- function(facet = c("interpretation", "antibiotic"), nrow = NULL) { - + stopifnot_installed_package("ggplot2") - + facet <- facet[1] - + # we work with aes_string later on facet_deparse <- deparse(substitute(facet)) if (facet_deparse != "facet") { @@ -328,13 +328,13 @@ facet_rsi <- function(facet = c("interpretation", "antibiotic"), nrow = NULL) { if (facet %like% '".*"') { facet <- substr(facet, 2, nchar(facet) - 1) } - + if (tolower(facet) %in% tolower(c("SIR", "RSI", "interpretations", "result"))) { facet <- "interpretation" } else if (tolower(facet) %in% tolower(c("ab", "abx", "antibiotics"))) { facet <- "antibiotic" } - + ggplot2::facet_wrap(facets = facet, scales = "free_x", nrow = nrow) } @@ -343,7 +343,7 @@ facet_rsi <- function(facet = c("interpretation", "antibiotic"), nrow = NULL) { #' @export scale_y_percent <- function(breaks = seq(0, 1, 0.1), limits = NULL) { stopifnot_installed_package("ggplot2") - + if (all(breaks[breaks != 0] > 1)) { breaks <- breaks / 100 } @@ -362,7 +362,7 @@ scale_rsi_colours <- function(colours = c(S = "#61a8ff", stopifnot_installed_package("ggplot2") # previous colour: palette = "RdYlGn" # previous colours: values = c("#b22222", "#ae9c20", "#7cfc00") - + if (!identical(colours, FALSE)) { original_cols <- c(S = "#61a8ff", SI = "#61a8ff", diff --git a/R/lifecycle.R b/R/lifecycle.R index 0a805964..ed8f123e 100644 --- a/R/lifecycle.R +++ b/R/lifecycle.R @@ -32,7 +32,7 @@ #' This page contains a section for every lifecycle (with text borrowed from the aforementioned `tidyverse` website), so they can be used in the manual pages of our functions. #' @section Experimental lifecycle: #' \if{html}{\figure{lifecycle_experimental.svg}{options: style=margin-bottom:5px} \cr} -#' The [lifecycle][AMR::lifecycle] of this function is **experimental**. An experimental function is in the very early stages of development. The unlying code might be changing frequently as we rapidly iterate and explore variations in search of the best fit. Experimental functions might be removed without deprecation, so you are generally best off waiting until a function is more mature before you use it in production code. Experimental functions will not be included in releases we submit to CRAN. +#' The [lifecycle][AMR::lifecycle] of this function is **experimental**. An experimental function is in the very early stages of development. The unlying code might be changing frequently as we rapidly iterate and explore variations in search of the best fit. Experimental functions might be removed without deprecation, so you are generally best off waiting until a function is more mature before you use it in production code. Experimental functions will not be included in releases we submit to CRAN, since they have not yet matured enough. #' @section Maturing lifecycle: #' \if{html}{\figure{lifecycle_maturing.svg}{options: style=margin-bottom:5px} \cr} #' The [lifecycle][AMR::lifecycle] of this function is **maturing**. The unlying code of a maturing function has been roughed out, but finer details might still change. We will strive to maintain backward compatibility, but the function needs wider usage and more extensive testing in order to optimise the unlying code. diff --git a/R/pca.R b/R/pca.R new file mode 100755 index 00000000..65bd1d3a --- /dev/null +++ b/R/pca.R @@ -0,0 +1,128 @@ +# ==================================================================== # +# TITLE # +# Antimicrobial Resistance (AMR) Analysis # +# # +# SOURCE # +# https://gitlab.com/msberends/AMR # +# # +# LICENCE # +# (c) 2018-2020 Berends MS, Luz CF et al. # +# # +# This R package is free software; you can freely use and distribute # +# it for both personal and commercial purposes under the terms of the # +# GNU General Public License version 2.0 (GNU GPL-2), as published by # +# the Free Software Foundation. # +# # +# We created this package for both routine data analysis and academic # +# research and it was publicly released in the hope that it will be # +# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. # +# Visit our website for more info: https://msberends.gitlab.io/AMR. # +# ==================================================================== # + +#' Principal Component Analysis (for AMR) +#' +#' Performs a principal component analysis (PCA) based on a data set with automatic determination for afterwards plotting the groups and labels. +#' @inheritSection lifecycle Experimental lifecycle +#' @param x a [data.frame] containing numeric columns +#' @param ... columns of `x` to be selected for PCA +#' @inheritParams stats::prcomp +#' @details The [pca()] function takes a [data.frame] as input and performs the actual PCA with the R function [prcomp()]. +#' +#' The result of the [pca()] function is a [`prcomp`] object, with an additional attribute `non_numeric_cols` which is a vector with the column names of all columns that do not contain numeric values. These are probably the groups and labels, and will be used by [ggplot_pca()]. +#' @rdname pca +#' @exportMethod prcomp.data.frame +#' @export +#' @examples +#' # `example_isolates` is a dataset available in the AMR package. +#' # See ?example_isolates. +#' +#' # calculate the resistance per group first +#' library(dplyr) +#' resistance_data <- example_isolates %>% +#' group_by(order = mo_order(mo), # group on anything, like order +#' genus = mo_genus(mo)) %>% # and genus as we do here +#' summarise_if(is.rsi, resistance) # then get resistance of all drugs +#' +#' # now conduct PCA for certain antimicrobial agents +#' pca_result <- resistance_data %>% +#' pca(AMC, CXM, CTX, CAZ, GEN, TOB, TMP, SXT) +#' +#' pca_result +#' summary(pca_result) +#' biplot(pca_result) +#' ggplot_pca(pca_result) # a new and convenient plot function +prcomp.data.frame <- function(x, + ..., + retx = TRUE, + center = TRUE, + scale. = TRUE, + tol = NULL, + rank. = NULL) { + + x <- pca_transform_x(x = x, ... = ...) + pca_data <- x[, which(sapply(x, function(x) is.numeric(x)))] + + message(blue(paste0("NOTE: Columns selected for PCA: ", paste0(bold(colnames(pca_data)), collapse = "/"), + ".\n Total observations available: ", nrow(pca_data), "."))) + + stats:::prcomp.default(pca_data, retx = retx, center = center, scale. = scale., tol = tol, rank. = rank.) +} + +#' @rdname pca +#' @export +pca <- function(x, ...) { + if (!is.data.frame(x)) { + stop("this function only takes a data.frame as input") + } + pca_model <- prcomp(x, ...) + + x <- pca_transform_x(x = x, ... = ...) + attr(pca_model, "non_numeric_cols") <- x[, sapply(x, function(y) !is.numeric(y) & !all(is.na(y))), drop = FALSE] + pca_model +} + +#' @importFrom dplyr ungroup %>% filter_all all_vars +#' @importFrom rlang enquos eval_tidy +pca_transform_x <- function(x, ...) { + # unset data.table, tbl_df, etc. + # also removes groups made by dplyr::group_by + x <- as.data.frame(x, stringsAsFactors = FALSE) + x.bak <- x + + user_exprs <- enquos(...) + + if (length(user_exprs) > 0) { + new_list <- list(0) + for (i in seq_len(length(user_exprs))) { + new_list[[i]] <- tryCatch(eval_tidy(user_exprs[[i]], data = x), + error = function(e) stop(e$message, call. = FALSE)) + if (length(new_list[[i]]) == 1) { + if (i == 1) { + # only for first item: + if (is.character(new_list[[i]]) & new_list[[i]] %in% colnames(x)) { + # this is to support: df %>% pca("mycol") + new_list[[i]] <- x[, new_list[[i]]] + } + } else { + # remove item - it's a parameter like `center` + new_list[[i]] <- NULL + } + } + } + x <- as.data.frame(new_list, stringsAsFactors = FALSE) + if (any(sapply(x, function(y) !is.numeric(y)))) { + warning("Be sure to first calculate the resistance (or susceptibility) of variables with antimicrobial test results, since PCA works with numeric variables only. Please see Examples in ?pca.") + } + # set column names + tryCatch(colnames(x) <- sapply(user_exprs, function(y) as_label(y)), + error = function(e) warning("column names could not be set")) + # keep only numeric columns + x <- x[, sapply(x, function(y) is.numeric(y))] + # bind the data set with the non-numeric columns + x <- cbind(x.bak[, sapply(x.bak, function(y) !is.numeric(y) & !all(is.na(y))), drop = FALSE], x) + } + + x %>% + ungroup() %>% # would otherwise select the grouping vars + filter_all(all_vars(!is.na(.))) +} diff --git a/_pkgdown.yml b/_pkgdown.yml index c6b8a71c..ceb72343 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -44,6 +44,9 @@ navbar: - text: "Predict antimicrobial resistance" icon: "fa-dice" href: "articles/resistance_predict.html" + - text: "Conduct principal component analysis for AMR" + icon: "fa-compress" + href: "articles/PCA.html" - text: "Determine multi-drug resistance (MDR)" icon: "fa-skull-crossbones" href: "articles/MDR.html" @@ -94,7 +97,6 @@ reference: - "`guess_ab_col`" - "`mo_source`" - "`read.4D`" - - "`rsi_translation`" - title: "Enhancing your data" desc: > Functions to add new data to your existing data, such as the determination @@ -117,28 +119,31 @@ reference: Functions for conducting AMR analysis, like counting isolates, calculating resistance or susceptibility, or make plots. contents: + - "`proportion`" + - "`count`" - "`availability`" - "`bug_drug_combinations`" - - "`count`" + - "`resistance_predict`" + - "`pca`" - "`filter_ab_class`" - "`g.test`" - "`ggplot_rsi`" + - "`ggplot_pca`" - "`kurtosis`" - - "`portion`" - - "`resistance_predict`" - "`skewness`" - title: "Included data sets" desc: > Scientifically reliable references for microorganisms and antibiotics, and example data sets to use for practise. contents: + - "`microorganisms`" - "`antibiotics`" - "`antivirals`" - "`example_isolates`" - "`example_isolates_unclean`" + - "`rsi_translation`" - "`microorganisms.codes`" - "`microorganisms.old`" - - "`microorganisms`" - "`WHONET`" - title: "Background information" desc: > diff --git a/data-raw/country_analysis.R b/data-raw/country_analysis.R index 59a9097a..395963fd 100644 --- a/data-raw/country_analysis.R +++ b/data-raw/country_analysis.R @@ -154,7 +154,11 @@ data %>% origin = 'iso2c', destination = 'country.name')) %>% summarise(first = min(timestamp_server)) %>% - arrange(desc(first)) + arrange(desc(first)) %>% + mutate(frame = case_when(first <= as.POSIXct("2019-06-30") ~ "Q1-Q2 2019", + first <= as.POSIXct("2019-12-31") ~ "Q3-Q4 2019", + TRUE ~ "Q1-Q2 2020")) %>% + View() # # p1 <- data %>% # group_by(country) %>% diff --git a/docs/404.html b/docs/404.html index abfa2ad2..ee5c562c 100644 --- a/docs/404.html +++ b/docs/404.html @@ -78,7 +78,7 @@ AMR (for R) - 1.0.1 + 1.0.1.9000 @@ -114,6 +114,13 @@ Predict antimicrobial resistance +
  • + + + + Conduct principal component analysis for AMR + +
  • diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index 503dd3c6..fd38e4e8 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -78,7 +78,7 @@ AMR (for R) - 1.0.1 + 1.0.1.9000 @@ -114,6 +114,13 @@ Predict antimicrobial resistance
  • +
  • + + + + Conduct principal component analysis for AMR + +
  • diff --git a/docs/articles/EUCAST.html b/docs/articles/EUCAST.html index aa4260f4..922cbf0f 100644 --- a/docs/articles/EUCAST.html +++ b/docs/articles/EUCAST.html @@ -39,7 +39,7 @@ AMR (for R) - 1.0.1 + 1.0.1.9000 @@ -75,6 +75,13 @@ Predict antimicrobial resistance
  • +
  • + + + + Conduct Principal Component Analysis for AMR + +
  • @@ -179,7 +186,7 @@

    How to apply EUCAST rules

    Matthijs S. Berends

    -

    23 February 2020

    +

    07 March 2020

    @@ -293,7 +300,7 @@ -
    eucast_rules(data, info = FALSE)
    + diff --git a/docs/articles/PCA.html b/docs/articles/PCA.html new file mode 100644 index 00000000..ced3dfc4 --- /dev/null +++ b/docs/articles/PCA.html @@ -0,0 +1,345 @@ + + + + + + + +How to conduct principal component analysis (PCA) for AMR • AMR (for R) + + + + + + + + + + + + + + + + + + +
    +
    + + + + +
    +
    + + + + +

    NOTE: This page will be updated soon, as the pca() function is currently being developed.

    +
    +

    +Introduction

    +
    +
    +

    +Transforming

    +

    For PCA, we need to transform our AMR data first. This is what the example_isolates data set in this package looks like:

    +
    library(AMR)
    +library(dplyr)
    +glimpse(example_isolates)
    +# Observations: 2,000
    +# Variables: 49
    +# $ date            <date> 2002-01-02, 2002-01-03, 2002-01-07, 2002-01-07, 2002…
    +# $ hospital_id     <fct> D, D, B, B, B, B, D, D, B, B, D, D, D, D, D, B, B, B,…
    +# $ ward_icu        <lgl> FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, T…
    +# $ ward_clinical   <lgl> TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, F…
    +# $ ward_outpatient <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS…
    +# $ age             <dbl> 65, 65, 45, 45, 45, 45, 78, 78, 45, 79, 67, 67, 71, 7…
    +# $ gender          <chr> "F", "F", "F", "F", "F", "F", "M", "M", "F", "F", "M"…
    +# $ patient_id      <chr> "A77334", "A77334", "067927", "067927", "067927", "06…
    +# $ mo              <mo> B_ESCHR_COLI, B_ESCHR_COLI, B_STPHY_EPDR, B_STPHY_EPDR…
    +# $ PEN             <rsi> R, R, R, R, R, R, R, R, R, R, R, R, R, R, R, R, R, R,…
    +# $ OXA             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
    +# $ FLC             <rsi> NA, NA, R, R, R, R, S, S, R, S, S, S, NA, NA, NA, NA,…
    +# $ AMX             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
    +# $ AMC             <rsi> I, I, NA, NA, NA, NA, S, S, NA, NA, S, S, I, I, R, I,…
    +# $ AMP             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
    +# $ TZP             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
    +# $ CZO             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
    +# $ FEP             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
    +# $ CXM             <rsi> I, I, R, R, R, R, S, S, R, S, S, S, S, S, NA, S, S, R…
    +# $ FOX             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
    +# $ CTX             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, S, S,…
    +# $ CAZ             <rsi> NA, NA, R, R, R, R, R, R, R, R, R, R, NA, NA, NA, S, …
    +# $ CRO             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, S, S,…
    +# $ GEN             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
    +# $ TOB             <rsi> NA, NA, NA, NA, NA, NA, S, S, NA, NA, NA, NA, S, S, N…
    +# $ AMK             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
    +# $ KAN             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
    +# $ TMP             <rsi> R, R, S, S, R, R, R, R, S, S, NA, NA, S, S, S, S, S, …
    +# $ SXT             <rsi> R, R, S, S, NA, NA, NA, NA, S, S, NA, NA, S, S, S, S,…
    +# $ NIT             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
    +# $ FOS             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
    +# $ LNZ             <rsi> R, R, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, R, R, R…
    +# $ CIP             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, S, S, NA, NA, NA, NA,…
    +# $ MFX             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
    +# $ VAN             <rsi> R, R, S, S, S, S, S, S, S, S, NA, NA, R, R, R, R, R, …
    +# $ TEC             <rsi> R, R, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, R, R, R…
    +# $ TCY             <rsi> R, R, S, S, S, S, S, S, S, I, S, S, NA, NA, I, R, R, …
    +# $ TGC             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
    +# $ DOX             <rsi> NA, NA, S, S, S, S, S, S, S, NA, S, S, NA, NA, NA, R,…
    +# $ ERY             <rsi> R, R, R, R, R, R, S, S, R, S, S, S, R, R, R, R, R, R,…
    +# $ CLI             <rsi> NA, NA, NA, NA, NA, R, NA, NA, NA, NA, NA, NA, NA, NA…
    +# $ AZM             <rsi> R, R, R, R, R, R, S, S, R, S, S, S, R, R, R, R, R, R,…
    +# $ IPM             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, S, S,…
    +# $ MEM             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
    +# $ MTR             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
    +# $ CHL             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
    +# $ COL             <rsi> NA, NA, R, R, R, R, R, R, R, R, R, R, NA, NA, NA, R, …
    +# $ MUP             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
    +# $ RIF             <rsi> R, R, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, R, R, R…
    +

    Now to transform this to a data set with only resistance percentages per taxonomic order and genus:

    +
    resistance_data <- example_isolates %>% 
    +  group_by(order = mo_order(mo),       # group on anything, like order
    +           genus = mo_genus(mo)) %>%   #  and genus as we do here
    +  summarise_if(is.rsi, resistance) %>% # then get resistance of all drugs
    +  select(order, genus, AMC, CXM, CTX, 
    +         CAZ, GEN, TOB, TMP, SXT)      # and select only relevant columns
    +
    +head(resistance_data)
    +# # A tibble: 6 x 10
    +# # Groups:   order [2]
    +#   order          genus             AMC   CXM   CTX   CAZ   GEN   TOB   TMP   SXT
    +#   <chr>          <chr>           <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
    +# 1 (unknown orde… Micrococcoides     NA    NA    NA    NA    NA    NA    NA    NA
    +# 2 Actinomycetal… Actinomyces        NA    NA    NA    NA    NA    NA    NA    NA
    +# 3 Actinomycetal… Corynebacterium    NA    NA    NA    NA    NA    NA    NA    NA
    +# 4 Actinomycetal… Dermabacter        NA    NA    NA    NA    NA    NA    NA    NA
    +# 5 Actinomycetal… Micrococcus        NA    NA    NA    NA    NA    NA    NA    NA
    +# 6 Actinomycetal… Propionibacter…    NA    NA    NA    NA    NA    NA    NA    NA
    +
    +
    +

    +Perform principal component analysis

    +

    The new pca() function will automatically filter on rows that contain numeric values in all selected variables, so we now only need to do:

    +
    pca_result <- pca(resistance_data)
    +# NOTE: Columns selected for PCA: AMC/CXM/CTX/CAZ/GEN/TOB/TMP/SXT.
    +#       Total observations available: 7.
    +

    The result can be reviewed with the good old summary() function:

    +
    summary(pca_result)
    +# Importance of components:
    +#                           PC1    PC2     PC3     PC4     PC5     PC6       PC7
    +# Standard deviation     2.1580 1.6783 0.61282 0.33017 0.20150 0.03190 2.123e-16
    +# Proportion of Variance 0.5821 0.3521 0.04694 0.01363 0.00508 0.00013 0.000e+00
    +# Cumulative Proportion  0.5821 0.9342 0.98117 0.99480 0.99987 1.00000 1.000e+00
    +

    Good news. The first two components explain a total of 93.4% of the variance (see the PC1 and PC2 values of the Proportion of Variance. We can create a so-called biplot with the base R biplot() function, to see which antimicrobial resistance per drug explain the difference per microorganism.

    +
    +
    +

    +Plotting the results

    +
    biplot(pca_result)
    +

    +

    But we can’t see the explanation of the points. Perhaps this works better with the new ggplot_pca() function, that automatically adds the right labels and even groups:

    +
    ggplot_pca(pca_result)
    +

    +

    You can also print an ellipse per group, and edit the appearance:

    +
    ggplot_pca(pca_result, ellipse = TRUE) +
    +  ggplot2::labs(title = "An AMR/PCA biplot!")
    +

    +
    +
    + + + +
    + + + + +
    + + + + + + diff --git a/docs/articles/PCA_files/figure-html/unnamed-chunk-5-1.png b/docs/articles/PCA_files/figure-html/unnamed-chunk-5-1.png new file mode 100644 index 00000000..760b3b65 Binary files /dev/null and b/docs/articles/PCA_files/figure-html/unnamed-chunk-5-1.png differ diff --git a/docs/articles/PCA_files/figure-html/unnamed-chunk-6-1.png b/docs/articles/PCA_files/figure-html/unnamed-chunk-6-1.png new file mode 100644 index 00000000..1f655d7f Binary files /dev/null and b/docs/articles/PCA_files/figure-html/unnamed-chunk-6-1.png differ diff --git a/docs/articles/PCA_files/figure-html/unnamed-chunk-7-1.png b/docs/articles/PCA_files/figure-html/unnamed-chunk-7-1.png new file mode 100644 index 00000000..9d52bc29 Binary files /dev/null and b/docs/articles/PCA_files/figure-html/unnamed-chunk-7-1.png differ diff --git a/docs/articles/benchmarks.html b/docs/articles/benchmarks.html index f3b67680..db2efa6e 100644 --- a/docs/articles/benchmarks.html +++ b/docs/articles/benchmarks.html @@ -39,7 +39,7 @@ AMR (for R) - 1.0.1 + 1.0.1.9000 @@ -75,6 +75,13 @@ Predict antimicrobial resistance +
  • + + + + Conduct principal component analysis for AMR + +
  • @@ -179,7 +186,7 @@

    Benchmarks

    Matthijs S. Berends

    -

    23 February 2020

    +

    07 March 2020

    @@ -213,21 +220,36 @@
    times = 10) print(S.aureus, unit = "ms", signif = 2) # Unit: milliseconds -# expr min lq mean median uq max neval -# as.mo("sau") 8.2 8.5 12.0 9.1 9.5 34 10 -# as.mo("stau") 36.0 37.0 50.0 40.0 65.0 82 10 -# as.mo("STAU") 38.0 41.0 49.0 41.0 64.0 72 10 -# as.mo("staaur") 8.4 8.9 9.2 9.2 9.3 10 10 -# as.mo("STAAUR") 8.4 8.7 13.0 9.4 9.5 43 10 -# as.mo("S. aureus") 14.0 15.0 30.0 37.0 40.0 44 10 -# as.mo("S aureus") 14.0 15.0 43.0 15.0 37.0 250 10 -# as.mo("Staphylococcus aureus") 4.7 5.0 7.4 5.1 5.5 28 10 -# as.mo("Staphylococcus aureus (MRSA)") 660.0 690.0 720.0 730.0 730.0 790 10 -# as.mo("Sthafilokkockus aaureuz") 350.0 380.0 420.0 400.0 440.0 520 10 -# as.mo("MRSA") 8.1 8.5 12.0 9.1 9.3 38 10 -# as.mo("VISA") 24.0 26.0 35.0 28.0 46.0 52 10 -# as.mo("VRSA") 24.0 26.0 35.0 29.0 47.0 57 10 -# as.mo(22242419) 130.0 130.0 150.0 140.0 150.0 240 10 +# expr min lq mean median uq max +# as.mo("sau") 8.0 8.2 9.1 8.4 8.5 16 +# as.mo("stau") 37.0 40.0 51.0 52.0 60.0 76 +# as.mo("STAU") 36.0 38.0 58.0 60.0 68.0 100 +# as.mo("staaur") 8.2 8.4 9.5 8.6 8.9 14 +# as.mo("STAAUR") 8.2 8.3 15.0 9.2 14.0 53 +# as.mo("S. aureus") 13.0 21.0 64.0 21.0 45.0 260 +# as.mo("S aureus") 13.0 14.0 33.0 24.0 44.0 76 +# as.mo("Staphylococcus aureus") 4.7 4.8 9.9 6.8 7.9 42 +# as.mo("Staphylococcus aureus (MRSA)") 620.0 640.0 770.0 700.0 860.0 1100 +# as.mo("Sthafilokkockus aaureuz") 330.0 350.0 460.0 490.0 560.0 570 +# as.mo("MRSA") 8.1 8.3 14.0 12.0 13.0 48 +# as.mo("VISA") 24.0 25.0 34.0 26.0 38.0 59 +# as.mo("VRSA") 23.0 24.0 37.0 27.0 39.0 78 +# as.mo(22242419) 120.0 130.0 150.0 140.0 160.0 240 +# neval +# 10 +# 10 +# 10 +# 10 +# 10 +# 10 +# 10 +# 10 +# 10 +# 10 +# 10 +# 10 +# 10 +# 10

    In the table above, all measurements are shown in milliseconds (thousands of seconds). A value of 5 milliseconds means it can determine 200 input values per second. It case of 100 milliseconds, this is only 10 input values per second.

    To achieve this speed, the as.mo function also takes into account the prevalence of human pathogenic microorganisms. The downside of this is of course that less prevalent microorganisms will be determined less fast. See this example for the ID of Methanosarcina semesiae (B_MTHNSR_SEMS), a bug probably never found before in humans:

    @@ -239,19 +261,19 @@ times = 10) print(M.semesiae, unit = "ms", signif = 4) # Unit: milliseconds -# expr min lq mean median uq -# as.mo("metsem") 1497.000 1536.000 1604.00 1575.00 1693.000 -# as.mo("METSEM") 1472.000 1510.000 1563.00 1563.00 1615.000 -# as.mo("M. semesiae") 14.520 14.760 22.18 15.39 36.430 -# as.mo("M. semesiae") 14.310 14.630 19.94 15.18 16.080 -# as.mo("Methanosarcina semesiae") 5.376 5.482 8.41 5.81 5.911 -# max neval -# 1709.00 10 -# 1641.00 10 -# 40.57 10 -# 40.27 10 -# 32.27 10 -

    That takes 5.7 times as much time on average. We can conclude that looking up arbitrary codes of less prevalent microorganisms is the worst way to go, in terms of calculation performance. Full names (like Methanosarcina semesiae) are always very fast and only take some thousands of seconds to coerce - they are the most probable input from most data sets.

    +# expr min lq mean median uq +# as.mo("metsem") 1349.000 1352.000 1597.000 1411.000 1983.000 +# as.mo("METSEM") 1316.000 2146.000 2069.000 2226.000 2245.000 +# as.mo("M. semesiae") 13.330 14.110 32.960 21.840 53.090 +# as.mo("M. semesiae") 13.730 20.960 29.720 21.430 40.000 +# as.mo("Methanosarcina semesiae") 4.802 5.171 6.667 6.551 8.036 +# max neval +# 2184.000 10 +# 2337.000 10 +# 62.780 10 +# 64.510 10 +# 8.735 10 +

    That takes 6.1 times as much time on average. We can conclude that looking up arbitrary codes of less prevalent microorganisms is the worst way to go, in terms of calculation performance. Full names (like Methanosarcina semesiae) are always very fast and only take some thousands of seconds to coerce - they are the most probable input from most data sets.

    In the figure below, we compare Escherichia coli (which is very common) with Prevotella brevis (which is moderately common) and with Methanosarcina semesiae (which is uncommon):

    Uncommon microorganisms take a lot more time than common microorganisms. To relieve this pitfall and further improve performance, two important calculations take almost no time at all: repetitive results and already precalculated results.

    @@ -285,8 +307,8 @@ print(run_it, unit = "ms", signif = 3) # Unit: milliseconds # expr min lq mean median uq max neval -# mo_name(x) 568 614 651 634 657 1170 100 -

    So transforming 500,000 values (!!) of 50 unique values only takes 0.63 seconds (634 ms). You only lose time on your unique input values.

    +# mo_name(x) 564 605 673 630 657 1100 100 +

    So transforming 500,000 values (!!) of 50 unique values only takes 0.63 seconds (630 ms). You only lose time on your unique input values.

    @@ -298,10 +320,10 @@ times = 10) print(run_it, unit = "ms", signif = 3) # Unit: milliseconds -# expr min lq mean median uq max neval -# A 6.630 6.790 7.330 7.230 7.690 8.39 10 -# B 13.900 14.300 19.000 14.700 17.000 53.00 10 -# C 0.847 0.875 0.947 0.901 0.977 1.16 10

    +# expr min lq mean median uq max neval +# A 6.58 6.590 7.340 6.630 6.780 13.00 10 +# B 13.50 13.700 18.700 13.900 14.600 60.80 10 +# C 0.72 0.863 0.917 0.898 0.935 1.26 10

    So going from mo_name("Staphylococcus aureus") to "Staphylococcus aureus" takes 0.0009 seconds - it doesn’t even start calculating if the result would be the same as the expected resulting value. That goes for all helper functions:

    run_it <- microbenchmark(A = mo_species("aureus"),
                              B = mo_genus("Staphylococcus"),
    @@ -315,14 +337,14 @@
     print(run_it, unit = "ms", signif = 3)
     # Unit: milliseconds
     #  expr   min    lq  mean median    uq   max neval
    -#     A 0.476 0.485 0.501  0.498 0.504 0.554    10
    -#     B 0.515 0.521 0.548  0.545 0.553 0.614    10
    -#     C 0.710 0.791 0.870  0.842 0.855 1.330    10
    -#     D 0.491 0.524 0.539  0.535 0.546 0.613    10
    -#     E 0.488 0.500 0.583  0.541 0.635 0.830    10
    -#     F 0.477 0.488 0.509  0.495 0.519 0.569    10
    -#     G 0.473 0.490 0.507  0.498 0.534 0.547    10
    -#     H 0.477 0.486 0.500  0.494 0.509 0.561    10
    +# A 0.499 0.511 0.516 0.517 0.522 0.544 10 +# B 0.532 0.539 0.550 0.542 0.563 0.592 10 +# C 0.718 0.787 0.832 0.843 0.889 0.904 10 +# D 0.538 0.548 0.566 0.567 0.571 0.607 10 +# E 0.503 0.509 0.515 0.513 0.516 0.549 10 +# F 0.502 0.504 0.514 0.511 0.519 0.539 10 +# G 0.493 0.513 0.538 0.514 0.536 0.684 10 +# H 0.499 0.501 0.509 0.505 0.516 0.531 10

    Of course, when running mo_phylum("Firmicutes") the function has zero knowledge about the actual microorganism, namely S. aureus. But since the result would be "Firmicutes" anyway, there is no point in calculating the result. And because this package ‘knows’ all phyla of all known bacteria (according to the Catalogue of Life), it can just return the initial value immediately.

    @@ -349,13 +371,13 @@ print(run_it, unit = "ms", signif = 4) # Unit: milliseconds # expr min lq mean median uq max neval -# en 24.69 25.88 32.93 26.49 28.24 165.60 100 -# de 26.07 27.22 32.82 28.08 29.90 75.78 100 -# nl 32.01 33.49 39.63 34.78 36.68 78.48 100 -# es 25.66 27.31 32.25 28.04 29.53 68.15 100 -# it 25.69 27.13 32.40 28.22 30.01 62.64 100 -# fr 25.77 27.22 33.72 28.03 30.46 71.35 100 -# pt 25.65 27.12 31.94 27.78 29.08 60.06 100
    +# en 23.72 25.30 30.59 25.77 26.99 76.03 100 +# de 24.88 26.81 31.11 27.47 28.93 69.86 100 +# nl 30.65 32.77 38.07 33.70 35.23 74.79 100 +# es 24.89 26.33 32.10 27.13 28.87 68.79 100 +# it 24.78 26.72 33.51 27.53 28.91 166.60 100 +# fr 24.84 26.58 31.50 27.13 28.29 67.38 100 +# pt 24.88 26.58 32.38 27.50 29.20 79.30 100

    Currently supported are German, Dutch, Spanish, Italian, French and Portuguese.

    diff --git a/docs/articles/benchmarks_files/figure-html/unnamed-chunk-4-1.png b/docs/articles/benchmarks_files/figure-html/unnamed-chunk-4-1.png index be6dde2f..f86f4ef2 100644 Binary files a/docs/articles/benchmarks_files/figure-html/unnamed-chunk-4-1.png and b/docs/articles/benchmarks_files/figure-html/unnamed-chunk-4-1.png differ diff --git a/docs/articles/benchmarks_files/figure-html/unnamed-chunk-6-1.png b/docs/articles/benchmarks_files/figure-html/unnamed-chunk-6-1.png index 907f48db..3eec84dd 100644 Binary files a/docs/articles/benchmarks_files/figure-html/unnamed-chunk-6-1.png and b/docs/articles/benchmarks_files/figure-html/unnamed-chunk-6-1.png differ diff --git a/docs/articles/index.html b/docs/articles/index.html index 61da5cc2..ed5d18ed 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -78,7 +78,7 @@ AMR (for R) - 1.0.1 + 1.0.1.9000 @@ -114,6 +114,13 @@ Predict antimicrobial resistance
  • +
  • + + + + Conduct principal component analysis for AMR + +
  • @@ -226,6 +233,7 @@
  • How to conduct AMR analysis
  • How to apply EUCAST rules
  • How to determine multi-drug resistance (MDR)
  • +
  • How to conduct principal component analysis (PCA) for AMR
  • How to import data from SPSS / SAS / Stata
  • How to work with WHONET data
  • Benchmarks
  • diff --git a/docs/authors.html b/docs/authors.html index d541892f..c40eb5aa 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -78,7 +78,7 @@ AMR (for R) - 1.0.1 + 1.0.1.9000 @@ -114,6 +114,13 @@ Predict antimicrobial resistance +
  • + + + + Conduct principal component analysis for AMR + +
  • diff --git a/docs/index.html b/docs/index.html index c36273cd..bbd5b8b7 100644 --- a/docs/index.html +++ b/docs/index.html @@ -43,7 +43,7 @@ AMR (for R) - 1.0.1 + 1.0.1.9000 @@ -79,6 +79,13 @@ Predict antimicrobial resistance
  • +
  • + + + + Conduct principal component analysis for AMR + +
  • diff --git a/docs/news/index.html b/docs/news/index.html index c4ae39a2..c60b1aaf 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -78,7 +78,7 @@ AMR (for R) - 1.0.1 + 1.0.1.9000 @@ -114,6 +114,13 @@ Predict antimicrobial resistance
  • +
  • + + + + Conduct principal component analysis for AMR + +
  • @@ -219,6 +226,19 @@ +
    +

    +AMR 1.0.1.9000 Unreleased +

    +
    +

    +New

    +
      +
    • Support for easy principal component analysis for AMR, using the new pca() function
    • +
    • Plotting biplots for principal component analysis using the new ggplot_pca() function
    • +
    +
    +

    AMR 1.0.1 2020-02-23 @@ -247,9 +267,9 @@ AMR 1.0.0 2020-02-17

    This software is now out of beta and considered stable. Nonetheless, this package will be developed continually.

    -
    +

    -New

    +New
    -
    +

    -New

    +New
    • Functions susceptibility() and resistance() as aliases of proportion_SI() and proportion_R(), respectively. These functions were added to make it more clear that “I” should be considered susceptible and not resistant.

      @@ -449,9 +469,9 @@
    • Renamed data set septic_patients to example_isolates

    -
    +

    -New

    +New
    • Function bug_drug_combinations() to quickly get a data.frame with the results of all bug-drug combinations in a data set. The column containing microorganism codes is guessed automatically and its input is transformed with mo_shortname() at default:

      @@ -575,9 +595,9 @@

      AMR 0.7.1 2019-06-23

      -
      +

      -New

      +New
      • Function rsi_df() to transform a data.frame to a data set containing only the microbial interpretation (S, I, R), the antibiotic, the percentage of S/I/R and the number of available isolates. This is a convenient combination of the existing functions count_df() and portion_df() to immediately show resistance percentages and number of available isolates:

        @@ -656,9 +676,9 @@

        AMR 0.7.0 2019-06-03

        -
        +

        -New

        +New
        • Support for translation of disk diffusion and MIC values to RSI values (i.e. antimicrobial interpretations). Supported guidelines are EUCAST (2011 to 2019) and CLSI (2011 to 2019). Use as.rsi() on an MIC value (created with as.mic()), a disk diffusion value (created with the new as.disk()) or on a complete date set containing columns with MIC or disk diffusion values.
        • Function mo_name() as alias of mo_fullname() @@ -773,9 +793,9 @@
        • Contains the complete manual of this package and all of its functions with an explanation of their parameters
        • Contains a comprehensive tutorial about how to conduct antimicrobial resistance analysis, import data from WHONET or SPSS and many more.
        -
        +

        -New

        +New
        • BREAKING: removed deprecated functions, parameters and references to ‘bactid’. Use as.mo() to identify an MO code.

        • @@ -999,9 +1019,9 @@

          AMR 0.5.0 2018-11-30

          -
          +

          -New

          +New
          • Repository moved to GitLab: https://gitlab.com/msberends/AMR
          • @@ -1118,9 +1138,9 @@

            AMR 0.4.0 2018-10-01

            -
            +

            -New

            +New
            • The data set microorganisms now contains all microbial taxonomic data from ITIS (kingdoms Bacteria, Fungi and Protozoa), the Integrated Taxonomy Information System, available via https://itis.gov. The data set now contains more than 18,000 microorganisms with all known bacteria, fungi and protozoa according ITIS with genus, species, subspecies, family, order, class, phylum and subkingdom. The new data set microorganisms.old contains all previously known taxonomic names from those kingdoms.

            • @@ -1251,9 +1271,9 @@

              AMR 0.3.0 2018-08-14

              -
              +

              -New

              +New
              • BREAKING: rsi_df was removed in favour of new functions portion_R, portion_IR, portion_I, portion_SI and portion_S to selectively calculate resistance or susceptibility. These functions are 20 to 30 times faster than the old rsi function. The old function still works, but is deprecated. @@ -1388,9 +1408,9 @@

                AMR 0.2.0 2018-05-03

                -
                +

                -New

                +New
                • Full support for Windows, Linux and macOS
                • Full support for old R versions, only R-3.0.0 (April 2013) or later is needed (needed packages may have other dependencies)
                • @@ -1469,6 +1489,7 @@

                  Contents

                  @@ -115,6 +115,13 @@ Predict antimicrobial resistance +
                • + + + + Conduct Principal Component Analysis for AMR + +
                • @@ -250,7 +257,7 @@

                  Value

                  -

                  An integer (no decimals) if exact = FALSE, a double (with decimals) otherwise

                  +

                  An integer (no decimals) if exact = FALSE, a double (with decimals) otherwise

                  Stable lifecycle

                  diff --git a/docs/reference/age_groups.html b/docs/reference/age_groups.html index 7a5ce44c..1d52932e 100644 --- a/docs/reference/age_groups.html +++ b/docs/reference/age_groups.html @@ -79,7 +79,7 @@ AMR (for R) - 1.0.1 + 1.0.1.9000
                @@ -115,6 +115,13 @@ Predict antimicrobial resistance
              • +
              • + + + + Conduct Principal Component Analysis for AMR + +
              • @@ -240,23 +247,23 @@
  • - +
    mo
    na.rm

    a logical to indicate whether missing values should be removed

    a logical to indicate whether missing values should be removed

    Value

    -

    Ordered factor

    +

    Ordered factor

    Details

    -

    To split ages, the input can be: