diff --git a/DESCRIPTION b/DESCRIPTION index 8f626de4..ae0e5c7b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: AMR -Version: 1.0.1.9001 +Version: 1.0.1.9002 Date: 2020-03-08 Title: Antimicrobial Resistance Analysis Authors@R: c( diff --git a/NAMESPACE b/NAMESPACE index 9567031b..6c6487ca 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -37,7 +37,6 @@ S3method(pillar_shaft,rsi) S3method(plot,mic) S3method(plot,resistance_predict) S3method(plot,rsi) -S3method(prcomp,data.frame) S3method(print,ab) S3method(print,bug_drug_combinations) S3method(print,catalogue_of_life_version) @@ -227,7 +226,6 @@ exportMethods(kurtosis.default) exportMethods(kurtosis.matrix) exportMethods(plot.mic) exportMethods(plot.rsi) -exportMethods(prcomp.data.frame) exportMethods(print.ab) exportMethods(print.bug_drug_combinations) exportMethods(print.catalogue_of_life_version) @@ -329,6 +327,8 @@ importFrom(stats,lm) importFrom(stats,pchisq) importFrom(stats,prcomp) importFrom(stats,predict) +importFrom(stats,qchisq) +importFrom(stats,var) importFrom(tidyr,pivot_longer) importFrom(tidyr,pivot_wider) importFrom(utils,adist) diff --git a/NEWS.md b/NEWS.md index 6119f607..4560f753 100755 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,5 @@ -# AMR 1.0.1.9001 +# AMR 1.0.1.9002 +## Last updated: 08-Mar-2020 ### New * Support for easy principal component analysis for AMR, using the new `pca()` function diff --git a/R/misc.R b/R/aa_helper_functions.R similarity index 97% rename from R/misc.R rename to R/aa_helper_functions.R index d007bf7a..ca879602 100755 --- a/R/misc.R +++ b/R/aa_helper_functions.R @@ -43,7 +43,7 @@ check_dataset_integrity <- function() { "iv_ddd", "iv_units", "loinc") %in% colnames(antibiotics), na.rm = TRUE) }, error = function(e) - stop('Please use the command \'library("AMR")\' before using this function, to load the needed reference data.', call. = FALSE) + stop('Please use the command \'library("AMR")\' before using this function, to load the required reference data.', call. = FALSE) ) if (!check_microorganisms | !check_antibiotics) { stop("Data set `microorganisms` or data set `antibiotics` is overwritten by your global environment and prevents the AMR package from working correctly. Please rename your object before using this function.", call. = FALSE) @@ -154,6 +154,13 @@ stopifnot_installed_package <- function(package) { return(invisible()) } +stopifnot_msg <- function(expr, msg) { + if (!isTRUE(expr)) { + stop(msg, call. = FALSE) + } +} + + "%or%" <- function(x, y) { if (is.null(x) | is.null(y)) { if (is.null(x)) { diff --git a/R/ggplot_pca.R b/R/ggplot_pca.R index d512609a..e7764cae 100755 --- a/R/ggplot_pca.R +++ b/R/ggplot_pca.R @@ -49,9 +49,9 @@ #' 1. Rewritten code to remove the dependency on packages `plyr`, `scales` and `grid` #' 2. Parametrised more options, like arrow and ellipse settings #' 3. Added total amount of explained variance as a caption in the plot -#' 4. Cleaned all syntax based on the `lintr` package +#' 4. Cleaned all syntax based on the `lintr` package and added integrity checks #' 5. Updated documentation -#' @details The default colours for labels and points is set with [scale_colour_viridis_d()], but these can be changed by adding another scale for colour, like [scale_colour_brewer()]. +#' @details The colours for labels and points can be changed by adding another scale layer for colour, like [scale_colour_viridis_d()] or [scale_colour_brewer()]. #' @rdname ggplot_pca #' @export #' @examples @@ -74,14 +74,15 @@ ggplot_pca <- function(x, choices = 1:2, scale = TRUE, + pc.biplot = TRUE, labels = NULL, labels_textsize = 3, labels_text_placement = 1.5, groups = NULL, - ellipse = FALSE, + ellipse = TRUE, ellipse_prob = 0.68, ellipse_size = 0.5, - ellipse_alpha = 0.25, + ellipse_alpha = 0.5, points_size = 2, points_alpha = 0.25, arrows = TRUE, @@ -93,6 +94,21 @@ ggplot_pca <- function(x, ...) { stopifnot_installed_package("ggplot2") + stopifnot_msg(length(choices) == 2, "`choices` must be of length 2") + stopifnot_msg(is.logical(scale), "`scale` must be TRUE or FALSE") + stopifnot_msg(is.logical(pc.biplot), "`pc.biplot` must be TRUE or FALSE") + stopifnot_msg(is.numeric(choices), "`choices` must be numeric") + stopifnot_msg(is.numeric(labels_textsize), "`labels_textsize` must be numeric") + stopifnot_msg(is.numeric(labels_text_placement), "`labels_text_placement` must be numeric") + stopifnot_msg(is.logical(ellipse), "`ellipse` must be TRUE or FALSE") + stopifnot_msg(is.numeric(ellipse_prob), "`ellipse_prob` must be numeric") + stopifnot_msg(is.numeric(ellipse_size), "`ellipse_size` must be numeric") + stopifnot_msg(is.numeric(ellipse_alpha), "`ellipse_alpha` must be numeric") + stopifnot_msg(is.logical(arrows), "`arrows` must be TRUE or FALSE") + stopifnot_msg(is.numeric(arrows_size), "`arrows_size` must be numeric") + stopifnot_msg(is.numeric(arrows_textsize), "`arrows_textsize` must be numeric") + stopifnot_msg(is.numeric(arrows_alpha), "`arrows_alpha` must be numeric") + stopifnot_msg(is.numeric(base_textsize), "`base_textsize` must be numeric") calculations <- pca_calculations(pca_model = x, groups = groups, @@ -101,6 +117,7 @@ ggplot_pca <- function(x, labels_missing = missing(labels), choices = choices, scale = scale, + pc.biplot = pc.biplot, ellipse_prob = ellipse_prob, labels_text_placement = labels_text_placement) nobs.factor <- calculations$nobs.factor @@ -116,17 +133,16 @@ ggplot_pca <- function(x, group_name <- calculations$group_name labels <- calculations$labels - stopifnot(length(choices) == 2) - # Append the proportion of explained variance to the axis labels if ((1 - as.integer(scale)) == 0) { - u.axis.labs <- paste("Standardised PC", choices, sep = "") + u.axis.labs <- paste0("Standardised PC", choices) } else { - u.axis.labs <- paste("PC", choices, sep = "") + u.axis.labs <- paste0("PC", choices) } - u.axis.labs <- paste(u.axis.labs, - paste0("\n(explained var: ", - percentage(x$sdev[choices] ^ 2 / sum(x$sdev ^ 2)), ")")) + u.axis.labs <- paste0(u.axis.labs, + paste0("\n(explained var: ", + percentage(x$sdev[choices] ^ 2 / sum(x$sdev ^ 2)), + ")")) # Score Labels if (!is.null(labels)) { @@ -138,7 +154,6 @@ ggplot_pca <- function(x, df.u$groups <- groups } - # Base plot g <- ggplot2::ggplot(data = df.u, ggplot2::aes(x = xvar, y = yvar)) + @@ -150,30 +165,25 @@ ggplot_pca <- function(x, # Draw either labels or points if (!is.null(df.u$labels)) { if (!is.null(df.u$groups)) { - g <- g + - ggplot2::geom_point(ggplot2::aes(colour = groups), - alpha = points_alpha, - size = points_size) + + g <- g + ggplot2::geom_point(ggplot2::aes(colour = groups), + alpha = points_alpha, + size = points_size) + ggplot2::geom_text(ggplot2::aes(label = labels, colour = groups), nudge_y = -0.05, size = labels_textsize) + - ggplot2::scale_colour_viridis_d() + ggplot2::labs(colour = group_name) } else { - g <- g + - ggplot2::geom_point(alpha = points_alpha, - size = points_size) + + g <- g + ggplot2::geom_point(alpha = points_alpha, + size = points_size) + ggplot2::geom_text(ggplot2::aes(label = labels), nudge_y = -0.05, size = labels_textsize) } } else { if (!is.null(df.u$groups)) { - g <- g + - ggplot2::geom_point(ggplot2::aes(colour = groups), - alpha = points_alpha, - size = points_size) + - ggplot2::scale_colour_viridis_d() + + g <- g + ggplot2::geom_point(ggplot2::aes(colour = groups), + alpha = points_alpha, + size = points_size) + ggplot2::labs(colour = group_name) } else { g <- g + ggplot2::geom_point(alpha = points_alpha, @@ -182,26 +192,24 @@ ggplot_pca <- function(x, } # Overlay a concentration ellipse if there are groups - if (!is.null(df.u$groups) & isTRUE(ellipse)) { - g <- g + - ggplot2::geom_path(data = ell, - ggplot2::aes(colour = groups, group = groups), - size = ellipse_size, - alpha = points_alpha) + if (!is.null(df.u$groups) & !is.null(ell) & isTRUE(ellipse)) { + g <- g + ggplot2::geom_path(data = ell, + ggplot2::aes(colour = groups, group = groups), + size = ellipse_size, + alpha = points_alpha) } # Label the variable axes if (arrows == TRUE) { - g <- g + - ggplot2::geom_segment(data = df.v, - ggplot2::aes(x = 0, y = 0, xend = xvar, yend = yvar), - arrow = ggplot2::arrow(length = ggplot2::unit(0.5, "picas"), - angle = 20, - ends = "last", - type = "open"), - colour = arrows_colour, - size = arrows_size, - alpha = arrows_alpha) + + g <- g + ggplot2::geom_segment(data = df.v, + ggplot2::aes(x = 0, y = 0, xend = xvar, yend = yvar), + arrow = ggplot2::arrow(length = ggplot2::unit(0.5, "picas"), + angle = 20, + ends = "last", + type = "open"), + colour = arrows_colour, + size = arrows_size, + alpha = arrows_alpha) + ggplot2::geom_text(data = df.v, ggplot2::aes(label = varname, x = xvar, y = yvar, angle = angle, hjust = hjust), colour = arrows_colour, @@ -225,6 +233,7 @@ ggplot_pca <- function(x, } #' @importFrom dplyr bind_rows +#' @importFrom stats qchisq var pca_calculations <- function(pca_model, groups = NULL, groups_missing = TRUE, @@ -232,6 +241,7 @@ pca_calculations <- function(pca_model, labels_missing = TRUE, choices = 1:2, scale = 1, + pc.biplot = TRUE, ellipse_prob = 0.68, labels_text_placement = 1.5) { @@ -291,7 +301,9 @@ pca_calculations <- function(pca_model, names(df.u) <- c("xvar", "yvar") names(df.v) <- names(df.u) - df.u <- df.u * nobs.factor + if (isTRUE(pc.biplot)) { + df.u <- df.u * nobs.factor + } # Scale the radius of the correlation circle so that it corresponds to # a data ellipse for the standardized PC scores @@ -314,8 +326,8 @@ pca_calculations <- function(pca_model, df.v$hjust <- with(df.v, (1 - labels_text_placement * sign(xvar)) / 2) if (!is.null(df.u$groups)) { - theta <<- c(seq(-pi, pi, length = 50), seq(pi, -pi, length = 50)) - circle <<- cbind(cos(theta), sin(theta)) + theta <- c(seq(-pi, pi, length = 50), seq(pi, -pi, length = 50)) + circle <- cbind(cos(theta), sin(theta)) ell <- bind_rows( sapply(unique(df.u$groups), function(g, df = df.u) { x <- df[which(df$groups == g), , drop = FALSE] @@ -325,18 +337,18 @@ pca_calculations <- function(pca_model, sigma <- var(cbind(x$xvar, x$yvar)) mu <- c(mean(x$xvar), mean(x$yvar)) ed <- sqrt(qchisq(ellipse_prob, df = 2)) - el <- data.frame(sweep(circle %*% chol(sigma) * ed, 2, mu, FUN = "+"), - groups = x$groups[1]) - names(el)[1:2] <- c("xvar", "yvar") - el + data.frame(sweep(circle %*% chol(sigma) * ed, 2, mu, FUN = "+"), + groups = x$groups[1]) })) if (NROW(ell) == 0) { ell <- NULL + } else { + names(ell)[1:2] <- c("xvar", "yvar") } } else { ell <- NULL } - + list(nobs.factor = nobs.factor, d = d, u = u, @@ -350,5 +362,4 @@ pca_calculations <- function(pca_model, group_name = group_name, labels = labels ) - } diff --git a/R/globals.R b/R/globals.R index 3121afed..a4915079 100755 --- a/R/globals.R +++ b/R/globals.R @@ -24,6 +24,7 @@ globalVariables(c(".", "ab", "ab_txt", "abbreviations", + "angle", "antibiotic", "antibiotics", "CNS_CPS", @@ -40,6 +41,7 @@ globalVariables(c(".", "genus", "gramstain", "group", + "hjust", "index", "input", "interpretation", @@ -100,7 +102,10 @@ globalVariables(c(".", "txt", "uncertainty_level", "value", + "varname", "x", "xdr", + "xvar", "y", - "year")) + "year", + "yvar")) diff --git a/R/pca.R b/R/pca.R index f6a9a2e1..bef7abd3 100755 --- a/R/pca.R +++ b/R/pca.R @@ -21,17 +21,18 @@ #' Principal Component Analysis (for AMR) #' -#' Performs a principal component analysis (PCA) based on a data set with automatic determination for afterwards plotting the groups and labels. +#' Performs a principal component analysis (PCA) based on a data set with automatic determination for afterwards plotting the groups and labels, and automatic filtering on only suitable (i.e. non-empty and numeric) variables. #' @inheritSection lifecycle Experimental lifecycle #' @param x a [data.frame] containing numeric columns #' @param ... columns of `x` to be selected for PCA #' @inheritParams stats::prcomp -#' @details The [pca()] function takes a [data.frame] as input and performs the actual PCA with the R function [prcomp()]. +#' @details The [pca()] function takes a [data.frame] as input and performs the actual PCA with the \R function [prcomp()]. #' -#' The result of the [pca()] function is a [`prcomp`] object, with an additional attribute `non_numeric_cols` which is a vector with the column names of all columns that do not contain numeric values. These are probably the groups and labels, and will be used by [ggplot_pca()]. -#' @rdname pca -#' @exportMethod prcomp.data.frame +#' The result of the [pca()] function is a [prcomp] object, with an additional attribute `non_numeric_cols` which is a vector with the column names of all columns that do not contain numeric values. These are probably the groups and labels, and will be used by [ggplot_pca()]. +#' @return An object of classes [pca] and [prcomp] #' @importFrom stats prcomp +#' @importFrom dplyr ungroup %>% filter_all all_vars +#' @importFrom rlang enquos eval_tidy #' @export #' @examples #' # `example_isolates` is a dataset available in the AMR package. @@ -52,40 +53,19 @@ #' summary(pca_result) #' biplot(pca_result) #' ggplot_pca(pca_result) # a new and convenient plot function -prcomp.data.frame <- function(x, - ..., - retx = TRUE, - center = TRUE, - scale. = TRUE, - tol = NULL, - rank. = NULL) { +pca <- function(x, + ..., + retx = TRUE, + center = TRUE, + scale. = TRUE, + tol = NULL, + rank. = NULL) { - x <- pca_transform_x(x = x, ... = ...) - pca_data <- x[, which(sapply(x, function(x) is.numeric(x)))] - - message(blue(paste0("NOTE: Columns selected for PCA: ", paste0(bold(colnames(pca_data)), collapse = "/"), - ".\n Total observations available: ", nrow(pca_data), "."))) - - stats:::prcomp.default(pca_data, retx = retx, center = center, scale. = scale., tol = tol, rank. = rank.) -} - -#' @rdname pca -#' @export -pca <- function(x, ...) { if (!is.data.frame(x)) { stop("this function only takes a data.frame as input") } - pca_model <- prcomp(x, ...) - x <- pca_transform_x(x = x, ... = ...) - attr(pca_model, "non_numeric_cols") <- x[, sapply(x, function(y) !is.numeric(y) & !all(is.na(y))), drop = FALSE] - pca_model -} - -#' @importFrom dplyr ungroup %>% filter_all all_vars -#' @importFrom rlang enquos eval_tidy -pca_transform_x <- function(x, ...) { - # unset data.table, tbl_df, etc. + # unset data.table, tibble, etc. # also removes groups made by dplyr::group_by x <- as.data.frame(x, stringsAsFactors = FALSE) x.bak <- x @@ -123,7 +103,17 @@ pca_transform_x <- function(x, ...) { x <- cbind(x.bak[, sapply(x.bak, function(y) !is.numeric(y) & !all(is.na(y))), drop = FALSE], x) } - x %>% + x <- x %>% ungroup() %>% # would otherwise select the grouping vars filter_all(all_vars(!is.na(.))) + + pca_data <- x[, which(sapply(x, function(x) is.numeric(x)))] + + message(blue(paste0("NOTE: Columns selected for PCA: ", paste0(bold(colnames(pca_data)), collapse = "/"), + ".\n Total observations available: ", nrow(pca_data), "."))) + + pca_model <- prcomp(pca_data, retx = retx, center = center, scale. = scale., tol = tol, rank. = rank.) + attr(pca_model, "non_numeric_cols") <- x[, sapply(x, function(y) !is.numeric(y) & !all(is.na(y))), drop = FALSE] + class(pca_model) <- c("pca", class(pca_model)) + pca_model } diff --git a/docs/404.html b/docs/404.html index 2da52a13..79c6d533 100644 --- a/docs/404.html +++ b/docs/404.html @@ -78,7 +78,7 @@ AMR (for R) - 1.0.1.9001 + 1.0.1.9002 diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index 83237154..155d95ca 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -78,7 +78,7 @@ AMR (for R) - 1.0.1.9001 + 1.0.1.9002 diff --git a/docs/articles/index.html b/docs/articles/index.html index 81472786..28f4f619 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -78,7 +78,7 @@ AMR (for R) - 1.0.1.9001 + 1.0.1.9002 diff --git a/docs/authors.html b/docs/authors.html index 4ec83601..beceee87 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -78,7 +78,7 @@ AMR (for R) - 1.0.1.9001 + 1.0.1.9002 diff --git a/docs/index.html b/docs/index.html index 616492cf..b33d8bda 100644 --- a/docs/index.html +++ b/docs/index.html @@ -43,7 +43,7 @@ AMR (for R) - 1.0.1.9001 + 1.0.1.9002 diff --git a/docs/news/index.html b/docs/news/index.html index 2f69e007..c6d9537c 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -78,7 +78,7 @@ AMR (for R) - 1.0.1.9001 + 1.0.1.9002 @@ -226,10 +226,14 @@ -
+

-AMR 1.0.1.9001 Unreleased +AMR 1.0.1.9002 Unreleased

+
+

+Last updated: 08-Mar-2020 +

New

@@ -238,6 +242,7 @@
  • Plotting biplots for principal component analysis using the new ggplot_pca() function
  • +

    @@ -1489,7 +1494,7 @@

    Contents

    @@ -236,14 +236,15 @@ x, choices = 1:2, scale = TRUE, + pc.biplot = TRUE, labels = NULL, labels_textsize = 3, labels_text_placement = 1.5, groups = NULL, - ellipse = FALSE, + ellipse = TRUE, ellipse_prob = 0.68, ellipse_size = 0.5, - ellipse_alpha = 0.25, + ellipse_alpha = 0.5, points_size = 2, points_alpha = 0.25, arrows = TRUE, @@ -275,6 +276,14 @@ princomp. Normally 0 <= scale <= 1, and a warning will be issued if the specified scale is outside this range.

    + + pc.biplot +

    If true, use what Gabriel (1971) refers to as a "principal component + biplot", with lambda = 1 and observations scaled up by sqrt(n) and + variables scaled down by sqrt(n). Then inner products between + variables approximate covariances and distances between observations + approximate Mahalanobis distance.

    + labels

    an optional vector of labels for the observations. If set, the labels will be placed below their respective points. When using the pca() function as input for x, this will be determined automatically based on the attribute non_numeric_cols, see pca().

    @@ -352,13 +361,13 @@
  • Rewritten code to remove the dependency on packages plyr, scales and grid

  • Parametrised more options, like arrow and ellipse settings

  • Added total amount of explained variance as a caption in the plot

  • -
  • Cleaned all syntax based on the lintr package

  • +
  • Cleaned all syntax based on the lintr package and added integrity checks

  • Updated documentation

  • Details

    -

    The default colours for labels and points is set with scale_colour_viridis_d(), but these can be changed by adding another scale for colour, like scale_colour_brewer().

    +

    The colours for labels and points can be changed by adding another scale layer for colour, like scale_colour_viridis_d() or scale_colour_brewer().

    Maturing lifecycle

    diff --git a/docs/reference/index.html b/docs/reference/index.html index 47037b36..d8e3cb48 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -78,7 +78,7 @@ AMR (for R) - 1.0.1.9001 + 1.0.1.9002
    @@ -403,7 +403,7 @@ -

    prcomp(<data.frame>) pca()

    +

    pca()

    Principal Component Analysis (for AMR)

    diff --git a/docs/reference/pca.html b/docs/reference/pca.html index c131227b..ca321567 100644 --- a/docs/reference/pca.html +++ b/docs/reference/pca.html @@ -6,7 +6,7 @@ -Principal Component Analysis (for AMR) — prcomp.data.frame • AMR (for R) +Principal Component Analysis (for AMR) — pca • AMR (for R) @@ -44,8 +44,8 @@ - - + + @@ -79,7 +79,7 @@ AMR (for R) - 1.0.1.9000 + 1.0.1.9002
    @@ -229,11 +229,10 @@
    -

    Performs a principal component analysis (PCA) based on a data set with automatic determination for afterwards plotting the groups and labels.

    +

    Performs a principal component analysis (PCA) based on a data set with automatic determination for afterwards plotting the groups and labels, and automatic filtering on only suitable (i.e. non-empty and numeric) variables.

    -
    # S3 method for data.frame
    -prcomp(
    +    
    pca(
       x,
       ...,
       retx = TRUE,
    @@ -241,9 +240,7 @@
       scale. = TRUE,
       tol = NULL,
       rank. = NULL
    -)
    -
    -pca(x, ...)
    +)

    Arguments

    @@ -297,10 +294,13 @@
    +

    Value

    + +

    An object of classes pca and prcomp

    Details

    -

    The pca() function takes a data.frame as input and performs the actual PCA with the R function prcomp().

    -

    The result of the pca() function is a prcomp object, with an additional attribute non_numeric_cols which is a vector with the column names of all columns that do not contain numeric values. These are probably the groups and labels, and will be used by ggplot_pca().

    +

    The pca() function takes a data.frame as input and performs the actual PCA with the R function prcomp().

    +

    The result of the pca() function is a prcomp object, with an additional attribute non_numeric_cols which is a vector with the column names of all columns that do not contain numeric values. These are probably the groups and labels, and will be used by ggplot_pca().

    Experimental lifecycle

    @@ -332,6 +332,7 @@ The lifecycle of this function is experimen

    Contents