(v1.0.1.9002) PCA unit tests

2025-08-25 13:42:10 +02:00 · 2020-03-08 11:18:59 +01:00
parent 9fc858f208
commit 77656a676c
20 changed files with 182 additions and 135 deletions
--- a/R/aa_helper_functions.R
+++ b/R/aa_helper_functions.R
@@ -43,7 +43,7 @@ check_dataset_integrity <- function() {
                               "iv_ddd", "iv_units", "loinc") %in% colnames(antibiotics),
                             na.rm = TRUE)
  }, error = function(e)
-    stop('Please use the command \'library("AMR")\' before using this function, to load the needed reference data.', call. = FALSE)
+    stop('Please use the command \'library("AMR")\' before using this function, to load the required reference data.', call. = FALSE)
  )
  if (!check_microorganisms | !check_antibiotics) {
    stop("Data set `microorganisms` or data set `antibiotics` is overwritten by your global environment and prevents the AMR package from working correctly. Please rename your object before using this function.", call. = FALSE)
@@ -154,6 +154,13 @@ stopifnot_installed_package <- function(package) {
  return(invisible())
 }

+stopifnot_msg <- function(expr, msg) {
+  if (!isTRUE(expr)) {
+    stop(msg, call. = FALSE)
+  }
+}
+
+
 "%or%" <- function(x, y) {
  if (is.null(x) | is.null(y)) {
    if (is.null(x)) {
--- a/R/ggplot_pca.R
+++ b/R/ggplot_pca.R
@@ -49,9 +49,9 @@
 #' 1. Rewritten code to remove the dependency on packages `plyr`, `scales` and `grid`
 #' 2. Parametrised more options, like arrow and ellipse settings
 #' 3. Added total amount of explained variance as a caption in the plot
-#' 4. Cleaned all syntax based on the `lintr` package
+#' 4. Cleaned all syntax based on the `lintr` package and added integrity checks
 #' 5. Updated documentation
-#' @details The default colours for labels and points is set with [scale_colour_viridis_d()], but these can be changed by adding another scale for colour, like [scale_colour_brewer()].
+#' @details The colours for labels and points can be changed by adding another scale layer for colour, like [scale_colour_viridis_d()] or [scale_colour_brewer()].
 #' @rdname ggplot_pca
 #' @export
 #' @examples 
@@ -74,14 +74,15 @@
 ggplot_pca <- function(x,
                       choices = 1:2,
                       scale = TRUE,
+                       pc.biplot = TRUE,
                       labels = NULL,
                       labels_textsize = 3,
                       labels_text_placement = 1.5,
                       groups = NULL,
-                       ellipse = FALSE,
+                       ellipse = TRUE,
                       ellipse_prob = 0.68,
                       ellipse_size = 0.5,
-                       ellipse_alpha = 0.25,
+                       ellipse_alpha = 0.5,
                       points_size = 2,
                       points_alpha = 0.25,
                       arrows = TRUE,
@@ -93,6 +94,21 @@ ggplot_pca <- function(x,
                       ...) {
  
  stopifnot_installed_package("ggplot2")
+  stopifnot_msg(length(choices) == 2, "`choices` must be of length 2")
+  stopifnot_msg(is.logical(scale), "`scale` must be TRUE or FALSE")
+  stopifnot_msg(is.logical(pc.biplot), "`pc.biplot` must be TRUE or FALSE")
+  stopifnot_msg(is.numeric(choices), "`choices` must be numeric")
+  stopifnot_msg(is.numeric(labels_textsize), "`labels_textsize` must be numeric")
+  stopifnot_msg(is.numeric(labels_text_placement), "`labels_text_placement` must be numeric")
+  stopifnot_msg(is.logical(ellipse), "`ellipse` must be TRUE or FALSE")
+  stopifnot_msg(is.numeric(ellipse_prob), "`ellipse_prob` must be numeric")
+  stopifnot_msg(is.numeric(ellipse_size), "`ellipse_size` must be numeric")
+  stopifnot_msg(is.numeric(ellipse_alpha), "`ellipse_alpha` must be numeric")
+  stopifnot_msg(is.logical(arrows), "`arrows` must be TRUE or FALSE")
+  stopifnot_msg(is.numeric(arrows_size), "`arrows_size` must be numeric")
+  stopifnot_msg(is.numeric(arrows_textsize), "`arrows_textsize` must be numeric")
+  stopifnot_msg(is.numeric(arrows_alpha), "`arrows_alpha` must be numeric")
+  stopifnot_msg(is.numeric(base_textsize), "`base_textsize` must be numeric")
  
  calculations <- pca_calculations(pca_model = x,
                                   groups = groups, 
@@ -101,6 +117,7 @@ ggplot_pca <- function(x,
                                   labels_missing = missing(labels),
                                   choices = choices,
                                   scale = scale,
+                                   pc.biplot = pc.biplot,
                                   ellipse_prob = ellipse_prob,
                                   labels_text_placement = labels_text_placement)
  nobs.factor <- calculations$nobs.factor
@@ -116,17 +133,16 @@ ggplot_pca <- function(x,
  group_name <- calculations$group_name
  labels <- calculations$labels
  
-  stopifnot(length(choices) == 2)
-  
  # Append the proportion of explained variance to the axis labels
  if ((1 - as.integer(scale)) == 0) {
-    u.axis.labs <- paste("Standardised PC", choices, sep = "")
+    u.axis.labs <- paste0("Standardised PC", choices)
  } else {
-    u.axis.labs <- paste("PC", choices, sep = "")
+    u.axis.labs <- paste0("PC", choices)
  }
-  u.axis.labs <- paste(u.axis.labs,
-                       paste0("\n(explained var: ", 
-                              percentage(x$sdev[choices] ^ 2 / sum(x$sdev ^ 2)), ")"))
+  u.axis.labs <- paste0(u.axis.labs,
+                        paste0("\n(explained var: ", 
+                               percentage(x$sdev[choices] ^ 2 / sum(x$sdev ^ 2)),
+                               ")"))
  
  # Score Labels
  if (!is.null(labels)) {
@@ -138,7 +154,6 @@ ggplot_pca <- function(x,
    df.u$groups <- groups
  }
  
-  
  # Base plot
  g <- ggplot2::ggplot(data = df.u, 
                       ggplot2::aes(x = xvar, y = yvar)) + 
@@ -150,30 +165,25 @@ ggplot_pca <- function(x,
  # Draw either labels or points
  if (!is.null(df.u$labels)) {
    if (!is.null(df.u$groups)) {
-      g <- g + 
-        ggplot2::geom_point(ggplot2::aes(colour = groups),
-                            alpha = points_alpha, 
-                            size = points_size) +
+      g <- g + ggplot2::geom_point(ggplot2::aes(colour = groups),
+                                   alpha = points_alpha, 
+                                   size = points_size) +
        ggplot2::geom_text(ggplot2::aes(label = labels, colour = groups),
                           nudge_y = -0.05,
                           size = labels_textsize) +
-        ggplot2::scale_colour_viridis_d() +
        ggplot2::labs(colour = group_name)
    } else {
-      g <- g + 
-        ggplot2::geom_point(alpha = points_alpha, 
-                            size = points_size) +
+      g <- g + ggplot2::geom_point(alpha = points_alpha, 
+                                   size = points_size) +
        ggplot2::geom_text(ggplot2::aes(label = labels),
                           nudge_y = -0.05,
                           size = labels_textsize)      
    }
  } else {
    if (!is.null(df.u$groups)) {
-      g <- g + 
-        ggplot2::geom_point(ggplot2::aes(colour = groups),
-                            alpha = points_alpha,
-                            size = points_size) +
-        ggplot2::scale_colour_viridis_d() +
+      g <- g + ggplot2::geom_point(ggplot2::aes(colour = groups),
+                                   alpha = points_alpha,
+                                   size = points_size) +
        ggplot2::labs(colour = group_name)
    } else {
      g <- g + ggplot2::geom_point(alpha = points_alpha,
@@ -182,26 +192,24 @@ ggplot_pca <- function(x,
  }
  
  # Overlay a concentration ellipse if there are groups
-  if (!is.null(df.u$groups) & isTRUE(ellipse)) {
-    g <- g + 
-      ggplot2::geom_path(data = ell, 
-                         ggplot2::aes(colour = groups, group = groups),
-                         size = ellipse_size,
-                         alpha = points_alpha)
+  if (!is.null(df.u$groups) & !is.null(ell) & isTRUE(ellipse)) {
+    g <- g + ggplot2::geom_path(data = ell, 
+                                ggplot2::aes(colour = groups, group = groups),
+                                size = ellipse_size,
+                                alpha = points_alpha)
  }
  
  # Label the variable axes
  if (arrows == TRUE) {
-    g <- g + 
-      ggplot2::geom_segment(data = df.v,
-                            ggplot2::aes(x = 0, y = 0, xend = xvar, yend = yvar),
-                            arrow = ggplot2::arrow(length = ggplot2::unit(0.5, "picas"),
-                                                   angle = 20,
-                                                   ends = "last",
-                                                   type = "open"), 
-                            colour = arrows_colour, 
-                            size = arrows_size,
-                            alpha = arrows_alpha) +
+    g <- g + ggplot2::geom_segment(data = df.v,
+                                   ggplot2::aes(x = 0, y = 0, xend = xvar, yend = yvar),
+                                   arrow = ggplot2::arrow(length = ggplot2::unit(0.5, "picas"),
+                                                          angle = 20,
+                                                          ends = "last",
+                                                          type = "open"), 
+                                   colour = arrows_colour, 
+                                   size = arrows_size,
+                                   alpha = arrows_alpha) +
      ggplot2::geom_text(data = df.v, 
                         ggplot2::aes(label = varname, x = xvar, y = yvar, angle = angle, hjust = hjust), 
                         colour = arrows_colour,
@@ -225,6 +233,7 @@ ggplot_pca <- function(x,
 }

 #' @importFrom dplyr bind_rows
+#' @importFrom stats qchisq var
 pca_calculations <- function(pca_model,
                             groups = NULL,
                             groups_missing = TRUE,
@@ -232,6 +241,7 @@ pca_calculations <- function(pca_model,
                             labels_missing = TRUE,
                             choices = 1:2,
                             scale = 1,
+                             pc.biplot = TRUE,
                             ellipse_prob = 0.68,
                             labels_text_placement = 1.5) {
  
@@ -291,7 +301,9 @@ pca_calculations <- function(pca_model,
  names(df.u) <- c("xvar", "yvar")
  names(df.v) <- names(df.u)
  
-  df.u <- df.u * nobs.factor
+  if (isTRUE(pc.biplot)) {
+    df.u <- df.u * nobs.factor
+  }
  
  # Scale the radius of the correlation circle so that it corresponds to 
  # a data ellipse for the standardized PC scores
@@ -314,8 +326,8 @@ pca_calculations <- function(pca_model,
  df.v$hjust <- with(df.v, (1 - labels_text_placement * sign(xvar)) / 2)
  
  if (!is.null(df.u$groups)) {
-    theta <<- c(seq(-pi, pi, length = 50), seq(pi, -pi, length = 50))
-    circle <<- cbind(cos(theta), sin(theta))
+    theta <- c(seq(-pi, pi, length = 50), seq(pi, -pi, length = 50))
+    circle <- cbind(cos(theta), sin(theta))
    ell <- bind_rows(
      sapply(unique(df.u$groups), function(g, df = df.u) {
        x <- df[which(df$groups == g), , drop = FALSE]
@@ -325,18 +337,18 @@ pca_calculations <- function(pca_model,
        sigma <- var(cbind(x$xvar, x$yvar))
        mu <- c(mean(x$xvar), mean(x$yvar))
        ed <- sqrt(qchisq(ellipse_prob, df = 2))
-        el <- data.frame(sweep(circle %*% chol(sigma) * ed, 2, mu, FUN = "+"), 
-                   groups = x$groups[1])
-        names(el)[1:2] <- c("xvar", "yvar")
-        el
+        data.frame(sweep(circle %*% chol(sigma) * ed, 2, mu, FUN = "+"), 
+                         groups = x$groups[1])
      }))
    if (NROW(ell) == 0) {
      ell <- NULL
+    } else {
+      names(ell)[1:2] <- c("xvar", "yvar")
    }
  } else {
    ell <- NULL
  }
-
+  
  list(nobs.factor = nobs.factor,
       d = d,
       u = u,
@@ -350,5 +362,4 @@ pca_calculations <- function(pca_model,
       group_name = group_name,
       labels = labels
  )
-  
 }
--- a/R/globals.R
+++ b/R/globals.R
@@ -24,6 +24,7 @@ globalVariables(c(".",
                  "ab",
                  "ab_txt",
                  "abbreviations",
+                  "angle",
                  "antibiotic",
                  "antibiotics",
                  "CNS_CPS",
@@ -40,6 +41,7 @@ globalVariables(c(".",
                  "genus",
                  "gramstain",
                  "group",
+                  "hjust",
                  "index",
                  "input",
                  "interpretation",
@@ -100,7 +102,10 @@ globalVariables(c(".",
                  "txt",
                  "uncertainty_level",
                  "value",
+                  "varname",
                  "x",
                  "xdr",
+                  "xvar",
                  "y",
-                  "year"))
+                  "year",
+                  "yvar"))
--- a/R/pca.R
+++ b/R/pca.R
@@ -21,17 +21,18 @@

 #' Principal Component Analysis (for AMR)
 #' 
-#' Performs a principal component analysis (PCA) based on a data set with automatic determination for afterwards plotting the groups and labels.
+#' Performs a principal component analysis (PCA) based on a data set with automatic determination for afterwards plotting the groups and labels, and automatic filtering on only suitable (i.e. non-empty and numeric) variables.
 #' @inheritSection lifecycle Experimental lifecycle
 #' @param x a [data.frame] containing numeric columns
 #' @param ... columns of `x` to be selected for PCA
 #' @inheritParams stats::prcomp
-#' @details The [pca()] function takes a [data.frame] as input and performs the actual PCA with the R function [prcomp()].
+#' @details The [pca()] function takes a [data.frame] as input and performs the actual PCA with the \R function [prcomp()].
 #' 
-#' The result of the [pca()] function is a [`prcomp`] object, with an additional attribute `non_numeric_cols` which is a vector with the column names of all columns that do not contain numeric values. These are probably the groups and labels, and will be used by [ggplot_pca()].
-#' @rdname pca
-#' @exportMethod prcomp.data.frame
+#' The result of the [pca()] function is a [prcomp] object, with an additional attribute `non_numeric_cols` which is a vector with the column names of all columns that do not contain numeric values. These are probably the groups and labels, and will be used by [ggplot_pca()].
+#' @return An object of classes [pca] and [prcomp]
 #' @importFrom stats prcomp
+#' @importFrom dplyr ungroup %>% filter_all all_vars
+#' @importFrom rlang enquos eval_tidy
 #' @export
 #' @examples 
 #' # `example_isolates` is a dataset available in the AMR package.
@@ -52,40 +53,19 @@
 #' summary(pca_result)
 #' biplot(pca_result)
 #' ggplot_pca(pca_result) # a new and convenient plot function
-prcomp.data.frame <- function(x,
-                              ...,
-                              retx = TRUE,
-                              center = TRUE, 
-                              scale. = TRUE,
-                              tol = NULL,
-                              rank. = NULL) {
+pca <- function(x,
+                ...,
+                retx = TRUE,
+                center = TRUE, 
+                scale. = TRUE,
+                tol = NULL,
+                rank. = NULL) {
  
-  x <- pca_transform_x(x = x, ... = ...)
-  pca_data <- x[, which(sapply(x, function(x) is.numeric(x)))]
-
-  message(blue(paste0("NOTE: Columns selected for PCA: ", paste0(bold(colnames(pca_data)), collapse = "/"),
-                      ".\n      Total observations available: ", nrow(pca_data), ".")))
-  
-  stats:::prcomp.default(pca_data, retx = retx, center = center, scale. = scale., tol = tol, rank. = rank.)
-}
-
-#' @rdname pca
-#' @export
-pca <- function(x, ...) {
  if (!is.data.frame(x)) {
    stop("this function only takes a data.frame as input")
  }
-  pca_model <- prcomp(x, ...)
  
-  x <- pca_transform_x(x = x, ... = ...)
-  attr(pca_model, "non_numeric_cols") <- x[, sapply(x, function(y) !is.numeric(y) & !all(is.na(y))), drop = FALSE]
-  pca_model
-}
-
-#' @importFrom dplyr ungroup %>% filter_all all_vars
-#' @importFrom rlang enquos eval_tidy
-pca_transform_x <- function(x, ...) {
-  # unset data.table, tbl_df, etc.
+  # unset data.table, tibble, etc.
  # also removes groups made by dplyr::group_by
  x <- as.data.frame(x, stringsAsFactors = FALSE)
  x.bak <- x
@@ -123,7 +103,17 @@ pca_transform_x <- function(x, ...) {
    x <- cbind(x.bak[, sapply(x.bak, function(y) !is.numeric(y) & !all(is.na(y))), drop = FALSE], x)
  }
  
-  x %>%
+  x <- x %>%
    ungroup() %>% # would otherwise select the grouping vars
    filter_all(all_vars(!is.na(.)))
+  
+  pca_data <- x[, which(sapply(x, function(x) is.numeric(x)))]
+  
+  message(blue(paste0("NOTE: Columns selected for PCA: ", paste0(bold(colnames(pca_data)), collapse = "/"),
+                      ".\n      Total observations available: ", nrow(pca_data), ".")))
+  
+  pca_model <- prcomp(pca_data, retx = retx, center = center, scale. = scale., tol = tol, rank. = rank.)
+  attr(pca_model, "non_numeric_cols") <- x[, sapply(x, function(y) !is.numeric(y) & !all(is.na(y))), drop = FALSE]
+  class(pca_model) <- c("pca", class(pca_model))
+  pca_model
 }