AMR/R/ggplot_pca.R

366 lines
16 KiB
R
Raw Normal View History

2020-03-07 21:48:21 +01:00
# ==================================================================== #
# TITLE #
# Antimicrobial Resistance (AMR) Analysis #
# #
# SOURCE #
# https://gitlab.com/msberends/AMR #
# #
# LICENCE #
# (c) 2018-2020 Berends MS, Luz CF et al. #
# #
# This R package is free software; you can freely use and distribute #
# it for both personal and commercial purposes under the terms of the #
# GNU General Public License version 2.0 (GNU GPL-2), as published by #
# the Free Software Foundation. #
# #
# We created this package for both routine data analysis and academic #
# research and it was publicly released in the hope that it will be #
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. #
# Visit our website for more info: https://msberends.gitlab.io/AMR. #
# ==================================================================== #
#' PCA biplot with `ggplot2`
#'
2020-04-13 21:09:56 +02:00
#' Produces a `ggplot2` variant of a so-called [biplot](https://en.wikipedia.org/wiki/Biplot) for PCA (principal component analysis), but is more flexible and more appealing than the base \R [biplot()] function.
2020-03-07 21:48:21 +01:00
#' @inheritSection lifecycle Maturing lifecycle
#' @param x an object returned by [pca()], [prcomp()] or [princomp()]
#' @inheritParams stats::biplot.prcomp
#' @param labels an optional vector of labels for the observations. If set, the labels will be placed below their respective points. When using the [pca()] function as input for `x`, this will be determined automatically based on the attribute `non_numeric_cols`, see [pca()].
#' @param labels_textsize the size of the text used for the labels
#' @param labels_text_placement adjustment factor the placement of the variable names (`>=1` means further away from the arrow head)
#' @param groups an optional vector of groups for the labels, with the same length as `labels`. If set, the points and labels will be coloured according to these groups. When using the [pca()] function as input for `x`, this will be determined automatically based on the attribute `non_numeric_cols`, see [pca()].
#' @param ellipse a logical to indicate whether a normal data ellipse should be drawn for each group (set with `groups`)
#' @param ellipse_prob statistical size of the ellipse in normal probability
#' @param ellipse_size the size of the ellipse line
#' @param ellipse_alpha the alpha (transparency) of the ellipse line
2020-03-08 09:12:11 +01:00
#' @param points_size the size of the points
2020-03-07 21:48:21 +01:00
#' @param points_alpha the alpha (transparency) of the points
#' @param arrows a logical to indicate whether arrows should be drawn
#' @param arrows_textsize the size of the text for variable names
#' @param arrows_colour the colour of the arrow and their text
#' @param arrows_size the size (thickness) of the arrow lines
#' @param arrows_textsize the size of the text at the end of the arrows
#' @param arrows_alpha the alpha (transparency) of the arrows and their text
#' @param base_textsize the text size for all plot elements except the labels and arrows
#' @param ... Parameters passed on to functions
2020-03-08 09:12:11 +01:00
#' @source The [ggplot_pca()] function is based on the `ggbiplot()` function from the `ggbiplot` package by Vince Vu, as found on GitHub: <https://github.com/vqv/ggbiplot> (retrieved: 2 March 2020, their latest commit: [`7325e88`](https://github.com/vqv/ggbiplot/commit/7325e880485bea4c07465a0304c470608fffb5d9); 12 February 2015).
2020-03-07 21:48:21 +01:00
#'
#' As per their GPL-2 licence that demands documentation of code changes, the changes made based on the source code were:
#' 1. Rewritten code to remove the dependency on packages `plyr`, `scales` and `grid`
#' 2. Parametrised more options, like arrow and ellipse settings
#' 3. Added total amount of explained variance as a caption in the plot
2020-03-08 11:18:59 +01:00
#' 4. Cleaned all syntax based on the `lintr` package and added integrity checks
2020-03-07 21:48:21 +01:00
#' 5. Updated documentation
2020-03-08 11:18:59 +01:00
#' @details The colours for labels and points can be changed by adding another scale layer for colour, like [scale_colour_viridis_d()] or [scale_colour_brewer()].
2020-03-07 21:48:21 +01:00
#' @rdname ggplot_pca
#' @export
#' @examples
#' # `example_isolates` is a dataset available in the AMR package.
#' # See ?example_isolates.
#'
#' # See ?pca for more info about Principal Component Analysis (PCA).
#' library(dplyr)
#' pca_model <- example_isolates %>%
#' filter(mo_genus(mo) == "Staphylococcus") %>%
#' group_by(species = mo_shortname(mo)) %>%
#' summarise_if (is.rsi, resistance) %>%
#' pca(FLC, AMC, CXM, GEN, TOB, TMP, SXT, CIP, TEC, TCY, ERY)
#'
#' # old
#' biplot(pca_model)
#'
#' # new
#' ggplot_pca(pca_model)
ggplot_pca <- function(x,
choices = 1:2,
scale = TRUE,
2020-03-08 11:18:59 +01:00
pc.biplot = TRUE,
2020-03-07 21:48:21 +01:00
labels = NULL,
labels_textsize = 3,
labels_text_placement = 1.5,
groups = NULL,
2020-03-08 11:18:59 +01:00
ellipse = TRUE,
2020-03-07 21:48:21 +01:00
ellipse_prob = 0.68,
ellipse_size = 0.5,
2020-03-08 11:18:59 +01:00
ellipse_alpha = 0.5,
2020-03-07 21:48:21 +01:00
points_size = 2,
points_alpha = 0.25,
arrows = TRUE,
arrows_colour = "darkblue",
arrows_size = 0.5,
arrows_textsize = 3,
arrows_alpha = 0.75,
base_textsize = 10,
...) {
stopifnot_installed_package("ggplot2")
2020-03-08 11:18:59 +01:00
stopifnot_msg(length(choices) == 2, "`choices` must be of length 2")
stopifnot_msg(is.logical(scale), "`scale` must be TRUE or FALSE")
stopifnot_msg(is.logical(pc.biplot), "`pc.biplot` must be TRUE or FALSE")
stopifnot_msg(is.numeric(choices), "`choices` must be numeric")
stopifnot_msg(is.numeric(labels_textsize), "`labels_textsize` must be numeric")
stopifnot_msg(is.numeric(labels_text_placement), "`labels_text_placement` must be numeric")
stopifnot_msg(is.logical(ellipse), "`ellipse` must be TRUE or FALSE")
stopifnot_msg(is.numeric(ellipse_prob), "`ellipse_prob` must be numeric")
stopifnot_msg(is.numeric(ellipse_size), "`ellipse_size` must be numeric")
stopifnot_msg(is.numeric(ellipse_alpha), "`ellipse_alpha` must be numeric")
stopifnot_msg(is.logical(arrows), "`arrows` must be TRUE or FALSE")
stopifnot_msg(is.numeric(arrows_size), "`arrows_size` must be numeric")
stopifnot_msg(is.numeric(arrows_textsize), "`arrows_textsize` must be numeric")
stopifnot_msg(is.numeric(arrows_alpha), "`arrows_alpha` must be numeric")
stopifnot_msg(is.numeric(base_textsize), "`base_textsize` must be numeric")
2020-03-07 21:48:21 +01:00
calculations <- pca_calculations(pca_model = x,
groups = groups,
groups_missing = missing(groups),
labels = labels,
labels_missing = missing(labels),
choices = choices,
scale = scale,
2020-03-08 11:18:59 +01:00
pc.biplot = pc.biplot,
2020-03-07 21:48:21 +01:00
ellipse_prob = ellipse_prob,
labels_text_placement = labels_text_placement)
nobs.factor <- calculations$nobs.factor
d <- calculations$d
u <- calculations$u
v <- calculations$v
choices <- calculations$choices
df.u <- calculations$df.u
df.v <- calculations$df.v
r <- calculations$r
ell <- calculations$ell
groups <- calculations$groups
group_name <- calculations$group_name
labels <- calculations$labels
# Append the proportion of explained variance to the axis labels
if ((1 - as.integer(scale)) == 0) {
2020-03-08 11:18:59 +01:00
u.axis.labs <- paste0("Standardised PC", choices)
2020-03-07 21:48:21 +01:00
} else {
2020-03-08 11:18:59 +01:00
u.axis.labs <- paste0("PC", choices)
2020-03-07 21:48:21 +01:00
}
2020-03-08 11:18:59 +01:00
u.axis.labs <- paste0(u.axis.labs,
paste0("\n(explained var: ",
percentage(x$sdev[choices] ^ 2 / sum(x$sdev ^ 2)),
")"))
2020-03-07 21:48:21 +01:00
# Score Labels
if (!is.null(labels)) {
df.u$labels <- labels
}
# Grouping variable
if (!is.null(groups)) {
df.u$groups <- groups
}
# Base plot
g <- ggplot2::ggplot(data = df.u,
ggplot2::aes(x = xvar, y = yvar)) +
ggplot2::xlab(u.axis.labs[1]) +
ggplot2::ylab(u.axis.labs[2]) +
ggplot2::expand_limits(x = c(-1.15, 1.15),
y = c(-1.15, 1.15))
# Draw either labels or points
if (!is.null(df.u$labels)) {
if (!is.null(df.u$groups)) {
2020-03-08 11:18:59 +01:00
g <- g + ggplot2::geom_point(ggplot2::aes(colour = groups),
alpha = points_alpha,
size = points_size) +
2020-03-07 21:48:21 +01:00
ggplot2::geom_text(ggplot2::aes(label = labels, colour = groups),
nudge_y = -0.05,
size = labels_textsize) +
ggplot2::labs(colour = group_name)
} else {
2020-03-08 11:18:59 +01:00
g <- g + ggplot2::geom_point(alpha = points_alpha,
size = points_size) +
2020-03-07 21:48:21 +01:00
ggplot2::geom_text(ggplot2::aes(label = labels),
nudge_y = -0.05,
size = labels_textsize)
}
} else {
if (!is.null(df.u$groups)) {
2020-03-08 11:18:59 +01:00
g <- g + ggplot2::geom_point(ggplot2::aes(colour = groups),
alpha = points_alpha,
size = points_size) +
2020-03-07 21:48:21 +01:00
ggplot2::labs(colour = group_name)
} else {
g <- g + ggplot2::geom_point(alpha = points_alpha,
size = points_size)
}
}
# Overlay a concentration ellipse if there are groups
2020-03-08 11:18:59 +01:00
if (!is.null(df.u$groups) & !is.null(ell) & isTRUE(ellipse)) {
g <- g + ggplot2::geom_path(data = ell,
ggplot2::aes(colour = groups, group = groups),
size = ellipse_size,
alpha = points_alpha)
2020-03-07 21:48:21 +01:00
}
# Label the variable axes
if (arrows == TRUE) {
2020-03-08 11:18:59 +01:00
g <- g + ggplot2::geom_segment(data = df.v,
ggplot2::aes(x = 0, y = 0, xend = xvar, yend = yvar),
arrow = ggplot2::arrow(length = ggplot2::unit(0.5, "picas"),
angle = 20,
ends = "last",
type = "open"),
colour = arrows_colour,
size = arrows_size,
alpha = arrows_alpha) +
2020-03-07 21:48:21 +01:00
ggplot2::geom_text(data = df.v,
ggplot2::aes(label = varname, x = xvar, y = yvar, angle = angle, hjust = hjust),
colour = arrows_colour,
size = arrows_textsize,
alpha = arrows_alpha)
}
# Add caption label about total explained variance
g <- g + ggplot2::labs(caption = paste0("Total explained variance: ",
percentage(sum(x$sdev[choices] ^ 2 / sum(x$sdev ^ 2)))))
# mark-up nicely
g <- g + ggplot2::theme_minimal(base_size = base_textsize) +
ggplot2::theme(panel.grid.major = ggplot2::element_line(colour = "grey85"),
panel.grid.minor = ggplot2::element_blank(),
# centre title and subtitle
plot.title = ggplot2::element_text(hjust = 0.5),
plot.subtitle = ggplot2::element_text(hjust = 0.5))
g
}
#' @importFrom dplyr bind_rows
2020-03-08 11:18:59 +01:00
#' @importFrom stats qchisq var
2020-03-07 21:48:21 +01:00
pca_calculations <- function(pca_model,
groups = NULL,
groups_missing = TRUE,
labels = NULL,
labels_missing = TRUE,
choices = 1:2,
scale = 1,
2020-03-08 11:18:59 +01:00
pc.biplot = TRUE,
2020-03-07 21:48:21 +01:00
ellipse_prob = 0.68,
labels_text_placement = 1.5) {
non_numeric_cols <- attributes(pca_model)$non_numeric_cols
if (groups_missing) {
groups <- tryCatch(non_numeric_cols[[1]],
error = function(e) NULL)
group_name <- tryCatch(colnames(non_numeric_cols[1]),
error = function(e) NULL)
}
if (labels_missing) {
labels <- tryCatch(non_numeric_cols[[2]],
error = function(e) NULL)
}
if (!is.null(groups) & is.null(labels)) {
# turn them around
labels <- groups
groups <- NULL
group_name <- NULL
}
# Recover the SVD
if (inherits(pca_model, "prcomp")) {
nobs.factor <- sqrt(nrow(pca_model$x) - 1)
d <- pca_model$sdev
u <- sweep(pca_model$x, 2, 1 / (d * nobs.factor), FUN = "*")
v <- pca_model$rotation
} else if (inherits(pca_model, "princomp")) {
nobs.factor <- sqrt(pca_model$n.obs)
d <- pca_model$sdev
u <- sweep(pca_model$scores, 2, 1 / (d * nobs.factor), FUN = "*")
v <- pca_model$loadings
} else if (inherits(pca_model, "PCA")) {
nobs.factor <- sqrt(nrow(pca_model$call$X))
d <- unlist(sqrt(pca_model$eig)[1])
u <- sweep(pca_model$ind$coord, 2, 1 / (d * nobs.factor), FUN = "*")
v <- sweep(pca_model$var$coord, 2, sqrt(pca_model$eig[seq_len(ncol(pca_model$var$coord)), 1]), FUN = "/")
} else if (inherits(pca_model, "lda")) {
nobs.factor <- sqrt(pca_model$N)
d <- pca_model$svd
u <- predict(pca_model)$x / nobs.factor
v <- pca_model$scaling
d.total <- sum(d ^ 2)
} else {
stop("Expected a object of class prcomp, princomp, PCA, or lda")
}
# Scores
choices <- pmin(choices, ncol(u))
obs.scale <- 1 - as.integer(scale)
df.u <- as.data.frame(sweep(u[, choices], 2, d[choices] ^ obs.scale, FUN = "*"))
# Directions
v <- sweep(v, 2, d ^ as.integer(scale), FUN = "*")
df.v <- as.data.frame(v[, choices])
names(df.u) <- c("xvar", "yvar")
names(df.v) <- names(df.u)
2020-03-08 11:18:59 +01:00
if (isTRUE(pc.biplot)) {
df.u <- df.u * nobs.factor
}
2020-03-07 21:48:21 +01:00
# Scale the radius of the correlation circle so that it corresponds to
# a data ellipse for the standardized PC scores
circle_prob <- 0.69
r <- sqrt(qchisq(circle_prob, df = 2)) * prod(colMeans(df.u ^ 2)) ^ (0.25)
# Scale directions
v.scale <- rowSums(v ^ 2)
df.v <- r * df.v / sqrt(max(v.scale))
# Grouping variable
if (!is.null(groups)) {
df.u$groups <- groups
}
df.v$varname <- rownames(v)
# Variables for text label placement
df.v$angle <- with(df.v, (180 / pi) * atan(yvar / xvar))
df.v$hjust <- with(df.v, (1 - labels_text_placement * sign(xvar)) / 2)
if (!is.null(df.u$groups)) {
2020-03-08 11:18:59 +01:00
theta <- c(seq(-pi, pi, length = 50), seq(pi, -pi, length = 50))
circle <- cbind(cos(theta), sin(theta))
2020-03-07 21:48:21 +01:00
ell <- bind_rows(
sapply(unique(df.u$groups), function(g, df = df.u) {
x <- df[which(df$groups == g), , drop = FALSE]
if (nrow(x) <= 2) {
return(NULL)
}
sigma <- var(cbind(x$xvar, x$yvar))
mu <- c(mean(x$xvar), mean(x$yvar))
ed <- sqrt(qchisq(ellipse_prob, df = 2))
2020-03-08 11:18:59 +01:00
data.frame(sweep(circle %*% chol(sigma) * ed, 2, mu, FUN = "+"),
groups = x$groups[1])
2020-03-07 21:48:21 +01:00
}))
2020-03-08 09:12:11 +01:00
if (NROW(ell) == 0) {
ell <- NULL
2020-03-08 11:18:59 +01:00
} else {
names(ell)[1:2] <- c("xvar", "yvar")
2020-03-08 09:12:11 +01:00
}
2020-03-07 21:48:21 +01:00
} else {
ell <- NULL
}
2020-03-08 11:18:59 +01:00
2020-03-07 21:48:21 +01:00
list(nobs.factor = nobs.factor,
d = d,
u = u,
v = v,
choices = choices,
df.u = df.u,
df.v = df.v,
r = r,
ell = ell,
groups = groups,
group_name = group_name,
labels = labels
)
}