mirror of
https://github.com/msberends/AMR.git
synced 2025-07-08 14:01:55 +02:00
(v1.0.1.9002) PCA unit tests
This commit is contained in:
@ -43,7 +43,7 @@ check_dataset_integrity <- function() {
|
||||
"iv_ddd", "iv_units", "loinc") %in% colnames(antibiotics),
|
||||
na.rm = TRUE)
|
||||
}, error = function(e)
|
||||
stop('Please use the command \'library("AMR")\' before using this function, to load the needed reference data.', call. = FALSE)
|
||||
stop('Please use the command \'library("AMR")\' before using this function, to load the required reference data.', call. = FALSE)
|
||||
)
|
||||
if (!check_microorganisms | !check_antibiotics) {
|
||||
stop("Data set `microorganisms` or data set `antibiotics` is overwritten by your global environment and prevents the AMR package from working correctly. Please rename your object before using this function.", call. = FALSE)
|
||||
@ -154,6 +154,13 @@ stopifnot_installed_package <- function(package) {
|
||||
return(invisible())
|
||||
}
|
||||
|
||||
stopifnot_msg <- function(expr, msg) {
|
||||
if (!isTRUE(expr)) {
|
||||
stop(msg, call. = FALSE)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
"%or%" <- function(x, y) {
|
||||
if (is.null(x) | is.null(y)) {
|
||||
if (is.null(x)) {
|
111
R/ggplot_pca.R
111
R/ggplot_pca.R
@ -49,9 +49,9 @@
|
||||
#' 1. Rewritten code to remove the dependency on packages `plyr`, `scales` and `grid`
|
||||
#' 2. Parametrised more options, like arrow and ellipse settings
|
||||
#' 3. Added total amount of explained variance as a caption in the plot
|
||||
#' 4. Cleaned all syntax based on the `lintr` package
|
||||
#' 4. Cleaned all syntax based on the `lintr` package and added integrity checks
|
||||
#' 5. Updated documentation
|
||||
#' @details The default colours for labels and points is set with [scale_colour_viridis_d()], but these can be changed by adding another scale for colour, like [scale_colour_brewer()].
|
||||
#' @details The colours for labels and points can be changed by adding another scale layer for colour, like [scale_colour_viridis_d()] or [scale_colour_brewer()].
|
||||
#' @rdname ggplot_pca
|
||||
#' @export
|
||||
#' @examples
|
||||
@ -74,14 +74,15 @@
|
||||
ggplot_pca <- function(x,
|
||||
choices = 1:2,
|
||||
scale = TRUE,
|
||||
pc.biplot = TRUE,
|
||||
labels = NULL,
|
||||
labels_textsize = 3,
|
||||
labels_text_placement = 1.5,
|
||||
groups = NULL,
|
||||
ellipse = FALSE,
|
||||
ellipse = TRUE,
|
||||
ellipse_prob = 0.68,
|
||||
ellipse_size = 0.5,
|
||||
ellipse_alpha = 0.25,
|
||||
ellipse_alpha = 0.5,
|
||||
points_size = 2,
|
||||
points_alpha = 0.25,
|
||||
arrows = TRUE,
|
||||
@ -93,6 +94,21 @@ ggplot_pca <- function(x,
|
||||
...) {
|
||||
|
||||
stopifnot_installed_package("ggplot2")
|
||||
stopifnot_msg(length(choices) == 2, "`choices` must be of length 2")
|
||||
stopifnot_msg(is.logical(scale), "`scale` must be TRUE or FALSE")
|
||||
stopifnot_msg(is.logical(pc.biplot), "`pc.biplot` must be TRUE or FALSE")
|
||||
stopifnot_msg(is.numeric(choices), "`choices` must be numeric")
|
||||
stopifnot_msg(is.numeric(labels_textsize), "`labels_textsize` must be numeric")
|
||||
stopifnot_msg(is.numeric(labels_text_placement), "`labels_text_placement` must be numeric")
|
||||
stopifnot_msg(is.logical(ellipse), "`ellipse` must be TRUE or FALSE")
|
||||
stopifnot_msg(is.numeric(ellipse_prob), "`ellipse_prob` must be numeric")
|
||||
stopifnot_msg(is.numeric(ellipse_size), "`ellipse_size` must be numeric")
|
||||
stopifnot_msg(is.numeric(ellipse_alpha), "`ellipse_alpha` must be numeric")
|
||||
stopifnot_msg(is.logical(arrows), "`arrows` must be TRUE or FALSE")
|
||||
stopifnot_msg(is.numeric(arrows_size), "`arrows_size` must be numeric")
|
||||
stopifnot_msg(is.numeric(arrows_textsize), "`arrows_textsize` must be numeric")
|
||||
stopifnot_msg(is.numeric(arrows_alpha), "`arrows_alpha` must be numeric")
|
||||
stopifnot_msg(is.numeric(base_textsize), "`base_textsize` must be numeric")
|
||||
|
||||
calculations <- pca_calculations(pca_model = x,
|
||||
groups = groups,
|
||||
@ -101,6 +117,7 @@ ggplot_pca <- function(x,
|
||||
labels_missing = missing(labels),
|
||||
choices = choices,
|
||||
scale = scale,
|
||||
pc.biplot = pc.biplot,
|
||||
ellipse_prob = ellipse_prob,
|
||||
labels_text_placement = labels_text_placement)
|
||||
nobs.factor <- calculations$nobs.factor
|
||||
@ -116,17 +133,16 @@ ggplot_pca <- function(x,
|
||||
group_name <- calculations$group_name
|
||||
labels <- calculations$labels
|
||||
|
||||
stopifnot(length(choices) == 2)
|
||||
|
||||
# Append the proportion of explained variance to the axis labels
|
||||
if ((1 - as.integer(scale)) == 0) {
|
||||
u.axis.labs <- paste("Standardised PC", choices, sep = "")
|
||||
u.axis.labs <- paste0("Standardised PC", choices)
|
||||
} else {
|
||||
u.axis.labs <- paste("PC", choices, sep = "")
|
||||
u.axis.labs <- paste0("PC", choices)
|
||||
}
|
||||
u.axis.labs <- paste(u.axis.labs,
|
||||
paste0("\n(explained var: ",
|
||||
percentage(x$sdev[choices] ^ 2 / sum(x$sdev ^ 2)), ")"))
|
||||
u.axis.labs <- paste0(u.axis.labs,
|
||||
paste0("\n(explained var: ",
|
||||
percentage(x$sdev[choices] ^ 2 / sum(x$sdev ^ 2)),
|
||||
")"))
|
||||
|
||||
# Score Labels
|
||||
if (!is.null(labels)) {
|
||||
@ -138,7 +154,6 @@ ggplot_pca <- function(x,
|
||||
df.u$groups <- groups
|
||||
}
|
||||
|
||||
|
||||
# Base plot
|
||||
g <- ggplot2::ggplot(data = df.u,
|
||||
ggplot2::aes(x = xvar, y = yvar)) +
|
||||
@ -150,30 +165,25 @@ ggplot_pca <- function(x,
|
||||
# Draw either labels or points
|
||||
if (!is.null(df.u$labels)) {
|
||||
if (!is.null(df.u$groups)) {
|
||||
g <- g +
|
||||
ggplot2::geom_point(ggplot2::aes(colour = groups),
|
||||
alpha = points_alpha,
|
||||
size = points_size) +
|
||||
g <- g + ggplot2::geom_point(ggplot2::aes(colour = groups),
|
||||
alpha = points_alpha,
|
||||
size = points_size) +
|
||||
ggplot2::geom_text(ggplot2::aes(label = labels, colour = groups),
|
||||
nudge_y = -0.05,
|
||||
size = labels_textsize) +
|
||||
ggplot2::scale_colour_viridis_d() +
|
||||
ggplot2::labs(colour = group_name)
|
||||
} else {
|
||||
g <- g +
|
||||
ggplot2::geom_point(alpha = points_alpha,
|
||||
size = points_size) +
|
||||
g <- g + ggplot2::geom_point(alpha = points_alpha,
|
||||
size = points_size) +
|
||||
ggplot2::geom_text(ggplot2::aes(label = labels),
|
||||
nudge_y = -0.05,
|
||||
size = labels_textsize)
|
||||
}
|
||||
} else {
|
||||
if (!is.null(df.u$groups)) {
|
||||
g <- g +
|
||||
ggplot2::geom_point(ggplot2::aes(colour = groups),
|
||||
alpha = points_alpha,
|
||||
size = points_size) +
|
||||
ggplot2::scale_colour_viridis_d() +
|
||||
g <- g + ggplot2::geom_point(ggplot2::aes(colour = groups),
|
||||
alpha = points_alpha,
|
||||
size = points_size) +
|
||||
ggplot2::labs(colour = group_name)
|
||||
} else {
|
||||
g <- g + ggplot2::geom_point(alpha = points_alpha,
|
||||
@ -182,26 +192,24 @@ ggplot_pca <- function(x,
|
||||
}
|
||||
|
||||
# Overlay a concentration ellipse if there are groups
|
||||
if (!is.null(df.u$groups) & isTRUE(ellipse)) {
|
||||
g <- g +
|
||||
ggplot2::geom_path(data = ell,
|
||||
ggplot2::aes(colour = groups, group = groups),
|
||||
size = ellipse_size,
|
||||
alpha = points_alpha)
|
||||
if (!is.null(df.u$groups) & !is.null(ell) & isTRUE(ellipse)) {
|
||||
g <- g + ggplot2::geom_path(data = ell,
|
||||
ggplot2::aes(colour = groups, group = groups),
|
||||
size = ellipse_size,
|
||||
alpha = points_alpha)
|
||||
}
|
||||
|
||||
# Label the variable axes
|
||||
if (arrows == TRUE) {
|
||||
g <- g +
|
||||
ggplot2::geom_segment(data = df.v,
|
||||
ggplot2::aes(x = 0, y = 0, xend = xvar, yend = yvar),
|
||||
arrow = ggplot2::arrow(length = ggplot2::unit(0.5, "picas"),
|
||||
angle = 20,
|
||||
ends = "last",
|
||||
type = "open"),
|
||||
colour = arrows_colour,
|
||||
size = arrows_size,
|
||||
alpha = arrows_alpha) +
|
||||
g <- g + ggplot2::geom_segment(data = df.v,
|
||||
ggplot2::aes(x = 0, y = 0, xend = xvar, yend = yvar),
|
||||
arrow = ggplot2::arrow(length = ggplot2::unit(0.5, "picas"),
|
||||
angle = 20,
|
||||
ends = "last",
|
||||
type = "open"),
|
||||
colour = arrows_colour,
|
||||
size = arrows_size,
|
||||
alpha = arrows_alpha) +
|
||||
ggplot2::geom_text(data = df.v,
|
||||
ggplot2::aes(label = varname, x = xvar, y = yvar, angle = angle, hjust = hjust),
|
||||
colour = arrows_colour,
|
||||
@ -225,6 +233,7 @@ ggplot_pca <- function(x,
|
||||
}
|
||||
|
||||
#' @importFrom dplyr bind_rows
|
||||
#' @importFrom stats qchisq var
|
||||
pca_calculations <- function(pca_model,
|
||||
groups = NULL,
|
||||
groups_missing = TRUE,
|
||||
@ -232,6 +241,7 @@ pca_calculations <- function(pca_model,
|
||||
labels_missing = TRUE,
|
||||
choices = 1:2,
|
||||
scale = 1,
|
||||
pc.biplot = TRUE,
|
||||
ellipse_prob = 0.68,
|
||||
labels_text_placement = 1.5) {
|
||||
|
||||
@ -291,7 +301,9 @@ pca_calculations <- function(pca_model,
|
||||
names(df.u) <- c("xvar", "yvar")
|
||||
names(df.v) <- names(df.u)
|
||||
|
||||
df.u <- df.u * nobs.factor
|
||||
if (isTRUE(pc.biplot)) {
|
||||
df.u <- df.u * nobs.factor
|
||||
}
|
||||
|
||||
# Scale the radius of the correlation circle so that it corresponds to
|
||||
# a data ellipse for the standardized PC scores
|
||||
@ -314,8 +326,8 @@ pca_calculations <- function(pca_model,
|
||||
df.v$hjust <- with(df.v, (1 - labels_text_placement * sign(xvar)) / 2)
|
||||
|
||||
if (!is.null(df.u$groups)) {
|
||||
theta <<- c(seq(-pi, pi, length = 50), seq(pi, -pi, length = 50))
|
||||
circle <<- cbind(cos(theta), sin(theta))
|
||||
theta <- c(seq(-pi, pi, length = 50), seq(pi, -pi, length = 50))
|
||||
circle <- cbind(cos(theta), sin(theta))
|
||||
ell <- bind_rows(
|
||||
sapply(unique(df.u$groups), function(g, df = df.u) {
|
||||
x <- df[which(df$groups == g), , drop = FALSE]
|
||||
@ -325,18 +337,18 @@ pca_calculations <- function(pca_model,
|
||||
sigma <- var(cbind(x$xvar, x$yvar))
|
||||
mu <- c(mean(x$xvar), mean(x$yvar))
|
||||
ed <- sqrt(qchisq(ellipse_prob, df = 2))
|
||||
el <- data.frame(sweep(circle %*% chol(sigma) * ed, 2, mu, FUN = "+"),
|
||||
groups = x$groups[1])
|
||||
names(el)[1:2] <- c("xvar", "yvar")
|
||||
el
|
||||
data.frame(sweep(circle %*% chol(sigma) * ed, 2, mu, FUN = "+"),
|
||||
groups = x$groups[1])
|
||||
}))
|
||||
if (NROW(ell) == 0) {
|
||||
ell <- NULL
|
||||
} else {
|
||||
names(ell)[1:2] <- c("xvar", "yvar")
|
||||
}
|
||||
} else {
|
||||
ell <- NULL
|
||||
}
|
||||
|
||||
|
||||
list(nobs.factor = nobs.factor,
|
||||
d = d,
|
||||
u = u,
|
||||
@ -350,5 +362,4 @@ pca_calculations <- function(pca_model,
|
||||
group_name = group_name,
|
||||
labels = labels
|
||||
)
|
||||
|
||||
}
|
||||
|
@ -24,6 +24,7 @@ globalVariables(c(".",
|
||||
"ab",
|
||||
"ab_txt",
|
||||
"abbreviations",
|
||||
"angle",
|
||||
"antibiotic",
|
||||
"antibiotics",
|
||||
"CNS_CPS",
|
||||
@ -40,6 +41,7 @@ globalVariables(c(".",
|
||||
"genus",
|
||||
"gramstain",
|
||||
"group",
|
||||
"hjust",
|
||||
"index",
|
||||
"input",
|
||||
"interpretation",
|
||||
@ -100,7 +102,10 @@ globalVariables(c(".",
|
||||
"txt",
|
||||
"uncertainty_level",
|
||||
"value",
|
||||
"varname",
|
||||
"x",
|
||||
"xdr",
|
||||
"xvar",
|
||||
"y",
|
||||
"year"))
|
||||
"year",
|
||||
"yvar"))
|
||||
|
60
R/pca.R
60
R/pca.R
@ -21,17 +21,18 @@
|
||||
|
||||
#' Principal Component Analysis (for AMR)
|
||||
#'
|
||||
#' Performs a principal component analysis (PCA) based on a data set with automatic determination for afterwards plotting the groups and labels.
|
||||
#' Performs a principal component analysis (PCA) based on a data set with automatic determination for afterwards plotting the groups and labels, and automatic filtering on only suitable (i.e. non-empty and numeric) variables.
|
||||
#' @inheritSection lifecycle Experimental lifecycle
|
||||
#' @param x a [data.frame] containing numeric columns
|
||||
#' @param ... columns of `x` to be selected for PCA
|
||||
#' @inheritParams stats::prcomp
|
||||
#' @details The [pca()] function takes a [data.frame] as input and performs the actual PCA with the R function [prcomp()].
|
||||
#' @details The [pca()] function takes a [data.frame] as input and performs the actual PCA with the \R function [prcomp()].
|
||||
#'
|
||||
#' The result of the [pca()] function is a [`prcomp`] object, with an additional attribute `non_numeric_cols` which is a vector with the column names of all columns that do not contain numeric values. These are probably the groups and labels, and will be used by [ggplot_pca()].
|
||||
#' @rdname pca
|
||||
#' @exportMethod prcomp.data.frame
|
||||
#' The result of the [pca()] function is a [prcomp] object, with an additional attribute `non_numeric_cols` which is a vector with the column names of all columns that do not contain numeric values. These are probably the groups and labels, and will be used by [ggplot_pca()].
|
||||
#' @return An object of classes [pca] and [prcomp]
|
||||
#' @importFrom stats prcomp
|
||||
#' @importFrom dplyr ungroup %>% filter_all all_vars
|
||||
#' @importFrom rlang enquos eval_tidy
|
||||
#' @export
|
||||
#' @examples
|
||||
#' # `example_isolates` is a dataset available in the AMR package.
|
||||
@ -52,40 +53,19 @@
|
||||
#' summary(pca_result)
|
||||
#' biplot(pca_result)
|
||||
#' ggplot_pca(pca_result) # a new and convenient plot function
|
||||
prcomp.data.frame <- function(x,
|
||||
...,
|
||||
retx = TRUE,
|
||||
center = TRUE,
|
||||
scale. = TRUE,
|
||||
tol = NULL,
|
||||
rank. = NULL) {
|
||||
pca <- function(x,
|
||||
...,
|
||||
retx = TRUE,
|
||||
center = TRUE,
|
||||
scale. = TRUE,
|
||||
tol = NULL,
|
||||
rank. = NULL) {
|
||||
|
||||
x <- pca_transform_x(x = x, ... = ...)
|
||||
pca_data <- x[, which(sapply(x, function(x) is.numeric(x)))]
|
||||
|
||||
message(blue(paste0("NOTE: Columns selected for PCA: ", paste0(bold(colnames(pca_data)), collapse = "/"),
|
||||
".\n Total observations available: ", nrow(pca_data), ".")))
|
||||
|
||||
stats:::prcomp.default(pca_data, retx = retx, center = center, scale. = scale., tol = tol, rank. = rank.)
|
||||
}
|
||||
|
||||
#' @rdname pca
|
||||
#' @export
|
||||
pca <- function(x, ...) {
|
||||
if (!is.data.frame(x)) {
|
||||
stop("this function only takes a data.frame as input")
|
||||
}
|
||||
pca_model <- prcomp(x, ...)
|
||||
|
||||
x <- pca_transform_x(x = x, ... = ...)
|
||||
attr(pca_model, "non_numeric_cols") <- x[, sapply(x, function(y) !is.numeric(y) & !all(is.na(y))), drop = FALSE]
|
||||
pca_model
|
||||
}
|
||||
|
||||
#' @importFrom dplyr ungroup %>% filter_all all_vars
|
||||
#' @importFrom rlang enquos eval_tidy
|
||||
pca_transform_x <- function(x, ...) {
|
||||
# unset data.table, tbl_df, etc.
|
||||
# unset data.table, tibble, etc.
|
||||
# also removes groups made by dplyr::group_by
|
||||
x <- as.data.frame(x, stringsAsFactors = FALSE)
|
||||
x.bak <- x
|
||||
@ -123,7 +103,17 @@ pca_transform_x <- function(x, ...) {
|
||||
x <- cbind(x.bak[, sapply(x.bak, function(y) !is.numeric(y) & !all(is.na(y))), drop = FALSE], x)
|
||||
}
|
||||
|
||||
x %>%
|
||||
x <- x %>%
|
||||
ungroup() %>% # would otherwise select the grouping vars
|
||||
filter_all(all_vars(!is.na(.)))
|
||||
|
||||
pca_data <- x[, which(sapply(x, function(x) is.numeric(x)))]
|
||||
|
||||
message(blue(paste0("NOTE: Columns selected for PCA: ", paste0(bold(colnames(pca_data)), collapse = "/"),
|
||||
".\n Total observations available: ", nrow(pca_data), ".")))
|
||||
|
||||
pca_model <- prcomp(pca_data, retx = retx, center = center, scale. = scale., tol = tol, rank. = rank.)
|
||||
attr(pca_model, "non_numeric_cols") <- x[, sapply(x, function(y) !is.numeric(y) & !all(is.na(y))), drop = FALSE]
|
||||
class(pca_model) <- c("pca", class(pca_model))
|
||||
pca_model
|
||||
}
|
||||
|
Reference in New Issue
Block a user