1
0
mirror of https://github.com/msberends/AMR.git synced 2025-07-08 14:01:55 +02:00

(v1.0.1.9002) PCA unit tests

This commit is contained in:
2020-03-08 11:18:59 +01:00
parent 9fc858f208
commit 77656a676c
20 changed files with 182 additions and 135 deletions

View File

@ -43,7 +43,7 @@ check_dataset_integrity <- function() {
"iv_ddd", "iv_units", "loinc") %in% colnames(antibiotics),
na.rm = TRUE)
}, error = function(e)
stop('Please use the command \'library("AMR")\' before using this function, to load the needed reference data.', call. = FALSE)
stop('Please use the command \'library("AMR")\' before using this function, to load the required reference data.', call. = FALSE)
)
if (!check_microorganisms | !check_antibiotics) {
stop("Data set `microorganisms` or data set `antibiotics` is overwritten by your global environment and prevents the AMR package from working correctly. Please rename your object before using this function.", call. = FALSE)
@ -154,6 +154,13 @@ stopifnot_installed_package <- function(package) {
return(invisible())
}
stopifnot_msg <- function(expr, msg) {
if (!isTRUE(expr)) {
stop(msg, call. = FALSE)
}
}
"%or%" <- function(x, y) {
if (is.null(x) | is.null(y)) {
if (is.null(x)) {

View File

@ -49,9 +49,9 @@
#' 1. Rewritten code to remove the dependency on packages `plyr`, `scales` and `grid`
#' 2. Parametrised more options, like arrow and ellipse settings
#' 3. Added total amount of explained variance as a caption in the plot
#' 4. Cleaned all syntax based on the `lintr` package
#' 4. Cleaned all syntax based on the `lintr` package and added integrity checks
#' 5. Updated documentation
#' @details The default colours for labels and points is set with [scale_colour_viridis_d()], but these can be changed by adding another scale for colour, like [scale_colour_brewer()].
#' @details The colours for labels and points can be changed by adding another scale layer for colour, like [scale_colour_viridis_d()] or [scale_colour_brewer()].
#' @rdname ggplot_pca
#' @export
#' @examples
@ -74,14 +74,15 @@
ggplot_pca <- function(x,
choices = 1:2,
scale = TRUE,
pc.biplot = TRUE,
labels = NULL,
labels_textsize = 3,
labels_text_placement = 1.5,
groups = NULL,
ellipse = FALSE,
ellipse = TRUE,
ellipse_prob = 0.68,
ellipse_size = 0.5,
ellipse_alpha = 0.25,
ellipse_alpha = 0.5,
points_size = 2,
points_alpha = 0.25,
arrows = TRUE,
@ -93,6 +94,21 @@ ggplot_pca <- function(x,
...) {
stopifnot_installed_package("ggplot2")
stopifnot_msg(length(choices) == 2, "`choices` must be of length 2")
stopifnot_msg(is.logical(scale), "`scale` must be TRUE or FALSE")
stopifnot_msg(is.logical(pc.biplot), "`pc.biplot` must be TRUE or FALSE")
stopifnot_msg(is.numeric(choices), "`choices` must be numeric")
stopifnot_msg(is.numeric(labels_textsize), "`labels_textsize` must be numeric")
stopifnot_msg(is.numeric(labels_text_placement), "`labels_text_placement` must be numeric")
stopifnot_msg(is.logical(ellipse), "`ellipse` must be TRUE or FALSE")
stopifnot_msg(is.numeric(ellipse_prob), "`ellipse_prob` must be numeric")
stopifnot_msg(is.numeric(ellipse_size), "`ellipse_size` must be numeric")
stopifnot_msg(is.numeric(ellipse_alpha), "`ellipse_alpha` must be numeric")
stopifnot_msg(is.logical(arrows), "`arrows` must be TRUE or FALSE")
stopifnot_msg(is.numeric(arrows_size), "`arrows_size` must be numeric")
stopifnot_msg(is.numeric(arrows_textsize), "`arrows_textsize` must be numeric")
stopifnot_msg(is.numeric(arrows_alpha), "`arrows_alpha` must be numeric")
stopifnot_msg(is.numeric(base_textsize), "`base_textsize` must be numeric")
calculations <- pca_calculations(pca_model = x,
groups = groups,
@ -101,6 +117,7 @@ ggplot_pca <- function(x,
labels_missing = missing(labels),
choices = choices,
scale = scale,
pc.biplot = pc.biplot,
ellipse_prob = ellipse_prob,
labels_text_placement = labels_text_placement)
nobs.factor <- calculations$nobs.factor
@ -116,17 +133,16 @@ ggplot_pca <- function(x,
group_name <- calculations$group_name
labels <- calculations$labels
stopifnot(length(choices) == 2)
# Append the proportion of explained variance to the axis labels
if ((1 - as.integer(scale)) == 0) {
u.axis.labs <- paste("Standardised PC", choices, sep = "")
u.axis.labs <- paste0("Standardised PC", choices)
} else {
u.axis.labs <- paste("PC", choices, sep = "")
u.axis.labs <- paste0("PC", choices)
}
u.axis.labs <- paste(u.axis.labs,
paste0("\n(explained var: ",
percentage(x$sdev[choices] ^ 2 / sum(x$sdev ^ 2)), ")"))
u.axis.labs <- paste0(u.axis.labs,
paste0("\n(explained var: ",
percentage(x$sdev[choices] ^ 2 / sum(x$sdev ^ 2)),
")"))
# Score Labels
if (!is.null(labels)) {
@ -138,7 +154,6 @@ ggplot_pca <- function(x,
df.u$groups <- groups
}
# Base plot
g <- ggplot2::ggplot(data = df.u,
ggplot2::aes(x = xvar, y = yvar)) +
@ -150,30 +165,25 @@ ggplot_pca <- function(x,
# Draw either labels or points
if (!is.null(df.u$labels)) {
if (!is.null(df.u$groups)) {
g <- g +
ggplot2::geom_point(ggplot2::aes(colour = groups),
alpha = points_alpha,
size = points_size) +
g <- g + ggplot2::geom_point(ggplot2::aes(colour = groups),
alpha = points_alpha,
size = points_size) +
ggplot2::geom_text(ggplot2::aes(label = labels, colour = groups),
nudge_y = -0.05,
size = labels_textsize) +
ggplot2::scale_colour_viridis_d() +
ggplot2::labs(colour = group_name)
} else {
g <- g +
ggplot2::geom_point(alpha = points_alpha,
size = points_size) +
g <- g + ggplot2::geom_point(alpha = points_alpha,
size = points_size) +
ggplot2::geom_text(ggplot2::aes(label = labels),
nudge_y = -0.05,
size = labels_textsize)
}
} else {
if (!is.null(df.u$groups)) {
g <- g +
ggplot2::geom_point(ggplot2::aes(colour = groups),
alpha = points_alpha,
size = points_size) +
ggplot2::scale_colour_viridis_d() +
g <- g + ggplot2::geom_point(ggplot2::aes(colour = groups),
alpha = points_alpha,
size = points_size) +
ggplot2::labs(colour = group_name)
} else {
g <- g + ggplot2::geom_point(alpha = points_alpha,
@ -182,26 +192,24 @@ ggplot_pca <- function(x,
}
# Overlay a concentration ellipse if there are groups
if (!is.null(df.u$groups) & isTRUE(ellipse)) {
g <- g +
ggplot2::geom_path(data = ell,
ggplot2::aes(colour = groups, group = groups),
size = ellipse_size,
alpha = points_alpha)
if (!is.null(df.u$groups) & !is.null(ell) & isTRUE(ellipse)) {
g <- g + ggplot2::geom_path(data = ell,
ggplot2::aes(colour = groups, group = groups),
size = ellipse_size,
alpha = points_alpha)
}
# Label the variable axes
if (arrows == TRUE) {
g <- g +
ggplot2::geom_segment(data = df.v,
ggplot2::aes(x = 0, y = 0, xend = xvar, yend = yvar),
arrow = ggplot2::arrow(length = ggplot2::unit(0.5, "picas"),
angle = 20,
ends = "last",
type = "open"),
colour = arrows_colour,
size = arrows_size,
alpha = arrows_alpha) +
g <- g + ggplot2::geom_segment(data = df.v,
ggplot2::aes(x = 0, y = 0, xend = xvar, yend = yvar),
arrow = ggplot2::arrow(length = ggplot2::unit(0.5, "picas"),
angle = 20,
ends = "last",
type = "open"),
colour = arrows_colour,
size = arrows_size,
alpha = arrows_alpha) +
ggplot2::geom_text(data = df.v,
ggplot2::aes(label = varname, x = xvar, y = yvar, angle = angle, hjust = hjust),
colour = arrows_colour,
@ -225,6 +233,7 @@ ggplot_pca <- function(x,
}
#' @importFrom dplyr bind_rows
#' @importFrom stats qchisq var
pca_calculations <- function(pca_model,
groups = NULL,
groups_missing = TRUE,
@ -232,6 +241,7 @@ pca_calculations <- function(pca_model,
labels_missing = TRUE,
choices = 1:2,
scale = 1,
pc.biplot = TRUE,
ellipse_prob = 0.68,
labels_text_placement = 1.5) {
@ -291,7 +301,9 @@ pca_calculations <- function(pca_model,
names(df.u) <- c("xvar", "yvar")
names(df.v) <- names(df.u)
df.u <- df.u * nobs.factor
if (isTRUE(pc.biplot)) {
df.u <- df.u * nobs.factor
}
# Scale the radius of the correlation circle so that it corresponds to
# a data ellipse for the standardized PC scores
@ -314,8 +326,8 @@ pca_calculations <- function(pca_model,
df.v$hjust <- with(df.v, (1 - labels_text_placement * sign(xvar)) / 2)
if (!is.null(df.u$groups)) {
theta <<- c(seq(-pi, pi, length = 50), seq(pi, -pi, length = 50))
circle <<- cbind(cos(theta), sin(theta))
theta <- c(seq(-pi, pi, length = 50), seq(pi, -pi, length = 50))
circle <- cbind(cos(theta), sin(theta))
ell <- bind_rows(
sapply(unique(df.u$groups), function(g, df = df.u) {
x <- df[which(df$groups == g), , drop = FALSE]
@ -325,18 +337,18 @@ pca_calculations <- function(pca_model,
sigma <- var(cbind(x$xvar, x$yvar))
mu <- c(mean(x$xvar), mean(x$yvar))
ed <- sqrt(qchisq(ellipse_prob, df = 2))
el <- data.frame(sweep(circle %*% chol(sigma) * ed, 2, mu, FUN = "+"),
groups = x$groups[1])
names(el)[1:2] <- c("xvar", "yvar")
el
data.frame(sweep(circle %*% chol(sigma) * ed, 2, mu, FUN = "+"),
groups = x$groups[1])
}))
if (NROW(ell) == 0) {
ell <- NULL
} else {
names(ell)[1:2] <- c("xvar", "yvar")
}
} else {
ell <- NULL
}
list(nobs.factor = nobs.factor,
d = d,
u = u,
@ -350,5 +362,4 @@ pca_calculations <- function(pca_model,
group_name = group_name,
labels = labels
)
}

View File

@ -24,6 +24,7 @@ globalVariables(c(".",
"ab",
"ab_txt",
"abbreviations",
"angle",
"antibiotic",
"antibiotics",
"CNS_CPS",
@ -40,6 +41,7 @@ globalVariables(c(".",
"genus",
"gramstain",
"group",
"hjust",
"index",
"input",
"interpretation",
@ -100,7 +102,10 @@ globalVariables(c(".",
"txt",
"uncertainty_level",
"value",
"varname",
"x",
"xdr",
"xvar",
"y",
"year"))
"year",
"yvar"))

60
R/pca.R
View File

@ -21,17 +21,18 @@
#' Principal Component Analysis (for AMR)
#'
#' Performs a principal component analysis (PCA) based on a data set with automatic determination for afterwards plotting the groups and labels.
#' Performs a principal component analysis (PCA) based on a data set with automatic determination for afterwards plotting the groups and labels, and automatic filtering on only suitable (i.e. non-empty and numeric) variables.
#' @inheritSection lifecycle Experimental lifecycle
#' @param x a [data.frame] containing numeric columns
#' @param ... columns of `x` to be selected for PCA
#' @inheritParams stats::prcomp
#' @details The [pca()] function takes a [data.frame] as input and performs the actual PCA with the R function [prcomp()].
#' @details The [pca()] function takes a [data.frame] as input and performs the actual PCA with the \R function [prcomp()].
#'
#' The result of the [pca()] function is a [`prcomp`] object, with an additional attribute `non_numeric_cols` which is a vector with the column names of all columns that do not contain numeric values. These are probably the groups and labels, and will be used by [ggplot_pca()].
#' @rdname pca
#' @exportMethod prcomp.data.frame
#' The result of the [pca()] function is a [prcomp] object, with an additional attribute `non_numeric_cols` which is a vector with the column names of all columns that do not contain numeric values. These are probably the groups and labels, and will be used by [ggplot_pca()].
#' @return An object of classes [pca] and [prcomp]
#' @importFrom stats prcomp
#' @importFrom dplyr ungroup %>% filter_all all_vars
#' @importFrom rlang enquos eval_tidy
#' @export
#' @examples
#' # `example_isolates` is a dataset available in the AMR package.
@ -52,40 +53,19 @@
#' summary(pca_result)
#' biplot(pca_result)
#' ggplot_pca(pca_result) # a new and convenient plot function
prcomp.data.frame <- function(x,
...,
retx = TRUE,
center = TRUE,
scale. = TRUE,
tol = NULL,
rank. = NULL) {
pca <- function(x,
...,
retx = TRUE,
center = TRUE,
scale. = TRUE,
tol = NULL,
rank. = NULL) {
x <- pca_transform_x(x = x, ... = ...)
pca_data <- x[, which(sapply(x, function(x) is.numeric(x)))]
message(blue(paste0("NOTE: Columns selected for PCA: ", paste0(bold(colnames(pca_data)), collapse = "/"),
".\n Total observations available: ", nrow(pca_data), ".")))
stats:::prcomp.default(pca_data, retx = retx, center = center, scale. = scale., tol = tol, rank. = rank.)
}
#' @rdname pca
#' @export
pca <- function(x, ...) {
if (!is.data.frame(x)) {
stop("this function only takes a data.frame as input")
}
pca_model <- prcomp(x, ...)
x <- pca_transform_x(x = x, ... = ...)
attr(pca_model, "non_numeric_cols") <- x[, sapply(x, function(y) !is.numeric(y) & !all(is.na(y))), drop = FALSE]
pca_model
}
#' @importFrom dplyr ungroup %>% filter_all all_vars
#' @importFrom rlang enquos eval_tidy
pca_transform_x <- function(x, ...) {
# unset data.table, tbl_df, etc.
# unset data.table, tibble, etc.
# also removes groups made by dplyr::group_by
x <- as.data.frame(x, stringsAsFactors = FALSE)
x.bak <- x
@ -123,7 +103,17 @@ pca_transform_x <- function(x, ...) {
x <- cbind(x.bak[, sapply(x.bak, function(y) !is.numeric(y) & !all(is.na(y))), drop = FALSE], x)
}
x %>%
x <- x %>%
ungroup() %>% # would otherwise select the grouping vars
filter_all(all_vars(!is.na(.)))
pca_data <- x[, which(sapply(x, function(x) is.numeric(x)))]
message(blue(paste0("NOTE: Columns selected for PCA: ", paste0(bold(colnames(pca_data)), collapse = "/"),
".\n Total observations available: ", nrow(pca_data), ".")))
pca_model <- prcomp(pca_data, retx = retx, center = center, scale. = scale., tol = tol, rank. = rank.)
attr(pca_model, "non_numeric_cols") <- x[, sapply(x, function(y) !is.numeric(y) & !all(is.na(y))), drop = FALSE]
class(pca_model) <- c("pca", class(pca_model))
pca_model
}