(v1.0.1.9000) first PCA implementation

2026-01-11 08:34:41 +01:00 · 2020-03-07 21:48:21 +01:00
parent f444c24ed3
commit fa0d9c58d9
40 changed files with 2224 additions and 172 deletions
--- a/man/age.Rd
+++ b/man/age.Rd
@@ -16,7 +16,7 @@ age(x, reference = Sys.Date(), exact = FALSE, na.rm = FALSE)
 \item{na.rm}{a logical to indicate whether missing values should be removed}
 }
 \value{
-An integer (no decimals) if \code{exact = FALSE}, a double (with decimals) otherwise
+An \link{integer} (no decimals) if \code{exact = FALSE}, a \link{double} (with decimals) otherwise
 }
 \description{
 Calculates age in years based on a reference date, which is the sytem date at default.
--- a/man/age_groups.Rd
+++ b/man/age_groups.Rd
@@ -11,16 +11,16 @@ age_groups(x, split_at = c(12, 25, 55, 75), na.rm = FALSE)

 \item{split_at}{values to split \code{x} at, defaults to age groups 0-11, 12-24, 25-54, 55-74 and 75+. See Details.}

-\item{na.rm}{a logical to indicate whether missing values should be removed}
+\item{na.rm}{a \link{logical} to indicate whether missing values should be removed}
 }
 \value{
-Ordered \code{\link{factor}}
+Ordered \link{factor}
 }
 \description{
 Split ages into age groups defined by the \code{split} parameter. This allows for easier demographic (antimicrobial resistance) analysis.
 }
 \details{
-To split ages, the input can be:
+To split ages, the input for the \code{split_at} parameter can be:
 \itemize{
 \item A numeric vector. A vector of e.g. \code{c(10, 20)} will split on 0-9, 10-19 and 20+. A value of only \code{50} will split on 0-49 and 50+.
 The default is to split on young children (0-11), youth (12-24), young adults (25-54), middle-aged adults (55-74) and elderly (75+).
@@ -29,7 +29,7 @@ The default is to split on young children (0-11), youth (12-24), young adults (2
 \item \code{"children"} or \code{"kids"}, equivalent of: \code{c(0, 1, 2, 4, 6, 13, 18)}. This will split on 0, 1, 2-3, 4-5, 6-12, 13-17 and 18+.
 \item \code{"elderly"} or \code{"seniors"}, equivalent of: \code{c(65, 75, 85)}. This will split on 0-64, 65-74, 75-84, 85+.
 \item \code{"fives"}, equivalent of: \code{1:20 * 5}. This will split on 0-4, 5-9, 10-14, ..., 90-94, 95-99, 100+.
-\item \code{"tens"}, equivalent of: \code{1:10 * 10}. This will split on 0-9, 10-19, 20-29, ... 80-89, 90-99, 100+.
+\item \code{"tens"}, equivalent of: \code{1:10 * 10}. This will split on 0-9, 10-19, 20-29, ..., 80-89, 90-99, 100+.
 }
 }
 }
--- a/man/ggplot_pca.Rd
+++ b/man/ggplot_pca.Rd
@@ -0,0 +1,119 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ggplot_pca.R
+\name{ggplot_pca}
+\alias{ggplot_pca}
+\title{PCA biplot with \code{ggplot2}}
+\source{
+The \code{\link[=ggplot_pca]{ggplot_pca()}} function is based on the \code{\link[=ggbiplot]{ggbiplot()}} function from the \code{ggbiplot} package by Vince Vu, as found on GitHub: \url{https://github.com/vqv/ggbiplot} (retrieved: 2 March 2020, their latest commit: \href{https://github.com/vqv/ggbiplot/commit/7325e880485bea4c07465a0304c470608fffb5d9}{\code{7325e88}}; 12 February 2015).
+
+As per their GPL-2 licence that demands documentation of code changes, the changes made based on the source code were:
+\enumerate{
+\item Rewritten code to remove the dependency on packages \code{plyr}, \code{scales} and \code{grid}
+\item Parametrised more options, like arrow and ellipse settings
+\item Added total amount of explained variance as a caption in the plot
+\item Cleaned all syntax based on the \code{lintr} package
+\item Updated documentation
+}
+}
+\usage{
+ggplot_pca(
+  x,
+  choices = 1:2,
+  scale = TRUE,
+  labels = NULL,
+  labels_textsize = 3,
+  labels_text_placement = 1.5,
+  groups = NULL,
+  ellipse = FALSE,
+  ellipse_prob = 0.68,
+  ellipse_size = 0.5,
+  ellipse_alpha = 0.25,
+  points_size = 2,
+  points_alpha = 0.25,
+  arrows = TRUE,
+  arrows_colour = "darkblue",
+  arrows_size = 0.5,
+  arrows_textsize = 3,
+  arrows_alpha = 0.75,
+  base_textsize = 10,
+  ...
+)
+}
+\arguments{
+\item{x}{an object returned by \code{\link[=pca]{pca()}}, \code{\link[=prcomp]{prcomp()}} or \code{\link[=princomp]{princomp()}}}
+
+\item{choices}{
+    length 2 vector specifying the components to plot. Only the default
+    is a biplot in the strict sense.
+  }
+
+\item{scale}{
+    The variables are scaled by \code{lambda ^ scale} and the
+    observations are scaled by \code{lambda ^ (1-scale)} where
+    \code{lambda} are the singular values as computed by
+    \code{\link[stats]{princomp}}. Normally \code{0 <= scale <= 1}, and a warning
+    will be issued if the specified \code{scale} is outside this range.
+  }
+
+\item{labels}{an optional vector of labels for the observations. If set, the labels will be placed below their respective points. When using the \code{\link[=pca]{pca()}} function as input for \code{x}, this will be determined automatically based on the attribute \code{non_numeric_cols}, see \code{\link[=pca]{pca()}}.}
+
+\item{labels_textsize}{the size of the text used for the labels}
+
+\item{labels_text_placement}{adjustment factor the placement of the variable names (\verb{>=1} means further away from the arrow head)}
+
+\item{groups}{an optional vector of groups for the labels, with the same length as \code{labels}. If set, the points and labels will be coloured according to these groups. When using the \code{\link[=pca]{pca()}} function as input for \code{x}, this will be determined automatically based on the attribute \code{non_numeric_cols}, see \code{\link[=pca]{pca()}}.}
+
+\item{ellipse}{a logical to indicate whether a normal data ellipse should be drawn for each group (set with \code{groups})}
+
+\item{ellipse_prob}{statistical size of the ellipse in normal probability}
+
+\item{ellipse_size}{the size of the ellipse line}
+
+\item{ellipse_alpha}{the alpha (transparency) of the ellipse line}
+
+\item{points_alpha}{the alpha (transparency) of the points}
+
+\item{arrows}{a logical to indicate whether arrows should be drawn}
+
+\item{arrows_colour}{the colour of the arrow and their text}
+
+\item{arrows_size}{the size (thickness) of the arrow lines}
+
+\item{arrows_textsize}{the size of the text at the end of the arrows}
+
+\item{arrows_alpha}{the alpha (transparency) of the arrows and their text}
+
+\item{base_textsize}{the text size for all plot elements except the labels and arrows}
+
+\item{...}{Parameters passed on to functions}
+}
+\description{
+This function is to produce a \code{ggplot2} variant of a so-called \href{https://en.wikipedia.org/wiki/Biplot}{biplot} for PCA (principal component analysis), but is more flexible and more appealing than the base \R \code{\link[=biplot]{biplot()}} function.
+}
+\details{
+The default colours for labels and points is set with \code{\link[=scale_colour_viridis_d]{scale_colour_viridis_d()}}, but these can be changed by adding another scale for colour, like \code{\link[=scale_colour_brewer]{scale_colour_brewer()}}.
+}
+\section{Maturing lifecycle}{
+
+\if{html}{\figure{lifecycle_maturing.svg}{options: style=margin-bottom:5px} \cr}
+The \link[AMR:lifecycle]{lifecycle} of this function is \strong{maturing}. The unlying code of a maturing function has been roughed out, but finer details might still change. We will strive to maintain backward compatibility, but the function needs wider usage and more extensive testing in order to optimise the unlying code.
+}
+
+\examples{
+# `example_isolates` is a dataset available in the AMR package.
+# See ?example_isolates.
+
+# See ?pca for more info about Principal Component Analysis (PCA).
+library(dplyr)
+pca_model <- example_isolates \%>\% 
+  filter(mo_genus(mo) == "Staphylococcus") \%>\% 
+  group_by(species = mo_shortname(mo)) \%>\%
+  summarise_if (is.rsi, resistance) \%>\%
+  pca(FLC, AMC, CXM, GEN, TOB, TMP, SXT, CIP, TEC, TCY, ERY)
+  
+# old
+biplot(pca_model)
+
+# new 
+ggplot_pca(pca_model)
+}
--- a/man/lifecycle.Rd
+++ b/man/lifecycle.Rd
@@ -12,7 +12,7 @@ This page contains a section for every lifecycle (with text borrowed from the af
 \section{Experimental lifecycle}{

 \if{html}{\figure{lifecycle_experimental.svg}{options: style=margin-bottom:5px} \cr}
-The \link[AMR:lifecycle]{lifecycle} of this function is \strong{experimental}. An experimental function is in the very early stages of development. The unlying code might be changing frequently as we rapidly iterate and explore variations in search of the best fit. Experimental functions might be removed without deprecation, so you are generally best off waiting until a function is more mature before you use it in production code. Experimental functions will not be included in releases we submit to CRAN.
+The \link[AMR:lifecycle]{lifecycle} of this function is \strong{experimental}. An experimental function is in the very early stages of development. The unlying code might be changing frequently as we rapidly iterate and explore variations in search of the best fit. Experimental functions might be removed without deprecation, so you are generally best off waiting until a function is more mature before you use it in production code. Experimental functions will not be included in releases we submit to CRAN, since they have not yet matured enough.
 }

 \section{Maturing lifecycle}{
--- a/man/pca.Rd
+++ b/man/pca.Rd
@@ -0,0 +1,87 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/pca.R
+\name{prcomp.data.frame}
+\alias{prcomp.data.frame}
+\alias{pca}
+\title{Principal Component Analysis (for AMR)}
+\usage{
+\method{prcomp}{data.frame}(
+  x,
+  ...,
+  retx = TRUE,
+  center = TRUE,
+  scale. = TRUE,
+  tol = NULL,
+  rank. = NULL
+)
+
+pca(x, ...)
+}
+\arguments{
+\item{x}{a \link{data.frame} containing numeric columns}
+
+\item{...}{columns of \code{x} to be selected for PCA}
+
+\item{retx}{a logical value indicating whether the rotated variables
+    should be returned.}
+
+\item{center}{a logical value indicating whether the variables
+    should be shifted to be zero centered. Alternately, a vector of
+    length equal the number of columns of \code{x} can be supplied.
+    The value is passed to \code{scale}.}
+
+\item{scale.}{a logical value indicating whether the variables should
+    be scaled to have unit variance before the analysis takes
+    place.  The default is \code{FALSE} for consistency with S, but
+    in general scaling is advisable.  Alternatively, a vector of length
+    equal the number of columns of \code{x} can be supplied.  The
+    value is passed to \code{\link{scale}}.}
+
+\item{tol}{a value indicating the magnitude below which components
+    should be omitted. (Components are omitted if their
+    standard deviations are less than or equal to \code{tol} times the
+    standard deviation of the first component.)  With the default null
+    setting, no components are omitted (unless \code{rank.} is specified
+    less than \code{min(dim(x))}.).  Other settings for tol could be
+    \code{tol = 0} or \code{tol = sqrt(.Machine$double.eps)}, which
+    would omit essentially constant components.}
+
+\item{rank.}{optionally, a number specifying the maximal rank, i.e.,
+    maximal number of principal components to be used.  Can be set as
+    alternative or in addition to \code{tol}, useful notably when the
+    desired rank is considerably smaller than the dimensions of the matrix.}
+}
+\description{
+Performs a principal component analysis (PCA) based on a data set with automatic determination for afterwards plotting the groups and labels.
+}
+\details{
+The \code{\link[=pca]{pca()}} function takes a \link{data.frame} as input and performs the actual PCA with the R function \code{\link[=prcomp]{prcomp()}}.
+
+The result of the \code{\link[=pca]{pca()}} function is a \code{\link{prcomp}} object, with an additional attribute \code{non_numeric_cols} which is a vector with the column names of all columns that do not contain numeric values. These are probably the groups and labels, and will be used by \code{\link[=ggplot_pca]{ggplot_pca()}}.
+}
+\section{Experimental lifecycle}{
+
+\if{html}{\figure{lifecycle_experimental.svg}{options: style=margin-bottom:5px} \cr}
+The \link[AMR:lifecycle]{lifecycle} of this function is \strong{experimental}. An experimental function is in the very early stages of development. The unlying code might be changing frequently as we rapidly iterate and explore variations in search of the best fit. Experimental functions might be removed without deprecation, so you are generally best off waiting until a function is more mature before you use it in production code. Experimental functions will not be included in releases we submit to CRAN, since they have not yet matured enough.
+}
+
+\examples{
+# `example_isolates` is a dataset available in the AMR package.
+# See ?example_isolates.
+
+# calculate the resistance per group first
+library(dplyr)
+resistance_data <- example_isolates \%>\% 
+  group_by(order = mo_order(mo),       # group on anything, like order
+           genus = mo_genus(mo)) \%>\%   #  and genus as we do here
+  summarise_if(is.rsi, resistance)     # then get resistance of all drugs
+  
+# now conduct PCA for certain antimicrobial agents
+pca_result <- resistance_data \%>\%         
+  pca(AMC, CXM, CTX, CAZ, GEN, TOB, TMP, SXT) 
+  
+pca_result
+summary(pca_result)
+biplot(pca_result)
+ggplot_pca(pca_result) # a new and convenient plot function
+}