forked from GRIAC/system_genetics
81 lines
2.1 KiB
R
81 lines
2.1 KiB
R
# Principle Component Analysis
|
|
# Normalized with limma::voom
|
|
library(limma)
|
|
library(tidyverse)
|
|
|
|
|
|
# expression.data. Has columns:
|
|
# - Gene (gene identifier)
|
|
# - [Sample identifiers]
|
|
expression.data <- "mrna_counts_table.csv"
|
|
# results.dir. The directory of where to put the resulting tables (and later your plots.)
|
|
results.dir <- "results"
|
|
|
|
|
|
# PCA variables
|
|
do.center = TRUE
|
|
do.scale = FALSE
|
|
|
|
|
|
# The analysis
|
|
# We ise prcomp to calculate the PCAs. Afterwards you should plot the results.
|
|
norm.expr.data <- expression.data %>%
|
|
tibble::column_to_rownames("Gene")
|
|
norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>%
|
|
limma::voom() %>%
|
|
as.matrix()
|
|
|
|
# Principle Component analysis
|
|
results.dir.pca <- file.path(results.dir, "principle.components")
|
|
dir.create(results.dir.pca, recursive=TRUE)
|
|
|
|
norm.expr.data.pcs <- norm.expr.data %>%
|
|
t() %>%
|
|
stats::prcomp(
|
|
center = do.center,
|
|
scale. = do.scale
|
|
)
|
|
|
|
# Write summary of PCAs to files
|
|
pcs.summery <- summary(norm.expr.data.pcs)
|
|
pcs.summery$importance %>%
|
|
t() %>%
|
|
as.data.frame() %>%
|
|
tibble::rownames_to_column("PC.name") %>%
|
|
readr::write_csv(
|
|
file.path(results.dir.pca, "importance.csv")
|
|
)
|
|
|
|
pcs.summery$x %>%
|
|
t() %>%
|
|
as.data.frame() %>%
|
|
tibble::rownames_to_column("ensembl.id") %>%
|
|
readr::write_csv(
|
|
file.path(results.dir.pca, "values.csv")
|
|
)
|
|
|
|
pcs.summery$rotation %>%
|
|
t() %>%
|
|
as.data.frame() %>%
|
|
tibble::rownames_to_column("sample.id") %>%
|
|
readr::write_csv(
|
|
file.path(results.dir.pca, "rotation.csv")
|
|
)
|
|
|
|
data.frame(
|
|
rownames = names(pcs.summery$center),
|
|
center = pcs.summery$center,
|
|
scale = pcs.summery$scale
|
|
) %>%
|
|
readr::write_csv(
|
|
file.path(results.dir.pca, "rest.csv")
|
|
)
|
|
|
|
# Not saved: pcs.summery$sdev,
|
|
|
|
|
|
# Next thing to do:
|
|
# - (Optional) scree plot - to determine the optimal cutoff for PCA inclusion based on explaination of variance
|
|
# - (Optional) eigencorplot - to correlate PCAs to clinical variables so that you know which PCA to include for which analysis
|
|
# - (Optional) pairsplot - plot multiple PCAs against each other in a single figure
|
|
# - Plot the first couple of PCAs against each other |