system_genetics/rnaseq/step6_overall_QC/01 - Principle Component An...

81 lines
2.1 KiB
R
Raw Normal View History

2021-02-09 12:52:44 +01:00
# Principle Component Analysis
# Normalized with limma::voom
library(limma)
library(tidyverse)
2021-02-09 12:52:44 +01:00
# expression.data. Has columns:
# - Gene (gene identifier)
# - [Sample identifiers]
expression.data <- "mrna_counts_table.csv"
# results.dir. The directory of where to put the resulting tables (and later your plots.)
results.dir <- "results"
2021-02-09 12:52:44 +01:00
# PCA variables
do.center = TRUE
do.scale = FALSE
# The analysis
# We use prcomp to calculate the PCAs. Afterwards you should plot the results.
2021-02-09 12:52:44 +01:00
norm.expr.data <- expression.data %>%
tibble::column_to_rownames("Gene")
norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>%
limma::voom() %>%
as.matrix()
# Principle Component analysis
results.dir.pca <- file.path(results.dir, "principle.components")
dir.create(results.dir.pca, recursive=TRUE)
norm.expr.data.pcs <- norm.expr.data %>%
t() %>%
stats::prcomp(
center = do.center,
scale. = do.scale
)
# Write summary of PCAs to files
pcs.summary <- summary(norm.expr.data.pcs)
pcs.summary$importance %>%
2021-02-09 12:52:44 +01:00
t() %>%
as.data.frame() %>%
tibble::rownames_to_column("PC.name") %>%
readr::write_csv(
file.path(results.dir.pca, "importance.csv")
)
pcs.summary$x %>%
2021-02-09 12:52:44 +01:00
t() %>%
as.data.frame() %>%
tibble::rownames_to_column("ensembl.id") %>%
readr::write_csv(
file.path(results.dir.pca, "values.csv")
)
pcs.summary$rotation %>%
2021-02-09 12:52:44 +01:00
t() %>%
as.data.frame() %>%
tibble::rownames_to_column("sample.id") %>%
readr::write_csv(
file.path(results.dir.pca, "rotation.csv")
)
data.frame(
rownames = names(pcs.summary$center),
center = pcs.summary$center,
scale = pcs.summary$scale
2021-02-09 12:52:44 +01:00
) %>%
readr::write_csv(
file.path(results.dir.pca, "rest.csv")
)
# Not saved: pcs.summary$sdev,
2021-02-09 12:52:44 +01:00
# Next thing to do:
# - (Optional) scree plot - to determine the optimal cutoff for PCA inclusion based on explaination of variance
# - (Optional) eigencorplot - to correlate PCAs to clinical variables so that you know which PCA to include for which analysis
# - (Optional) pairsplot - plot multiple PCAs against each other in a single figure
# - Plot the first couple of PCAs against each other