fastqc: simple comment. trimming: reverted back. overall qc: simplified the scripts and made sure to add instructions.

This commit is contained in:
Jos van Nijnatten
2021-02-10 13:42:51 +01:00
parent 135f231c86
commit 648cd09a0e
6 changed files with 64 additions and 813 deletions

View File

@@ -1,16 +1,28 @@
# Gender QC
# Normalized with limma::voom
library(GSVA)
library(limma)
library(edgeR)
library(tidyverse)
source("__ - Preloader.R", verbose=T)
# expression.data. Has columns:
# - Gene (gene identifier)
# - [Sample identifiers]
expression.data <- "mrna_counts_table.csv"
# master.Table. Has columns:
# - GenomeScan_ID
# - gender, levels = c("male", "female")
# - age
# - factor(smoking.status, levels = c("Ex-smoker", "Current smoker"))
master.Table <- "patient_table.csv"
# results.dir. The directory of where to put the resulting tables (and later your plots.)
results.dir <- "results"
# The analysis
norm.expr.data <- expression.data %>%
tibble::column_to_rownames("Gene")
norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>%
limma::voom() %>%
as.matrix()
# We first do a differential expression analysis on gender using EdgeR.
# Afterwards you should plot these results.
x.genes <- gene.data %>%
dplyr::filter(chromosome_name == "X") %>%
dplyr::pull(ensembl_gene_id)
@@ -87,248 +99,12 @@ gender.qc.genes.to.plot <- gender.qc.results %>%
chromosome_name %in% c("X", "Y") &
FDR < 0.05 &
dplyr::row_number() <= 5
) |
hgnc_symbol %in% c(
"XIST",
"TSIX",
"KDM6A",
"ZFX",
"KDM5C",
"ZFY-AS1",
"ARSDP1",
"GYG2P1",
"RBMY2JP",
"ARSLP1"
)
)
gender.qc.data <- as.data.frame(norm.expr.data) %>%
rownames_to_column("ensembl.id") %>%
tidyr::gather(
key = "rna.seq.sample.id",
value = "expr.value",
-ensembl.id
) %>%
dplyr::filter(
ensembl.id %in% gender.qc.genes.to.plot$ensembl.id
) %>%
dplyr::left_join(
y = gender.qc.patients,
by = c("rna.seq.sample.id" = "GenomeScan_ID")
) %>%
#dplyr::filter(
# !is.na(gender)
#) %>%
readr::write_csv(
file.path(results.dir.gender.plot, "plot.data.voom.csv")
)
for (chr in gender.qc.genes.to.plot$chromosome_name) {
current.gender.qc.genes.to.plot <- gender.qc.genes.to.plot %>%
dplyr::filter(chromosome_name == chr)
chromosome_name <- chr
i <- 0
for (current.ensembl.id in current.gender.qc.genes.to.plot$ensembl.id) {
i <- i + 1
hgnc_symbol <- gene.data %>%
dplyr::filter(
ensembl_gene_id == current.ensembl.id
) %>%
dplyr::pull(hgnc_symbol)
# calculate outliers, kinda
plot.data <- gender.qc.data %>%
dplyr::filter(
ensembl.id == current.ensembl.id
) %>%
dplyr::mutate(
gender = dplyr::case_when(
is.na(gender) | (stringr::str_trim(gender) == "") ~ "other",
TRUE ~ gender
)
)
if (nrow(plot.data) <= 0) {
next
}
outliers <- boxplot(
formula = expr.value ~ gender,
data = plot.data,
plot = FALSE
)$out
result.to.annotate <- plot.data %>%
dplyr::filter(
expr.value %in% outliers
)
# Visual: plot range (for t-test p-value)
plot.y.range <- c(
"min" = as.integer(min(plot.data$expr.value) - 1) ,
"max" = as.integer(max(plot.data$expr.value) + 1)
)
plot.margin <- ((plot.y.range["max"] + (plot.y.range["min"] * -1)) * 0.05)
plot.y.range["min"] <- plot.y.range["min"] - plot.margin
plot.y.range["max"] <- plot.y.range["max"] + plot.margin
# Plot the damn thing as if it is Graphpad Prism
stat.table <- rstatix::t_test(plot.data, expr.value ~ gender)
plt <- plot.data %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
x = gender,
y = expr.value
)
) +
ggplot2::geom_jitter(
mapping = ggplot2::aes(
colour = gender,
shape = gender
),
width = 0.1
) +
ggrepel::geom_text_repel(
data = result.to.annotate,
mapping = ggplot2::aes(
label = sample.id
),
size = 2,
box.padding = unit(0.35, "lines"),
point.padding = unit(0.3, "lines")
) +
ggplot2::stat_summary(
fun = "mean",
geom = "crossbar",
width = 0.3,
size = 0.3
) +
ggplot2::scale_y_continuous(
limits = plot.y.range,
guide = "prism_offset"
) +
#ggprism::add_pvalue(
# stat.table,
# y.position = plot.y.range["max"]
#) +
ggprism::theme_prism() +
ggprism::scale_colour_prism() +
ggprism::scale_shape_prism() +
ggplot2::theme(
legend.position = "none"
) +
ggplot2::labs(
subtitle = paste0("Gender Check: ", hgnc_symbol, " (chr. ", chromosome_name, ")"),
x = "Gender",
y = "Normalised Expression Values"
)
ggplot2::ggsave(
filename = file.path(results.dir.gender.plot, paste0(chromosome_name, ".", i, ".", hgnc_symbol, ".png")),
plot = plt,
width = 12.5,
height = 12.5,
unit = "cm"
)
}
}
# Let's try a GSVA
gsva.groups <- list(
X = gender.qc.genes.to.plot %>%
dplyr::filter(chromosome_name == "X") %>%
dplyr::pull(ensembl.id),
Y = gender.qc.genes.to.plot %>%
dplyr::filter(chromosome_name == "Y") %>%
dplyr::pull(ensembl.id)
)
gsva_res = GSVA::gsva(
norm.expr.data,
gsva.groups,
mx.diff = TRUE,
verbose = FALSE,
parallel.sz = 1
)
gender.qc.gsva.data <- as.data.frame(gsva_res) %>%
rownames_to_column("gsva.group") %>%
tidyr::gather(
key = "rna.seq.sample.id",
value = "gsva.value",
-gsva.group
) %>%
dplyr::left_join(
y = gender.qc.patients %>%
dplyr::select(
GenomeScan_ID,
sample.id,
gender
),
by = c("rna.seq.sample.id" = "GenomeScan_ID")
) %>%
readr::write_csv(
file.path(results.dir.gender.plot, "plot.data.gsva.csv")
)
for (c.gender in unique(gender.qc.gsva.data$gender)) {
if (is.na(c.gender)) {
next
}
c.plot.data <- gender.qc.gsva.data %>%
dplyr::filter(
gender == c.gender
)
outliers <- boxplot(
formula = gsva.value ~ gsva.group,
data = c.plot.data,
plot = FALSE
)$out
result.to.annotate <- c.plot.data %>%
dplyr::filter(
gsva.value %in% outliers
)
plt <- c.plot.data %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
x = gsva.group,
y = gsva.value
)
) +
ggplot2::geom_boxplot() +
ggrepel::geom_text_repel(
data = result.to.annotate,
mapping = ggplot2::aes(
label = sample.id
),
size = 2,
box.padding = unit(0.35, "lines"),
point.padding = unit(0.3, "lines")
) +
ggprism::theme_prism() +
ggprism::scale_colour_prism() +
ggprism::scale_shape_prism() +
ggplot2::theme(
legend.position = "none"
) +
ggplot2::labs(
subtitle = paste0("", toupper(c.gender)),
x = "Chromosome",
y = "GSVA Values"
)
ggplot2::ggsave(
filename = file.path(results.dir.gender.plot, paste0("gsva.", c.gender, ".png")),
plot = plt,
width = 12.5,
height = 12.5,
unit = "cm"
)
}
# Next thing to do:
# - Plot the normalized expressino values for the genes in gender.qc.genes.to.plot in a boxplot, split and colored by gender.
# - (Optional) Do a GSVA with as genesets the genes found in gender.qc.genes.to.plot. Plot the boxplots as per the previous point.
# - (Optional) Plot the number of Y-chromosome reads devided by the number of X chromosome reads in a boxplot as per the first point.
# - (Optional) Plot the number of Y-chromosome SNPs devided by the number of X chromosome SNPs in a boxplot as per the first point.