forked from GRIAC/system_genetics
335 lines
9.2 KiB
R
335 lines
9.2 KiB
R
|
# Gender QC
|
||
|
# Normalized with limma::voom
|
||
|
|
||
|
source("__ - Preloader.R", verbose=T)
|
||
|
|
||
|
|
||
|
# The analysis
|
||
|
norm.expr.data <- expression.data %>%
|
||
|
tibble::column_to_rownames("Gene")
|
||
|
norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>%
|
||
|
limma::voom() %>%
|
||
|
as.matrix()
|
||
|
|
||
|
x.genes <- gene.data %>%
|
||
|
dplyr::filter(chromosome_name == "X") %>%
|
||
|
dplyr::pull(ensembl_gene_id)
|
||
|
|
||
|
y.genes <- gene.data %>%
|
||
|
dplyr::filter(chromosome_name == "Y") %>%
|
||
|
dplyr::pull(ensembl_gene_id)
|
||
|
|
||
|
# Gender QC
|
||
|
results.dir.gender <- file.path(results.dir, "gender.check")
|
||
|
dir.create(results.dir.gender, recursive=TRUE)
|
||
|
|
||
|
# Differential Expression on Gender
|
||
|
gender.qc.patients <- master.Table %>%
|
||
|
dplyr::filter(
|
||
|
!is.na(GenomeScan_ID)
|
||
|
) %>%
|
||
|
dplyr::mutate(
|
||
|
gender = factor(gender, levels = c("male", "female")),
|
||
|
age = as.numeric(age),
|
||
|
smoking.status = factor(smoking.status, levels = c("Ex-smoker", "Current smoker"))
|
||
|
) %>%
|
||
|
dplyr::filter(
|
||
|
!is.na(gender)
|
||
|
)
|
||
|
gender.qc.sample.order <- gender.qc.patients %>%
|
||
|
dplyr::pull(GenomeScan_ID)
|
||
|
|
||
|
gender.qc.expression.data <- expression.data %>%
|
||
|
tibble::column_to_rownames("Gene") %>%
|
||
|
select.columns.in.order(gender.qc.sample.order) %>%
|
||
|
as.matrix()
|
||
|
|
||
|
design <- model.matrix( ~0 + gender, data = gender.qc.patients)
|
||
|
|
||
|
DGEL <- edgeR::DGEList(gender.qc.expression.data)
|
||
|
keep <- edgeR::filterByExpr(DGEL)
|
||
|
keep[names(keep) %in% x.genes] <- TRUE
|
||
|
keep[names(keep) %in% y.genes] <- TRUE
|
||
|
DGEL <- DGEL[keep, , keep.lib.sizes=FALSE]
|
||
|
DGEL <- edgeR::calcNormFactors(DGEL, method = "TMM")
|
||
|
|
||
|
DGEL <- edgeR::estimateDisp(DGEL, design)
|
||
|
fit <- edgeR::glmQLFit(DGEL,design)
|
||
|
|
||
|
contrasts <- limma::makeContrasts(
|
||
|
gender = gendermale - genderfemale,
|
||
|
levels = design
|
||
|
)
|
||
|
qlf <- edgeR::glmQLFTest(fit, contrast = contrasts[,"gender"])
|
||
|
|
||
|
gender.qc.results <- edgeR::topTags(
|
||
|
qlf,
|
||
|
n=nrow(DGEL)
|
||
|
)$table %>%
|
||
|
tibble::rownames_to_column("ensembl.id") %>%
|
||
|
dplyr::left_join(
|
||
|
y = gene.data,
|
||
|
by = c("ensembl.id" = "ensembl_gene_id")
|
||
|
) %>%
|
||
|
readr::write_csv(
|
||
|
file.path(results.dir.gender, "differential.expression.on.gender.csv")
|
||
|
)
|
||
|
|
||
|
# Plotting of gender expression
|
||
|
results.dir.gender.plot <- file.path(results.dir.gender, "img")
|
||
|
dir.create(results.dir.gender.plot, recursive=TRUE)
|
||
|
|
||
|
gender.qc.genes.to.plot <- gender.qc.results %>%
|
||
|
dplyr::arrange(PValue) %>%
|
||
|
dplyr::group_by(chromosome_name) %>%
|
||
|
dplyr::filter(
|
||
|
(
|
||
|
chromosome_name %in% c("X", "Y") &
|
||
|
FDR < 0.05 &
|
||
|
dplyr::row_number() <= 5
|
||
|
) |
|
||
|
hgnc_symbol %in% c(
|
||
|
"XIST",
|
||
|
"TSIX",
|
||
|
"KDM6A",
|
||
|
"ZFX",
|
||
|
"KDM5C",
|
||
|
"ZFY-AS1",
|
||
|
"ARSDP1",
|
||
|
"GYG2P1",
|
||
|
"RBMY2JP",
|
||
|
"ARSLP1"
|
||
|
)
|
||
|
)
|
||
|
|
||
|
gender.qc.data <- as.data.frame(norm.expr.data) %>%
|
||
|
rownames_to_column("ensembl.id") %>%
|
||
|
tidyr::gather(
|
||
|
key = "rna.seq.sample.id",
|
||
|
value = "expr.value",
|
||
|
-ensembl.id
|
||
|
) %>%
|
||
|
dplyr::filter(
|
||
|
ensembl.id %in% gender.qc.genes.to.plot$ensembl.id
|
||
|
) %>%
|
||
|
dplyr::left_join(
|
||
|
y = gender.qc.patients,
|
||
|
by = c("rna.seq.sample.id" = "GenomeScan_ID")
|
||
|
) %>%
|
||
|
#dplyr::filter(
|
||
|
# !is.na(gender)
|
||
|
#) %>%
|
||
|
readr::write_csv(
|
||
|
file.path(results.dir.gender.plot, "plot.data.voom.csv")
|
||
|
)
|
||
|
|
||
|
for (chr in gender.qc.genes.to.plot$chromosome_name) {
|
||
|
current.gender.qc.genes.to.plot <- gender.qc.genes.to.plot %>%
|
||
|
dplyr::filter(chromosome_name == chr)
|
||
|
chromosome_name <- chr
|
||
|
i <- 0
|
||
|
for (current.ensembl.id in current.gender.qc.genes.to.plot$ensembl.id) {
|
||
|
i <- i + 1
|
||
|
|
||
|
hgnc_symbol <- gene.data %>%
|
||
|
dplyr::filter(
|
||
|
ensembl_gene_id == current.ensembl.id
|
||
|
) %>%
|
||
|
dplyr::pull(hgnc_symbol)
|
||
|
|
||
|
# calculate outliers, kinda
|
||
|
plot.data <- gender.qc.data %>%
|
||
|
dplyr::filter(
|
||
|
ensembl.id == current.ensembl.id
|
||
|
) %>%
|
||
|
dplyr::mutate(
|
||
|
gender = dplyr::case_when(
|
||
|
is.na(gender) | (stringr::str_trim(gender) == "") ~ "other",
|
||
|
TRUE ~ gender
|
||
|
)
|
||
|
)
|
||
|
|
||
|
if (nrow(plot.data) <= 0) {
|
||
|
next
|
||
|
}
|
||
|
|
||
|
outliers <- boxplot(
|
||
|
formula = expr.value ~ gender,
|
||
|
data = plot.data,
|
||
|
plot = FALSE
|
||
|
)$out
|
||
|
|
||
|
result.to.annotate <- plot.data %>%
|
||
|
dplyr::filter(
|
||
|
expr.value %in% outliers
|
||
|
)
|
||
|
|
||
|
# Visual: plot range (for t-test p-value)
|
||
|
plot.y.range <- c(
|
||
|
"min" = as.integer(min(plot.data$expr.value) - 1) ,
|
||
|
"max" = as.integer(max(plot.data$expr.value) + 1)
|
||
|
)
|
||
|
plot.margin <- ((plot.y.range["max"] + (plot.y.range["min"] * -1)) * 0.05)
|
||
|
plot.y.range["min"] <- plot.y.range["min"] - plot.margin
|
||
|
plot.y.range["max"] <- plot.y.range["max"] + plot.margin
|
||
|
|
||
|
# Plot the damn thing as if it is Graphpad Prism
|
||
|
stat.table <- rstatix::t_test(plot.data, expr.value ~ gender)
|
||
|
plt <- plot.data %>%
|
||
|
ggplot2::ggplot(
|
||
|
mapping = ggplot2::aes(
|
||
|
x = gender,
|
||
|
y = expr.value
|
||
|
)
|
||
|
) +
|
||
|
ggplot2::geom_jitter(
|
||
|
mapping = ggplot2::aes(
|
||
|
colour = gender,
|
||
|
shape = gender
|
||
|
),
|
||
|
width = 0.1
|
||
|
) +
|
||
|
ggrepel::geom_text_repel(
|
||
|
data = result.to.annotate,
|
||
|
mapping = ggplot2::aes(
|
||
|
label = sample.id
|
||
|
),
|
||
|
size = 2,
|
||
|
box.padding = unit(0.35, "lines"),
|
||
|
point.padding = unit(0.3, "lines")
|
||
|
) +
|
||
|
ggplot2::stat_summary(
|
||
|
fun = "mean",
|
||
|
geom = "crossbar",
|
||
|
width = 0.3,
|
||
|
size = 0.3
|
||
|
) +
|
||
|
ggplot2::scale_y_continuous(
|
||
|
limits = plot.y.range,
|
||
|
guide = "prism_offset"
|
||
|
) +
|
||
|
#ggprism::add_pvalue(
|
||
|
# stat.table,
|
||
|
# y.position = plot.y.range["max"]
|
||
|
#) +
|
||
|
ggprism::theme_prism() +
|
||
|
ggprism::scale_colour_prism() +
|
||
|
ggprism::scale_shape_prism() +
|
||
|
ggplot2::theme(
|
||
|
legend.position = "none"
|
||
|
) +
|
||
|
ggplot2::labs(
|
||
|
subtitle = paste0("Gender Check: ", hgnc_symbol, " (chr. ", chromosome_name, ")"),
|
||
|
x = "Gender",
|
||
|
y = "Normalised Expression Values"
|
||
|
)
|
||
|
|
||
|
ggplot2::ggsave(
|
||
|
filename = file.path(results.dir.gender.plot, paste0(chromosome_name, ".", i, ".", hgnc_symbol, ".png")),
|
||
|
plot = plt,
|
||
|
width = 12.5,
|
||
|
height = 12.5,
|
||
|
unit = "cm"
|
||
|
)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
# Let's try a GSVA
|
||
|
gsva.groups <- list(
|
||
|
X = gender.qc.genes.to.plot %>%
|
||
|
dplyr::filter(chromosome_name == "X") %>%
|
||
|
dplyr::pull(ensembl.id),
|
||
|
Y = gender.qc.genes.to.plot %>%
|
||
|
dplyr::filter(chromosome_name == "Y") %>%
|
||
|
dplyr::pull(ensembl.id)
|
||
|
)
|
||
|
|
||
|
gsva_res = GSVA::gsva(
|
||
|
norm.expr.data,
|
||
|
gsva.groups,
|
||
|
mx.diff = TRUE,
|
||
|
verbose = FALSE,
|
||
|
parallel.sz = 1
|
||
|
)
|
||
|
|
||
|
|
||
|
gender.qc.gsva.data <- as.data.frame(gsva_res) %>%
|
||
|
rownames_to_column("gsva.group") %>%
|
||
|
tidyr::gather(
|
||
|
key = "rna.seq.sample.id",
|
||
|
value = "gsva.value",
|
||
|
-gsva.group
|
||
|
) %>%
|
||
|
dplyr::left_join(
|
||
|
y = gender.qc.patients %>%
|
||
|
dplyr::select(
|
||
|
GenomeScan_ID,
|
||
|
sample.id,
|
||
|
gender
|
||
|
),
|
||
|
by = c("rna.seq.sample.id" = "GenomeScan_ID")
|
||
|
) %>%
|
||
|
readr::write_csv(
|
||
|
file.path(results.dir.gender.plot, "plot.data.gsva.csv")
|
||
|
)
|
||
|
|
||
|
|
||
|
for (c.gender in unique(gender.qc.gsva.data$gender)) {
|
||
|
if (is.na(c.gender)) {
|
||
|
next
|
||
|
}
|
||
|
|
||
|
c.plot.data <- gender.qc.gsva.data %>%
|
||
|
dplyr::filter(
|
||
|
gender == c.gender
|
||
|
)
|
||
|
|
||
|
outliers <- boxplot(
|
||
|
formula = gsva.value ~ gsva.group,
|
||
|
data = c.plot.data,
|
||
|
plot = FALSE
|
||
|
)$out
|
||
|
|
||
|
result.to.annotate <- c.plot.data %>%
|
||
|
dplyr::filter(
|
||
|
gsva.value %in% outliers
|
||
|
)
|
||
|
|
||
|
plt <- c.plot.data %>%
|
||
|
ggplot2::ggplot(
|
||
|
mapping = ggplot2::aes(
|
||
|
x = gsva.group,
|
||
|
y = gsva.value
|
||
|
)
|
||
|
) +
|
||
|
ggplot2::geom_boxplot() +
|
||
|
ggrepel::geom_text_repel(
|
||
|
data = result.to.annotate,
|
||
|
mapping = ggplot2::aes(
|
||
|
label = sample.id
|
||
|
),
|
||
|
size = 2,
|
||
|
box.padding = unit(0.35, "lines"),
|
||
|
point.padding = unit(0.3, "lines")
|
||
|
) +
|
||
|
ggprism::theme_prism() +
|
||
|
ggprism::scale_colour_prism() +
|
||
|
ggprism::scale_shape_prism() +
|
||
|
ggplot2::theme(
|
||
|
legend.position = "none"
|
||
|
) +
|
||
|
ggplot2::labs(
|
||
|
subtitle = paste0("", toupper(c.gender)),
|
||
|
x = "Chromosome",
|
||
|
y = "GSVA Values"
|
||
|
)
|
||
|
|
||
|
ggplot2::ggsave(
|
||
|
filename = file.path(results.dir.gender.plot, paste0("gsva.", c.gender, ".png")),
|
||
|
plot = plt,
|
||
|
width = 12.5,
|
||
|
height = 12.5,
|
||
|
unit = "cm"
|
||
|
)
|
||
|
}
|