113 lines
3.6 KiB
R
113 lines
3.6 KiB
R
# Gender QC
|
|
# Normalized with limma::voom
|
|
library(GSVA)
|
|
library(limma)
|
|
library(edgeR)
|
|
library(tidyverse)
|
|
|
|
|
|
# expression.data. Has columns:
|
|
# - Gene (gene identifier)
|
|
# - [Sample identifiers]
|
|
expression.data <- "mrna_counts_table.csv"
|
|
# master.Table. Has columns:
|
|
# - GenomeScan_ID
|
|
# - gender, levels = c("male", "female")
|
|
# - age
|
|
# - factor(smoking.status, levels = c("Ex-smoker", "Current smoker"))
|
|
master.Table <- "patient_table.csv"
|
|
# results.dir. The directory of where to put the resulting tables (and later your plots.)
|
|
results.dir <- "results"
|
|
|
|
|
|
# The analysis
|
|
# We first do a differential expression analysis on gender using EdgeR.
|
|
# Afterwards you should plot these results.
|
|
x.genes <- gene.data %>%
|
|
dplyr::filter(chromosome_name == "X") %>%
|
|
dplyr::pull(ensembl_gene_id)
|
|
|
|
y.genes <- gene.data %>%
|
|
dplyr::filter(chromosome_name == "Y") %>%
|
|
dplyr::pull(ensembl_gene_id)
|
|
|
|
# Gender QC
|
|
results.dir.gender <- file.path(results.dir, "gender.check")
|
|
dir.create(results.dir.gender, recursive=TRUE)
|
|
|
|
# Differential Expression on Gender
|
|
gender.qc.patients <- master.Table %>%
|
|
dplyr::filter(
|
|
!is.na(GenomeScan_ID)
|
|
) %>%
|
|
dplyr::mutate(
|
|
gender = factor(gender, levels = c("male", "female")),
|
|
age = as.numeric(age),
|
|
smoking.status = factor(smoking.status, levels = c("Ex-smoker", "Current smoker"))
|
|
) %>%
|
|
dplyr::filter(
|
|
!is.na(gender)
|
|
)
|
|
gender.qc.sample.order <- gender.qc.patients %>%
|
|
dplyr::pull(GenomeScan_ID)
|
|
|
|
gender.qc.expression.data <- expression.data %>%
|
|
tibble::column_to_rownames("Gene") %>%
|
|
select.columns.in.order(gender.qc.sample.order) %>%
|
|
as.matrix()
|
|
|
|
design <- model.matrix( ~0 + gender, data = gender.qc.patients)
|
|
|
|
DGEL <- edgeR::DGEList(gender.qc.expression.data)
|
|
keep <- edgeR::filterByExpr(DGEL)
|
|
keep[names(keep) %in% x.genes] <- TRUE
|
|
keep[names(keep) %in% y.genes] <- TRUE
|
|
DGEL <- DGEL[keep, , keep.lib.sizes=FALSE]
|
|
DGEL <- edgeR::calcNormFactors(DGEL, method = "TMM")
|
|
|
|
DGEL <- edgeR::estimateDisp(DGEL, design)
|
|
fit <- edgeR::glmQLFit(DGEL,design)
|
|
|
|
contrasts <- limma::makeContrasts(
|
|
gender = gendermale - genderfemale,
|
|
levels = design
|
|
)
|
|
qlf <- edgeR::glmQLFTest(fit, contrast = contrasts[,"gender"])
|
|
|
|
gender.qc.results <- edgeR::topTags(
|
|
qlf,
|
|
n=nrow(DGEL)
|
|
)$table %>%
|
|
tibble::rownames_to_column("ensembl.id") %>%
|
|
dplyr::left_join(
|
|
y = gene.data,
|
|
by = c("ensembl.id" = "ensembl_gene_id")
|
|
) %>%
|
|
readr::write_csv(
|
|
file.path(results.dir.gender, "differential.expression.on.gender.csv")
|
|
)
|
|
|
|
# Plotting of gender expression
|
|
results.dir.gender.plot <- file.path(results.dir.gender, "img")
|
|
dir.create(results.dir.gender.plot, recursive=TRUE)
|
|
|
|
gender.qc.genes.to.plot <- gender.qc.results %>%
|
|
dplyr::arrange(PValue) %>%
|
|
dplyr::group_by(chromosome_name) %>%
|
|
dplyr::filter(
|
|
(
|
|
chromosome_name %in% c("X", "Y") &
|
|
FDR < 0.05 &
|
|
dplyr::row_number() <= 5
|
|
)
|
|
)
|
|
|
|
|
|
# Next thing to do:
|
|
# - Plot the normalized expressino values for the genes in gender.qc.genes.to.plot in a boxplot, split and colored by gender.
|
|
# - (Optional) Do a GSVA with as genesets the genes found in gender.qc.genes.to.plot. Plot the boxplots as per the previous point.
|
|
# - (Optional) Plot the number of Y-chromosome reads devided by the number of X chromosome reads in a boxplot as per the first point.
|
|
# x=$(samtools view -q 20 -f 2 $bam_file X | wc -l)
|
|
# y=$(samtools view -q 20 -f 2 $bam_file Y | wc -l)
|
|
# - (Optional) Plot the number of Y-chromosome SNPs devided by the number of X chromosome SNPs in a boxplot as per the first point.
|