# Gender QC # Normalized with limma::voom library(GSVA) library(limma) library(edgeR) library(tidyverse) # expression.data. Has columns: # - Gene (gene identifier) # - [Sample identifiers] expression.data <- "mrna_counts_table.csv" # master.Table. Has columns: # - GenomeScan_ID # - gender, levels = c("male", "female") # - age # - factor(smoking.status, levels = c("Ex-smoker", "Current smoker")) master.Table <- "patient_table.csv" # results.dir. The directory of where to put the resulting tables (and later your plots.) results.dir <- "results" # The analysis # We first do a differential expression analysis on gender using EdgeR. # Afterwards you should plot these results. x.genes <- gene.data %>% dplyr::filter(chromosome_name == "X") %>% dplyr::pull(ensembl_gene_id) y.genes <- gene.data %>% dplyr::filter(chromosome_name == "Y") %>% dplyr::pull(ensembl_gene_id) # Gender QC results.dir.gender <- file.path(results.dir, "gender.check") dir.create(results.dir.gender, recursive=TRUE) # Differential Expression on Gender gender.qc.patients <- master.Table %>% dplyr::filter( !is.na(GenomeScan_ID) ) %>% dplyr::mutate( gender = factor(gender, levels = c("male", "female")), age = as.numeric(age), smoking.status = factor(smoking.status, levels = c("Ex-smoker", "Current smoker")) ) %>% dplyr::filter( !is.na(gender) ) gender.qc.sample.order <- gender.qc.patients %>% dplyr::pull(GenomeScan_ID) gender.qc.expression.data <- expression.data %>% tibble::column_to_rownames("Gene") %>% select.columns.in.order(gender.qc.sample.order) %>% as.matrix() design <- model.matrix( ~0 + gender, data = gender.qc.patients) DGEL <- edgeR::DGEList(gender.qc.expression.data) keep <- edgeR::filterByExpr(DGEL) keep[names(keep) %in% x.genes] <- TRUE keep[names(keep) %in% y.genes] <- TRUE DGEL <- DGEL[keep, , keep.lib.sizes=FALSE] DGEL <- edgeR::calcNormFactors(DGEL, method = "TMM") DGEL <- edgeR::estimateDisp(DGEL, design) fit <- edgeR::glmQLFit(DGEL,design) contrasts <- limma::makeContrasts( gender = gendermale - genderfemale, levels = design ) qlf <- edgeR::glmQLFTest(fit, contrast = contrasts[,"gender"]) gender.qc.results <- edgeR::topTags( qlf, n=nrow(DGEL) )$table %>% tibble::rownames_to_column("ensembl.id") %>% dplyr::left_join( y = gene.data, by = c("ensembl.id" = "ensembl_gene_id") ) %>% readr::write_csv( file.path(results.dir.gender, "differential.expression.on.gender.csv") ) # Plotting of gender expression results.dir.gender.plot <- file.path(results.dir.gender, "img") dir.create(results.dir.gender.plot, recursive=TRUE) gender.qc.genes.to.plot <- gender.qc.results %>% dplyr::arrange(PValue) %>% dplyr::group_by(chromosome_name) %>% dplyr::filter( ( chromosome_name %in% c("X", "Y") & FDR < 0.05 & dplyr::row_number() <= 5 ) ) # Next thing to do: # - Plot the normalized expressino values for the genes in gender.qc.genes.to.plot in a boxplot, split and colored by gender. # - (Optional) Do a GSVA with as genesets the genes found in gender.qc.genes.to.plot. Plot the boxplots as per the previous point. # - (Optional) Plot the number of Y-chromosome reads devided by the number of X chromosome reads in a boxplot as per the first point. # x=$(samtools view -q 20 -f 2 $bam_file X | wc -l) # y=$(samtools view -q 20 -f 2 $bam_file Y | wc -l) # - (Optional) Plot the number of Y-chromosome SNPs devided by the number of X chromosome SNPs in a boxplot as per the first point.