# Gender QC # Normalized with limma::voom source("__ - Preloader.R", verbose=T) # The analysis norm.expr.data <- expression.data %>% tibble::column_to_rownames("Gene") norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>% limma::voom() %>% as.matrix() x.genes <- gene.data %>% dplyr::filter(chromosome_name == "X") %>% dplyr::pull(ensembl_gene_id) y.genes <- gene.data %>% dplyr::filter(chromosome_name == "Y") %>% dplyr::pull(ensembl_gene_id) # Gender QC results.dir.gender <- file.path(results.dir, "gender.check") dir.create(results.dir.gender, recursive=TRUE) # Differential Expression on Gender gender.qc.patients <- master.Table %>% dplyr::filter( !is.na(GenomeScan_ID) ) %>% dplyr::mutate( gender = factor(gender, levels = c("male", "female")), age = as.numeric(age), smoking.status = factor(smoking.status, levels = c("Ex-smoker", "Current smoker")) ) %>% dplyr::filter( !is.na(gender) ) gender.qc.sample.order <- gender.qc.patients %>% dplyr::pull(GenomeScan_ID) gender.qc.expression.data <- expression.data %>% tibble::column_to_rownames("Gene") %>% select.columns.in.order(gender.qc.sample.order) %>% as.matrix() design <- model.matrix( ~0 + gender, data = gender.qc.patients) DGEL <- edgeR::DGEList(gender.qc.expression.data) keep <- edgeR::filterByExpr(DGEL) keep[names(keep) %in% x.genes] <- TRUE keep[names(keep) %in% y.genes] <- TRUE DGEL <- DGEL[keep, , keep.lib.sizes=FALSE] DGEL <- edgeR::calcNormFactors(DGEL, method = "TMM") DGEL <- edgeR::estimateDisp(DGEL, design) fit <- edgeR::glmQLFit(DGEL,design) contrasts <- limma::makeContrasts( gender = gendermale - genderfemale, levels = design ) qlf <- edgeR::glmQLFTest(fit, contrast = contrasts[,"gender"]) gender.qc.results <- edgeR::topTags( qlf, n=nrow(DGEL) )$table %>% tibble::rownames_to_column("ensembl.id") %>% dplyr::left_join( y = gene.data, by = c("ensembl.id" = "ensembl_gene_id") ) %>% readr::write_csv( file.path(results.dir.gender, "differential.expression.on.gender.csv") ) # Plotting of gender expression results.dir.gender.plot <- file.path(results.dir.gender, "img") dir.create(results.dir.gender.plot, recursive=TRUE) gender.qc.genes.to.plot <- gender.qc.results %>% dplyr::arrange(PValue) %>% dplyr::group_by(chromosome_name) %>% dplyr::filter( ( chromosome_name %in% c("X", "Y") & FDR < 0.05 & dplyr::row_number() <= 5 ) | hgnc_symbol %in% c( "XIST", "TSIX", "KDM6A", "ZFX", "KDM5C", "ZFY-AS1", "ARSDP1", "GYG2P1", "RBMY2JP", "ARSLP1" ) ) gender.qc.data <- as.data.frame(norm.expr.data) %>% rownames_to_column("ensembl.id") %>% tidyr::gather( key = "rna.seq.sample.id", value = "expr.value", -ensembl.id ) %>% dplyr::filter( ensembl.id %in% gender.qc.genes.to.plot$ensembl.id ) %>% dplyr::left_join( y = gender.qc.patients, by = c("rna.seq.sample.id" = "GenomeScan_ID") ) %>% #dplyr::filter( # !is.na(gender) #) %>% readr::write_csv( file.path(results.dir.gender.plot, "plot.data.voom.csv") ) for (chr in gender.qc.genes.to.plot$chromosome_name) { current.gender.qc.genes.to.plot <- gender.qc.genes.to.plot %>% dplyr::filter(chromosome_name == chr) chromosome_name <- chr i <- 0 for (current.ensembl.id in current.gender.qc.genes.to.plot$ensembl.id) { i <- i + 1 hgnc_symbol <- gene.data %>% dplyr::filter( ensembl_gene_id == current.ensembl.id ) %>% dplyr::pull(hgnc_symbol) # calculate outliers, kinda plot.data <- gender.qc.data %>% dplyr::filter( ensembl.id == current.ensembl.id ) %>% dplyr::mutate( gender = dplyr::case_when( is.na(gender) | (stringr::str_trim(gender) == "") ~ "other", TRUE ~ gender ) ) if (nrow(plot.data) <= 0) { next } outliers <- boxplot( formula = expr.value ~ gender, data = plot.data, plot = FALSE )$out result.to.annotate <- plot.data %>% dplyr::filter( expr.value %in% outliers ) # Visual: plot range (for t-test p-value) plot.y.range <- c( "min" = as.integer(min(plot.data$expr.value) - 1) , "max" = as.integer(max(plot.data$expr.value) + 1) ) plot.margin <- ((plot.y.range["max"] + (plot.y.range["min"] * -1)) * 0.05) plot.y.range["min"] <- plot.y.range["min"] - plot.margin plot.y.range["max"] <- plot.y.range["max"] + plot.margin # Plot the damn thing as if it is Graphpad Prism stat.table <- rstatix::t_test(plot.data, expr.value ~ gender) plt <- plot.data %>% ggplot2::ggplot( mapping = ggplot2::aes( x = gender, y = expr.value ) ) + ggplot2::geom_jitter( mapping = ggplot2::aes( colour = gender, shape = gender ), width = 0.1 ) + ggrepel::geom_text_repel( data = result.to.annotate, mapping = ggplot2::aes( label = sample.id ), size = 2, box.padding = unit(0.35, "lines"), point.padding = unit(0.3, "lines") ) + ggplot2::stat_summary( fun = "mean", geom = "crossbar", width = 0.3, size = 0.3 ) + ggplot2::scale_y_continuous( limits = plot.y.range, guide = "prism_offset" ) + #ggprism::add_pvalue( # stat.table, # y.position = plot.y.range["max"] #) + ggprism::theme_prism() + ggprism::scale_colour_prism() + ggprism::scale_shape_prism() + ggplot2::theme( legend.position = "none" ) + ggplot2::labs( subtitle = paste0("Gender Check: ", hgnc_symbol, " (chr. ", chromosome_name, ")"), x = "Gender", y = "Normalised Expression Values" ) ggplot2::ggsave( filename = file.path(results.dir.gender.plot, paste0(chromosome_name, ".", i, ".", hgnc_symbol, ".png")), plot = plt, width = 12.5, height = 12.5, unit = "cm" ) } } # Let's try a GSVA gsva.groups <- list( X = gender.qc.genes.to.plot %>% dplyr::filter(chromosome_name == "X") %>% dplyr::pull(ensembl.id), Y = gender.qc.genes.to.plot %>% dplyr::filter(chromosome_name == "Y") %>% dplyr::pull(ensembl.id) ) gsva_res = GSVA::gsva( norm.expr.data, gsva.groups, mx.diff = TRUE, verbose = FALSE, parallel.sz = 1 ) gender.qc.gsva.data <- as.data.frame(gsva_res) %>% rownames_to_column("gsva.group") %>% tidyr::gather( key = "rna.seq.sample.id", value = "gsva.value", -gsva.group ) %>% dplyr::left_join( y = gender.qc.patients %>% dplyr::select( GenomeScan_ID, sample.id, gender ), by = c("rna.seq.sample.id" = "GenomeScan_ID") ) %>% readr::write_csv( file.path(results.dir.gender.plot, "plot.data.gsva.csv") ) for (c.gender in unique(gender.qc.gsva.data$gender)) { if (is.na(c.gender)) { next } c.plot.data <- gender.qc.gsva.data %>% dplyr::filter( gender == c.gender ) outliers <- boxplot( formula = gsva.value ~ gsva.group, data = c.plot.data, plot = FALSE )$out result.to.annotate <- c.plot.data %>% dplyr::filter( gsva.value %in% outliers ) plt <- c.plot.data %>% ggplot2::ggplot( mapping = ggplot2::aes( x = gsva.group, y = gsva.value ) ) + ggplot2::geom_boxplot() + ggrepel::geom_text_repel( data = result.to.annotate, mapping = ggplot2::aes( label = sample.id ), size = 2, box.padding = unit(0.35, "lines"), point.padding = unit(0.3, "lines") ) + ggprism::theme_prism() + ggprism::scale_colour_prism() + ggprism::scale_shape_prism() + ggplot2::theme( legend.position = "none" ) + ggplot2::labs( subtitle = paste0("", toupper(c.gender)), x = "Chromosome", y = "GSVA Values" ) ggplot2::ggsave( filename = file.path(results.dir.gender.plot, paste0("gsva.", c.gender, ".png")), plot = plt, width = 12.5, height = 12.5, unit = "cm" ) }