From 3d7c9e3b1cef3336adcb424ed52b224de9b194c5 Mon Sep 17 00:00:00 2001 From: Jos van Nijnatten Date: Tue, 9 Feb 2021 12:52:44 +0100 Subject: [PATCH 1/5] FastQC, Trimming, Overall QC (initial) --- .gitignore | 4 + rnaseq/.DS_Store | Bin 0 -> 6148 bytes rnaseq/step1_fastqc/snippet.sh | 34 ++ rnaseq/step2_trim/snippet.sh | 64 ++++ .../01 - Principle Component Analysis.R | 200 +++++++++++ rnaseq/step6_overall_QC/02 - Gender Check.R | 334 ++++++++++++++++++ rnaseq/step6_overall_QC/03 - Sample Counts.R | 163 +++++++++ rnaseq/step6_overall_QC/__ - Preloader.R | 189 ++++++++++ rnaseq/step6_overall_QC/snippet.R | 0 9 files changed, 988 insertions(+) create mode 100644 .gitignore create mode 100644 rnaseq/.DS_Store create mode 100644 rnaseq/step6_overall_QC/01 - Principle Component Analysis.R create mode 100644 rnaseq/step6_overall_QC/02 - Gender Check.R create mode 100644 rnaseq/step6_overall_QC/03 - Sample Counts.R create mode 100644 rnaseq/step6_overall_QC/__ - Preloader.R delete mode 100644 rnaseq/step6_overall_QC/snippet.R diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..57ef2cc --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ + +.DS_Store +*.nosync +*.RData diff --git a/rnaseq/.DS_Store b/rnaseq/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..8933a0f0cb19a47a85e31c0e56aa08d8ead71b08 GIT binary patch literal 6148 zcmeHK!A=`75FLlKYy=fKRD$DPxKSxNfXV@B73hH*)gm~wRkF(lSZTXXvm2r|1nqCM zf7CDN@AQrBMvc-6aftxUMD`ooGamU(WbY7>>Wq>$QIm)~G{#y3-9Loaxh+V|&D4R) z?9rw!T~b5^l?&D~@faE4+^w5@^=UwV=kE($B*jG*Yn>sv_=j*(*=P?*)^r1Xh*W3z zb?H4F#^YiTXF7)M&v=lfMcHcoQni)(i0yU<|iHn%2|xBGiL{+rfx z$G2*`wZA)^HoVR4_Q7%Q^Ve^a+v(kRn-awbWZAGV>wiGI_uvm==_VZJna;1U@t>fx zXrc%M!hkTaEC$>K-#6x@0*lfM1H!;O11ome z> Executing trimming of $GSID (${#GSID})" + if [ "${#GSID}" == "18" ]; then # check length - don't include sample pools + for fqFile in $(find -L "$dir_raw_fastq" -maxdepth 1 -type f -iname "*${GSID}*.fastq.gz"); do + newFilename=${fqFile/.fastq.gz/_trimmed.fq.gz} + if [ -f "${newFilename}" ]; then + echo ">>File exists ${newFilename}. Skipping" + else + echo ">>File: ${fqFile}" + if grep -q "_R1" <<< $(basename "$fqFile"); then + adapter_seq=$adapter_3p + elif grep -q "_R2" <<< $(basename "$fqFile"); then + adapter_seq=$adapter_5p + fi + echo ">>Trimming with sequence ${adapter_seq}" + trim_galore \ + --adapter "$adapter_seq" \ + --length $(printf "$adapter_seq" | wc -m) \ + --output_dir "${dir_trimmed_fastq}" \ + --fastqc_args "--noextract" \ + "${fqFile}" & + fi + done + else + echo "Skipped." + fi +done < samples.csv + + +wait +mv ${dir_trimmed_fastq}/*.zip "${dir_trimmed_fastq_reports}" +mv ${dir_trimmed_fastq}/*.txt "${dir_trimmed_fastq_reports}" +mv ${dir_trimmed_fastq}/*.html "${dir_trimmed_fastq_reports}" \ No newline at end of file diff --git a/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R b/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R new file mode 100644 index 0000000..09e4dff --- /dev/null +++ b/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R @@ -0,0 +1,200 @@ +# Principle Component Analysis +# Normalized with limma::voom + +source("__ - Preloader.R", verbose=T) + + +# PCA variables +do.center = TRUE +do.scale = FALSE + + +# The analysis +master.Table %>% readr::write_csv( + file.path(results.dir, "patient.table.csv") +) + +norm.expr.data <- expression.data %>% + tibble::column_to_rownames("Gene") +norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>% + limma::voom() %>% + as.matrix() + +# Principle Component analysis +results.dir.pca <- file.path(results.dir, "principle.components") +dir.create(results.dir.pca, recursive=TRUE) + +norm.expr.data.pcs <- norm.expr.data %>% + t() %>% + stats::prcomp( + center = do.center, + scale. = do.scale + ) + +# Write summary of PCAs to files +pcs.summery <- summary(norm.expr.data.pcs) +pcs.summery$importance %>% + t() %>% + as.data.frame() %>% + tibble::rownames_to_column("PC.name") %>% + readr::write_csv( + file.path(results.dir.pca, "importance.csv") + ) + +pcs.summery$x %>% + t() %>% + as.data.frame() %>% + tibble::rownames_to_column("ensembl.id") %>% + readr::write_csv( + file.path(results.dir.pca, "values.csv") + ) + +pcs.summery$rotation %>% + t() %>% + as.data.frame() %>% + tibble::rownames_to_column("sample.id") %>% + readr::write_csv( + file.path(results.dir.pca, "rotation.csv") + ) + +data.frame( + rownames = names(pcs.summery$center), + center = pcs.summery$center, + scale = pcs.summery$scale + ) %>% + readr::write_csv( + file.path(results.dir.pca, "rest.csv") + ) + +# Not saved: pcs.summery$sdev, + + + +# Plot PCAs +# https://github.com/kevinblighe/PCAtools +results.dir.pca.plot <- file.path(results.dir.pca, "img") +dir.create(results.dir.pca.plot) + +metadata <- master.Table %>% + dplyr::filter( + !is.na(GenomeScan_ID) + ) %>% + tibble::column_to_rownames("GenomeScan_ID") %>% + select.rows.in.order( + colnames(norm.expr.data) + ) + + +p <- pca(norm.expr.data, + metadata = metadata, + center = do.center, + scale = do.scale +) +elbow <- findElbowPoint(p$variance) +metavars <- c('Age','Gender','Smoking_status','COPD_Y_or_N','SEO_COPD_Y_or_N','GOLD_stage') + + +png(filename = file.path(results.dir.pca.plot,"scree_plot.png"), + width = 800, height = 800, + units = "px", pointsize = 12, + type = "Xlib") +print(screeplot(p, + axisLabSize = 12, + titleLabSize = 12, + components = getComponents(p, 1:(elbow+5)), + vline = c(elbow) + ) + + geom_label( + aes( + x = elbow + 1, + y = 50, + label = 'Elbow method', + vjust = -1, + size = 8 + ) + )) +dev.off() + + +png(filename = file.path(results.dir.pca.plot, "eigen_corr_plot.png"), + width = 1200, height = 1200, + units = "px", pointsize = 12, + type = "Xlib") +print(eigencorplot(p, + metavars = metavars +)) +dev.off() + + +dir.create(file.path(results.dir.pca.plot, "pairsplot")) +for (var in metavars) { + png( + filename = file.path(results.dir.pca.plot, "pairsplot", paste0(var,".png")), + width = 1200, height = 1200, + units = "px", pointsize = 12, + type = "Xlib" + ) + print(pairsplot(p, + components = getComponents(p, c(1:(elbow+1))), + triangle = TRUE, + trianglelabSize = 12, + hline = 0, vline = 0, + pointSize = 0.4, + gridlines.major = FALSE, + gridlines.minor = FALSE, + colby = var, + title = paste0('Pairs plot: ',var), + plotaxes = TRUE + )) + dev.off() +} + + + +# Plot PCAs - old failure +pca.combinations <- combinations( + n = (elbow+1), + r = 2, + v = 1:(elbow+1), + repeats.allowed = FALSE + ) + +dir.create(file.path(results.dir.pca.plot, "biplots")) +for (var in metavars) { + for (i in 1:nrow(pca.combinations)) { + pca.combi <- pca.combinations[i,] + pca.title <- paste(paste0("PC", pca.combi), collapse="_") + png( + filename = file.path(results.dir.pca.plot, "biplots", paste0(var, "-", pca.title, ".png")), + width = 800, height = 800, + units = "px", pointsize = 12, + type = "Xlib" + ) + print( + autoplot( + norm.expr.data.pcs, + data = master.Table %>% + dplyr::filter( + !is.na(GenomeScan_ID) + ) %>% + tibble::column_to_rownames("GenomeScan_ID") %>% + select.rows.in.order( + rownames(norm.expr.data.pcs$x) + ), + x = pca.combi[1], + y = pca.combi[2], + colour = var, + loadings = FALSE, + loadings.label = FALSE, + #label = FALSE, + label.size = 3 + ) + + ggprism::theme_prism() + + #ggprism::scale_colour_prism() + + ggprism::scale_shape_prism() + + ggplot2::labs(subtitle = paste0(str_to_title(var), " (", pca.title,")")) + ) + dev.off() + } +} + diff --git a/rnaseq/step6_overall_QC/02 - Gender Check.R b/rnaseq/step6_overall_QC/02 - Gender Check.R new file mode 100644 index 0000000..af7536d --- /dev/null +++ b/rnaseq/step6_overall_QC/02 - Gender Check.R @@ -0,0 +1,334 @@ +# Gender QC +# Normalized with limma::voom + +source("__ - Preloader.R", verbose=T) + + +# The analysis +norm.expr.data <- expression.data %>% + tibble::column_to_rownames("Gene") +norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>% + limma::voom() %>% + as.matrix() + +x.genes <- gene.data %>% + dplyr::filter(chromosome_name == "X") %>% + dplyr::pull(ensembl_gene_id) + +y.genes <- gene.data %>% + dplyr::filter(chromosome_name == "Y") %>% + dplyr::pull(ensembl_gene_id) + +# Gender QC +results.dir.gender <- file.path(results.dir, "gender.check") +dir.create(results.dir.gender, recursive=TRUE) + +# Differential Expression on Gender +gender.qc.patients <- master.Table %>% + dplyr::filter( + !is.na(GenomeScan_ID) + ) %>% + dplyr::mutate( + gender = factor(gender, levels = c("male", "female")), + age = as.numeric(age), + smoking.status = factor(smoking.status, levels = c("Ex-smoker", "Current smoker")) + ) %>% + dplyr::filter( + !is.na(gender) + ) +gender.qc.sample.order <- gender.qc.patients %>% + dplyr::pull(GenomeScan_ID) + +gender.qc.expression.data <- expression.data %>% + tibble::column_to_rownames("Gene") %>% + select.columns.in.order(gender.qc.sample.order) %>% + as.matrix() + +design <- model.matrix( ~0 + gender, data = gender.qc.patients) + +DGEL <- edgeR::DGEList(gender.qc.expression.data) +keep <- edgeR::filterByExpr(DGEL) +keep[names(keep) %in% x.genes] <- TRUE +keep[names(keep) %in% y.genes] <- TRUE +DGEL <- DGEL[keep, , keep.lib.sizes=FALSE] +DGEL <- edgeR::calcNormFactors(DGEL, method = "TMM") + +DGEL <- edgeR::estimateDisp(DGEL, design) +fit <- edgeR::glmQLFit(DGEL,design) + +contrasts <- limma::makeContrasts( + gender = gendermale - genderfemale, + levels = design +) +qlf <- edgeR::glmQLFTest(fit, contrast = contrasts[,"gender"]) + +gender.qc.results <- edgeR::topTags( + qlf, + n=nrow(DGEL) + )$table %>% + tibble::rownames_to_column("ensembl.id") %>% + dplyr::left_join( + y = gene.data, + by = c("ensembl.id" = "ensembl_gene_id") + ) %>% + readr::write_csv( + file.path(results.dir.gender, "differential.expression.on.gender.csv") + ) + +# Plotting of gender expression +results.dir.gender.plot <- file.path(results.dir.gender, "img") +dir.create(results.dir.gender.plot, recursive=TRUE) + +gender.qc.genes.to.plot <- gender.qc.results %>% + dplyr::arrange(PValue) %>% + dplyr::group_by(chromosome_name) %>% + dplyr::filter( + ( + chromosome_name %in% c("X", "Y") & + FDR < 0.05 & + dplyr::row_number() <= 5 + ) | + hgnc_symbol %in% c( + "XIST", + "TSIX", + "KDM6A", + "ZFX", + "KDM5C", + "ZFY-AS1", + "ARSDP1", + "GYG2P1", + "RBMY2JP", + "ARSLP1" + ) + ) + +gender.qc.data <- as.data.frame(norm.expr.data) %>% + rownames_to_column("ensembl.id") %>% + tidyr::gather( + key = "rna.seq.sample.id", + value = "expr.value", + -ensembl.id + ) %>% + dplyr::filter( + ensembl.id %in% gender.qc.genes.to.plot$ensembl.id + ) %>% + dplyr::left_join( + y = gender.qc.patients, + by = c("rna.seq.sample.id" = "GenomeScan_ID") + ) %>% + #dplyr::filter( + # !is.na(gender) + #) %>% + readr::write_csv( + file.path(results.dir.gender.plot, "plot.data.voom.csv") + ) + +for (chr in gender.qc.genes.to.plot$chromosome_name) { + current.gender.qc.genes.to.plot <- gender.qc.genes.to.plot %>% + dplyr::filter(chromosome_name == chr) + chromosome_name <- chr + i <- 0 + for (current.ensembl.id in current.gender.qc.genes.to.plot$ensembl.id) { + i <- i + 1 + + hgnc_symbol <- gene.data %>% + dplyr::filter( + ensembl_gene_id == current.ensembl.id + ) %>% + dplyr::pull(hgnc_symbol) + + # calculate outliers, kinda + plot.data <- gender.qc.data %>% + dplyr::filter( + ensembl.id == current.ensembl.id + ) %>% + dplyr::mutate( + gender = dplyr::case_when( + is.na(gender) | (stringr::str_trim(gender) == "") ~ "other", + TRUE ~ gender + ) + ) + + if (nrow(plot.data) <= 0) { + next + } + + outliers <- boxplot( + formula = expr.value ~ gender, + data = plot.data, + plot = FALSE + )$out + + result.to.annotate <- plot.data %>% + dplyr::filter( + expr.value %in% outliers + ) + + # Visual: plot range (for t-test p-value) + plot.y.range <- c( + "min" = as.integer(min(plot.data$expr.value) - 1) , + "max" = as.integer(max(plot.data$expr.value) + 1) + ) + plot.margin <- ((plot.y.range["max"] + (plot.y.range["min"] * -1)) * 0.05) + plot.y.range["min"] <- plot.y.range["min"] - plot.margin + plot.y.range["max"] <- plot.y.range["max"] + plot.margin + + # Plot the damn thing as if it is Graphpad Prism + stat.table <- rstatix::t_test(plot.data, expr.value ~ gender) + plt <- plot.data %>% + ggplot2::ggplot( + mapping = ggplot2::aes( + x = gender, + y = expr.value + ) + ) + + ggplot2::geom_jitter( + mapping = ggplot2::aes( + colour = gender, + shape = gender + ), + width = 0.1 + ) + + ggrepel::geom_text_repel( + data = result.to.annotate, + mapping = ggplot2::aes( + label = sample.id + ), + size = 2, + box.padding = unit(0.35, "lines"), + point.padding = unit(0.3, "lines") + ) + + ggplot2::stat_summary( + fun = "mean", + geom = "crossbar", + width = 0.3, + size = 0.3 + ) + + ggplot2::scale_y_continuous( + limits = plot.y.range, + guide = "prism_offset" + ) + + #ggprism::add_pvalue( + # stat.table, + # y.position = plot.y.range["max"] + #) + + ggprism::theme_prism() + + ggprism::scale_colour_prism() + + ggprism::scale_shape_prism() + + ggplot2::theme( + legend.position = "none" + ) + + ggplot2::labs( + subtitle = paste0("Gender Check: ", hgnc_symbol, " (chr. ", chromosome_name, ")"), + x = "Gender", + y = "Normalised Expression Values" + ) + + ggplot2::ggsave( + filename = file.path(results.dir.gender.plot, paste0(chromosome_name, ".", i, ".", hgnc_symbol, ".png")), + plot = plt, + width = 12.5, + height = 12.5, + unit = "cm" + ) + } +} + +# Let's try a GSVA +gsva.groups <- list( + X = gender.qc.genes.to.plot %>% + dplyr::filter(chromosome_name == "X") %>% + dplyr::pull(ensembl.id), + Y = gender.qc.genes.to.plot %>% + dplyr::filter(chromosome_name == "Y") %>% + dplyr::pull(ensembl.id) +) + +gsva_res = GSVA::gsva( + norm.expr.data, + gsva.groups, + mx.diff = TRUE, + verbose = FALSE, + parallel.sz = 1 +) + + +gender.qc.gsva.data <- as.data.frame(gsva_res) %>% + rownames_to_column("gsva.group") %>% + tidyr::gather( + key = "rna.seq.sample.id", + value = "gsva.value", + -gsva.group + ) %>% + dplyr::left_join( + y = gender.qc.patients %>% + dplyr::select( + GenomeScan_ID, + sample.id, + gender + ), + by = c("rna.seq.sample.id" = "GenomeScan_ID") + ) %>% + readr::write_csv( + file.path(results.dir.gender.plot, "plot.data.gsva.csv") + ) + + +for (c.gender in unique(gender.qc.gsva.data$gender)) { + if (is.na(c.gender)) { + next + } + + c.plot.data <- gender.qc.gsva.data %>% + dplyr::filter( + gender == c.gender + ) + + outliers <- boxplot( + formula = gsva.value ~ gsva.group, + data = c.plot.data, + plot = FALSE + )$out + + result.to.annotate <- c.plot.data %>% + dplyr::filter( + gsva.value %in% outliers + ) + + plt <- c.plot.data %>% + ggplot2::ggplot( + mapping = ggplot2::aes( + x = gsva.group, + y = gsva.value + ) + ) + + ggplot2::geom_boxplot() + + ggrepel::geom_text_repel( + data = result.to.annotate, + mapping = ggplot2::aes( + label = sample.id + ), + size = 2, + box.padding = unit(0.35, "lines"), + point.padding = unit(0.3, "lines") + ) + + ggprism::theme_prism() + + ggprism::scale_colour_prism() + + ggprism::scale_shape_prism() + + ggplot2::theme( + legend.position = "none" + ) + + ggplot2::labs( + subtitle = paste0("", toupper(c.gender)), + x = "Chromosome", + y = "GSVA Values" + ) + + ggplot2::ggsave( + filename = file.path(results.dir.gender.plot, paste0("gsva.", c.gender, ".png")), + plot = plt, + width = 12.5, + height = 12.5, + unit = "cm" + ) +} diff --git a/rnaseq/step6_overall_QC/03 - Sample Counts.R b/rnaseq/step6_overall_QC/03 - Sample Counts.R new file mode 100644 index 0000000..90e805a --- /dev/null +++ b/rnaseq/step6_overall_QC/03 - Sample Counts.R @@ -0,0 +1,163 @@ +# Total counts per sample +# Normalized with limma::voom + +source("__ - Preloader.R", verbose=T) + + +# The analysis +norm.expr.data <- expression.data %>% + tibble::column_to_rownames("Gene") +norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>% + limma::voom() %>% + as.matrix() + +# Total counts per sample +total.count.per.sample <- expression.data %>% + tibble::column_to_rownames("Gene") %>% + colSums() + +data.frame( + sample = names(total.count.per.sample), + counts = as.numeric(total.count.per.sample) + ) %>% + readr::write_csv(file.path(results.dir, "total.counts.per.sample.csv")) + + +norm.data <- norm.expr.data %>% + as.data.frame() %>% + tibble::rownames_to_column( + "Gene" + ) %>% + tidyr::gather( + key = "sample.id", + value = "expr.value", + -Gene + ) %>% + dplyr::left_join( + y = master.Table %>% + dplyr::filter( + !is.na(GenomeScan_ID) + ) %>% + dplyr::mutate( + id = dplyr::case_when( + stringr::str_trim(gender) == "" ~ paste0("Water ", dplyr::row_number()), + TRUE ~ sample.id + ), + gender = dplyr::case_when( + stringr::str_trim(gender) == "" ~ "water", + !is.na(gender) ~ as.character(gender) + ) + ) %>% + dplyr::select( + GenomeScan_ID, + gender, + id + ), + by = c("sample.id" = "GenomeScan_ID") + ) + +norm.plot <- norm.data %>% + ggplot2::ggplot( + mapping = ggplot2::aes( + x = id, + y = expr.value, + fill = gender + ) + ) + + ggplot2::geom_boxplot() + + ggplot2::scale_fill_manual( + values = c( + "male" = "blue", + "female" = "red", + "water" = "green" + ) + ) + + ggplot2::labs( + title = "Normalized expression values distribution", + y = "Normalized expression values (limma::voom)", + x = "Sample", + gender = "Gender" + ) + + ggprism::theme_prism() + + ggplot2::theme( + axis.text.x = ggplot2::element_text(angle = 90) + ) + +ggplot2::ggsave( + filename = file.path(results.dir, "counts.per.sample.normalised.png"), + plot = norm.plot, + width = 40, + height = 20, + units = "cm" +) + + + +expr.data <- expression.data %>% + tidyr::gather( + key = "sample.id", + value = "expr.value", + -Gene + ) %>% + dplyr::filter( + expr.value != 0 + ) %>% + dplyr::left_join( + y = master.Table %>% + dplyr::filter( + !is.na(GenomeScan_ID) + ) %>% + dplyr::mutate( + id = dplyr::case_when( + stringr::str_trim(gender) == "" ~ paste0("Water ", dplyr::row_number()), + TRUE ~ sample.id + ), + gender = dplyr::case_when( + stringr::str_trim(gender) == "" ~ "water", + !is.na(gender) ~ as.character(gender) + ) + ) %>% + dplyr::select( + GenomeScan_ID, + gender, + id + ), + by = c("sample.id" = "GenomeScan_ID") + ) + +expr.plot <- expr.data %>% + ggplot2::ggplot( + mapping = ggplot2::aes( + x = id, + y = expr.value, + fill = gender + ) + ) + + ggplot2::geom_boxplot() + + ggplot2::scale_fill_manual( + values = c( + "male" = "blue", + "female" = "red", + "water" = "green" + ) + ) + + ggplot2::scale_y_continuous(trans='log2') + + ggplot2::labs( + title = "Raw expression values distribution, without zero's", + y = "Expression values", + x = "Sample", + gender = "Gender" + ) + + ggprism::theme_prism() + + ggplot2::theme( + axis.text.x = ggplot2::element_text(angle = 90) + ) + +ggplot2::ggsave( + filename = file.path(results.dir, "counts.per.sample.raw.zeros.removed.png"), + plot = expr.plot, + width = 40, + height = 20, + units = "cm" +) + diff --git a/rnaseq/step6_overall_QC/__ - Preloader.R b/rnaseq/step6_overall_QC/__ - Preloader.R new file mode 100644 index 0000000..ce96a0f --- /dev/null +++ b/rnaseq/step6_overall_QC/__ - Preloader.R @@ -0,0 +1,189 @@ +library(tidyverse) +library(ggfortify) +library(ggprism) +library(limma) +library(biomaRt) +library(PCAtools) +library(gtools) +library(edgeR) +library(ggprism) +library(foreign) + + +# Global variables +results.dir <- file.path("results.nosync", "RNA-Seq QC") +data.dir <- "Data" +patient.dir <- file.path(data.dir, "Patients") +sample.dir <- file.path(data.dir, "Samples") +expression.dir <- file.path(data.dir, "mRNA - RNA-Seq") + + +dir.create(results.dir, recursive = TRUE) + + +#### +# Helper functions +#### +select.columns.in.order <- function(dataframe, columns) { + dataframe[, columns] +} + +select.rows.in.order <- function(dataframe, rows) { + dataframe[rows,] +} + +getGenedataByEnsemblId38 <- function(ensemblIds, file.location) { + file.name <- file.path(file.location, "genes_info_hg38.csv") + if (!file.exists(file.name)) { + if (!("mart" %in% ls())) { + assign("mart", useEnsembl( + biomart = "ENSEMBL_MART_ENSEMBL", + dataset = "hsapiens_gene_ensembl" + )) + } + gene.list <- getBM( + filters = "ensembl_gene_id", + attributes = c( + "hgnc_symbol", + "ensembl_gene_id", + "ensembl_transcript_id", + "chromosome_name", + "start_position", + "end_position", + "strand", + "transcription_start_site", + "transcript_start", + "transcript_end", + "external_gene_name" + ), + values = as.character(ensemblIds), + mart = mart + ) + readr::write_csv(gene.list, path = file.name) + } + return( + readr::read_csv( + file.name, + col_types = readr::cols() + ) + ) +} + +#remove.rows.with.count.less.then <- function(dataframe, minRowCount, columns.to.exclude) { +# dataframe %>% +# dplyr::filter( +# rowSums(dplyr::select(., -tidyselect::one_of(columns.to.exclude))) < minRowCount +# ) +#} + +limma.voom.convert.column <- function(dataframe, columnname) { + dataframe %>% + tibble::column_to_rownames(columnname) %>% + limma::voom() %>% + as.data.frame() %>% + tibble::rownames_to_column(columnname) +} + +select.columns.in.order <- function(dataframe, columns) { + dataframe[, columns] +} + +drop.columns.if.all.same.value <- function(dataframe) { + for (name in colnames(dataframe)) { + is.all.same <- (dataframe[, name] %>% unique() %>% length()) <= 1 + if (is.all.same) { + dataframe <- dataframe %>% + dplyr::select( + -tidyselect::one_of(name) + ) + } + } + dataframe +} + + + +# Load data +master.Table <- foreign::read.spss( + file.path(patient.dir, "PRESTO proteogenomics full data sat - ver 7.5.sav") + ) %>% + as.data.frame() %>% + dplyr::mutate( + GenomeScan_ID = stringr::str_trim(GenomeScan_ID), + gender = forcats::fct_recode( + Gender, + female = "f", + male = "m", + other = "" + ), + age = as.numeric(Age), + smoking.status = forcats::fct_recode( + Smoking_status, + `Ex-smoker` = "ES ", + `Current smoker` = "CS ", + other = " " + ) + ) + +expression.data <- readr::read_tsv( + file.path(expression.dir, "20200427_103972-001_rawcounts.txt"), + col_types = readr::cols() + ) + +gene.data <- getGenedataByEnsemblId38( + ensemblIds = expression.data$Gene, + file.location = expression.dir + ) %>% + dplyr::group_by(hgnc_symbol) %>% + dplyr::filter( + dplyr::row_number() == 1, + !is.na(hgnc_symbol), + hgnc_symbol != "" + ) %>% + dplyr::ungroup() %>% + dplyr::select( + hgnc_symbol, + ensembl_gene_id, + chromosome_name, + transcript_start, + transcript_end + ) + + +master.Table <- master.Table %>% + dplyr::mutate( + Group_simple2 = stringr::str_trim(Group_simple2), + Group_simple = stringr::str_trim(Group_simple), + T_number = as.character(T_number), + sample.id = stringr::str_trim(PRESTO_ID) + ) %>% + dplyr::filter( + # # According to Niek, I should not include this, for whatever reason + #!(GenomeScan_ID %in% c( + # "T02-01796", + # "T02-03095", + # "T02-10683", + # "T10-18671", + # "T12-12036" + # ) + # ), + # + # # (According to Niek, don't include) Water Controls + #!stringr::str_detect(T_number, pattern="Water"), + # + # # (According to Niek, don't include)never smoker controles + #!(Group_simple2 == "NS_Ctrl"), + # + # # (According to Niek, don't include)ALFA1 patiƫnten + #!(Group_simple == "ALFA"), + # + #stringr::str_trim(Passed_RNAseq_library_prep_QC_Y_N) == "Y", + GenomeScan_ID %in% colnames(expression.data), + !is.na(sample.id), + sample.id != "" + ) + +expression.data <- expression.data %>% + select.columns.in.order( + c("Gene", master.Table$GenomeScan_ID) + ) diff --git a/rnaseq/step6_overall_QC/snippet.R b/rnaseq/step6_overall_QC/snippet.R deleted file mode 100644 index e69de29..0000000 From 58bd93171c9dc7406f8a3f0931e1edb6db8ff3af Mon Sep 17 00:00:00 2001 From: Jos van Nijnatten Date: Tue, 9 Feb 2021 15:15:59 +0100 Subject: [PATCH 2/5] Test change --- rnaseq/step6_overall_QC/__ - Preloader.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rnaseq/step6_overall_QC/__ - Preloader.R b/rnaseq/step6_overall_QC/__ - Preloader.R index ce96a0f..d11175a 100644 --- a/rnaseq/step6_overall_QC/__ - Preloader.R +++ b/rnaseq/step6_overall_QC/__ - Preloader.R @@ -8,7 +8,7 @@ library(gtools) library(edgeR) library(ggprism) library(foreign) - + # Global variables results.dir <- file.path("results.nosync", "RNA-Seq QC") From 135f231c861cc015e7db3cd4d12fc68408826071 Mon Sep 17 00:00:00 2001 From: Jos van Nijnatten Date: Tue, 9 Feb 2021 15:39:35 +0100 Subject: [PATCH 3/5] Remove whitespace, MacOS specific files. --- rnaseq/.DS_Store | Bin 6148 -> 0 bytes rnaseq/step6_overall_QC/__ - Preloader.R | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) delete mode 100644 rnaseq/.DS_Store diff --git a/rnaseq/.DS_Store b/rnaseq/.DS_Store deleted file mode 100644 index 8933a0f0cb19a47a85e31c0e56aa08d8ead71b08..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK!A=`75FLlKYy=fKRD$DPxKSxNfXV@B73hH*)gm~wRkF(lSZTXXvm2r|1nqCM zf7CDN@AQrBMvc-6aftxUMD`ooGamU(WbY7>>Wq>$QIm)~G{#y3-9Loaxh+V|&D4R) z?9rw!T~b5^l?&D~@faE4+^w5@^=UwV=kE($B*jG*Yn>sv_=j*(*=P?*)^r1Xh*W3z zb?H4F#^YiTXF7)M&v=lfMcHcoQni)(i0yU<|iHn%2|xBGiL{+rfx z$G2*`wZA)^HoVR4_Q7%Q^Ve^a+v(kRn-awbWZAGV>wiGI_uvm==_VZJna;1U@t>fx zXrc%M!hkTaEC$>K-#6x@0*lfM1H!;O11ome z Date: Wed, 10 Feb 2021 13:42:51 +0100 Subject: [PATCH 4/5] fastqc: simple comment. trimming: reverted back. overall qc: simplified the scripts and made sure to add instructions. --- rnaseq/step1_fastqc/snippet.sh | 37 +-- rnaseq/step2_trim/snippet.sh | 64 ----- .../01 - Principle Component Analysis.R | 149 +--------- rnaseq/step6_overall_QC/02 - Gender Check.R | 272 ++---------------- rnaseq/step6_overall_QC/03 - Sample Counts.R | 166 ++--------- rnaseq/step6_overall_QC/__ - Preloader.R | 189 ------------ 6 files changed, 64 insertions(+), 813 deletions(-) delete mode 100644 rnaseq/step6_overall_QC/__ - Preloader.R diff --git a/rnaseq/step1_fastqc/snippet.sh b/rnaseq/step1_fastqc/snippet.sh index 0eb5213..b0d7dcb 100644 --- a/rnaseq/step1_fastqc/snippet.sh +++ b/rnaseq/step1_fastqc/snippet.sh @@ -1,34 +1,5 @@ -#!/bin/bash -#SBATCH --job-name=fastqc -#SBATCH --time=0-12:00:00 -#SBATCH --ntasks=1 -#SBATCH --mem=15G -#SBATCH --qos=regular +# The command to do a FastQC on a fastq file is +file="file_to_analyse.fq.gz" +fastqc_out="./path/to/fastqc/output/dir" - -set -e -set -u -set -x -set -o pipefail - - -module purge -module load Java -module load FastQC - - -dir_raw_fastq="$(pwd)/fastq/raw" -dir_fastqc="$(pwd)/fastqc" - - -[ -d "${dir_fastqc}" ] || mkdir -p "${dir_fastqc}" - - -# Run FastQC -files=$(find -L "$dir_raw_fastq" -type f -iname "*.fastq.gz") -for file in $files; do - filename=$(basename "$file") - fastqc -o "$dir_fastqc" "$file" & -done - -wait +fastqc -o "$fastqc_out" "$file" diff --git a/rnaseq/step2_trim/snippet.sh b/rnaseq/step2_trim/snippet.sh index 6c4eb2e..e69de29 100644 --- a/rnaseq/step2_trim/snippet.sh +++ b/rnaseq/step2_trim/snippet.sh @@ -1,64 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=trimming -#SBATCH --time=2-00:00:00 -#SBATCH --ntasks=1 -#SBATCH --mem=15G -#SBATCH --qos=regular - - -set -e -set -u -set -x -set -o pipefail - - -module purge -module load TrimGalore - - -dir_raw_fastq="$(pwd)/fastq/raw" -dir_trimmed_fastq="$(pwd)/fastq/trimmed" -dir_trimmed_fastq_reports="$(pwd)/fastq/trimmed/reports" - -adapter_3p="TGGAATTCTCGG" # is _R1 -adapter_5p="GATCGTCGGACT" # is _R2 - - -[ -d "${dir_trimmed_fastq_reports}" ] || mkdir -p "${dir_trimmed_fastq_reports}" - - -# Trim all adapters from the sequences -while IFS=, read -r GSID Sample -do - echo ">> Executing trimming of $GSID (${#GSID})" - if [ "${#GSID}" == "18" ]; then # check length - don't include sample pools - for fqFile in $(find -L "$dir_raw_fastq" -maxdepth 1 -type f -iname "*${GSID}*.fastq.gz"); do - newFilename=${fqFile/.fastq.gz/_trimmed.fq.gz} - if [ -f "${newFilename}" ]; then - echo ">>File exists ${newFilename}. Skipping" - else - echo ">>File: ${fqFile}" - if grep -q "_R1" <<< $(basename "$fqFile"); then - adapter_seq=$adapter_3p - elif grep -q "_R2" <<< $(basename "$fqFile"); then - adapter_seq=$adapter_5p - fi - echo ">>Trimming with sequence ${adapter_seq}" - trim_galore \ - --adapter "$adapter_seq" \ - --length $(printf "$adapter_seq" | wc -m) \ - --output_dir "${dir_trimmed_fastq}" \ - --fastqc_args "--noextract" \ - "${fqFile}" & - fi - done - else - echo "Skipped." - fi -done < samples.csv - - -wait -mv ${dir_trimmed_fastq}/*.zip "${dir_trimmed_fastq_reports}" -mv ${dir_trimmed_fastq}/*.txt "${dir_trimmed_fastq_reports}" -mv ${dir_trimmed_fastq}/*.html "${dir_trimmed_fastq_reports}" \ No newline at end of file diff --git a/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R b/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R index 09e4dff..0c42844 100644 --- a/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R +++ b/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R @@ -1,7 +1,15 @@ # Principle Component Analysis # Normalized with limma::voom +library(limma) +library(tidyverse) -source("__ - Preloader.R", verbose=T) + +# expression.data. Has columns: +# - Gene (gene identifier) +# - [Sample identifiers] +expression.data <- "mrna_counts_table.csv" +# results.dir. The directory of where to put the resulting tables (and later your plots.) +results.dir <- "results" # PCA variables @@ -10,10 +18,7 @@ do.scale = FALSE # The analysis -master.Table %>% readr::write_csv( - file.path(results.dir, "patient.table.csv") -) - +# We ise prcomp to calculate the PCAs. Afterwards you should plot the results. norm.expr.data <- expression.data %>% tibble::column_to_rownames("Gene") norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>% @@ -69,132 +74,8 @@ data.frame( # Not saved: pcs.summery$sdev, - -# Plot PCAs -# https://github.com/kevinblighe/PCAtools -results.dir.pca.plot <- file.path(results.dir.pca, "img") -dir.create(results.dir.pca.plot) - -metadata <- master.Table %>% - dplyr::filter( - !is.na(GenomeScan_ID) - ) %>% - tibble::column_to_rownames("GenomeScan_ID") %>% - select.rows.in.order( - colnames(norm.expr.data) - ) - - -p <- pca(norm.expr.data, - metadata = metadata, - center = do.center, - scale = do.scale -) -elbow <- findElbowPoint(p$variance) -metavars <- c('Age','Gender','Smoking_status','COPD_Y_or_N','SEO_COPD_Y_or_N','GOLD_stage') - - -png(filename = file.path(results.dir.pca.plot,"scree_plot.png"), - width = 800, height = 800, - units = "px", pointsize = 12, - type = "Xlib") -print(screeplot(p, - axisLabSize = 12, - titleLabSize = 12, - components = getComponents(p, 1:(elbow+5)), - vline = c(elbow) - ) + - geom_label( - aes( - x = elbow + 1, - y = 50, - label = 'Elbow method', - vjust = -1, - size = 8 - ) - )) -dev.off() - - -png(filename = file.path(results.dir.pca.plot, "eigen_corr_plot.png"), - width = 1200, height = 1200, - units = "px", pointsize = 12, - type = "Xlib") -print(eigencorplot(p, - metavars = metavars -)) -dev.off() - - -dir.create(file.path(results.dir.pca.plot, "pairsplot")) -for (var in metavars) { - png( - filename = file.path(results.dir.pca.plot, "pairsplot", paste0(var,".png")), - width = 1200, height = 1200, - units = "px", pointsize = 12, - type = "Xlib" - ) - print(pairsplot(p, - components = getComponents(p, c(1:(elbow+1))), - triangle = TRUE, - trianglelabSize = 12, - hline = 0, vline = 0, - pointSize = 0.4, - gridlines.major = FALSE, - gridlines.minor = FALSE, - colby = var, - title = paste0('Pairs plot: ',var), - plotaxes = TRUE - )) - dev.off() -} - - - -# Plot PCAs - old failure -pca.combinations <- combinations( - n = (elbow+1), - r = 2, - v = 1:(elbow+1), - repeats.allowed = FALSE - ) - -dir.create(file.path(results.dir.pca.plot, "biplots")) -for (var in metavars) { - for (i in 1:nrow(pca.combinations)) { - pca.combi <- pca.combinations[i,] - pca.title <- paste(paste0("PC", pca.combi), collapse="_") - png( - filename = file.path(results.dir.pca.plot, "biplots", paste0(var, "-", pca.title, ".png")), - width = 800, height = 800, - units = "px", pointsize = 12, - type = "Xlib" - ) - print( - autoplot( - norm.expr.data.pcs, - data = master.Table %>% - dplyr::filter( - !is.na(GenomeScan_ID) - ) %>% - tibble::column_to_rownames("GenomeScan_ID") %>% - select.rows.in.order( - rownames(norm.expr.data.pcs$x) - ), - x = pca.combi[1], - y = pca.combi[2], - colour = var, - loadings = FALSE, - loadings.label = FALSE, - #label = FALSE, - label.size = 3 - ) + - ggprism::theme_prism() + - #ggprism::scale_colour_prism() + - ggprism::scale_shape_prism() + - ggplot2::labs(subtitle = paste0(str_to_title(var), " (", pca.title,")")) - ) - dev.off() - } -} - +# Next thing to do: +# - (Optional) scree plot - to determine the optimal cutoff for PCA inclusion based on explaination of variance +# - (Optional) eigencorplot - to correlate PCAs to clinical variables so that you know which PCA to include for which analysis +# - (Optional) pairsplot - plot multiple PCAs against each other in a single figure +# - Plot the first couple of PCAs against each other \ No newline at end of file diff --git a/rnaseq/step6_overall_QC/02 - Gender Check.R b/rnaseq/step6_overall_QC/02 - Gender Check.R index af7536d..f02f19d 100644 --- a/rnaseq/step6_overall_QC/02 - Gender Check.R +++ b/rnaseq/step6_overall_QC/02 - Gender Check.R @@ -1,16 +1,28 @@ # Gender QC # Normalized with limma::voom +library(GSVA) +library(limma) +library(edgeR) +library(tidyverse) -source("__ - Preloader.R", verbose=T) + +# expression.data. Has columns: +# - Gene (gene identifier) +# - [Sample identifiers] +expression.data <- "mrna_counts_table.csv" +# master.Table. Has columns: +# - GenomeScan_ID +# - gender, levels = c("male", "female") +# - age +# - factor(smoking.status, levels = c("Ex-smoker", "Current smoker")) +master.Table <- "patient_table.csv" +# results.dir. The directory of where to put the resulting tables (and later your plots.) +results.dir <- "results" # The analysis -norm.expr.data <- expression.data %>% - tibble::column_to_rownames("Gene") -norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>% - limma::voom() %>% - as.matrix() - +# We first do a differential expression analysis on gender using EdgeR. +# Afterwards you should plot these results. x.genes <- gene.data %>% dplyr::filter(chromosome_name == "X") %>% dplyr::pull(ensembl_gene_id) @@ -87,248 +99,12 @@ gender.qc.genes.to.plot <- gender.qc.results %>% chromosome_name %in% c("X", "Y") & FDR < 0.05 & dplyr::row_number() <= 5 - ) | - hgnc_symbol %in% c( - "XIST", - "TSIX", - "KDM6A", - "ZFX", - "KDM5C", - "ZFY-AS1", - "ARSDP1", - "GYG2P1", - "RBMY2JP", - "ARSLP1" ) ) -gender.qc.data <- as.data.frame(norm.expr.data) %>% - rownames_to_column("ensembl.id") %>% - tidyr::gather( - key = "rna.seq.sample.id", - value = "expr.value", - -ensembl.id - ) %>% - dplyr::filter( - ensembl.id %in% gender.qc.genes.to.plot$ensembl.id - ) %>% - dplyr::left_join( - y = gender.qc.patients, - by = c("rna.seq.sample.id" = "GenomeScan_ID") - ) %>% - #dplyr::filter( - # !is.na(gender) - #) %>% - readr::write_csv( - file.path(results.dir.gender.plot, "plot.data.voom.csv") - ) -for (chr in gender.qc.genes.to.plot$chromosome_name) { - current.gender.qc.genes.to.plot <- gender.qc.genes.to.plot %>% - dplyr::filter(chromosome_name == chr) - chromosome_name <- chr - i <- 0 - for (current.ensembl.id in current.gender.qc.genes.to.plot$ensembl.id) { - i <- i + 1 - - hgnc_symbol <- gene.data %>% - dplyr::filter( - ensembl_gene_id == current.ensembl.id - ) %>% - dplyr::pull(hgnc_symbol) - - # calculate outliers, kinda - plot.data <- gender.qc.data %>% - dplyr::filter( - ensembl.id == current.ensembl.id - ) %>% - dplyr::mutate( - gender = dplyr::case_when( - is.na(gender) | (stringr::str_trim(gender) == "") ~ "other", - TRUE ~ gender - ) - ) - - if (nrow(plot.data) <= 0) { - next - } - - outliers <- boxplot( - formula = expr.value ~ gender, - data = plot.data, - plot = FALSE - )$out - - result.to.annotate <- plot.data %>% - dplyr::filter( - expr.value %in% outliers - ) - - # Visual: plot range (for t-test p-value) - plot.y.range <- c( - "min" = as.integer(min(plot.data$expr.value) - 1) , - "max" = as.integer(max(plot.data$expr.value) + 1) - ) - plot.margin <- ((plot.y.range["max"] + (plot.y.range["min"] * -1)) * 0.05) - plot.y.range["min"] <- plot.y.range["min"] - plot.margin - plot.y.range["max"] <- plot.y.range["max"] + plot.margin - - # Plot the damn thing as if it is Graphpad Prism - stat.table <- rstatix::t_test(plot.data, expr.value ~ gender) - plt <- plot.data %>% - ggplot2::ggplot( - mapping = ggplot2::aes( - x = gender, - y = expr.value - ) - ) + - ggplot2::geom_jitter( - mapping = ggplot2::aes( - colour = gender, - shape = gender - ), - width = 0.1 - ) + - ggrepel::geom_text_repel( - data = result.to.annotate, - mapping = ggplot2::aes( - label = sample.id - ), - size = 2, - box.padding = unit(0.35, "lines"), - point.padding = unit(0.3, "lines") - ) + - ggplot2::stat_summary( - fun = "mean", - geom = "crossbar", - width = 0.3, - size = 0.3 - ) + - ggplot2::scale_y_continuous( - limits = plot.y.range, - guide = "prism_offset" - ) + - #ggprism::add_pvalue( - # stat.table, - # y.position = plot.y.range["max"] - #) + - ggprism::theme_prism() + - ggprism::scale_colour_prism() + - ggprism::scale_shape_prism() + - ggplot2::theme( - legend.position = "none" - ) + - ggplot2::labs( - subtitle = paste0("Gender Check: ", hgnc_symbol, " (chr. ", chromosome_name, ")"), - x = "Gender", - y = "Normalised Expression Values" - ) - - ggplot2::ggsave( - filename = file.path(results.dir.gender.plot, paste0(chromosome_name, ".", i, ".", hgnc_symbol, ".png")), - plot = plt, - width = 12.5, - height = 12.5, - unit = "cm" - ) - } -} - -# Let's try a GSVA -gsva.groups <- list( - X = gender.qc.genes.to.plot %>% - dplyr::filter(chromosome_name == "X") %>% - dplyr::pull(ensembl.id), - Y = gender.qc.genes.to.plot %>% - dplyr::filter(chromosome_name == "Y") %>% - dplyr::pull(ensembl.id) -) - -gsva_res = GSVA::gsva( - norm.expr.data, - gsva.groups, - mx.diff = TRUE, - verbose = FALSE, - parallel.sz = 1 -) - - -gender.qc.gsva.data <- as.data.frame(gsva_res) %>% - rownames_to_column("gsva.group") %>% - tidyr::gather( - key = "rna.seq.sample.id", - value = "gsva.value", - -gsva.group - ) %>% - dplyr::left_join( - y = gender.qc.patients %>% - dplyr::select( - GenomeScan_ID, - sample.id, - gender - ), - by = c("rna.seq.sample.id" = "GenomeScan_ID") - ) %>% - readr::write_csv( - file.path(results.dir.gender.plot, "plot.data.gsva.csv") - ) - - -for (c.gender in unique(gender.qc.gsva.data$gender)) { - if (is.na(c.gender)) { - next - } - - c.plot.data <- gender.qc.gsva.data %>% - dplyr::filter( - gender == c.gender - ) - - outliers <- boxplot( - formula = gsva.value ~ gsva.group, - data = c.plot.data, - plot = FALSE - )$out - - result.to.annotate <- c.plot.data %>% - dplyr::filter( - gsva.value %in% outliers - ) - - plt <- c.plot.data %>% - ggplot2::ggplot( - mapping = ggplot2::aes( - x = gsva.group, - y = gsva.value - ) - ) + - ggplot2::geom_boxplot() + - ggrepel::geom_text_repel( - data = result.to.annotate, - mapping = ggplot2::aes( - label = sample.id - ), - size = 2, - box.padding = unit(0.35, "lines"), - point.padding = unit(0.3, "lines") - ) + - ggprism::theme_prism() + - ggprism::scale_colour_prism() + - ggprism::scale_shape_prism() + - ggplot2::theme( - legend.position = "none" - ) + - ggplot2::labs( - subtitle = paste0("", toupper(c.gender)), - x = "Chromosome", - y = "GSVA Values" - ) - - ggplot2::ggsave( - filename = file.path(results.dir.gender.plot, paste0("gsva.", c.gender, ".png")), - plot = plt, - width = 12.5, - height = 12.5, - unit = "cm" - ) -} +# Next thing to do: +# - Plot the normalized expressino values for the genes in gender.qc.genes.to.plot in a boxplot, split and colored by gender. +# - (Optional) Do a GSVA with as genesets the genes found in gender.qc.genes.to.plot. Plot the boxplots as per the previous point. +# - (Optional) Plot the number of Y-chromosome reads devided by the number of X chromosome reads in a boxplot as per the first point. +# - (Optional) Plot the number of Y-chromosome SNPs devided by the number of X chromosome SNPs in a boxplot as per the first point. diff --git a/rnaseq/step6_overall_QC/03 - Sample Counts.R b/rnaseq/step6_overall_QC/03 - Sample Counts.R index 90e805a..3f4c5c2 100644 --- a/rnaseq/step6_overall_QC/03 - Sample Counts.R +++ b/rnaseq/step6_overall_QC/03 - Sample Counts.R @@ -1,17 +1,25 @@ # Total counts per sample # Normalized with limma::voom +library(limma) +library(tidyverse) -source("__ - Preloader.R", verbose=T) + +# expression.data. Has columns: +# - Gene (gene identifier) +# - [Sample identifiers] +expression.data <- "mrna_counts_table.csv" +# master.Table. Has columns: +# - GenomeScan_ID +# - gender, levels = c("male", "female") +# - age +# - factor(smoking.status, levels = c("Ex-smoker", "Current smoker")) +master.Table <- "patient_table.csv" +# results.dir. The directory of where to put the resulting tables (and later your plots.) +results.dir <- "results" # The analysis -norm.expr.data <- expression.data %>% - tibble::column_to_rownames("Gene") -norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>% - limma::voom() %>% - as.matrix() - -# Total counts per sample +# We calculate the number of mapped reads per sample. total.count.per.sample <- expression.data %>% tibble::column_to_rownames("Gene") %>% colSums() @@ -23,141 +31,9 @@ data.frame( readr::write_csv(file.path(results.dir, "total.counts.per.sample.csv")) -norm.data <- norm.expr.data %>% - as.data.frame() %>% - tibble::rownames_to_column( - "Gene" - ) %>% - tidyr::gather( - key = "sample.id", - value = "expr.value", - -Gene - ) %>% - dplyr::left_join( - y = master.Table %>% - dplyr::filter( - !is.na(GenomeScan_ID) - ) %>% - dplyr::mutate( - id = dplyr::case_when( - stringr::str_trim(gender) == "" ~ paste0("Water ", dplyr::row_number()), - TRUE ~ sample.id - ), - gender = dplyr::case_when( - stringr::str_trim(gender) == "" ~ "water", - !is.na(gender) ~ as.character(gender) - ) - ) %>% - dplyr::select( - GenomeScan_ID, - gender, - id - ), - by = c("sample.id" = "GenomeScan_ID") - ) - -norm.plot <- norm.data %>% - ggplot2::ggplot( - mapping = ggplot2::aes( - x = id, - y = expr.value, - fill = gender - ) - ) + - ggplot2::geom_boxplot() + - ggplot2::scale_fill_manual( - values = c( - "male" = "blue", - "female" = "red", - "water" = "green" - ) - ) + - ggplot2::labs( - title = "Normalized expression values distribution", - y = "Normalized expression values (limma::voom)", - x = "Sample", - gender = "Gender" - ) + - ggprism::theme_prism() + - ggplot2::theme( - axis.text.x = ggplot2::element_text(angle = 90) - ) - -ggplot2::ggsave( - filename = file.path(results.dir, "counts.per.sample.normalised.png"), - plot = norm.plot, - width = 40, - height = 20, - units = "cm" -) - - - -expr.data <- expression.data %>% - tidyr::gather( - key = "sample.id", - value = "expr.value", - -Gene - ) %>% - dplyr::filter( - expr.value != 0 - ) %>% - dplyr::left_join( - y = master.Table %>% - dplyr::filter( - !is.na(GenomeScan_ID) - ) %>% - dplyr::mutate( - id = dplyr::case_when( - stringr::str_trim(gender) == "" ~ paste0("Water ", dplyr::row_number()), - TRUE ~ sample.id - ), - gender = dplyr::case_when( - stringr::str_trim(gender) == "" ~ "water", - !is.na(gender) ~ as.character(gender) - ) - ) %>% - dplyr::select( - GenomeScan_ID, - gender, - id - ), - by = c("sample.id" = "GenomeScan_ID") - ) - -expr.plot <- expr.data %>% - ggplot2::ggplot( - mapping = ggplot2::aes( - x = id, - y = expr.value, - fill = gender - ) - ) + - ggplot2::geom_boxplot() + - ggplot2::scale_fill_manual( - values = c( - "male" = "blue", - "female" = "red", - "water" = "green" - ) - ) + - ggplot2::scale_y_continuous(trans='log2') + - ggplot2::labs( - title = "Raw expression values distribution, without zero's", - y = "Expression values", - x = "Sample", - gender = "Gender" - ) + - ggprism::theme_prism() + - ggplot2::theme( - axis.text.x = ggplot2::element_text(angle = 90) - ) - -ggplot2::ggsave( - filename = file.path(results.dir, "counts.per.sample.raw.zeros.removed.png"), - plot = expr.plot, - width = 40, - height = 20, - units = "cm" -) +# Next thing to do: +# - Check the number of reads per sample in total.counts.per.sample.csv +# - Plot the reads distribution (all reads) per sample in a boxplot. +# - (Optional) Calculate the number of unmapped, multimapped, unique mapped to +# feature and unique mapped to no feature and plot these in a stacked bar graph. diff --git a/rnaseq/step6_overall_QC/__ - Preloader.R b/rnaseq/step6_overall_QC/__ - Preloader.R deleted file mode 100644 index ce96a0f..0000000 --- a/rnaseq/step6_overall_QC/__ - Preloader.R +++ /dev/null @@ -1,189 +0,0 @@ -library(tidyverse) -library(ggfortify) -library(ggprism) -library(limma) -library(biomaRt) -library(PCAtools) -library(gtools) -library(edgeR) -library(ggprism) -library(foreign) - - -# Global variables -results.dir <- file.path("results.nosync", "RNA-Seq QC") -data.dir <- "Data" -patient.dir <- file.path(data.dir, "Patients") -sample.dir <- file.path(data.dir, "Samples") -expression.dir <- file.path(data.dir, "mRNA - RNA-Seq") - - -dir.create(results.dir, recursive = TRUE) - - -#### -# Helper functions -#### -select.columns.in.order <- function(dataframe, columns) { - dataframe[, columns] -} - -select.rows.in.order <- function(dataframe, rows) { - dataframe[rows,] -} - -getGenedataByEnsemblId38 <- function(ensemblIds, file.location) { - file.name <- file.path(file.location, "genes_info_hg38.csv") - if (!file.exists(file.name)) { - if (!("mart" %in% ls())) { - assign("mart", useEnsembl( - biomart = "ENSEMBL_MART_ENSEMBL", - dataset = "hsapiens_gene_ensembl" - )) - } - gene.list <- getBM( - filters = "ensembl_gene_id", - attributes = c( - "hgnc_symbol", - "ensembl_gene_id", - "ensembl_transcript_id", - "chromosome_name", - "start_position", - "end_position", - "strand", - "transcription_start_site", - "transcript_start", - "transcript_end", - "external_gene_name" - ), - values = as.character(ensemblIds), - mart = mart - ) - readr::write_csv(gene.list, path = file.name) - } - return( - readr::read_csv( - file.name, - col_types = readr::cols() - ) - ) -} - -#remove.rows.with.count.less.then <- function(dataframe, minRowCount, columns.to.exclude) { -# dataframe %>% -# dplyr::filter( -# rowSums(dplyr::select(., -tidyselect::one_of(columns.to.exclude))) < minRowCount -# ) -#} - -limma.voom.convert.column <- function(dataframe, columnname) { - dataframe %>% - tibble::column_to_rownames(columnname) %>% - limma::voom() %>% - as.data.frame() %>% - tibble::rownames_to_column(columnname) -} - -select.columns.in.order <- function(dataframe, columns) { - dataframe[, columns] -} - -drop.columns.if.all.same.value <- function(dataframe) { - for (name in colnames(dataframe)) { - is.all.same <- (dataframe[, name] %>% unique() %>% length()) <= 1 - if (is.all.same) { - dataframe <- dataframe %>% - dplyr::select( - -tidyselect::one_of(name) - ) - } - } - dataframe -} - - - -# Load data -master.Table <- foreign::read.spss( - file.path(patient.dir, "PRESTO proteogenomics full data sat - ver 7.5.sav") - ) %>% - as.data.frame() %>% - dplyr::mutate( - GenomeScan_ID = stringr::str_trim(GenomeScan_ID), - gender = forcats::fct_recode( - Gender, - female = "f", - male = "m", - other = "" - ), - age = as.numeric(Age), - smoking.status = forcats::fct_recode( - Smoking_status, - `Ex-smoker` = "ES ", - `Current smoker` = "CS ", - other = " " - ) - ) - -expression.data <- readr::read_tsv( - file.path(expression.dir, "20200427_103972-001_rawcounts.txt"), - col_types = readr::cols() - ) - -gene.data <- getGenedataByEnsemblId38( - ensemblIds = expression.data$Gene, - file.location = expression.dir - ) %>% - dplyr::group_by(hgnc_symbol) %>% - dplyr::filter( - dplyr::row_number() == 1, - !is.na(hgnc_symbol), - hgnc_symbol != "" - ) %>% - dplyr::ungroup() %>% - dplyr::select( - hgnc_symbol, - ensembl_gene_id, - chromosome_name, - transcript_start, - transcript_end - ) - - -master.Table <- master.Table %>% - dplyr::mutate( - Group_simple2 = stringr::str_trim(Group_simple2), - Group_simple = stringr::str_trim(Group_simple), - T_number = as.character(T_number), - sample.id = stringr::str_trim(PRESTO_ID) - ) %>% - dplyr::filter( - # # According to Niek, I should not include this, for whatever reason - #!(GenomeScan_ID %in% c( - # "T02-01796", - # "T02-03095", - # "T02-10683", - # "T10-18671", - # "T12-12036" - # ) - # ), - # - # # (According to Niek, don't include) Water Controls - #!stringr::str_detect(T_number, pattern="Water"), - # - # # (According to Niek, don't include)never smoker controles - #!(Group_simple2 == "NS_Ctrl"), - # - # # (According to Niek, don't include)ALFA1 patiƫnten - #!(Group_simple == "ALFA"), - # - #stringr::str_trim(Passed_RNAseq_library_prep_QC_Y_N) == "Y", - GenomeScan_ID %in% colnames(expression.data), - !is.na(sample.id), - sample.id != "" - ) - -expression.data <- expression.data %>% - select.columns.in.order( - c("Gene", master.Table$GenomeScan_ID) - ) From 170a001729326564e350dc6ee4cf79624345dc5f Mon Sep 17 00:00:00 2001 From: Jos van Nijnatten Date: Mon, 15 Feb 2021 12:54:31 +0100 Subject: [PATCH 5/5] Count reads in X and Y chromosome. --- rnaseq/step6_overall_QC/02 - Gender Check.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rnaseq/step6_overall_QC/02 - Gender Check.R b/rnaseq/step6_overall_QC/02 - Gender Check.R index f02f19d..92f4e4d 100644 --- a/rnaseq/step6_overall_QC/02 - Gender Check.R +++ b/rnaseq/step6_overall_QC/02 - Gender Check.R @@ -107,4 +107,6 @@ gender.qc.genes.to.plot <- gender.qc.results %>% # - Plot the normalized expressino values for the genes in gender.qc.genes.to.plot in a boxplot, split and colored by gender. # - (Optional) Do a GSVA with as genesets the genes found in gender.qc.genes.to.plot. Plot the boxplots as per the previous point. # - (Optional) Plot the number of Y-chromosome reads devided by the number of X chromosome reads in a boxplot as per the first point. +# x=$(samtools view -q 20 -f 2 $bam_file X | wc -l) +# y=$(samtools view -q 20 -f 2 $bam_file Y | wc -l) # - (Optional) Plot the number of Y-chromosome SNPs devided by the number of X chromosome SNPs in a boxplot as per the first point.