From 3d7c9e3b1cef3336adcb424ed52b224de9b194c5 Mon Sep 17 00:00:00 2001 From: Jos van Nijnatten Date: Tue, 9 Feb 2021 12:52:44 +0100 Subject: [PATCH 01/11] FastQC, Trimming, Overall QC (initial) --- .gitignore | 4 + rnaseq/.DS_Store | Bin 0 -> 6148 bytes rnaseq/step1_fastqc/snippet.sh | 34 ++ rnaseq/step2_trim/snippet.sh | 64 ++++ .../01 - Principle Component Analysis.R | 200 +++++++++++ rnaseq/step6_overall_QC/02 - Gender Check.R | 334 ++++++++++++++++++ rnaseq/step6_overall_QC/03 - Sample Counts.R | 163 +++++++++ rnaseq/step6_overall_QC/__ - Preloader.R | 189 ++++++++++ rnaseq/step6_overall_QC/snippet.R | 0 9 files changed, 988 insertions(+) create mode 100644 .gitignore create mode 100644 rnaseq/.DS_Store create mode 100644 rnaseq/step6_overall_QC/01 - Principle Component Analysis.R create mode 100644 rnaseq/step6_overall_QC/02 - Gender Check.R create mode 100644 rnaseq/step6_overall_QC/03 - Sample Counts.R create mode 100644 rnaseq/step6_overall_QC/__ - Preloader.R delete mode 100644 rnaseq/step6_overall_QC/snippet.R diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..57ef2cc --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ + +.DS_Store +*.nosync +*.RData diff --git a/rnaseq/.DS_Store b/rnaseq/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..8933a0f0cb19a47a85e31c0e56aa08d8ead71b08 GIT binary patch literal 6148 zcmeHK!A=`75FLlKYy=fKRD$DPxKSxNfXV@B73hH*)gm~wRkF(lSZTXXvm2r|1nqCM zf7CDN@AQrBMvc-6aftxUMD`ooGamU(WbY7>>Wq>$QIm)~G{#y3-9Loaxh+V|&D4R) z?9rw!T~b5^l?&D~@faE4+^w5@^=UwV=kE($B*jG*Yn>sv_=j*(*=P?*)^r1Xh*W3z zb?H4F#^YiTXF7)M&v=lfMcHcoQni)(i0yU<|iHn%2|xBGiL{+rfx z$G2*`wZA)^HoVR4_Q7%Q^Ve^a+v(kRn-awbWZAGV>wiGI_uvm==_VZJna;1U@t>fx zXrc%M!hkTaEC$>K-#6x@0*lfM1H!;O11ome z> Executing trimming of $GSID (${#GSID})" + if [ "${#GSID}" == "18" ]; then # check length - don't include sample pools + for fqFile in $(find -L "$dir_raw_fastq" -maxdepth 1 -type f -iname "*${GSID}*.fastq.gz"); do + newFilename=${fqFile/.fastq.gz/_trimmed.fq.gz} + if [ -f "${newFilename}" ]; then + echo ">>File exists ${newFilename}. Skipping" + else + echo ">>File: ${fqFile}" + if grep -q "_R1" <<< $(basename "$fqFile"); then + adapter_seq=$adapter_3p + elif grep -q "_R2" <<< $(basename "$fqFile"); then + adapter_seq=$adapter_5p + fi + echo ">>Trimming with sequence ${adapter_seq}" + trim_galore \ + --adapter "$adapter_seq" \ + --length $(printf "$adapter_seq" | wc -m) \ + --output_dir "${dir_trimmed_fastq}" \ + --fastqc_args "--noextract" \ + "${fqFile}" & + fi + done + else + echo "Skipped." + fi +done < samples.csv + + +wait +mv ${dir_trimmed_fastq}/*.zip "${dir_trimmed_fastq_reports}" +mv ${dir_trimmed_fastq}/*.txt "${dir_trimmed_fastq_reports}" +mv ${dir_trimmed_fastq}/*.html "${dir_trimmed_fastq_reports}" \ No newline at end of file diff --git a/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R b/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R new file mode 100644 index 0000000..09e4dff --- /dev/null +++ b/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R @@ -0,0 +1,200 @@ +# Principle Component Analysis +# Normalized with limma::voom + +source("__ - Preloader.R", verbose=T) + + +# PCA variables +do.center = TRUE +do.scale = FALSE + + +# The analysis +master.Table %>% readr::write_csv( + file.path(results.dir, "patient.table.csv") +) + +norm.expr.data <- expression.data %>% + tibble::column_to_rownames("Gene") +norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>% + limma::voom() %>% + as.matrix() + +# Principle Component analysis +results.dir.pca <- file.path(results.dir, "principle.components") +dir.create(results.dir.pca, recursive=TRUE) + +norm.expr.data.pcs <- norm.expr.data %>% + t() %>% + stats::prcomp( + center = do.center, + scale. = do.scale + ) + +# Write summary of PCAs to files +pcs.summery <- summary(norm.expr.data.pcs) +pcs.summery$importance %>% + t() %>% + as.data.frame() %>% + tibble::rownames_to_column("PC.name") %>% + readr::write_csv( + file.path(results.dir.pca, "importance.csv") + ) + +pcs.summery$x %>% + t() %>% + as.data.frame() %>% + tibble::rownames_to_column("ensembl.id") %>% + readr::write_csv( + file.path(results.dir.pca, "values.csv") + ) + +pcs.summery$rotation %>% + t() %>% + as.data.frame() %>% + tibble::rownames_to_column("sample.id") %>% + readr::write_csv( + file.path(results.dir.pca, "rotation.csv") + ) + +data.frame( + rownames = names(pcs.summery$center), + center = pcs.summery$center, + scale = pcs.summery$scale + ) %>% + readr::write_csv( + file.path(results.dir.pca, "rest.csv") + ) + +# Not saved: pcs.summery$sdev, + + + +# Plot PCAs +# https://github.com/kevinblighe/PCAtools +results.dir.pca.plot <- file.path(results.dir.pca, "img") +dir.create(results.dir.pca.plot) + +metadata <- master.Table %>% + dplyr::filter( + !is.na(GenomeScan_ID) + ) %>% + tibble::column_to_rownames("GenomeScan_ID") %>% + select.rows.in.order( + colnames(norm.expr.data) + ) + + +p <- pca(norm.expr.data, + metadata = metadata, + center = do.center, + scale = do.scale +) +elbow <- findElbowPoint(p$variance) +metavars <- c('Age','Gender','Smoking_status','COPD_Y_or_N','SEO_COPD_Y_or_N','GOLD_stage') + + +png(filename = file.path(results.dir.pca.plot,"scree_plot.png"), + width = 800, height = 800, + units = "px", pointsize = 12, + type = "Xlib") +print(screeplot(p, + axisLabSize = 12, + titleLabSize = 12, + components = getComponents(p, 1:(elbow+5)), + vline = c(elbow) + ) + + geom_label( + aes( + x = elbow + 1, + y = 50, + label = 'Elbow method', + vjust = -1, + size = 8 + ) + )) +dev.off() + + +png(filename = file.path(results.dir.pca.plot, "eigen_corr_plot.png"), + width = 1200, height = 1200, + units = "px", pointsize = 12, + type = "Xlib") +print(eigencorplot(p, + metavars = metavars +)) +dev.off() + + +dir.create(file.path(results.dir.pca.plot, "pairsplot")) +for (var in metavars) { + png( + filename = file.path(results.dir.pca.plot, "pairsplot", paste0(var,".png")), + width = 1200, height = 1200, + units = "px", pointsize = 12, + type = "Xlib" + ) + print(pairsplot(p, + components = getComponents(p, c(1:(elbow+1))), + triangle = TRUE, + trianglelabSize = 12, + hline = 0, vline = 0, + pointSize = 0.4, + gridlines.major = FALSE, + gridlines.minor = FALSE, + colby = var, + title = paste0('Pairs plot: ',var), + plotaxes = TRUE + )) + dev.off() +} + + + +# Plot PCAs - old failure +pca.combinations <- combinations( + n = (elbow+1), + r = 2, + v = 1:(elbow+1), + repeats.allowed = FALSE + ) + +dir.create(file.path(results.dir.pca.plot, "biplots")) +for (var in metavars) { + for (i in 1:nrow(pca.combinations)) { + pca.combi <- pca.combinations[i,] + pca.title <- paste(paste0("PC", pca.combi), collapse="_") + png( + filename = file.path(results.dir.pca.plot, "biplots", paste0(var, "-", pca.title, ".png")), + width = 800, height = 800, + units = "px", pointsize = 12, + type = "Xlib" + ) + print( + autoplot( + norm.expr.data.pcs, + data = master.Table %>% + dplyr::filter( + !is.na(GenomeScan_ID) + ) %>% + tibble::column_to_rownames("GenomeScan_ID") %>% + select.rows.in.order( + rownames(norm.expr.data.pcs$x) + ), + x = pca.combi[1], + y = pca.combi[2], + colour = var, + loadings = FALSE, + loadings.label = FALSE, + #label = FALSE, + label.size = 3 + ) + + ggprism::theme_prism() + + #ggprism::scale_colour_prism() + + ggprism::scale_shape_prism() + + ggplot2::labs(subtitle = paste0(str_to_title(var), " (", pca.title,")")) + ) + dev.off() + } +} + diff --git a/rnaseq/step6_overall_QC/02 - Gender Check.R b/rnaseq/step6_overall_QC/02 - Gender Check.R new file mode 100644 index 0000000..af7536d --- /dev/null +++ b/rnaseq/step6_overall_QC/02 - Gender Check.R @@ -0,0 +1,334 @@ +# Gender QC +# Normalized with limma::voom + +source("__ - Preloader.R", verbose=T) + + +# The analysis +norm.expr.data <- expression.data %>% + tibble::column_to_rownames("Gene") +norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>% + limma::voom() %>% + as.matrix() + +x.genes <- gene.data %>% + dplyr::filter(chromosome_name == "X") %>% + dplyr::pull(ensembl_gene_id) + +y.genes <- gene.data %>% + dplyr::filter(chromosome_name == "Y") %>% + dplyr::pull(ensembl_gene_id) + +# Gender QC +results.dir.gender <- file.path(results.dir, "gender.check") +dir.create(results.dir.gender, recursive=TRUE) + +# Differential Expression on Gender +gender.qc.patients <- master.Table %>% + dplyr::filter( + !is.na(GenomeScan_ID) + ) %>% + dplyr::mutate( + gender = factor(gender, levels = c("male", "female")), + age = as.numeric(age), + smoking.status = factor(smoking.status, levels = c("Ex-smoker", "Current smoker")) + ) %>% + dplyr::filter( + !is.na(gender) + ) +gender.qc.sample.order <- gender.qc.patients %>% + dplyr::pull(GenomeScan_ID) + +gender.qc.expression.data <- expression.data %>% + tibble::column_to_rownames("Gene") %>% + select.columns.in.order(gender.qc.sample.order) %>% + as.matrix() + +design <- model.matrix( ~0 + gender, data = gender.qc.patients) + +DGEL <- edgeR::DGEList(gender.qc.expression.data) +keep <- edgeR::filterByExpr(DGEL) +keep[names(keep) %in% x.genes] <- TRUE +keep[names(keep) %in% y.genes] <- TRUE +DGEL <- DGEL[keep, , keep.lib.sizes=FALSE] +DGEL <- edgeR::calcNormFactors(DGEL, method = "TMM") + +DGEL <- edgeR::estimateDisp(DGEL, design) +fit <- edgeR::glmQLFit(DGEL,design) + +contrasts <- limma::makeContrasts( + gender = gendermale - genderfemale, + levels = design +) +qlf <- edgeR::glmQLFTest(fit, contrast = contrasts[,"gender"]) + +gender.qc.results <- edgeR::topTags( + qlf, + n=nrow(DGEL) + )$table %>% + tibble::rownames_to_column("ensembl.id") %>% + dplyr::left_join( + y = gene.data, + by = c("ensembl.id" = "ensembl_gene_id") + ) %>% + readr::write_csv( + file.path(results.dir.gender, "differential.expression.on.gender.csv") + ) + +# Plotting of gender expression +results.dir.gender.plot <- file.path(results.dir.gender, "img") +dir.create(results.dir.gender.plot, recursive=TRUE) + +gender.qc.genes.to.plot <- gender.qc.results %>% + dplyr::arrange(PValue) %>% + dplyr::group_by(chromosome_name) %>% + dplyr::filter( + ( + chromosome_name %in% c("X", "Y") & + FDR < 0.05 & + dplyr::row_number() <= 5 + ) | + hgnc_symbol %in% c( + "XIST", + "TSIX", + "KDM6A", + "ZFX", + "KDM5C", + "ZFY-AS1", + "ARSDP1", + "GYG2P1", + "RBMY2JP", + "ARSLP1" + ) + ) + +gender.qc.data <- as.data.frame(norm.expr.data) %>% + rownames_to_column("ensembl.id") %>% + tidyr::gather( + key = "rna.seq.sample.id", + value = "expr.value", + -ensembl.id + ) %>% + dplyr::filter( + ensembl.id %in% gender.qc.genes.to.plot$ensembl.id + ) %>% + dplyr::left_join( + y = gender.qc.patients, + by = c("rna.seq.sample.id" = "GenomeScan_ID") + ) %>% + #dplyr::filter( + # !is.na(gender) + #) %>% + readr::write_csv( + file.path(results.dir.gender.plot, "plot.data.voom.csv") + ) + +for (chr in gender.qc.genes.to.plot$chromosome_name) { + current.gender.qc.genes.to.plot <- gender.qc.genes.to.plot %>% + dplyr::filter(chromosome_name == chr) + chromosome_name <- chr + i <- 0 + for (current.ensembl.id in current.gender.qc.genes.to.plot$ensembl.id) { + i <- i + 1 + + hgnc_symbol <- gene.data %>% + dplyr::filter( + ensembl_gene_id == current.ensembl.id + ) %>% + dplyr::pull(hgnc_symbol) + + # calculate outliers, kinda + plot.data <- gender.qc.data %>% + dplyr::filter( + ensembl.id == current.ensembl.id + ) %>% + dplyr::mutate( + gender = dplyr::case_when( + is.na(gender) | (stringr::str_trim(gender) == "") ~ "other", + TRUE ~ gender + ) + ) + + if (nrow(plot.data) <= 0) { + next + } + + outliers <- boxplot( + formula = expr.value ~ gender, + data = plot.data, + plot = FALSE + )$out + + result.to.annotate <- plot.data %>% + dplyr::filter( + expr.value %in% outliers + ) + + # Visual: plot range (for t-test p-value) + plot.y.range <- c( + "min" = as.integer(min(plot.data$expr.value) - 1) , + "max" = as.integer(max(plot.data$expr.value) + 1) + ) + plot.margin <- ((plot.y.range["max"] + (plot.y.range["min"] * -1)) * 0.05) + plot.y.range["min"] <- plot.y.range["min"] - plot.margin + plot.y.range["max"] <- plot.y.range["max"] + plot.margin + + # Plot the damn thing as if it is Graphpad Prism + stat.table <- rstatix::t_test(plot.data, expr.value ~ gender) + plt <- plot.data %>% + ggplot2::ggplot( + mapping = ggplot2::aes( + x = gender, + y = expr.value + ) + ) + + ggplot2::geom_jitter( + mapping = ggplot2::aes( + colour = gender, + shape = gender + ), + width = 0.1 + ) + + ggrepel::geom_text_repel( + data = result.to.annotate, + mapping = ggplot2::aes( + label = sample.id + ), + size = 2, + box.padding = unit(0.35, "lines"), + point.padding = unit(0.3, "lines") + ) + + ggplot2::stat_summary( + fun = "mean", + geom = "crossbar", + width = 0.3, + size = 0.3 + ) + + ggplot2::scale_y_continuous( + limits = plot.y.range, + guide = "prism_offset" + ) + + #ggprism::add_pvalue( + # stat.table, + # y.position = plot.y.range["max"] + #) + + ggprism::theme_prism() + + ggprism::scale_colour_prism() + + ggprism::scale_shape_prism() + + ggplot2::theme( + legend.position = "none" + ) + + ggplot2::labs( + subtitle = paste0("Gender Check: ", hgnc_symbol, " (chr. ", chromosome_name, ")"), + x = "Gender", + y = "Normalised Expression Values" + ) + + ggplot2::ggsave( + filename = file.path(results.dir.gender.plot, paste0(chromosome_name, ".", i, ".", hgnc_symbol, ".png")), + plot = plt, + width = 12.5, + height = 12.5, + unit = "cm" + ) + } +} + +# Let's try a GSVA +gsva.groups <- list( + X = gender.qc.genes.to.plot %>% + dplyr::filter(chromosome_name == "X") %>% + dplyr::pull(ensembl.id), + Y = gender.qc.genes.to.plot %>% + dplyr::filter(chromosome_name == "Y") %>% + dplyr::pull(ensembl.id) +) + +gsva_res = GSVA::gsva( + norm.expr.data, + gsva.groups, + mx.diff = TRUE, + verbose = FALSE, + parallel.sz = 1 +) + + +gender.qc.gsva.data <- as.data.frame(gsva_res) %>% + rownames_to_column("gsva.group") %>% + tidyr::gather( + key = "rna.seq.sample.id", + value = "gsva.value", + -gsva.group + ) %>% + dplyr::left_join( + y = gender.qc.patients %>% + dplyr::select( + GenomeScan_ID, + sample.id, + gender + ), + by = c("rna.seq.sample.id" = "GenomeScan_ID") + ) %>% + readr::write_csv( + file.path(results.dir.gender.plot, "plot.data.gsva.csv") + ) + + +for (c.gender in unique(gender.qc.gsva.data$gender)) { + if (is.na(c.gender)) { + next + } + + c.plot.data <- gender.qc.gsva.data %>% + dplyr::filter( + gender == c.gender + ) + + outliers <- boxplot( + formula = gsva.value ~ gsva.group, + data = c.plot.data, + plot = FALSE + )$out + + result.to.annotate <- c.plot.data %>% + dplyr::filter( + gsva.value %in% outliers + ) + + plt <- c.plot.data %>% + ggplot2::ggplot( + mapping = ggplot2::aes( + x = gsva.group, + y = gsva.value + ) + ) + + ggplot2::geom_boxplot() + + ggrepel::geom_text_repel( + data = result.to.annotate, + mapping = ggplot2::aes( + label = sample.id + ), + size = 2, + box.padding = unit(0.35, "lines"), + point.padding = unit(0.3, "lines") + ) + + ggprism::theme_prism() + + ggprism::scale_colour_prism() + + ggprism::scale_shape_prism() + + ggplot2::theme( + legend.position = "none" + ) + + ggplot2::labs( + subtitle = paste0("", toupper(c.gender)), + x = "Chromosome", + y = "GSVA Values" + ) + + ggplot2::ggsave( + filename = file.path(results.dir.gender.plot, paste0("gsva.", c.gender, ".png")), + plot = plt, + width = 12.5, + height = 12.5, + unit = "cm" + ) +} diff --git a/rnaseq/step6_overall_QC/03 - Sample Counts.R b/rnaseq/step6_overall_QC/03 - Sample Counts.R new file mode 100644 index 0000000..90e805a --- /dev/null +++ b/rnaseq/step6_overall_QC/03 - Sample Counts.R @@ -0,0 +1,163 @@ +# Total counts per sample +# Normalized with limma::voom + +source("__ - Preloader.R", verbose=T) + + +# The analysis +norm.expr.data <- expression.data %>% + tibble::column_to_rownames("Gene") +norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>% + limma::voom() %>% + as.matrix() + +# Total counts per sample +total.count.per.sample <- expression.data %>% + tibble::column_to_rownames("Gene") %>% + colSums() + +data.frame( + sample = names(total.count.per.sample), + counts = as.numeric(total.count.per.sample) + ) %>% + readr::write_csv(file.path(results.dir, "total.counts.per.sample.csv")) + + +norm.data <- norm.expr.data %>% + as.data.frame() %>% + tibble::rownames_to_column( + "Gene" + ) %>% + tidyr::gather( + key = "sample.id", + value = "expr.value", + -Gene + ) %>% + dplyr::left_join( + y = master.Table %>% + dplyr::filter( + !is.na(GenomeScan_ID) + ) %>% + dplyr::mutate( + id = dplyr::case_when( + stringr::str_trim(gender) == "" ~ paste0("Water ", dplyr::row_number()), + TRUE ~ sample.id + ), + gender = dplyr::case_when( + stringr::str_trim(gender) == "" ~ "water", + !is.na(gender) ~ as.character(gender) + ) + ) %>% + dplyr::select( + GenomeScan_ID, + gender, + id + ), + by = c("sample.id" = "GenomeScan_ID") + ) + +norm.plot <- norm.data %>% + ggplot2::ggplot( + mapping = ggplot2::aes( + x = id, + y = expr.value, + fill = gender + ) + ) + + ggplot2::geom_boxplot() + + ggplot2::scale_fill_manual( + values = c( + "male" = "blue", + "female" = "red", + "water" = "green" + ) + ) + + ggplot2::labs( + title = "Normalized expression values distribution", + y = "Normalized expression values (limma::voom)", + x = "Sample", + gender = "Gender" + ) + + ggprism::theme_prism() + + ggplot2::theme( + axis.text.x = ggplot2::element_text(angle = 90) + ) + +ggplot2::ggsave( + filename = file.path(results.dir, "counts.per.sample.normalised.png"), + plot = norm.plot, + width = 40, + height = 20, + units = "cm" +) + + + +expr.data <- expression.data %>% + tidyr::gather( + key = "sample.id", + value = "expr.value", + -Gene + ) %>% + dplyr::filter( + expr.value != 0 + ) %>% + dplyr::left_join( + y = master.Table %>% + dplyr::filter( + !is.na(GenomeScan_ID) + ) %>% + dplyr::mutate( + id = dplyr::case_when( + stringr::str_trim(gender) == "" ~ paste0("Water ", dplyr::row_number()), + TRUE ~ sample.id + ), + gender = dplyr::case_when( + stringr::str_trim(gender) == "" ~ "water", + !is.na(gender) ~ as.character(gender) + ) + ) %>% + dplyr::select( + GenomeScan_ID, + gender, + id + ), + by = c("sample.id" = "GenomeScan_ID") + ) + +expr.plot <- expr.data %>% + ggplot2::ggplot( + mapping = ggplot2::aes( + x = id, + y = expr.value, + fill = gender + ) + ) + + ggplot2::geom_boxplot() + + ggplot2::scale_fill_manual( + values = c( + "male" = "blue", + "female" = "red", + "water" = "green" + ) + ) + + ggplot2::scale_y_continuous(trans='log2') + + ggplot2::labs( + title = "Raw expression values distribution, without zero's", + y = "Expression values", + x = "Sample", + gender = "Gender" + ) + + ggprism::theme_prism() + + ggplot2::theme( + axis.text.x = ggplot2::element_text(angle = 90) + ) + +ggplot2::ggsave( + filename = file.path(results.dir, "counts.per.sample.raw.zeros.removed.png"), + plot = expr.plot, + width = 40, + height = 20, + units = "cm" +) + diff --git a/rnaseq/step6_overall_QC/__ - Preloader.R b/rnaseq/step6_overall_QC/__ - Preloader.R new file mode 100644 index 0000000..ce96a0f --- /dev/null +++ b/rnaseq/step6_overall_QC/__ - Preloader.R @@ -0,0 +1,189 @@ +library(tidyverse) +library(ggfortify) +library(ggprism) +library(limma) +library(biomaRt) +library(PCAtools) +library(gtools) +library(edgeR) +library(ggprism) +library(foreign) + + +# Global variables +results.dir <- file.path("results.nosync", "RNA-Seq QC") +data.dir <- "Data" +patient.dir <- file.path(data.dir, "Patients") +sample.dir <- file.path(data.dir, "Samples") +expression.dir <- file.path(data.dir, "mRNA - RNA-Seq") + + +dir.create(results.dir, recursive = TRUE) + + +#### +# Helper functions +#### +select.columns.in.order <- function(dataframe, columns) { + dataframe[, columns] +} + +select.rows.in.order <- function(dataframe, rows) { + dataframe[rows,] +} + +getGenedataByEnsemblId38 <- function(ensemblIds, file.location) { + file.name <- file.path(file.location, "genes_info_hg38.csv") + if (!file.exists(file.name)) { + if (!("mart" %in% ls())) { + assign("mart", useEnsembl( + biomart = "ENSEMBL_MART_ENSEMBL", + dataset = "hsapiens_gene_ensembl" + )) + } + gene.list <- getBM( + filters = "ensembl_gene_id", + attributes = c( + "hgnc_symbol", + "ensembl_gene_id", + "ensembl_transcript_id", + "chromosome_name", + "start_position", + "end_position", + "strand", + "transcription_start_site", + "transcript_start", + "transcript_end", + "external_gene_name" + ), + values = as.character(ensemblIds), + mart = mart + ) + readr::write_csv(gene.list, path = file.name) + } + return( + readr::read_csv( + file.name, + col_types = readr::cols() + ) + ) +} + +#remove.rows.with.count.less.then <- function(dataframe, minRowCount, columns.to.exclude) { +# dataframe %>% +# dplyr::filter( +# rowSums(dplyr::select(., -tidyselect::one_of(columns.to.exclude))) < minRowCount +# ) +#} + +limma.voom.convert.column <- function(dataframe, columnname) { + dataframe %>% + tibble::column_to_rownames(columnname) %>% + limma::voom() %>% + as.data.frame() %>% + tibble::rownames_to_column(columnname) +} + +select.columns.in.order <- function(dataframe, columns) { + dataframe[, columns] +} + +drop.columns.if.all.same.value <- function(dataframe) { + for (name in colnames(dataframe)) { + is.all.same <- (dataframe[, name] %>% unique() %>% length()) <= 1 + if (is.all.same) { + dataframe <- dataframe %>% + dplyr::select( + -tidyselect::one_of(name) + ) + } + } + dataframe +} + + + +# Load data +master.Table <- foreign::read.spss( + file.path(patient.dir, "PRESTO proteogenomics full data sat - ver 7.5.sav") + ) %>% + as.data.frame() %>% + dplyr::mutate( + GenomeScan_ID = stringr::str_trim(GenomeScan_ID), + gender = forcats::fct_recode( + Gender, + female = "f", + male = "m", + other = "" + ), + age = as.numeric(Age), + smoking.status = forcats::fct_recode( + Smoking_status, + `Ex-smoker` = "ES ", + `Current smoker` = "CS ", + other = " " + ) + ) + +expression.data <- readr::read_tsv( + file.path(expression.dir, "20200427_103972-001_rawcounts.txt"), + col_types = readr::cols() + ) + +gene.data <- getGenedataByEnsemblId38( + ensemblIds = expression.data$Gene, + file.location = expression.dir + ) %>% + dplyr::group_by(hgnc_symbol) %>% + dplyr::filter( + dplyr::row_number() == 1, + !is.na(hgnc_symbol), + hgnc_symbol != "" + ) %>% + dplyr::ungroup() %>% + dplyr::select( + hgnc_symbol, + ensembl_gene_id, + chromosome_name, + transcript_start, + transcript_end + ) + + +master.Table <- master.Table %>% + dplyr::mutate( + Group_simple2 = stringr::str_trim(Group_simple2), + Group_simple = stringr::str_trim(Group_simple), + T_number = as.character(T_number), + sample.id = stringr::str_trim(PRESTO_ID) + ) %>% + dplyr::filter( + # # According to Niek, I should not include this, for whatever reason + #!(GenomeScan_ID %in% c( + # "T02-01796", + # "T02-03095", + # "T02-10683", + # "T10-18671", + # "T12-12036" + # ) + # ), + # + # # (According to Niek, don't include) Water Controls + #!stringr::str_detect(T_number, pattern="Water"), + # + # # (According to Niek, don't include)never smoker controles + #!(Group_simple2 == "NS_Ctrl"), + # + # # (According to Niek, don't include)ALFA1 patiƫnten + #!(Group_simple == "ALFA"), + # + #stringr::str_trim(Passed_RNAseq_library_prep_QC_Y_N) == "Y", + GenomeScan_ID %in% colnames(expression.data), + !is.na(sample.id), + sample.id != "" + ) + +expression.data <- expression.data %>% + select.columns.in.order( + c("Gene", master.Table$GenomeScan_ID) + ) diff --git a/rnaseq/step6_overall_QC/snippet.R b/rnaseq/step6_overall_QC/snippet.R deleted file mode 100644 index e69de29..0000000 From 58bd93171c9dc7406f8a3f0931e1edb6db8ff3af Mon Sep 17 00:00:00 2001 From: Jos van Nijnatten Date: Tue, 9 Feb 2021 15:15:59 +0100 Subject: [PATCH 02/11] Test change --- rnaseq/step6_overall_QC/__ - Preloader.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rnaseq/step6_overall_QC/__ - Preloader.R b/rnaseq/step6_overall_QC/__ - Preloader.R index ce96a0f..d11175a 100644 --- a/rnaseq/step6_overall_QC/__ - Preloader.R +++ b/rnaseq/step6_overall_QC/__ - Preloader.R @@ -8,7 +8,7 @@ library(gtools) library(edgeR) library(ggprism) library(foreign) - + # Global variables results.dir <- file.path("results.nosync", "RNA-Seq QC") From 135f231c861cc015e7db3cd4d12fc68408826071 Mon Sep 17 00:00:00 2001 From: Jos van Nijnatten Date: Tue, 9 Feb 2021 15:39:35 +0100 Subject: [PATCH 03/11] Remove whitespace, MacOS specific files. --- rnaseq/.DS_Store | Bin 6148 -> 0 bytes rnaseq/step6_overall_QC/__ - Preloader.R | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) delete mode 100644 rnaseq/.DS_Store diff --git a/rnaseq/.DS_Store b/rnaseq/.DS_Store deleted file mode 100644 index 8933a0f0cb19a47a85e31c0e56aa08d8ead71b08..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK!A=`75FLlKYy=fKRD$DPxKSxNfXV@B73hH*)gm~wRkF(lSZTXXvm2r|1nqCM zf7CDN@AQrBMvc-6aftxUMD`ooGamU(WbY7>>Wq>$QIm)~G{#y3-9Loaxh+V|&D4R) z?9rw!T~b5^l?&D~@faE4+^w5@^=UwV=kE($B*jG*Yn>sv_=j*(*=P?*)^r1Xh*W3z zb?H4F#^YiTXF7)M&v=lfMcHcoQni)(i0yU<|iHn%2|xBGiL{+rfx z$G2*`wZA)^HoVR4_Q7%Q^Ve^a+v(kRn-awbWZAGV>wiGI_uvm==_VZJna;1U@t>fx zXrc%M!hkTaEC$>K-#6x@0*lfM1H!;O11ome z Date: Wed, 10 Feb 2021 13:42:51 +0100 Subject: [PATCH 04/11] fastqc: simple comment. trimming: reverted back. overall qc: simplified the scripts and made sure to add instructions. --- rnaseq/step1_fastqc/snippet.sh | 37 +-- rnaseq/step2_trim/snippet.sh | 64 ----- .../01 - Principle Component Analysis.R | 149 +--------- rnaseq/step6_overall_QC/02 - Gender Check.R | 272 ++---------------- rnaseq/step6_overall_QC/03 - Sample Counts.R | 166 ++--------- rnaseq/step6_overall_QC/__ - Preloader.R | 189 ------------ 6 files changed, 64 insertions(+), 813 deletions(-) delete mode 100644 rnaseq/step6_overall_QC/__ - Preloader.R diff --git a/rnaseq/step1_fastqc/snippet.sh b/rnaseq/step1_fastqc/snippet.sh index 0eb5213..b0d7dcb 100644 --- a/rnaseq/step1_fastqc/snippet.sh +++ b/rnaseq/step1_fastqc/snippet.sh @@ -1,34 +1,5 @@ -#!/bin/bash -#SBATCH --job-name=fastqc -#SBATCH --time=0-12:00:00 -#SBATCH --ntasks=1 -#SBATCH --mem=15G -#SBATCH --qos=regular +# The command to do a FastQC on a fastq file is +file="file_to_analyse.fq.gz" +fastqc_out="./path/to/fastqc/output/dir" - -set -e -set -u -set -x -set -o pipefail - - -module purge -module load Java -module load FastQC - - -dir_raw_fastq="$(pwd)/fastq/raw" -dir_fastqc="$(pwd)/fastqc" - - -[ -d "${dir_fastqc}" ] || mkdir -p "${dir_fastqc}" - - -# Run FastQC -files=$(find -L "$dir_raw_fastq" -type f -iname "*.fastq.gz") -for file in $files; do - filename=$(basename "$file") - fastqc -o "$dir_fastqc" "$file" & -done - -wait +fastqc -o "$fastqc_out" "$file" diff --git a/rnaseq/step2_trim/snippet.sh b/rnaseq/step2_trim/snippet.sh index 6c4eb2e..e69de29 100644 --- a/rnaseq/step2_trim/snippet.sh +++ b/rnaseq/step2_trim/snippet.sh @@ -1,64 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=trimming -#SBATCH --time=2-00:00:00 -#SBATCH --ntasks=1 -#SBATCH --mem=15G -#SBATCH --qos=regular - - -set -e -set -u -set -x -set -o pipefail - - -module purge -module load TrimGalore - - -dir_raw_fastq="$(pwd)/fastq/raw" -dir_trimmed_fastq="$(pwd)/fastq/trimmed" -dir_trimmed_fastq_reports="$(pwd)/fastq/trimmed/reports" - -adapter_3p="TGGAATTCTCGG" # is _R1 -adapter_5p="GATCGTCGGACT" # is _R2 - - -[ -d "${dir_trimmed_fastq_reports}" ] || mkdir -p "${dir_trimmed_fastq_reports}" - - -# Trim all adapters from the sequences -while IFS=, read -r GSID Sample -do - echo ">> Executing trimming of $GSID (${#GSID})" - if [ "${#GSID}" == "18" ]; then # check length - don't include sample pools - for fqFile in $(find -L "$dir_raw_fastq" -maxdepth 1 -type f -iname "*${GSID}*.fastq.gz"); do - newFilename=${fqFile/.fastq.gz/_trimmed.fq.gz} - if [ -f "${newFilename}" ]; then - echo ">>File exists ${newFilename}. Skipping" - else - echo ">>File: ${fqFile}" - if grep -q "_R1" <<< $(basename "$fqFile"); then - adapter_seq=$adapter_3p - elif grep -q "_R2" <<< $(basename "$fqFile"); then - adapter_seq=$adapter_5p - fi - echo ">>Trimming with sequence ${adapter_seq}" - trim_galore \ - --adapter "$adapter_seq" \ - --length $(printf "$adapter_seq" | wc -m) \ - --output_dir "${dir_trimmed_fastq}" \ - --fastqc_args "--noextract" \ - "${fqFile}" & - fi - done - else - echo "Skipped." - fi -done < samples.csv - - -wait -mv ${dir_trimmed_fastq}/*.zip "${dir_trimmed_fastq_reports}" -mv ${dir_trimmed_fastq}/*.txt "${dir_trimmed_fastq_reports}" -mv ${dir_trimmed_fastq}/*.html "${dir_trimmed_fastq_reports}" \ No newline at end of file diff --git a/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R b/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R index 09e4dff..0c42844 100644 --- a/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R +++ b/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R @@ -1,7 +1,15 @@ # Principle Component Analysis # Normalized with limma::voom +library(limma) +library(tidyverse) -source("__ - Preloader.R", verbose=T) + +# expression.data. Has columns: +# - Gene (gene identifier) +# - [Sample identifiers] +expression.data <- "mrna_counts_table.csv" +# results.dir. The directory of where to put the resulting tables (and later your plots.) +results.dir <- "results" # PCA variables @@ -10,10 +18,7 @@ do.scale = FALSE # The analysis -master.Table %>% readr::write_csv( - file.path(results.dir, "patient.table.csv") -) - +# We ise prcomp to calculate the PCAs. Afterwards you should plot the results. norm.expr.data <- expression.data %>% tibble::column_to_rownames("Gene") norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>% @@ -69,132 +74,8 @@ data.frame( # Not saved: pcs.summery$sdev, - -# Plot PCAs -# https://github.com/kevinblighe/PCAtools -results.dir.pca.plot <- file.path(results.dir.pca, "img") -dir.create(results.dir.pca.plot) - -metadata <- master.Table %>% - dplyr::filter( - !is.na(GenomeScan_ID) - ) %>% - tibble::column_to_rownames("GenomeScan_ID") %>% - select.rows.in.order( - colnames(norm.expr.data) - ) - - -p <- pca(norm.expr.data, - metadata = metadata, - center = do.center, - scale = do.scale -) -elbow <- findElbowPoint(p$variance) -metavars <- c('Age','Gender','Smoking_status','COPD_Y_or_N','SEO_COPD_Y_or_N','GOLD_stage') - - -png(filename = file.path(results.dir.pca.plot,"scree_plot.png"), - width = 800, height = 800, - units = "px", pointsize = 12, - type = "Xlib") -print(screeplot(p, - axisLabSize = 12, - titleLabSize = 12, - components = getComponents(p, 1:(elbow+5)), - vline = c(elbow) - ) + - geom_label( - aes( - x = elbow + 1, - y = 50, - label = 'Elbow method', - vjust = -1, - size = 8 - ) - )) -dev.off() - - -png(filename = file.path(results.dir.pca.plot, "eigen_corr_plot.png"), - width = 1200, height = 1200, - units = "px", pointsize = 12, - type = "Xlib") -print(eigencorplot(p, - metavars = metavars -)) -dev.off() - - -dir.create(file.path(results.dir.pca.plot, "pairsplot")) -for (var in metavars) { - png( - filename = file.path(results.dir.pca.plot, "pairsplot", paste0(var,".png")), - width = 1200, height = 1200, - units = "px", pointsize = 12, - type = "Xlib" - ) - print(pairsplot(p, - components = getComponents(p, c(1:(elbow+1))), - triangle = TRUE, - trianglelabSize = 12, - hline = 0, vline = 0, - pointSize = 0.4, - gridlines.major = FALSE, - gridlines.minor = FALSE, - colby = var, - title = paste0('Pairs plot: ',var), - plotaxes = TRUE - )) - dev.off() -} - - - -# Plot PCAs - old failure -pca.combinations <- combinations( - n = (elbow+1), - r = 2, - v = 1:(elbow+1), - repeats.allowed = FALSE - ) - -dir.create(file.path(results.dir.pca.plot, "biplots")) -for (var in metavars) { - for (i in 1:nrow(pca.combinations)) { - pca.combi <- pca.combinations[i,] - pca.title <- paste(paste0("PC", pca.combi), collapse="_") - png( - filename = file.path(results.dir.pca.plot, "biplots", paste0(var, "-", pca.title, ".png")), - width = 800, height = 800, - units = "px", pointsize = 12, - type = "Xlib" - ) - print( - autoplot( - norm.expr.data.pcs, - data = master.Table %>% - dplyr::filter( - !is.na(GenomeScan_ID) - ) %>% - tibble::column_to_rownames("GenomeScan_ID") %>% - select.rows.in.order( - rownames(norm.expr.data.pcs$x) - ), - x = pca.combi[1], - y = pca.combi[2], - colour = var, - loadings = FALSE, - loadings.label = FALSE, - #label = FALSE, - label.size = 3 - ) + - ggprism::theme_prism() + - #ggprism::scale_colour_prism() + - ggprism::scale_shape_prism() + - ggplot2::labs(subtitle = paste0(str_to_title(var), " (", pca.title,")")) - ) - dev.off() - } -} - +# Next thing to do: +# - (Optional) scree plot - to determine the optimal cutoff for PCA inclusion based on explaination of variance +# - (Optional) eigencorplot - to correlate PCAs to clinical variables so that you know which PCA to include for which analysis +# - (Optional) pairsplot - plot multiple PCAs against each other in a single figure +# - Plot the first couple of PCAs against each other \ No newline at end of file diff --git a/rnaseq/step6_overall_QC/02 - Gender Check.R b/rnaseq/step6_overall_QC/02 - Gender Check.R index af7536d..f02f19d 100644 --- a/rnaseq/step6_overall_QC/02 - Gender Check.R +++ b/rnaseq/step6_overall_QC/02 - Gender Check.R @@ -1,16 +1,28 @@ # Gender QC # Normalized with limma::voom +library(GSVA) +library(limma) +library(edgeR) +library(tidyverse) -source("__ - Preloader.R", verbose=T) + +# expression.data. Has columns: +# - Gene (gene identifier) +# - [Sample identifiers] +expression.data <- "mrna_counts_table.csv" +# master.Table. Has columns: +# - GenomeScan_ID +# - gender, levels = c("male", "female") +# - age +# - factor(smoking.status, levels = c("Ex-smoker", "Current smoker")) +master.Table <- "patient_table.csv" +# results.dir. The directory of where to put the resulting tables (and later your plots.) +results.dir <- "results" # The analysis -norm.expr.data <- expression.data %>% - tibble::column_to_rownames("Gene") -norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>% - limma::voom() %>% - as.matrix() - +# We first do a differential expression analysis on gender using EdgeR. +# Afterwards you should plot these results. x.genes <- gene.data %>% dplyr::filter(chromosome_name == "X") %>% dplyr::pull(ensembl_gene_id) @@ -87,248 +99,12 @@ gender.qc.genes.to.plot <- gender.qc.results %>% chromosome_name %in% c("X", "Y") & FDR < 0.05 & dplyr::row_number() <= 5 - ) | - hgnc_symbol %in% c( - "XIST", - "TSIX", - "KDM6A", - "ZFX", - "KDM5C", - "ZFY-AS1", - "ARSDP1", - "GYG2P1", - "RBMY2JP", - "ARSLP1" ) ) -gender.qc.data <- as.data.frame(norm.expr.data) %>% - rownames_to_column("ensembl.id") %>% - tidyr::gather( - key = "rna.seq.sample.id", - value = "expr.value", - -ensembl.id - ) %>% - dplyr::filter( - ensembl.id %in% gender.qc.genes.to.plot$ensembl.id - ) %>% - dplyr::left_join( - y = gender.qc.patients, - by = c("rna.seq.sample.id" = "GenomeScan_ID") - ) %>% - #dplyr::filter( - # !is.na(gender) - #) %>% - readr::write_csv( - file.path(results.dir.gender.plot, "plot.data.voom.csv") - ) -for (chr in gender.qc.genes.to.plot$chromosome_name) { - current.gender.qc.genes.to.plot <- gender.qc.genes.to.plot %>% - dplyr::filter(chromosome_name == chr) - chromosome_name <- chr - i <- 0 - for (current.ensembl.id in current.gender.qc.genes.to.plot$ensembl.id) { - i <- i + 1 - - hgnc_symbol <- gene.data %>% - dplyr::filter( - ensembl_gene_id == current.ensembl.id - ) %>% - dplyr::pull(hgnc_symbol) - - # calculate outliers, kinda - plot.data <- gender.qc.data %>% - dplyr::filter( - ensembl.id == current.ensembl.id - ) %>% - dplyr::mutate( - gender = dplyr::case_when( - is.na(gender) | (stringr::str_trim(gender) == "") ~ "other", - TRUE ~ gender - ) - ) - - if (nrow(plot.data) <= 0) { - next - } - - outliers <- boxplot( - formula = expr.value ~ gender, - data = plot.data, - plot = FALSE - )$out - - result.to.annotate <- plot.data %>% - dplyr::filter( - expr.value %in% outliers - ) - - # Visual: plot range (for t-test p-value) - plot.y.range <- c( - "min" = as.integer(min(plot.data$expr.value) - 1) , - "max" = as.integer(max(plot.data$expr.value) + 1) - ) - plot.margin <- ((plot.y.range["max"] + (plot.y.range["min"] * -1)) * 0.05) - plot.y.range["min"] <- plot.y.range["min"] - plot.margin - plot.y.range["max"] <- plot.y.range["max"] + plot.margin - - # Plot the damn thing as if it is Graphpad Prism - stat.table <- rstatix::t_test(plot.data, expr.value ~ gender) - plt <- plot.data %>% - ggplot2::ggplot( - mapping = ggplot2::aes( - x = gender, - y = expr.value - ) - ) + - ggplot2::geom_jitter( - mapping = ggplot2::aes( - colour = gender, - shape = gender - ), - width = 0.1 - ) + - ggrepel::geom_text_repel( - data = result.to.annotate, - mapping = ggplot2::aes( - label = sample.id - ), - size = 2, - box.padding = unit(0.35, "lines"), - point.padding = unit(0.3, "lines") - ) + - ggplot2::stat_summary( - fun = "mean", - geom = "crossbar", - width = 0.3, - size = 0.3 - ) + - ggplot2::scale_y_continuous( - limits = plot.y.range, - guide = "prism_offset" - ) + - #ggprism::add_pvalue( - # stat.table, - # y.position = plot.y.range["max"] - #) + - ggprism::theme_prism() + - ggprism::scale_colour_prism() + - ggprism::scale_shape_prism() + - ggplot2::theme( - legend.position = "none" - ) + - ggplot2::labs( - subtitle = paste0("Gender Check: ", hgnc_symbol, " (chr. ", chromosome_name, ")"), - x = "Gender", - y = "Normalised Expression Values" - ) - - ggplot2::ggsave( - filename = file.path(results.dir.gender.plot, paste0(chromosome_name, ".", i, ".", hgnc_symbol, ".png")), - plot = plt, - width = 12.5, - height = 12.5, - unit = "cm" - ) - } -} - -# Let's try a GSVA -gsva.groups <- list( - X = gender.qc.genes.to.plot %>% - dplyr::filter(chromosome_name == "X") %>% - dplyr::pull(ensembl.id), - Y = gender.qc.genes.to.plot %>% - dplyr::filter(chromosome_name == "Y") %>% - dplyr::pull(ensembl.id) -) - -gsva_res = GSVA::gsva( - norm.expr.data, - gsva.groups, - mx.diff = TRUE, - verbose = FALSE, - parallel.sz = 1 -) - - -gender.qc.gsva.data <- as.data.frame(gsva_res) %>% - rownames_to_column("gsva.group") %>% - tidyr::gather( - key = "rna.seq.sample.id", - value = "gsva.value", - -gsva.group - ) %>% - dplyr::left_join( - y = gender.qc.patients %>% - dplyr::select( - GenomeScan_ID, - sample.id, - gender - ), - by = c("rna.seq.sample.id" = "GenomeScan_ID") - ) %>% - readr::write_csv( - file.path(results.dir.gender.plot, "plot.data.gsva.csv") - ) - - -for (c.gender in unique(gender.qc.gsva.data$gender)) { - if (is.na(c.gender)) { - next - } - - c.plot.data <- gender.qc.gsva.data %>% - dplyr::filter( - gender == c.gender - ) - - outliers <- boxplot( - formula = gsva.value ~ gsva.group, - data = c.plot.data, - plot = FALSE - )$out - - result.to.annotate <- c.plot.data %>% - dplyr::filter( - gsva.value %in% outliers - ) - - plt <- c.plot.data %>% - ggplot2::ggplot( - mapping = ggplot2::aes( - x = gsva.group, - y = gsva.value - ) - ) + - ggplot2::geom_boxplot() + - ggrepel::geom_text_repel( - data = result.to.annotate, - mapping = ggplot2::aes( - label = sample.id - ), - size = 2, - box.padding = unit(0.35, "lines"), - point.padding = unit(0.3, "lines") - ) + - ggprism::theme_prism() + - ggprism::scale_colour_prism() + - ggprism::scale_shape_prism() + - ggplot2::theme( - legend.position = "none" - ) + - ggplot2::labs( - subtitle = paste0("", toupper(c.gender)), - x = "Chromosome", - y = "GSVA Values" - ) - - ggplot2::ggsave( - filename = file.path(results.dir.gender.plot, paste0("gsva.", c.gender, ".png")), - plot = plt, - width = 12.5, - height = 12.5, - unit = "cm" - ) -} +# Next thing to do: +# - Plot the normalized expressino values for the genes in gender.qc.genes.to.plot in a boxplot, split and colored by gender. +# - (Optional) Do a GSVA with as genesets the genes found in gender.qc.genes.to.plot. Plot the boxplots as per the previous point. +# - (Optional) Plot the number of Y-chromosome reads devided by the number of X chromosome reads in a boxplot as per the first point. +# - (Optional) Plot the number of Y-chromosome SNPs devided by the number of X chromosome SNPs in a boxplot as per the first point. diff --git a/rnaseq/step6_overall_QC/03 - Sample Counts.R b/rnaseq/step6_overall_QC/03 - Sample Counts.R index 90e805a..3f4c5c2 100644 --- a/rnaseq/step6_overall_QC/03 - Sample Counts.R +++ b/rnaseq/step6_overall_QC/03 - Sample Counts.R @@ -1,17 +1,25 @@ # Total counts per sample # Normalized with limma::voom +library(limma) +library(tidyverse) -source("__ - Preloader.R", verbose=T) + +# expression.data. Has columns: +# - Gene (gene identifier) +# - [Sample identifiers] +expression.data <- "mrna_counts_table.csv" +# master.Table. Has columns: +# - GenomeScan_ID +# - gender, levels = c("male", "female") +# - age +# - factor(smoking.status, levels = c("Ex-smoker", "Current smoker")) +master.Table <- "patient_table.csv" +# results.dir. The directory of where to put the resulting tables (and later your plots.) +results.dir <- "results" # The analysis -norm.expr.data <- expression.data %>% - tibble::column_to_rownames("Gene") -norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>% - limma::voom() %>% - as.matrix() - -# Total counts per sample +# We calculate the number of mapped reads per sample. total.count.per.sample <- expression.data %>% tibble::column_to_rownames("Gene") %>% colSums() @@ -23,141 +31,9 @@ data.frame( readr::write_csv(file.path(results.dir, "total.counts.per.sample.csv")) -norm.data <- norm.expr.data %>% - as.data.frame() %>% - tibble::rownames_to_column( - "Gene" - ) %>% - tidyr::gather( - key = "sample.id", - value = "expr.value", - -Gene - ) %>% - dplyr::left_join( - y = master.Table %>% - dplyr::filter( - !is.na(GenomeScan_ID) - ) %>% - dplyr::mutate( - id = dplyr::case_when( - stringr::str_trim(gender) == "" ~ paste0("Water ", dplyr::row_number()), - TRUE ~ sample.id - ), - gender = dplyr::case_when( - stringr::str_trim(gender) == "" ~ "water", - !is.na(gender) ~ as.character(gender) - ) - ) %>% - dplyr::select( - GenomeScan_ID, - gender, - id - ), - by = c("sample.id" = "GenomeScan_ID") - ) - -norm.plot <- norm.data %>% - ggplot2::ggplot( - mapping = ggplot2::aes( - x = id, - y = expr.value, - fill = gender - ) - ) + - ggplot2::geom_boxplot() + - ggplot2::scale_fill_manual( - values = c( - "male" = "blue", - "female" = "red", - "water" = "green" - ) - ) + - ggplot2::labs( - title = "Normalized expression values distribution", - y = "Normalized expression values (limma::voom)", - x = "Sample", - gender = "Gender" - ) + - ggprism::theme_prism() + - ggplot2::theme( - axis.text.x = ggplot2::element_text(angle = 90) - ) - -ggplot2::ggsave( - filename = file.path(results.dir, "counts.per.sample.normalised.png"), - plot = norm.plot, - width = 40, - height = 20, - units = "cm" -) - - - -expr.data <- expression.data %>% - tidyr::gather( - key = "sample.id", - value = "expr.value", - -Gene - ) %>% - dplyr::filter( - expr.value != 0 - ) %>% - dplyr::left_join( - y = master.Table %>% - dplyr::filter( - !is.na(GenomeScan_ID) - ) %>% - dplyr::mutate( - id = dplyr::case_when( - stringr::str_trim(gender) == "" ~ paste0("Water ", dplyr::row_number()), - TRUE ~ sample.id - ), - gender = dplyr::case_when( - stringr::str_trim(gender) == "" ~ "water", - !is.na(gender) ~ as.character(gender) - ) - ) %>% - dplyr::select( - GenomeScan_ID, - gender, - id - ), - by = c("sample.id" = "GenomeScan_ID") - ) - -expr.plot <- expr.data %>% - ggplot2::ggplot( - mapping = ggplot2::aes( - x = id, - y = expr.value, - fill = gender - ) - ) + - ggplot2::geom_boxplot() + - ggplot2::scale_fill_manual( - values = c( - "male" = "blue", - "female" = "red", - "water" = "green" - ) - ) + - ggplot2::scale_y_continuous(trans='log2') + - ggplot2::labs( - title = "Raw expression values distribution, without zero's", - y = "Expression values", - x = "Sample", - gender = "Gender" - ) + - ggprism::theme_prism() + - ggplot2::theme( - axis.text.x = ggplot2::element_text(angle = 90) - ) - -ggplot2::ggsave( - filename = file.path(results.dir, "counts.per.sample.raw.zeros.removed.png"), - plot = expr.plot, - width = 40, - height = 20, - units = "cm" -) +# Next thing to do: +# - Check the number of reads per sample in total.counts.per.sample.csv +# - Plot the reads distribution (all reads) per sample in a boxplot. +# - (Optional) Calculate the number of unmapped, multimapped, unique mapped to +# feature and unique mapped to no feature and plot these in a stacked bar graph. diff --git a/rnaseq/step6_overall_QC/__ - Preloader.R b/rnaseq/step6_overall_QC/__ - Preloader.R deleted file mode 100644 index ce96a0f..0000000 --- a/rnaseq/step6_overall_QC/__ - Preloader.R +++ /dev/null @@ -1,189 +0,0 @@ -library(tidyverse) -library(ggfortify) -library(ggprism) -library(limma) -library(biomaRt) -library(PCAtools) -library(gtools) -library(edgeR) -library(ggprism) -library(foreign) - - -# Global variables -results.dir <- file.path("results.nosync", "RNA-Seq QC") -data.dir <- "Data" -patient.dir <- file.path(data.dir, "Patients") -sample.dir <- file.path(data.dir, "Samples") -expression.dir <- file.path(data.dir, "mRNA - RNA-Seq") - - -dir.create(results.dir, recursive = TRUE) - - -#### -# Helper functions -#### -select.columns.in.order <- function(dataframe, columns) { - dataframe[, columns] -} - -select.rows.in.order <- function(dataframe, rows) { - dataframe[rows,] -} - -getGenedataByEnsemblId38 <- function(ensemblIds, file.location) { - file.name <- file.path(file.location, "genes_info_hg38.csv") - if (!file.exists(file.name)) { - if (!("mart" %in% ls())) { - assign("mart", useEnsembl( - biomart = "ENSEMBL_MART_ENSEMBL", - dataset = "hsapiens_gene_ensembl" - )) - } - gene.list <- getBM( - filters = "ensembl_gene_id", - attributes = c( - "hgnc_symbol", - "ensembl_gene_id", - "ensembl_transcript_id", - "chromosome_name", - "start_position", - "end_position", - "strand", - "transcription_start_site", - "transcript_start", - "transcript_end", - "external_gene_name" - ), - values = as.character(ensemblIds), - mart = mart - ) - readr::write_csv(gene.list, path = file.name) - } - return( - readr::read_csv( - file.name, - col_types = readr::cols() - ) - ) -} - -#remove.rows.with.count.less.then <- function(dataframe, minRowCount, columns.to.exclude) { -# dataframe %>% -# dplyr::filter( -# rowSums(dplyr::select(., -tidyselect::one_of(columns.to.exclude))) < minRowCount -# ) -#} - -limma.voom.convert.column <- function(dataframe, columnname) { - dataframe %>% - tibble::column_to_rownames(columnname) %>% - limma::voom() %>% - as.data.frame() %>% - tibble::rownames_to_column(columnname) -} - -select.columns.in.order <- function(dataframe, columns) { - dataframe[, columns] -} - -drop.columns.if.all.same.value <- function(dataframe) { - for (name in colnames(dataframe)) { - is.all.same <- (dataframe[, name] %>% unique() %>% length()) <= 1 - if (is.all.same) { - dataframe <- dataframe %>% - dplyr::select( - -tidyselect::one_of(name) - ) - } - } - dataframe -} - - - -# Load data -master.Table <- foreign::read.spss( - file.path(patient.dir, "PRESTO proteogenomics full data sat - ver 7.5.sav") - ) %>% - as.data.frame() %>% - dplyr::mutate( - GenomeScan_ID = stringr::str_trim(GenomeScan_ID), - gender = forcats::fct_recode( - Gender, - female = "f", - male = "m", - other = "" - ), - age = as.numeric(Age), - smoking.status = forcats::fct_recode( - Smoking_status, - `Ex-smoker` = "ES ", - `Current smoker` = "CS ", - other = " " - ) - ) - -expression.data <- readr::read_tsv( - file.path(expression.dir, "20200427_103972-001_rawcounts.txt"), - col_types = readr::cols() - ) - -gene.data <- getGenedataByEnsemblId38( - ensemblIds = expression.data$Gene, - file.location = expression.dir - ) %>% - dplyr::group_by(hgnc_symbol) %>% - dplyr::filter( - dplyr::row_number() == 1, - !is.na(hgnc_symbol), - hgnc_symbol != "" - ) %>% - dplyr::ungroup() %>% - dplyr::select( - hgnc_symbol, - ensembl_gene_id, - chromosome_name, - transcript_start, - transcript_end - ) - - -master.Table <- master.Table %>% - dplyr::mutate( - Group_simple2 = stringr::str_trim(Group_simple2), - Group_simple = stringr::str_trim(Group_simple), - T_number = as.character(T_number), - sample.id = stringr::str_trim(PRESTO_ID) - ) %>% - dplyr::filter( - # # According to Niek, I should not include this, for whatever reason - #!(GenomeScan_ID %in% c( - # "T02-01796", - # "T02-03095", - # "T02-10683", - # "T10-18671", - # "T12-12036" - # ) - # ), - # - # # (According to Niek, don't include) Water Controls - #!stringr::str_detect(T_number, pattern="Water"), - # - # # (According to Niek, don't include)never smoker controles - #!(Group_simple2 == "NS_Ctrl"), - # - # # (According to Niek, don't include)ALFA1 patiƫnten - #!(Group_simple == "ALFA"), - # - #stringr::str_trim(Passed_RNAseq_library_prep_QC_Y_N) == "Y", - GenomeScan_ID %in% colnames(expression.data), - !is.na(sample.id), - sample.id != "" - ) - -expression.data <- expression.data %>% - select.columns.in.order( - c("Gene", master.Table$GenomeScan_ID) - ) From 1037d154e397a460d6fe19a1aba41b311d367bff Mon Sep 17 00:00:00 2001 From: "Hylke C. Donker" Date: Tue, 9 Feb 2021 17:31:42 +0100 Subject: [PATCH 05/11] Small snippet that generates a genome index and aligns the RNAseq reads. --- rnaseq/step3_align/snippet.sh | 53 +++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/rnaseq/step3_align/snippet.sh b/rnaseq/step3_align/snippet.sh index e69de29..5ad5e64 100644 --- a/rnaseq/step3_align/snippet.sh +++ b/rnaseq/step3_align/snippet.sh @@ -0,0 +1,53 @@ +#!/bin/bash +# +# Align reads against reference genome. + +STORAGE="/groups/umcg-griac/tmp04/rawdata/$(whoami)/step3" +# Store genome index in this location:. +GENOME_INDEX="${STORAGE}/genome_index" +mkdir -p "${GENOME_INDEX}" +# Store the generated `Aligned.sortedByCoord.out.bam` in this dir. +ALIGNMENT_OUTPUT="${STORAGE}/alignment" +mkdir -p "${ALIGNMENT_OUTPUT}" + +# 1) Generate genome index. +# +# N.B.: +# - We're assuming a read size of 100 bp (--sjdbOverhang 100). Refer back to the +# previous quality control steps if you are unsure about the size. In case of +# reads of varying length, the ideal value is max(ReadLength)-1. +# - We're using gzip compressed reference data (--readFilesCommand zcat), i.e., +# .gtf.gz and fa.gz. If not, you can remove the `zcat` flag. + +# Storage location reference data (in this case on calculon). +REFERENCE_DATA="/groups/umcg-griac/prm02/rawdata/reference/genome" +GTF_FILE="${REFERENCE_DATA}/Homo_sapiens.GRCh38.100.gtf.gz" +FASTA_FILE="${REFERENCE_DATA}/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz" + +STAR \ + --runThreadN 8 \ + --runMode genomeGenerate \ + --readFilesCommand zcat \ + --sjdbOverhang 100 \ + --genomeFastaFiles ${FASTA_FILE} \ + --sjdbGTFfile ${GTF_FILE} \ + --genomeDir ${GENOME_INDEX} + + +# 2) Do the actual alignment. +# +# N.B.: +# - We are assuming paired-end, gzip compressed (--readFilesCommand zcat) FastQ +# files. + +# THe compressed paired-end FastQ's that we are aligning. +R1="sample1_R1.fastq.gz" +R2="sample1_R2.fastq.gz" + +STAR \ + --runThreadN 8 \ + --readFilesCommand zcat \ + --readFilesIn "${R1}" "${R2}" \ + --outSAMtype BAM SortedByCoordinate \ + --genomeDir ${GENOME_INDEX} \ + --outFileNamePrefix "${ALIGNMENT_OUTPUT}" From 4b61c099f632956a6a063e9bf57e3e98fa49109c Mon Sep 17 00:00:00 2001 From: "C. Qi" Date: Sat, 13 Feb 2021 19:13:13 +0100 Subject: [PATCH 06/11] Update 'rnaseq/step2_trim/snippet.sh' --- rnaseq/step2_trim/snippet.sh | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/rnaseq/step2_trim/snippet.sh b/rnaseq/step2_trim/snippet.sh index e69de29..810c087 100644 --- a/rnaseq/step2_trim/snippet.sh +++ b/rnaseq/step2_trim/snippet.sh @@ -0,0 +1,36 @@ +## this script is to generate jobs of trimming for each samples on the cluster +## please run this script first and then submit the jobs for each samples +## reference: http://www.usadellab.org/cms/?page=trimmomatic + +#!/bin/bash +# $1 indicates the path of raw samples. +# In the input folder, one sample has one independent folder with two pair-end f +astq files. +# The folder name should be the sample name. +# the fastq file should be sample_1.fastq and sample_2.fastq +# please prepare a sample.list that include file names for each sample + +out="/ * your output folder * /" +input="/ * your input folder * /" +cat sample.list | while read line + +do +sample=$(echo $line) +echo '#!/bin/bash' > rnaseq.${sample}.sh +echo "#SBATCH --job-name=RNAseq.${sample}" >> rnaseq.${sample}.sh +echo "#SBATCH --error=RNAseq.${sample}.err" >> rnaseq.${sample}.sh +echo "#SBATCH --output=RNAseq.${sample}.out" >> rnaseq.${sample}.sh +echo "#SBATCH --mem=15gb" >> rnaseq.${sample}.sh +echo "#SBATCH --time=6:00:00" >> rnaseq.${sample}.sh +echo "#SBATCH --cpus-per-task=6" >> rnaseq.${sample}.sh + +echo "ml Java" >>rnaseq.${sample}.sh + +echo "java -jar /* your folder of software */trimmomatic-0.36.jar PE \ + -phred33 /$input/${sample}\_1.fq.gz /$input/${sample}\_2.fq.gz \ + $out/trimmomatic/${sample}\_1_paired.fq $out/trimmomatic/${sample}\_1_unpaired.fq \ + $out/trimmomatic/${sample}\_2_paired.fq $out/trimmomatic/${sample}\_2_unpaired.fq \ + ILLUMINACLIP: TruSeq3-PE.fa:2:30:10 \ + LEADING:3 TRAILING:3 SLIDINGWINDOW:4:25 HEADCROP:8 MINLEN:50" >> rnaseq.${sample}.sh + +done \ No newline at end of file From 170a001729326564e350dc6ee4cf79624345dc5f Mon Sep 17 00:00:00 2001 From: Jos van Nijnatten Date: Mon, 15 Feb 2021 12:54:31 +0100 Subject: [PATCH 07/11] Count reads in X and Y chromosome. --- rnaseq/step6_overall_QC/02 - Gender Check.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rnaseq/step6_overall_QC/02 - Gender Check.R b/rnaseq/step6_overall_QC/02 - Gender Check.R index f02f19d..92f4e4d 100644 --- a/rnaseq/step6_overall_QC/02 - Gender Check.R +++ b/rnaseq/step6_overall_QC/02 - Gender Check.R @@ -107,4 +107,6 @@ gender.qc.genes.to.plot <- gender.qc.results %>% # - Plot the normalized expressino values for the genes in gender.qc.genes.to.plot in a boxplot, split and colored by gender. # - (Optional) Do a GSVA with as genesets the genes found in gender.qc.genes.to.plot. Plot the boxplots as per the previous point. # - (Optional) Plot the number of Y-chromosome reads devided by the number of X chromosome reads in a boxplot as per the first point. +# x=$(samtools view -q 20 -f 2 $bam_file X | wc -l) +# y=$(samtools view -q 20 -f 2 $bam_file Y | wc -l) # - (Optional) Plot the number of Y-chromosome SNPs devided by the number of X chromosome SNPs in a boxplot as per the first point. From 51abdcdb26e9da5b7d74b0f2626e56d3cbfa254d Mon Sep 17 00:00:00 2001 From: Jos van Nijnatten Date: Mon, 15 Feb 2021 16:48:39 +0100 Subject: [PATCH 08/11] Trimmomatic update (simplified) --- rnaseq/step2_trim/snippet.sh | 60 ++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/rnaseq/step2_trim/snippet.sh b/rnaseq/step2_trim/snippet.sh index 810c087..ebf0200 100644 --- a/rnaseq/step2_trim/snippet.sh +++ b/rnaseq/step2_trim/snippet.sh @@ -1,36 +1,36 @@ -## this script is to generate jobs of trimming for each samples on the cluster -## please run this script first and then submit the jobs for each samples -## reference: http://www.usadellab.org/cms/?page=trimmomatic - #!/bin/bash -# $1 indicates the path of raw samples. -# In the input folder, one sample has one independent folder with two pair-end f -astq files. -# The folder name should be the sample name. -# the fastq file should be sample_1.fastq and sample_2.fastq -# please prepare a sample.list that include file names for each sample -out="/ * your output folder * /" -input="/ * your input folder * /" -cat sample.list | while read line +# reference: http://www.usadellab.org/cms/?page=trimmomatic -do -sample=$(echo $line) -echo '#!/bin/bash' > rnaseq.${sample}.sh -echo "#SBATCH --job-name=RNAseq.${sample}" >> rnaseq.${sample}.sh -echo "#SBATCH --error=RNAseq.${sample}.err" >> rnaseq.${sample}.sh -echo "#SBATCH --output=RNAseq.${sample}.out" >> rnaseq.${sample}.sh -echo "#SBATCH --mem=15gb" >> rnaseq.${sample}.sh -echo "#SBATCH --time=6:00:00" >> rnaseq.${sample}.sh -echo "#SBATCH --cpus-per-task=6" >> rnaseq.${sample}.sh -echo "ml Java" >>rnaseq.${sample}.sh +module load Trimmomatic -echo "java -jar /* your folder of software */trimmomatic-0.36.jar PE \ - -phred33 /$input/${sample}\_1.fq.gz /$input/${sample}\_2.fq.gz \ - $out/trimmomatic/${sample}\_1_paired.fq $out/trimmomatic/${sample}\_1_unpaired.fq \ - $out/trimmomatic/${sample}\_2_paired.fq $out/trimmomatic/${sample}\_2_unpaired.fq \ - ILLUMINACLIP: TruSeq3-PE.fa:2:30:10 \ - LEADING:3 TRAILING:3 SLIDINGWINDOW:4:25 HEADCROP:8 MINLEN:50" >> rnaseq.${sample}.sh -done \ No newline at end of file +# (Example) Paired end +java -jar $EBROOTTRIMMOMATIC/trimmomatic.jar PE \ + -phred33 \ + input_forward.fq.gz \ + input_reverse.fq.gz \ + output_forward_paired.fq.gz \ + output_forward_unpaired.fq.gz \ + output_reverse_paired.fq.gz \ + output_reverse_unpaired.fq.gz \ + ILLUMINACLIP: TruSeq3-PE.fa:2:30:10 \ + LEADING:3 \ + TRAILING:3 \ + SLIDINGWINDOW:4:25 \ + HEADCROP:8 \ + MINLEN:50 + + +# (Example) Single end +java -jar $EBROOTTRIMMOMATIC/trimmomatic.jar SE \ + -phred33 \ + input.fq.gz \ + output.fq.gz \ + ILLUMINACLIP:TruSeq3-SE:2:30:10 \ + LEADING:3 \ + TRAILING:3 \ + SLIDINGWINDOW:4:15 \ + HEADCROP:8 \ + MINLEN:50 From 6cf40804c47c573c965aadbff3f4b4e55d6285ba Mon Sep 17 00:00:00 2001 From: Jos van Nijnatten Date: Mon, 15 Feb 2021 17:03:34 +0100 Subject: [PATCH 09/11] adapter comments --- rnaseq/step2_trim/snippet.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/rnaseq/step2_trim/snippet.sh b/rnaseq/step2_trim/snippet.sh index ebf0200..1ae866f 100644 --- a/rnaseq/step2_trim/snippet.sh +++ b/rnaseq/step2_trim/snippet.sh @@ -6,6 +6,11 @@ module load Trimmomatic +# Adapters can be found at +# https://github.com/timflutre/trimmomatic/tree/master/adapters +# But should be verified with FastQC, or in another way. + + # (Example) Paired end java -jar $EBROOTTRIMMOMATIC/trimmomatic.jar PE \ -phred33 \ From a2bc31351578a289f4bbb3bb19ec2a3dd255af25 Mon Sep 17 00:00:00 2001 From: Marijn Berg Date: Mon, 22 Feb 2021 14:52:54 +0100 Subject: [PATCH 10/11] Simple snippet on running MultiQC to collate FastQC results --- rnaseq/step4_multiqc/snippet.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/rnaseq/step4_multiqc/snippet.sh b/rnaseq/step4_multiqc/snippet.sh index e69de29..e16b342 100644 --- a/rnaseq/step4_multiqc/snippet.sh +++ b/rnaseq/step4_multiqc/snippet.sh @@ -0,0 +1,5 @@ +#!/bin/bash +module load multiqc + +# assuming you have all your fastQC result files in the ./fastQCresults folder +multiqc ./fastQCresults From f93596dd879a585d2f8c7565dfebd2dda35ca896 Mon Sep 17 00:00:00 2001 From: "Hylke C. Donker" Date: Mon, 22 Feb 2021 13:18:18 +0100 Subject: [PATCH 11/11] Made snippet input- and output files more harmonised. --- rnaseq/step1_fastqc/snippet.sh | 15 ++++++-- rnaseq/step2_trim/snippet.sh | 37 +++++++++++++------ rnaseq/step3_align/snippet.sh | 19 +++++----- rnaseq/step4_multiqc/snippet.sh | 4 +- .../01 - Principle Component Analysis.R | 18 ++++----- 5 files changed, 57 insertions(+), 36 deletions(-) diff --git a/rnaseq/step1_fastqc/snippet.sh b/rnaseq/step1_fastqc/snippet.sh index b0d7dcb..1c987c7 100644 --- a/rnaseq/step1_fastqc/snippet.sh +++ b/rnaseq/step1_fastqc/snippet.sh @@ -1,5 +1,12 @@ -# The command to do a FastQC on a fastq file is -file="file_to_analyse.fq.gz" -fastqc_out="./path/to/fastqc/output/dir" +#!/bin/bash +R1="sample1_R1.fastq.gz" +R2="sample1_R2.fastq.gz" -fastqc -o "$fastqc_out" "$file" +PROJECT_DIRECTORY="/groups/umcg-griac/tmp01/rawdata/$(whoami)/rnaseq" +FASTQC_OUT="${PROJECT_DIRECTORY}/step1/" +mkdir -p "${FASTQC_OUT}" + +# Run FastQC on paired-end data. +fastqc \ + -o "${FASTQC_OUT}" \ + "${R1}" "${R2}" diff --git a/rnaseq/step2_trim/snippet.sh b/rnaseq/step2_trim/snippet.sh index 1ae866f..a65bc51 100644 --- a/rnaseq/step2_trim/snippet.sh +++ b/rnaseq/step2_trim/snippet.sh @@ -1,9 +1,12 @@ #!/bin/bash - -# reference: http://www.usadellab.org/cms/?page=trimmomatic +# +# Reference: http://www.usadellab.org/cms/?page=trimmomatic module load Trimmomatic +PROJECT_DIRECTORY="/groups/umcg-griac/tmp01/rawdata/$(whoami)/rnaseq" +FASTQ_OUT="${PROJECT_DIRECTORY}/step2/" +mkdir -p "${FASTQ_OUT}" # Adapters can be found at @@ -11,15 +14,25 @@ module load Trimmomatic # But should be verified with FastQC, or in another way. -# (Example) Paired end +# Trimmomatic example Paired end data. +# +# Flags: +# - ILLUMINACLIP: Cut adapter and other illumina-specific sequences from the +# read. +# - SLIDINGWINDOW: Perform a sliding window trimming, cutting once the average +# quality within the window falls below a threshold. +# - LEADING: Cut bases off the start of a read, if below a threshold quality. +# - TRAILING: Cut bases off the end of a read, if below a threshold quality. +# - HEADCROP: Cut the specified number of bases from the start of the read. +# - MINLEN: Drop the read if it is below a specified length. java -jar $EBROOTTRIMMOMATIC/trimmomatic.jar PE \ -phred33 \ - input_forward.fq.gz \ - input_reverse.fq.gz \ - output_forward_paired.fq.gz \ - output_forward_unpaired.fq.gz \ - output_reverse_paired.fq.gz \ - output_reverse_unpaired.fq.gz \ + sample1_R1.fastq.gz \ + sample1_R2.fastq.gz \ + "${FASTQ_OUT}/sample1_R1_paired.fastq.gz" \ + "${FASTQ_OUT}/sample1_R1_unpaired.fastq.gz" \ + "${FASTQ_OUT}/sample1_R2_paired.fastq.gz" \ + "${FASTQ_OUT}/sample1_R2_unpaired.fastq.gz" \ ILLUMINACLIP: TruSeq3-PE.fa:2:30:10 \ LEADING:3 \ TRAILING:3 \ @@ -28,11 +41,11 @@ java -jar $EBROOTTRIMMOMATIC/trimmomatic.jar PE \ MINLEN:50 -# (Example) Single end +# Example single end data. java -jar $EBROOTTRIMMOMATIC/trimmomatic.jar SE \ -phred33 \ - input.fq.gz \ - output.fq.gz \ + sample1.fastq.gz \ + output.fastq.gz \ ILLUMINACLIP:TruSeq3-SE:2:30:10 \ LEADING:3 \ TRAILING:3 \ diff --git a/rnaseq/step3_align/snippet.sh b/rnaseq/step3_align/snippet.sh index 5ad5e64..547867a 100644 --- a/rnaseq/step3_align/snippet.sh +++ b/rnaseq/step3_align/snippet.sh @@ -2,25 +2,26 @@ # # Align reads against reference genome. -STORAGE="/groups/umcg-griac/tmp04/rawdata/$(whoami)/step3" +PROJECT_DIRECTORY="/groups/umcg-griac/tmp01/rawdata/$(whoami)/rnaseq" # Store genome index in this location:. -GENOME_INDEX="${STORAGE}/genome_index" +GENOME_INDEX="${PROJECT_DIRECTORY}/step3/genome_index" mkdir -p "${GENOME_INDEX}" # Store the generated `Aligned.sortedByCoord.out.bam` in this dir. -ALIGNMENT_OUTPUT="${STORAGE}/alignment" +ALIGNMENT_OUTPUT="${PROJECT_DIRECTORY}/step3/alignment/" mkdir -p "${ALIGNMENT_OUTPUT}" # 1) Generate genome index. # # N.B.: -# - We're assuming a read size of 100 bp (--sjdbOverhang 100). Refer back to the +# - We're assuming a read size of 100 bp (--sjdbOverhang 100). An alternative +# cut-off is 150, for low-input methods. In general, refer back to the # previous quality control steps if you are unsure about the size. In case of # reads of varying length, the ideal value is max(ReadLength)-1. # - We're using gzip compressed reference data (--readFilesCommand zcat), i.e., # .gtf.gz and fa.gz. If not, you can remove the `zcat` flag. -# Storage location reference data (in this case on calculon). -REFERENCE_DATA="/groups/umcg-griac/prm02/rawdata/reference/genome" +# Storage location reference data (in this case on Gearshift). +REFERENCE_DATA="/groups/umcg-griac/prm03/rawdata/reference/genome" GTF_FILE="${REFERENCE_DATA}/Homo_sapiens.GRCh38.100.gtf.gz" FASTA_FILE="${REFERENCE_DATA}/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz" @@ -40,9 +41,9 @@ STAR \ # - We are assuming paired-end, gzip compressed (--readFilesCommand zcat) FastQ # files. -# THe compressed paired-end FastQ's that we are aligning. -R1="sample1_R1.fastq.gz" -R2="sample1_R2.fastq.gz" +# The compressed, paired-end, FastQ's after trimming (step 2). +R1="${PROJECT_DIRECTORY}/step2/sample1_R1_paired.fastq.gz" +R2="${PROJECT_DIRECTORY}/step2/sample1_R2_paired.fastq.gz" STAR \ --runThreadN 8 \ diff --git a/rnaseq/step4_multiqc/snippet.sh b/rnaseq/step4_multiqc/snippet.sh index e16b342..ceb2c25 100644 --- a/rnaseq/step4_multiqc/snippet.sh +++ b/rnaseq/step4_multiqc/snippet.sh @@ -1,5 +1,5 @@ #!/bin/bash module load multiqc -# assuming you have all your fastQC result files in the ./fastQCresults folder -multiqc ./fastQCresults +PROJECT_DIRECTORY="/groups/umcg-griac/tmp01/rawdata/$(whoami)/rnaseq" +multiqc "${PROJECT_DIRECTORY}" diff --git a/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R b/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R index 0c42844..ee8c249 100644 --- a/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R +++ b/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R @@ -18,7 +18,7 @@ do.scale = FALSE # The analysis -# We ise prcomp to calculate the PCAs. Afterwards you should plot the results. +# We use prcomp to calculate the PCAs. Afterwards you should plot the results. norm.expr.data <- expression.data %>% tibble::column_to_rownames("Gene") norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>% @@ -37,8 +37,8 @@ norm.expr.data.pcs <- norm.expr.data %>% ) # Write summary of PCAs to files -pcs.summery <- summary(norm.expr.data.pcs) -pcs.summery$importance %>% +pcs.summary <- summary(norm.expr.data.pcs) +pcs.summary$importance %>% t() %>% as.data.frame() %>% tibble::rownames_to_column("PC.name") %>% @@ -46,7 +46,7 @@ pcs.summery$importance %>% file.path(results.dir.pca, "importance.csv") ) -pcs.summery$x %>% +pcs.summary$x %>% t() %>% as.data.frame() %>% tibble::rownames_to_column("ensembl.id") %>% @@ -54,7 +54,7 @@ pcs.summery$x %>% file.path(results.dir.pca, "values.csv") ) -pcs.summery$rotation %>% +pcs.summary$rotation %>% t() %>% as.data.frame() %>% tibble::rownames_to_column("sample.id") %>% @@ -63,15 +63,15 @@ pcs.summery$rotation %>% ) data.frame( - rownames = names(pcs.summery$center), - center = pcs.summery$center, - scale = pcs.summery$scale + rownames = names(pcs.summary$center), + center = pcs.summary$center, + scale = pcs.summary$scale ) %>% readr::write_csv( file.path(results.dir.pca, "rest.csv") ) -# Not saved: pcs.summery$sdev, +# Not saved: pcs.summary$sdev, # Next thing to do: