fastqc: simple comment. trimming: reverted back. overall qc: simplified the scripts and made sure to add instructions.
This commit is contained in:
		| @@ -1,34 +1,5 @@ | ||||
| #!/bin/bash | ||||
| #SBATCH --job-name=fastqc | ||||
| #SBATCH --time=0-12:00:00 | ||||
| #SBATCH --ntasks=1 | ||||
| #SBATCH --mem=15G | ||||
| #SBATCH --qos=regular | ||||
| # The command to do a FastQC on a fastq file is | ||||
| file="file_to_analyse.fq.gz" | ||||
| fastqc_out="./path/to/fastqc/output/dir" | ||||
|  | ||||
|  | ||||
| set -e | ||||
| set -u | ||||
| set -x | ||||
| set -o pipefail | ||||
|  | ||||
|  | ||||
| module purge | ||||
| module load Java | ||||
| module load FastQC | ||||
|  | ||||
|  | ||||
| dir_raw_fastq="$(pwd)/fastq/raw" | ||||
| dir_fastqc="$(pwd)/fastqc" | ||||
|  | ||||
|  | ||||
| [ -d "${dir_fastqc}" ] || mkdir -p "${dir_fastqc}" | ||||
|  | ||||
|  | ||||
| # Run FastQC | ||||
| files=$(find -L "$dir_raw_fastq" -type f -iname "*.fastq.gz") | ||||
| for file in $files; do | ||||
| 	filename=$(basename "$file") | ||||
| 	fastqc -o "$dir_fastqc" "$file" & | ||||
| done | ||||
|  | ||||
| wait | ||||
| fastqc -o "$fastqc_out" "$file" | ||||
|   | ||||
| @@ -1,64 +0,0 @@ | ||||
| #!/bin/bash | ||||
| #SBATCH --job-name=trimming | ||||
| #SBATCH --time=2-00:00:00 | ||||
| #SBATCH --ntasks=1 | ||||
| #SBATCH --mem=15G | ||||
| #SBATCH --qos=regular | ||||
|  | ||||
|  | ||||
| set -e | ||||
| set -u | ||||
| set -x | ||||
| set -o pipefail | ||||
|  | ||||
|  | ||||
| module purge | ||||
| module load TrimGalore | ||||
|  | ||||
|  | ||||
| dir_raw_fastq="$(pwd)/fastq/raw" | ||||
| dir_trimmed_fastq="$(pwd)/fastq/trimmed" | ||||
| dir_trimmed_fastq_reports="$(pwd)/fastq/trimmed/reports" | ||||
|  | ||||
| adapter_3p="TGGAATTCTCGG" # is _R1 | ||||
| adapter_5p="GATCGTCGGACT" # is _R2 | ||||
|  | ||||
|  | ||||
| [ -d "${dir_trimmed_fastq_reports}" ] || mkdir -p "${dir_trimmed_fastq_reports}" | ||||
|  | ||||
|  | ||||
| # Trim all adapters from the sequences | ||||
| while IFS=, read -r GSID Sample | ||||
| do | ||||
|   echo ">> Executing trimming of $GSID (${#GSID})" | ||||
|   if [ "${#GSID}" == "18" ]; then # check length - don't include sample pools | ||||
|     for fqFile in $(find -L "$dir_raw_fastq" -maxdepth 1 -type f -iname "*${GSID}*.fastq.gz"); do | ||||
|       newFilename=${fqFile/.fastq.gz/_trimmed.fq.gz} | ||||
|       if [ -f "${newFilename}" ]; then | ||||
|         echo ">>File exists ${newFilename}. Skipping" | ||||
|       else | ||||
|         echo ">>File: ${fqFile}" | ||||
|         if grep -q "_R1" <<< $(basename "$fqFile"); then | ||||
|           adapter_seq=$adapter_3p | ||||
|         elif grep -q "_R2" <<< $(basename "$fqFile"); then | ||||
|           adapter_seq=$adapter_5p | ||||
|         fi | ||||
|         echo ">>Trimming with sequence ${adapter_seq}" | ||||
|         trim_galore \ | ||||
|           --adapter "$adapter_seq" \ | ||||
|           --length $(printf "$adapter_seq" | wc -m) \ | ||||
|           --output_dir "${dir_trimmed_fastq}" \ | ||||
|           --fastqc_args "--noextract" \ | ||||
|           "${fqFile}" & | ||||
|       fi | ||||
|     done | ||||
|   else | ||||
|     echo "Skipped." | ||||
|   fi | ||||
| done < samples.csv | ||||
|  | ||||
|  | ||||
| wait | ||||
| mv ${dir_trimmed_fastq}/*.zip "${dir_trimmed_fastq_reports}" | ||||
| mv ${dir_trimmed_fastq}/*.txt "${dir_trimmed_fastq_reports}" | ||||
| mv ${dir_trimmed_fastq}/*.html "${dir_trimmed_fastq_reports}" | ||||
| @@ -1,7 +1,15 @@ | ||||
| # Principle Component Analysis | ||||
| # Normalized with limma::voom | ||||
| library(limma) | ||||
| library(tidyverse) | ||||
|  | ||||
| source("__ - Preloader.R", verbose=T) | ||||
|  | ||||
| # expression.data. Has columns: | ||||
| # - Gene (gene identifier) | ||||
| # - [Sample identifiers] | ||||
| expression.data <- "mrna_counts_table.csv" | ||||
| # results.dir. The directory of where to put the resulting tables (and later your plots.) | ||||
| results.dir <- "results" | ||||
|  | ||||
|  | ||||
| # PCA variables | ||||
| @@ -10,10 +18,7 @@ do.scale = FALSE | ||||
|  | ||||
|  | ||||
| # The analysis | ||||
| master.Table %>% readr::write_csv( | ||||
| 	file.path(results.dir, "patient.table.csv") | ||||
| ) | ||||
|  | ||||
| # We ise prcomp to calculate the PCAs. Afterwards you should plot the results. | ||||
| norm.expr.data <- expression.data %>% | ||||
| 	tibble::column_to_rownames("Gene") | ||||
| norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>% | ||||
| @@ -69,132 +74,8 @@ data.frame( | ||||
| # Not saved: pcs.summery$sdev, | ||||
|  | ||||
|  | ||||
|  | ||||
| # Plot PCAs | ||||
| # https://github.com/kevinblighe/PCAtools | ||||
| results.dir.pca.plot <- file.path(results.dir.pca, "img") | ||||
| dir.create(results.dir.pca.plot) | ||||
|  | ||||
| metadata <- master.Table %>% | ||||
| 	dplyr::filter( | ||||
| 		!is.na(GenomeScan_ID) | ||||
| 	) %>% | ||||
| 	tibble::column_to_rownames("GenomeScan_ID") %>% | ||||
| 	select.rows.in.order( | ||||
| 		colnames(norm.expr.data) | ||||
| 	) | ||||
|  | ||||
|  | ||||
| p <- pca(norm.expr.data, | ||||
| 	metadata = metadata,  | ||||
|   	center = do.center, | ||||
|   	scale = do.scale | ||||
| ) | ||||
| elbow <- findElbowPoint(p$variance) | ||||
| metavars <- c('Age','Gender','Smoking_status','COPD_Y_or_N','SEO_COPD_Y_or_N','GOLD_stage') | ||||
|  | ||||
|  | ||||
| png(filename = file.path(results.dir.pca.plot,"scree_plot.png"), | ||||
|     width = 800, height = 800, | ||||
|     units = "px", pointsize = 12, | ||||
|     type = "Xlib") | ||||
| print(screeplot(p, | ||||
| 		axisLabSize = 12, | ||||
| 		titleLabSize = 12, | ||||
| 	    components = getComponents(p, 1:(elbow+5)), | ||||
| 	    vline = c(elbow) | ||||
|     ) + | ||||
|     geom_label( | ||||
|     	aes( | ||||
|     		x = elbow + 1, | ||||
|     		y = 50, | ||||
|     		label = 'Elbow method', | ||||
|     		vjust = -1, | ||||
|     		size = 8 | ||||
|     	) | ||||
|     )) | ||||
| dev.off() | ||||
|  | ||||
|  | ||||
| png(filename = file.path(results.dir.pca.plot, "eigen_corr_plot.png"), | ||||
|     width = 1200, height = 1200, | ||||
|     units = "px", pointsize = 12, | ||||
|     type = "Xlib") | ||||
| print(eigencorplot(p, | ||||
|     metavars = metavars | ||||
| )) | ||||
| dev.off() | ||||
|  | ||||
|  | ||||
| dir.create(file.path(results.dir.pca.plot, "pairsplot")) | ||||
| for (var in metavars) { | ||||
| 	png( | ||||
| 		filename = file.path(results.dir.pca.plot, "pairsplot", paste0(var,".png")), | ||||
| 	    width = 1200, height = 1200, | ||||
| 	    units = "px", pointsize = 12, | ||||
| 	    type = "Xlib" | ||||
|     ) | ||||
| 	print(pairsplot(p, | ||||
| 	    components = getComponents(p, c(1:(elbow+1))), | ||||
| 	    triangle = TRUE, | ||||
| 	    trianglelabSize = 12, | ||||
| 	    hline = 0, vline = 0, | ||||
| 	    pointSize = 0.4, | ||||
| 	    gridlines.major = FALSE, | ||||
| 	    gridlines.minor = FALSE, | ||||
| 	    colby = var, | ||||
| 	    title = paste0('Pairs plot: ',var), | ||||
| 	    plotaxes = TRUE | ||||
| 	)) | ||||
| 	dev.off() | ||||
| } | ||||
|  | ||||
|  | ||||
|  | ||||
| # Plot PCAs - old failure | ||||
| pca.combinations <- combinations( | ||||
| 		n = (elbow+1), | ||||
| 		r = 2, | ||||
| 		v = 1:(elbow+1), | ||||
| 		repeats.allowed = FALSE | ||||
| 	) | ||||
|  | ||||
| dir.create(file.path(results.dir.pca.plot, "biplots")) | ||||
| for (var in metavars) { | ||||
| 	for (i in 1:nrow(pca.combinations)) { | ||||
| 		pca.combi <- pca.combinations[i,] | ||||
| 		pca.title <- paste(paste0("PC", pca.combi), collapse="_") | ||||
| 		png( | ||||
| 			filename = file.path(results.dir.pca.plot, "biplots", paste0(var, "-", pca.title, ".png")), | ||||
| 		    width = 800, height = 800, | ||||
| 		    units = "px", pointsize = 12, | ||||
| 		    type = "Xlib" | ||||
| 	    ) | ||||
| 		print( | ||||
| 			autoplot( | ||||
| 				norm.expr.data.pcs, | ||||
| 				data = master.Table %>% | ||||
| 					dplyr::filter( | ||||
| 						!is.na(GenomeScan_ID) | ||||
| 					) %>% | ||||
| 					tibble::column_to_rownames("GenomeScan_ID") %>% | ||||
| 					select.rows.in.order( | ||||
| 						rownames(norm.expr.data.pcs$x) | ||||
| 					), | ||||
| 				x = pca.combi[1], | ||||
| 				y = pca.combi[2], | ||||
| 				colour = var, | ||||
| 				loadings = FALSE, | ||||
| 				loadings.label = FALSE, | ||||
| 				#label = FALSE, | ||||
| 				label.size = 3 | ||||
| 		    ) + | ||||
| 		    ggprism::theme_prism() +  | ||||
| 		    #ggprism::scale_colour_prism() +  | ||||
| 		    ggprism::scale_shape_prism() + | ||||
| 		    ggplot2::labs(subtitle = paste0(str_to_title(var), " (", pca.title,")")) | ||||
| 		) | ||||
| 		dev.off() | ||||
| 	} | ||||
| } | ||||
|  | ||||
| # Next thing to do: | ||||
| # - (Optional) scree plot - to determine the optimal cutoff for PCA inclusion based on explaination of variance | ||||
| # - (Optional) eigencorplot - to correlate PCAs to clinical variables so that you know which PCA to include for which analysis | ||||
| # - (Optional) pairsplot - plot multiple PCAs against each other in a single figure | ||||
| # - Plot the first couple of PCAs against each other | ||||
| @@ -1,16 +1,28 @@ | ||||
| # Gender QC | ||||
| # Normalized with limma::voom | ||||
| library(GSVA) | ||||
| library(limma) | ||||
| library(edgeR) | ||||
| library(tidyverse) | ||||
|  | ||||
| source("__ - Preloader.R", verbose=T) | ||||
|  | ||||
| # expression.data. Has columns: | ||||
| # - Gene (gene identifier) | ||||
| # - [Sample identifiers] | ||||
| expression.data <- "mrna_counts_table.csv" | ||||
| # master.Table. Has columns: | ||||
| # - GenomeScan_ID | ||||
| # - gender, levels = c("male", "female") | ||||
| # - age | ||||
| # - factor(smoking.status, levels = c("Ex-smoker", "Current smoker")) | ||||
| master.Table <- "patient_table.csv" | ||||
| # results.dir. The directory of where to put the resulting tables (and later your plots.) | ||||
| results.dir <- "results" | ||||
|  | ||||
|  | ||||
| # The analysis | ||||
| norm.expr.data <- expression.data %>% | ||||
|     tibble::column_to_rownames("Gene") | ||||
| norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>% | ||||
|     limma::voom() %>% | ||||
|     as.matrix() | ||||
|  | ||||
| # We first do a differential expression analysis on gender using EdgeR. | ||||
| # Afterwards you should plot these results. | ||||
| x.genes <- gene.data %>% | ||||
|     dplyr::filter(chromosome_name == "X") %>% | ||||
|     dplyr::pull(ensembl_gene_id) | ||||
| @@ -87,248 +99,12 @@ gender.qc.genes.to.plot <- gender.qc.results %>% | ||||
|             chromosome_name %in% c("X", "Y") & | ||||
|             FDR < 0.05 & | ||||
|             dplyr::row_number() <= 5 | ||||
|         ) | | ||||
|         hgnc_symbol %in% c( | ||||
|             "XIST", | ||||
|             "TSIX", | ||||
|             "KDM6A", | ||||
|             "ZFX", | ||||
|             "KDM5C", | ||||
|             "ZFY-AS1", | ||||
|             "ARSDP1", | ||||
|             "GYG2P1", | ||||
|             "RBMY2JP", | ||||
|             "ARSLP1" | ||||
|         ) | ||||
|     ) | ||||
|  | ||||
| gender.qc.data <- as.data.frame(norm.expr.data) %>% | ||||
|     rownames_to_column("ensembl.id") %>% | ||||
|     tidyr::gather( | ||||
|         key = "rna.seq.sample.id", | ||||
|         value = "expr.value", | ||||
|         -ensembl.id | ||||
|     ) %>% | ||||
|     dplyr::filter( | ||||
|         ensembl.id %in% gender.qc.genes.to.plot$ensembl.id | ||||
|     ) %>% | ||||
|     dplyr::left_join( | ||||
|         y = gender.qc.patients, | ||||
|         by = c("rna.seq.sample.id" = "GenomeScan_ID") | ||||
|     ) %>% | ||||
|     #dplyr::filter( | ||||
|     #    !is.na(gender) | ||||
|     #) %>% | ||||
|     readr::write_csv( | ||||
|         file.path(results.dir.gender.plot, "plot.data.voom.csv") | ||||
|     ) | ||||
|  | ||||
| for (chr in gender.qc.genes.to.plot$chromosome_name) { | ||||
|     current.gender.qc.genes.to.plot <- gender.qc.genes.to.plot %>% | ||||
|         dplyr::filter(chromosome_name == chr) | ||||
|     chromosome_name <- chr | ||||
|     i <- 0 | ||||
|     for (current.ensembl.id in current.gender.qc.genes.to.plot$ensembl.id) { | ||||
|         i <- i + 1 | ||||
|  | ||||
|         hgnc_symbol <- gene.data %>% | ||||
|             dplyr::filter( | ||||
|                 ensembl_gene_id == current.ensembl.id | ||||
|             ) %>% | ||||
|             dplyr::pull(hgnc_symbol) | ||||
|  | ||||
|         # calculate outliers, kinda | ||||
|         plot.data <- gender.qc.data %>% | ||||
|             dplyr::filter( | ||||
|                 ensembl.id == current.ensembl.id | ||||
|             ) %>% | ||||
|             dplyr::mutate( | ||||
|                 gender = dplyr::case_when( | ||||
|                     is.na(gender) | (stringr::str_trim(gender) == "") ~ "other", | ||||
|                     TRUE ~ gender | ||||
|                 ) | ||||
|             ) | ||||
|  | ||||
|         if (nrow(plot.data) <= 0) { | ||||
|             next | ||||
|         } | ||||
|  | ||||
|         outliers <- boxplot( | ||||
|             formula = expr.value ~ gender, | ||||
|             data = plot.data, | ||||
|             plot = FALSE | ||||
|         )$out | ||||
|  | ||||
|         result.to.annotate <- plot.data %>% | ||||
|             dplyr::filter( | ||||
|                 expr.value %in% outliers | ||||
|             ) | ||||
|  | ||||
|         # Visual: plot range (for t-test p-value) | ||||
|         plot.y.range <- c( | ||||
|             "min" = as.integer(min(plot.data$expr.value) - 1) , | ||||
|             "max" = as.integer(max(plot.data$expr.value) + 1) | ||||
|         ) | ||||
|         plot.margin <- ((plot.y.range["max"] + (plot.y.range["min"] * -1)) * 0.05) | ||||
|         plot.y.range["min"] <- plot.y.range["min"] - plot.margin | ||||
|         plot.y.range["max"] <- plot.y.range["max"] + plot.margin | ||||
|  | ||||
|         # Plot the damn thing as if it is Graphpad Prism | ||||
|         stat.table <- rstatix::t_test(plot.data, expr.value ~ gender) | ||||
|         plt <- plot.data %>% | ||||
|             ggplot2::ggplot( | ||||
|                 mapping = ggplot2::aes( | ||||
|                     x = gender, | ||||
|                     y = expr.value | ||||
|                 )  | ||||
|             ) + | ||||
|             ggplot2::geom_jitter( | ||||
|                 mapping = ggplot2::aes( | ||||
|                     colour = gender, | ||||
|                     shape = gender | ||||
|                 ), | ||||
|                 width = 0.1 | ||||
|             ) + | ||||
|             ggrepel::geom_text_repel( | ||||
|                 data = result.to.annotate, | ||||
|                 mapping = ggplot2::aes( | ||||
|                     label = sample.id | ||||
|                 ), | ||||
|                 size = 2, | ||||
|                 box.padding = unit(0.35, "lines"), | ||||
|                 point.padding = unit(0.3, "lines") | ||||
|             ) + | ||||
|               ggplot2::stat_summary( | ||||
|                   fun = "mean", | ||||
|                   geom = "crossbar", | ||||
|                   width = 0.3, | ||||
|                   size = 0.3 | ||||
|               ) +  | ||||
|             ggplot2::scale_y_continuous( | ||||
|                 limits = plot.y.range, | ||||
|                 guide = "prism_offset" | ||||
|             ) + | ||||
|             #ggprism::add_pvalue( | ||||
|             #    stat.table, | ||||
|             #    y.position = plot.y.range["max"] | ||||
|             #) + | ||||
|             ggprism::theme_prism() +  | ||||
|             ggprism::scale_colour_prism() +  | ||||
|             ggprism::scale_shape_prism() + | ||||
|             ggplot2::theme( | ||||
|                 legend.position = "none" | ||||
|             ) +  | ||||
|             ggplot2::labs( | ||||
|                 subtitle = paste0("Gender Check: ", hgnc_symbol, " (chr. ", chromosome_name, ")"), | ||||
|                 x = "Gender", | ||||
|                 y = "Normalised Expression Values" | ||||
|             ) | ||||
|  | ||||
|         ggplot2::ggsave( | ||||
|             filename = file.path(results.dir.gender.plot, paste0(chromosome_name, ".", i, ".", hgnc_symbol, ".png")), | ||||
|             plot = plt, | ||||
|             width = 12.5, | ||||
|             height = 12.5, | ||||
|             unit = "cm" | ||||
|         ) | ||||
|     } | ||||
| } | ||||
|  | ||||
| # Let's try a GSVA | ||||
| gsva.groups <- list( | ||||
|     X = gender.qc.genes.to.plot %>% | ||||
|         dplyr::filter(chromosome_name == "X") %>% | ||||
|         dplyr::pull(ensembl.id), | ||||
|     Y = gender.qc.genes.to.plot %>% | ||||
|         dplyr::filter(chromosome_name == "Y") %>% | ||||
|         dplyr::pull(ensembl.id) | ||||
| ) | ||||
|  | ||||
| gsva_res = GSVA::gsva( | ||||
|   norm.expr.data, | ||||
|   gsva.groups, | ||||
|   mx.diff = TRUE, | ||||
|   verbose = FALSE, | ||||
|   parallel.sz = 1 | ||||
| ) | ||||
|  | ||||
|  | ||||
| gender.qc.gsva.data <- as.data.frame(gsva_res) %>% | ||||
|     rownames_to_column("gsva.group") %>% | ||||
|     tidyr::gather( | ||||
|         key = "rna.seq.sample.id", | ||||
|         value = "gsva.value", | ||||
|         -gsva.group | ||||
|     ) %>% | ||||
|     dplyr::left_join( | ||||
|         y = gender.qc.patients %>% | ||||
|           dplyr::select( | ||||
|               GenomeScan_ID, | ||||
|               sample.id, | ||||
|               gender | ||||
|           ), | ||||
|         by = c("rna.seq.sample.id" = "GenomeScan_ID") | ||||
|     ) %>% | ||||
|     readr::write_csv( | ||||
|         file.path(results.dir.gender.plot, "plot.data.gsva.csv") | ||||
|     ) | ||||
|  | ||||
|  | ||||
| for (c.gender in unique(gender.qc.gsva.data$gender)) { | ||||
|     if (is.na(c.gender)) { | ||||
|         next | ||||
|     } | ||||
|  | ||||
|     c.plot.data <- gender.qc.gsva.data %>% | ||||
|         dplyr::filter( | ||||
|             gender == c.gender | ||||
|         ) | ||||
|  | ||||
|     outliers <- boxplot( | ||||
|         formula = gsva.value ~ gsva.group, | ||||
|         data = c.plot.data, | ||||
|         plot = FALSE | ||||
|     )$out | ||||
|  | ||||
|     result.to.annotate <- c.plot.data %>% | ||||
|         dplyr::filter( | ||||
|             gsva.value %in% outliers | ||||
|         ) | ||||
|  | ||||
|     plt <- c.plot.data %>% | ||||
|         ggplot2::ggplot( | ||||
|             mapping = ggplot2::aes( | ||||
|                 x = gsva.group, | ||||
|                 y = gsva.value | ||||
|             )  | ||||
|         ) + | ||||
|         ggplot2::geom_boxplot() + | ||||
|         ggrepel::geom_text_repel( | ||||
|             data = result.to.annotate, | ||||
|             mapping = ggplot2::aes( | ||||
|                 label = sample.id | ||||
|             ), | ||||
|             size = 2, | ||||
|             box.padding = unit(0.35, "lines"), | ||||
|             point.padding = unit(0.3, "lines") | ||||
|         ) + | ||||
|         ggprism::theme_prism() +  | ||||
|         ggprism::scale_colour_prism() +  | ||||
|         ggprism::scale_shape_prism() + | ||||
|         ggplot2::theme( | ||||
|             legend.position = "none" | ||||
|         ) +  | ||||
|         ggplot2::labs( | ||||
|             subtitle = paste0("", toupper(c.gender)), | ||||
|             x = "Chromosome", | ||||
|             y = "GSVA Values" | ||||
|         ) | ||||
|  | ||||
|     ggplot2::ggsave( | ||||
|         filename = file.path(results.dir.gender.plot, paste0("gsva.", c.gender, ".png")), | ||||
|         plot = plt, | ||||
|         width = 12.5, | ||||
|         height = 12.5, | ||||
|         unit = "cm" | ||||
|     ) | ||||
| } | ||||
| # Next thing to do: | ||||
| # - Plot the normalized expressino values for the genes in gender.qc.genes.to.plot in a boxplot, split and colored by gender. | ||||
| # - (Optional) Do a GSVA with as genesets the genes found in gender.qc.genes.to.plot. Plot the boxplots as per the previous point. | ||||
| # - (Optional) Plot the number of Y-chromosome reads devided by the number of X chromosome reads in a boxplot as per the first point. | ||||
| # - (Optional) Plot the number of Y-chromosome SNPs devided by the number of X chromosome SNPs in a boxplot as per the first point. | ||||
|   | ||||
| @@ -1,17 +1,25 @@ | ||||
| # Total counts per sample | ||||
| # Normalized with limma::voom | ||||
| library(limma) | ||||
| library(tidyverse) | ||||
|  | ||||
| source("__ - Preloader.R", verbose=T) | ||||
|  | ||||
| # expression.data. Has columns: | ||||
| # - Gene (gene identifier) | ||||
| # - [Sample identifiers] | ||||
| expression.data <- "mrna_counts_table.csv" | ||||
| # master.Table. Has columns: | ||||
| # - GenomeScan_ID | ||||
| # - gender, levels = c("male", "female") | ||||
| # - age | ||||
| # - factor(smoking.status, levels = c("Ex-smoker", "Current smoker")) | ||||
| master.Table <- "patient_table.csv" | ||||
| # results.dir. The directory of where to put the resulting tables (and later your plots.) | ||||
| results.dir <- "results" | ||||
|  | ||||
|  | ||||
| # The analysis | ||||
| norm.expr.data <- expression.data %>% | ||||
| 	tibble::column_to_rownames("Gene") | ||||
| norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>% | ||||
| 	limma::voom() %>% | ||||
| 	as.matrix() | ||||
|  | ||||
| # Total counts per sample | ||||
| # We calculate the number of mapped reads per sample. | ||||
| total.count.per.sample <- expression.data %>% | ||||
| 	tibble::column_to_rownames("Gene") %>% | ||||
| 	colSums() | ||||
| @@ -23,141 +31,9 @@ data.frame( | ||||
| 	readr::write_csv(file.path(results.dir, "total.counts.per.sample.csv")) | ||||
|  | ||||
|  | ||||
| norm.data <- norm.expr.data %>% | ||||
|   as.data.frame() %>% | ||||
|   tibble::rownames_to_column( | ||||
|     "Gene" | ||||
|   ) %>% | ||||
|   tidyr::gather( | ||||
|     key = "sample.id", | ||||
|     value = "expr.value", | ||||
|     -Gene | ||||
|   ) %>% | ||||
|   dplyr::left_join( | ||||
|     y = master.Table %>% | ||||
|       dplyr::filter( | ||||
|           !is.na(GenomeScan_ID) | ||||
|       ) %>% | ||||
|       dplyr::mutate( | ||||
|           id = dplyr::case_when( | ||||
|             stringr::str_trim(gender) == "" ~ paste0("Water ", dplyr::row_number()), | ||||
|             TRUE ~ sample.id | ||||
|           ), | ||||
|           gender = dplyr::case_when( | ||||
|             stringr::str_trim(gender) == "" ~ "water", | ||||
|             !is.na(gender) ~ as.character(gender) | ||||
|           ) | ||||
|       ) %>% | ||||
|       dplyr::select( | ||||
|         GenomeScan_ID, | ||||
|         gender, | ||||
|         id | ||||
|       ), | ||||
|     by = c("sample.id" = "GenomeScan_ID") | ||||
|   ) | ||||
|  | ||||
| norm.plot <- norm.data %>% | ||||
|   ggplot2::ggplot( | ||||
|     mapping = ggplot2::aes( | ||||
|       x = id, | ||||
|       y = expr.value, | ||||
|       fill = gender | ||||
|     ) | ||||
|   ) + | ||||
|   ggplot2::geom_boxplot() + | ||||
|   ggplot2::scale_fill_manual( | ||||
|     values = c( | ||||
|       "male" = "blue", | ||||
|       "female" = "red", | ||||
|       "water" = "green" | ||||
|     ) | ||||
|   ) + | ||||
|   ggplot2::labs( | ||||
|     title = "Normalized expression values distribution", | ||||
|     y = "Normalized expression values (limma::voom)", | ||||
|     x = "Sample", | ||||
|     gender = "Gender" | ||||
|   ) + | ||||
|   ggprism::theme_prism() + | ||||
|   ggplot2::theme( | ||||
|     axis.text.x = ggplot2::element_text(angle = 90) | ||||
|   ) | ||||
|  | ||||
| ggplot2::ggsave( | ||||
|   filename = file.path(results.dir, "counts.per.sample.normalised.png"), | ||||
|   plot = norm.plot, | ||||
|   width = 40, | ||||
|   height = 20, | ||||
|   units = "cm" | ||||
| ) | ||||
|  | ||||
|  | ||||
|  | ||||
| expr.data <- expression.data %>% | ||||
|   tidyr::gather( | ||||
|     key = "sample.id", | ||||
|     value = "expr.value", | ||||
|     -Gene | ||||
|   ) %>% | ||||
|   dplyr::filter( | ||||
|     expr.value != 0 | ||||
|   ) %>% | ||||
|   dplyr::left_join( | ||||
|     y = master.Table %>% | ||||
|       dplyr::filter( | ||||
|           !is.na(GenomeScan_ID) | ||||
|       ) %>% | ||||
|       dplyr::mutate( | ||||
|           id = dplyr::case_when( | ||||
|             stringr::str_trim(gender) == "" ~ paste0("Water ", dplyr::row_number()), | ||||
|             TRUE ~ sample.id | ||||
|           ), | ||||
|           gender = dplyr::case_when( | ||||
|             stringr::str_trim(gender) == "" ~ "water", | ||||
|             !is.na(gender) ~ as.character(gender) | ||||
|           ) | ||||
|       ) %>% | ||||
|       dplyr::select( | ||||
|         GenomeScan_ID, | ||||
|         gender, | ||||
|         id | ||||
|       ), | ||||
|     by = c("sample.id" = "GenomeScan_ID") | ||||
|   ) | ||||
|  | ||||
| expr.plot <- expr.data %>% | ||||
|   ggplot2::ggplot( | ||||
|     mapping = ggplot2::aes( | ||||
|       x = id, | ||||
|       y = expr.value, | ||||
|       fill = gender | ||||
|     ) | ||||
|   ) + | ||||
|   ggplot2::geom_boxplot() + | ||||
|   ggplot2::scale_fill_manual( | ||||
|     values = c( | ||||
|       "male" = "blue", | ||||
|       "female" = "red", | ||||
|       "water" = "green" | ||||
|     ) | ||||
|   ) + | ||||
|   ggplot2::scale_y_continuous(trans='log2')  + | ||||
|   ggplot2::labs( | ||||
|     title = "Raw expression values distribution, without zero's", | ||||
|     y = "Expression values", | ||||
|     x = "Sample", | ||||
|     gender = "Gender" | ||||
|   ) + | ||||
|   ggprism::theme_prism() + | ||||
|   ggplot2::theme( | ||||
|     axis.text.x = ggplot2::element_text(angle = 90) | ||||
|   ) | ||||
|  | ||||
| ggplot2::ggsave( | ||||
|   filename = file.path(results.dir, "counts.per.sample.raw.zeros.removed.png"), | ||||
|   plot = expr.plot, | ||||
|   width = 40, | ||||
|   height = 20, | ||||
|   units = "cm" | ||||
| ) | ||||
| # Next thing to do: | ||||
| # - Check the number of reads per sample in total.counts.per.sample.csv | ||||
| # - Plot the reads distribution (all reads) per sample in a boxplot. | ||||
| # - (Optional) Calculate the number of unmapped, multimapped, unique mapped to | ||||
| #   feature and unique mapped to no feature and plot these in a stacked bar graph. | ||||
|  | ||||
|   | ||||
| @@ -1,189 +0,0 @@ | ||||
| library(tidyverse) | ||||
| library(ggfortify) | ||||
| library(ggprism) | ||||
| library(limma) | ||||
| library(biomaRt) | ||||
| library(PCAtools) | ||||
| library(gtools) | ||||
| library(edgeR) | ||||
| library(ggprism) | ||||
| library(foreign) | ||||
|  | ||||
|  | ||||
| # Global variables | ||||
| results.dir <- file.path("results.nosync", "RNA-Seq QC") | ||||
| data.dir <- "Data" | ||||
| patient.dir <- file.path(data.dir, "Patients") | ||||
| sample.dir <- file.path(data.dir, "Samples") | ||||
| expression.dir <- file.path(data.dir, "mRNA - RNA-Seq") | ||||
|  | ||||
|  | ||||
| dir.create(results.dir, recursive = TRUE) | ||||
|  | ||||
|  | ||||
| #### | ||||
| # Helper functions | ||||
| #### | ||||
| select.columns.in.order <- function(dataframe, columns) { | ||||
|   dataframe[, columns] | ||||
| } | ||||
|  | ||||
| select.rows.in.order <- function(dataframe, rows) { | ||||
|   dataframe[rows,] | ||||
| } | ||||
|  | ||||
| getGenedataByEnsemblId38 <- function(ensemblIds, file.location) { | ||||
|   file.name <- file.path(file.location, "genes_info_hg38.csv") | ||||
|   if (!file.exists(file.name)) { | ||||
|     if (!("mart" %in% ls())) { | ||||
|       assign("mart", useEnsembl( | ||||
|         biomart = "ENSEMBL_MART_ENSEMBL", | ||||
|         dataset = "hsapiens_gene_ensembl" | ||||
|       )) | ||||
|     } | ||||
|     gene.list <- getBM( | ||||
|       filters = "ensembl_gene_id", | ||||
|       attributes = c( | ||||
|         "hgnc_symbol", | ||||
|         "ensembl_gene_id", | ||||
|         "ensembl_transcript_id", | ||||
|         "chromosome_name", | ||||
|         "start_position", | ||||
|         "end_position", | ||||
|         "strand", | ||||
|         "transcription_start_site", | ||||
|         "transcript_start", | ||||
|         "transcript_end", | ||||
|         "external_gene_name" | ||||
|       ), | ||||
|       values = as.character(ensemblIds), | ||||
|       mart = mart | ||||
|     ) | ||||
|     readr::write_csv(gene.list, path = file.name) | ||||
|   } | ||||
|   return( | ||||
|     readr::read_csv( | ||||
|     	file.name, | ||||
| 		col_types = readr::cols() | ||||
| 	) | ||||
|   ) | ||||
| } | ||||
|  | ||||
| #remove.rows.with.count.less.then <- function(dataframe, minRowCount, columns.to.exclude) { | ||||
| #  dataframe %>% | ||||
| #  dplyr::filter( | ||||
| #    rowSums(dplyr::select(., -tidyselect::one_of(columns.to.exclude))) < minRowCount | ||||
| #  ) | ||||
| #} | ||||
|  | ||||
| limma.voom.convert.column <- function(dataframe, columnname) { | ||||
|   dataframe %>% | ||||
|   tibble::column_to_rownames(columnname) %>% | ||||
|   limma::voom() %>% | ||||
|   as.data.frame() %>% | ||||
|   tibble::rownames_to_column(columnname) | ||||
| } | ||||
|  | ||||
| select.columns.in.order <- function(dataframe, columns) { | ||||
|   dataframe[, columns] | ||||
| } | ||||
|  | ||||
| drop.columns.if.all.same.value <- function(dataframe) { | ||||
|   for (name in colnames(dataframe)) { | ||||
|     is.all.same <- (dataframe[, name] %>% unique() %>% length()) <= 1 | ||||
|     if (is.all.same) { | ||||
|       dataframe <- dataframe %>% | ||||
|         dplyr::select( | ||||
|           -tidyselect::one_of(name) | ||||
|         ) | ||||
|     } | ||||
|   } | ||||
|   dataframe | ||||
| } | ||||
|  | ||||
|  | ||||
|  | ||||
| # Load data | ||||
| master.Table <- foreign::read.spss( | ||||
| 		file.path(patient.dir, "PRESTO proteogenomics full data sat - ver 7.5.sav") | ||||
| 	) %>% | ||||
|   as.data.frame() %>% | ||||
|   dplyr::mutate( | ||||
|     GenomeScan_ID = stringr::str_trim(GenomeScan_ID), | ||||
|     gender = forcats::fct_recode( | ||||
|       Gender, | ||||
|       female = "f", | ||||
|       male = "m", | ||||
|       other = "" | ||||
|     ), | ||||
|     age = as.numeric(Age), | ||||
|     smoking.status = forcats::fct_recode( | ||||
|       Smoking_status, | ||||
|       `Ex-smoker` = "ES        ", | ||||
|       `Current smoker` = "CS        ", | ||||
|       other = "          " | ||||
|     ) | ||||
|   ) | ||||
|  | ||||
| expression.data <- readr::read_tsv( | ||||
| 		file.path(expression.dir, "20200427_103972-001_rawcounts.txt"), | ||||
| 		col_types = readr::cols() | ||||
| 	) | ||||
|  | ||||
| gene.data <- getGenedataByEnsemblId38( | ||||
|     ensemblIds = expression.data$Gene, | ||||
|     file.location = expression.dir | ||||
|   ) %>% | ||||
|   dplyr::group_by(hgnc_symbol) %>% | ||||
|   dplyr::filter( | ||||
|     dplyr::row_number() == 1, | ||||
|     !is.na(hgnc_symbol), | ||||
|     hgnc_symbol != "" | ||||
|   ) %>% | ||||
|   dplyr::ungroup() %>% | ||||
|   dplyr::select( | ||||
|     hgnc_symbol, | ||||
|     ensembl_gene_id, | ||||
|     chromosome_name, | ||||
|     transcript_start, | ||||
|     transcript_end | ||||
|   ) | ||||
|  | ||||
|  | ||||
| master.Table <- master.Table %>% | ||||
|   dplyr::mutate( | ||||
|     Group_simple2 = stringr::str_trim(Group_simple2), | ||||
|     Group_simple  = stringr::str_trim(Group_simple), | ||||
|     T_number = as.character(T_number), | ||||
|     sample.id = stringr::str_trim(PRESTO_ID) | ||||
|   ) %>% | ||||
|   dplyr::filter( | ||||
|     # # According to Niek, I should not include this, for whatever reason | ||||
|     #!(GenomeScan_ID %in% c( | ||||
|     #    "T02-01796", | ||||
|     #    "T02-03095", | ||||
|     #    "T02-10683", | ||||
|     #    "T10-18671", | ||||
|     #    "T12-12036" | ||||
|     #  ) | ||||
|     # ), | ||||
|     #  | ||||
|     # # (According to Niek, don't include) Water Controls | ||||
|     #!stringr::str_detect(T_number, pattern="Water"), | ||||
|     #  | ||||
|     # # (According to Niek, don't include)never smoker controles | ||||
|     #!(Group_simple2 == "NS_Ctrl"), | ||||
|     #  | ||||
|     # # (According to Niek, don't include)ALFA1 patiënten  | ||||
|     #!(Group_simple == "ALFA"), | ||||
|     #  | ||||
|     #stringr::str_trim(Passed_RNAseq_library_prep_QC_Y_N) == "Y", | ||||
|     GenomeScan_ID %in% colnames(expression.data), | ||||
|     !is.na(sample.id), | ||||
|     sample.id != "" | ||||
|   ) | ||||
|  | ||||
| expression.data <- expression.data %>% | ||||
|   select.columns.in.order( | ||||
|     c("Gene", master.Table$GenomeScan_ID) | ||||
|   ) | ||||
		Reference in New Issue
	
	Block a user