From f93596dd879a585d2f8c7565dfebd2dda35ca896 Mon Sep 17 00:00:00 2001 From: "Hylke C. Donker" Date: Mon, 22 Feb 2021 13:18:18 +0100 Subject: [PATCH] Made snippet input- and output files more harmonised. --- rnaseq/step1_fastqc/snippet.sh | 15 ++++++-- rnaseq/step2_trim/snippet.sh | 37 +++++++++++++------ rnaseq/step3_align/snippet.sh | 19 +++++----- rnaseq/step4_multiqc/snippet.sh | 4 +- .../01 - Principle Component Analysis.R | 18 ++++----- 5 files changed, 57 insertions(+), 36 deletions(-) diff --git a/rnaseq/step1_fastqc/snippet.sh b/rnaseq/step1_fastqc/snippet.sh index b0d7dcb..1c987c7 100644 --- a/rnaseq/step1_fastqc/snippet.sh +++ b/rnaseq/step1_fastqc/snippet.sh @@ -1,5 +1,12 @@ -# The command to do a FastQC on a fastq file is -file="file_to_analyse.fq.gz" -fastqc_out="./path/to/fastqc/output/dir" +#!/bin/bash +R1="sample1_R1.fastq.gz" +R2="sample1_R2.fastq.gz" -fastqc -o "$fastqc_out" "$file" +PROJECT_DIRECTORY="/groups/umcg-griac/tmp01/rawdata/$(whoami)/rnaseq" +FASTQC_OUT="${PROJECT_DIRECTORY}/step1/" +mkdir -p "${FASTQC_OUT}" + +# Run FastQC on paired-end data. +fastqc \ + -o "${FASTQC_OUT}" \ + "${R1}" "${R2}" diff --git a/rnaseq/step2_trim/snippet.sh b/rnaseq/step2_trim/snippet.sh index 1ae866f..a65bc51 100644 --- a/rnaseq/step2_trim/snippet.sh +++ b/rnaseq/step2_trim/snippet.sh @@ -1,9 +1,12 @@ #!/bin/bash - -# reference: http://www.usadellab.org/cms/?page=trimmomatic +# +# Reference: http://www.usadellab.org/cms/?page=trimmomatic module load Trimmomatic +PROJECT_DIRECTORY="/groups/umcg-griac/tmp01/rawdata/$(whoami)/rnaseq" +FASTQ_OUT="${PROJECT_DIRECTORY}/step2/" +mkdir -p "${FASTQ_OUT}" # Adapters can be found at @@ -11,15 +14,25 @@ module load Trimmomatic # But should be verified with FastQC, or in another way. -# (Example) Paired end +# Trimmomatic example Paired end data. +# +# Flags: +# - ILLUMINACLIP: Cut adapter and other illumina-specific sequences from the +# read. +# - SLIDINGWINDOW: Perform a sliding window trimming, cutting once the average +# quality within the window falls below a threshold. +# - LEADING: Cut bases off the start of a read, if below a threshold quality. +# - TRAILING: Cut bases off the end of a read, if below a threshold quality. +# - HEADCROP: Cut the specified number of bases from the start of the read. +# - MINLEN: Drop the read if it is below a specified length. java -jar $EBROOTTRIMMOMATIC/trimmomatic.jar PE \ -phred33 \ - input_forward.fq.gz \ - input_reverse.fq.gz \ - output_forward_paired.fq.gz \ - output_forward_unpaired.fq.gz \ - output_reverse_paired.fq.gz \ - output_reverse_unpaired.fq.gz \ + sample1_R1.fastq.gz \ + sample1_R2.fastq.gz \ + "${FASTQ_OUT}/sample1_R1_paired.fastq.gz" \ + "${FASTQ_OUT}/sample1_R1_unpaired.fastq.gz" \ + "${FASTQ_OUT}/sample1_R2_paired.fastq.gz" \ + "${FASTQ_OUT}/sample1_R2_unpaired.fastq.gz" \ ILLUMINACLIP: TruSeq3-PE.fa:2:30:10 \ LEADING:3 \ TRAILING:3 \ @@ -28,11 +41,11 @@ java -jar $EBROOTTRIMMOMATIC/trimmomatic.jar PE \ MINLEN:50 -# (Example) Single end +# Example single end data. java -jar $EBROOTTRIMMOMATIC/trimmomatic.jar SE \ -phred33 \ - input.fq.gz \ - output.fq.gz \ + sample1.fastq.gz \ + output.fastq.gz \ ILLUMINACLIP:TruSeq3-SE:2:30:10 \ LEADING:3 \ TRAILING:3 \ diff --git a/rnaseq/step3_align/snippet.sh b/rnaseq/step3_align/snippet.sh index 5ad5e64..547867a 100644 --- a/rnaseq/step3_align/snippet.sh +++ b/rnaseq/step3_align/snippet.sh @@ -2,25 +2,26 @@ # # Align reads against reference genome. -STORAGE="/groups/umcg-griac/tmp04/rawdata/$(whoami)/step3" +PROJECT_DIRECTORY="/groups/umcg-griac/tmp01/rawdata/$(whoami)/rnaseq" # Store genome index in this location:. -GENOME_INDEX="${STORAGE}/genome_index" +GENOME_INDEX="${PROJECT_DIRECTORY}/step3/genome_index" mkdir -p "${GENOME_INDEX}" # Store the generated `Aligned.sortedByCoord.out.bam` in this dir. -ALIGNMENT_OUTPUT="${STORAGE}/alignment" +ALIGNMENT_OUTPUT="${PROJECT_DIRECTORY}/step3/alignment/" mkdir -p "${ALIGNMENT_OUTPUT}" # 1) Generate genome index. # # N.B.: -# - We're assuming a read size of 100 bp (--sjdbOverhang 100). Refer back to the +# - We're assuming a read size of 100 bp (--sjdbOverhang 100). An alternative +# cut-off is 150, for low-input methods. In general, refer back to the # previous quality control steps if you are unsure about the size. In case of # reads of varying length, the ideal value is max(ReadLength)-1. # - We're using gzip compressed reference data (--readFilesCommand zcat), i.e., # .gtf.gz and fa.gz. If not, you can remove the `zcat` flag. -# Storage location reference data (in this case on calculon). -REFERENCE_DATA="/groups/umcg-griac/prm02/rawdata/reference/genome" +# Storage location reference data (in this case on Gearshift). +REFERENCE_DATA="/groups/umcg-griac/prm03/rawdata/reference/genome" GTF_FILE="${REFERENCE_DATA}/Homo_sapiens.GRCh38.100.gtf.gz" FASTA_FILE="${REFERENCE_DATA}/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz" @@ -40,9 +41,9 @@ STAR \ # - We are assuming paired-end, gzip compressed (--readFilesCommand zcat) FastQ # files. -# THe compressed paired-end FastQ's that we are aligning. -R1="sample1_R1.fastq.gz" -R2="sample1_R2.fastq.gz" +# The compressed, paired-end, FastQ's after trimming (step 2). +R1="${PROJECT_DIRECTORY}/step2/sample1_R1_paired.fastq.gz" +R2="${PROJECT_DIRECTORY}/step2/sample1_R2_paired.fastq.gz" STAR \ --runThreadN 8 \ diff --git a/rnaseq/step4_multiqc/snippet.sh b/rnaseq/step4_multiqc/snippet.sh index e16b342..ceb2c25 100644 --- a/rnaseq/step4_multiqc/snippet.sh +++ b/rnaseq/step4_multiqc/snippet.sh @@ -1,5 +1,5 @@ #!/bin/bash module load multiqc -# assuming you have all your fastQC result files in the ./fastQCresults folder -multiqc ./fastQCresults +PROJECT_DIRECTORY="/groups/umcg-griac/tmp01/rawdata/$(whoami)/rnaseq" +multiqc "${PROJECT_DIRECTORY}" diff --git a/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R b/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R index 0c42844..ee8c249 100644 --- a/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R +++ b/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R @@ -18,7 +18,7 @@ do.scale = FALSE # The analysis -# We ise prcomp to calculate the PCAs. Afterwards you should plot the results. +# We use prcomp to calculate the PCAs. Afterwards you should plot the results. norm.expr.data <- expression.data %>% tibble::column_to_rownames("Gene") norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>% @@ -37,8 +37,8 @@ norm.expr.data.pcs <- norm.expr.data %>% ) # Write summary of PCAs to files -pcs.summery <- summary(norm.expr.data.pcs) -pcs.summery$importance %>% +pcs.summary <- summary(norm.expr.data.pcs) +pcs.summary$importance %>% t() %>% as.data.frame() %>% tibble::rownames_to_column("PC.name") %>% @@ -46,7 +46,7 @@ pcs.summery$importance %>% file.path(results.dir.pca, "importance.csv") ) -pcs.summery$x %>% +pcs.summary$x %>% t() %>% as.data.frame() %>% tibble::rownames_to_column("ensembl.id") %>% @@ -54,7 +54,7 @@ pcs.summery$x %>% file.path(results.dir.pca, "values.csv") ) -pcs.summery$rotation %>% +pcs.summary$rotation %>% t() %>% as.data.frame() %>% tibble::rownames_to_column("sample.id") %>% @@ -63,15 +63,15 @@ pcs.summery$rotation %>% ) data.frame( - rownames = names(pcs.summery$center), - center = pcs.summery$center, - scale = pcs.summery$scale + rownames = names(pcs.summary$center), + center = pcs.summary$center, + scale = pcs.summary$scale ) %>% readr::write_csv( file.path(results.dir.pca, "rest.csv") ) -# Not saved: pcs.summery$sdev, +# Not saved: pcs.summary$sdev, # Next thing to do: