Small snippet that generates a genome index and aligns the RNAseq reads.

2021-02-12 14:46:05 +01:00
5 changed files with 53 additions and 86 deletions
--- a/rnaseq/general_files/describing_general_files.docx
+++ b/rnaseq/general_files/describing_general_files.docx
--- a/rnaseq/step1_fastqc/00_fastqc.pl
+++ b/rnaseq/step1_fastqc/00_fastqc.pl
@@ -1,23 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-
-# this script was made with consideration for UMI-deduplicating.
-# this is because there are three .fastq files for each sample. 
-# the provider states the info about which file contains which info,
-# but in our case, from GenomeScan in Leiden, R2 contains the UMI read.
-# R1 and R3 contain sequencing information from paired-end sequencing
-
-foreach my $file1 ( <*_R1.fastq.gz> ) {
-    my $file2 = $file1;
-    $file2 =~ s/\_R1\./_R2./;
-    my $file3 =~ s/\_R3\./_R2./;
-    die "file1==file2" if $file1 eq $file2;
-    my $sample = $file1;
-    $sample =~ s/\_R1\.fastq\.gz$//;
-    mkdir $sample.'_R1', 0700;
-    system join(' ', 'fastqc', '-o', $sample.'_R1', $file1);
-    mkdir $sample.'_R2', 0700;
-    system join(' ', 'fastqc', '-o', $sample.'_R2', $file2);
-    mkdir $sample.'_R3', 0700;
-    system join(' ', 'fastqc', '-o', $sample.'_R3', $file3)
-}
--- a/rnaseq/step1_fastqc/00_fastqc.sh
+++ b/rnaseq/step1_fastqc/00_fastqc.sh
@@ -1,27 +0,0 @@
-#!/bin/bash
-#SBATCH --job-name=FastQC.for.alveolar_type_2
-#SBATCH --comment=FastQC.for.alveolar_type_2
-#SBATCH --time=48:00:00
-#SBATCH --mincpus=2
-#SBATCH --mem=20G
-#SBATCH --qos=priority
-
-# For 173 samples, it will take about 24 hrs to run with about 15Gb of memory.
-# Should probably parallelize the perl script/make it a bash/slurm script.
-
-module purge
-module load Perl/5.26.2-foss-2015b-bare
-module load BioPerl/1.6.924-foss-2015b-Perl-5.22.0
-module load Java/11.0.2
-module load FastQC/0.11.7-Java-1.8.0_144-unlimited_JCE
-
-# Please see
-# https://www.youtube.com/watch?v=0Rj_xNuyOyQ
-
-cd /groups/umcg-griac/tmp04/projects/umcg-rbults/alveolar_type2_fastq/
-perl scripts/00_fastqc.pl
-
-mkdir rene_FastQC.results
-find . -maxdepth 1 -type d -iname "*_R[123]" -exec mv {} ./rene_FastQC.results/ \;
-#find . -maxdepth 1 -type f -iname "*.htm*" -exec mv {} ./FastQC.results/ \;
-
--- a/rnaseq/step2_trim/snippet.sh
+++ b/rnaseq/step2_trim/snippet.sh
@@ -1,36 +0,0 @@
-## this script is to generate jobs of trimming for each samples on the cluster
-## please run this script first and then submit the jobs for each samples
-## reference: http://www.usadellab.org/cms/?page=trimmomatic
-
-#!/bin/bash
-# $1 indicates the path of raw samples. 
-# In the input folder, one sample has one independent folder with two pair-end f
-astq files. 
-# The folder name should be the sample name. 
-# the fastq file should be sample_1.fastq and sample_2.fastq 
-# please prepare a sample.list that include file names for each sample
-
-out="/ * your output folder * /"
-input="/ * your input folder * /"
-cat sample.list | while read line
-
-do
-sample=$(echo $line)
-echo '#!/bin/bash'  > rnaseq.${sample}.sh
-echo "#SBATCH --job-name=RNAseq.${sample}" >> rnaseq.${sample}.sh
-echo "#SBATCH --error=RNAseq.${sample}.err" >> rnaseq.${sample}.sh
-echo "#SBATCH --output=RNAseq.${sample}.out" >> rnaseq.${sample}.sh
-echo "#SBATCH --mem=15gb" >> rnaseq.${sample}.sh
-echo "#SBATCH --time=6:00:00" >> rnaseq.${sample}.sh
-echo "#SBATCH --cpus-per-task=6" >> rnaseq.${sample}.sh
-
-echo "ml Java" >>rnaseq.${sample}.sh
-
-echo "java -jar /* your folder of software */trimmomatic-0.36.jar PE \
-  -phred33 /$input/${sample}\_1.fq.gz /$input/${sample}\_2.fq.gz \
-   $out/trimmomatic/${sample}\_1_paired.fq $out/trimmomatic/${sample}\_1_unpaired.fq \
-   $out/trimmomatic/${sample}\_2_paired.fq $out/trimmomatic/${sample}\_2_unpaired.fq \
-   ILLUMINACLIP: TruSeq3-PE.fa:2:30:10 \
-   LEADING:3 TRAILING:3 SLIDINGWINDOW:4:25 HEADCROP:8 MINLEN:50" >> rnaseq.${sample}.sh
-
-done
--- a/rnaseq/step3_align/snippet.sh
+++ b/rnaseq/step3_align/snippet.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+#
+# Align reads against reference genome.
+
+STORAGE="/groups/umcg-griac/tmp04/rawdata/$(whoami)/step3"
+# Store genome index in this location:.
+GENOME_INDEX="${STORAGE}/genome_index"
+mkdir -p "${GENOME_INDEX}"
+# Store the generated `Aligned.sortedByCoord.out.bam` in this dir.
+ALIGNMENT_OUTPUT="${STORAGE}/alignment"
+mkdir -p "${ALIGNMENT_OUTPUT}"
+
+# 1) Generate genome index.
+#
+# N.B.:
+# - We're assuming a read size of 100 bp (--sjdbOverhang 100). Refer back to the
+#   previous quality control steps if you are unsure about the size. In case of
+#   reads of varying length, the ideal value is max(ReadLength)-1.
+# - We're using gzip compressed reference data (--readFilesCommand zcat), i.e.,
+#   .gtf.gz and fa.gz. If not, you can remove the `zcat` flag.
+
+# Storage location reference data (in this case on calculon).
+REFERENCE_DATA="/groups/umcg-griac/prm02/rawdata/reference/genome"
+GTF_FILE="${REFERENCE_DATA}/Homo_sapiens.GRCh38.100.gtf.gz"
+FASTA_FILE="${REFERENCE_DATA}/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz"
+
+STAR \
+    --runThreadN 8 \
+    --runMode genomeGenerate \
+    --readFilesCommand zcat \
+    --sjdbOverhang 100 \
+    --genomeFastaFiles ${FASTA_FILE} \
+    --sjdbGTFfile ${GTF_FILE} \
+    --genomeDir ${GENOME_INDEX}
+
+
+# 2) Do the actual alignment.
+#
+# N.B.:
+# - We are assuming paired-end, gzip compressed (--readFilesCommand zcat)  FastQ
+#   files.
+
+# THe compressed paired-end FastQ's that we are aligning.
+R1="sample1_R1.fastq.gz"
+R2="sample1_R2.fastq.gz"
+
+STAR \
+    --runThreadN 8 \
+    --readFilesCommand zcat \
+    --readFilesIn "${R1}" "${R2}" \
+    --outSAMtype BAM SortedByCoordinate \
+    --genomeDir ${GENOME_INDEX} \
+    --outFileNamePrefix "${ALIGNMENT_OUTPUT}"