diff --git a/rnaseq/step3_align/snippet.sh b/rnaseq/step3_align/snippet.sh index e69de29..5ad5e64 100644 --- a/rnaseq/step3_align/snippet.sh +++ b/rnaseq/step3_align/snippet.sh @@ -0,0 +1,53 @@ +#!/bin/bash +# +# Align reads against reference genome. + +STORAGE="/groups/umcg-griac/tmp04/rawdata/$(whoami)/step3" +# Store genome index in this location:. +GENOME_INDEX="${STORAGE}/genome_index" +mkdir -p "${GENOME_INDEX}" +# Store the generated `Aligned.sortedByCoord.out.bam` in this dir. +ALIGNMENT_OUTPUT="${STORAGE}/alignment" +mkdir -p "${ALIGNMENT_OUTPUT}" + +# 1) Generate genome index. +# +# N.B.: +# - We're assuming a read size of 100 bp (--sjdbOverhang 100). Refer back to the +# previous quality control steps if you are unsure about the size. In case of +# reads of varying length, the ideal value is max(ReadLength)-1. +# - We're using gzip compressed reference data (--readFilesCommand zcat), i.e., +# .gtf.gz and fa.gz. If not, you can remove the `zcat` flag. + +# Storage location reference data (in this case on calculon). +REFERENCE_DATA="/groups/umcg-griac/prm02/rawdata/reference/genome" +GTF_FILE="${REFERENCE_DATA}/Homo_sapiens.GRCh38.100.gtf.gz" +FASTA_FILE="${REFERENCE_DATA}/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz" + +STAR \ + --runThreadN 8 \ + --runMode genomeGenerate \ + --readFilesCommand zcat \ + --sjdbOverhang 100 \ + --genomeFastaFiles ${FASTA_FILE} \ + --sjdbGTFfile ${GTF_FILE} \ + --genomeDir ${GENOME_INDEX} + + +# 2) Do the actual alignment. +# +# N.B.: +# - We are assuming paired-end, gzip compressed (--readFilesCommand zcat) FastQ +# files. + +# THe compressed paired-end FastQ's that we are aligning. +R1="sample1_R1.fastq.gz" +R2="sample1_R2.fastq.gz" + +STAR \ + --runThreadN 8 \ + --readFilesCommand zcat \ + --readFilesIn "${R1}" "${R2}" \ + --outSAMtype BAM SortedByCoordinate \ + --genomeDir ${GENOME_INDEX} \ + --outFileNamePrefix "${ALIGNMENT_OUTPUT}"