#!/bin/bash # # Align reads against reference genome. STORAGE="/groups/umcg-griac/tmp01/rawdata/$(whoami)/step3" # Store genome index in this location:. GENOME_INDEX="${STORAGE}/genome_index" mkdir -p "${GENOME_INDEX}" # Store the generated `Aligned.sortedByCoord.out.bam` in this dir. ALIGNMENT_OUTPUT="${STORAGE}/alignment" mkdir -p "${OUTPUT}" # 1) Generate genome index. # # N.B.: # - We're assuming a read size of 100 bp (--sjdbOverhang 100). Refer back to the # previous quality control steps if you are unsure about the size. In case of # reads of varying length, the ideal value is max(ReadLength)-1. # Where the reference data is stored. REFERENCE_DATA="/groups/umcg-griac/prm02/rawdata/reference/genome" GTF_FILE="${REFERENCE_DATA}/Homo_sapiens.GRCh38.100.gtf" FASTA_FILE="${REFERENCE_DATA}/Homo_sapiens.GRCh38.dna.primary_assembly.fa" STAR \ --runThreadN 8 \ --runMode genomeGenerate \ --sjdbOverhang 100 \ --genomeFastaFiles ${FASTA_FILE} \ --sjdbGTFfile ${GTF_FILE} \ --genomeDir ${GENOME_INDEX} # 2) Do the actual alignment. # # N.B.: # - We are assuming paired-end, gzip compressed (--readFilesCommand zcat) FastQ # files. # THe compressed paired-end FastQ's that we are aligning. R1="sample1_R1.fastq.gz" R2="sample1_R2.fastq.gz" STAR \ --runThreadN 8 \ --readFilesCommand zcat \ --readFilesIn "${R1}" "${R2}" \ --outSAMtype BAM SortedByCoordinate \ --outFileNamePrefix "${ALIGNMENT_OUTPUT}"