diff --git a/rnaseq/step3_align/snippet.sh b/rnaseq/step3_align/snippet.sh index 547867a..ac5e38a 100644 --- a/rnaseq/step3_align/snippet.sh +++ b/rnaseq/step3_align/snippet.sh @@ -2,33 +2,42 @@ # # Align reads against reference genome. +REFERENCE_DATA="/groups/umcg-griac/prm03/rawdata/reference/genome" + PROJECT_DIRECTORY="/groups/umcg-griac/tmp01/rawdata/$(whoami)/rnaseq" -# Store genome index in this location:. -GENOME_INDEX="${PROJECT_DIRECTORY}/step3/genome_index" -mkdir -p "${GENOME_INDEX}" -# Store the generated `Aligned.sortedByCoord.out.bam` in this dir. -ALIGNMENT_OUTPUT="${PROJECT_DIRECTORY}/step3/alignment/" +# Store the generated `sample1_Aligned.sortedByCoord.out.bam` in this dir. +ALIGNMENT_OUTPUT="${PROJECT_DIRECTORY}/step3/alignment" mkdir -p "${ALIGNMENT_OUTPUT}" -# 1) Generate genome index. +# 1) Generate genome index (optional). +# +# Depending on your read size, reference genome and annotation, you may need to +# generate a new genome index. In most cases, this is not necessary and you can +# directly use the pre-build genome index from the cluster: +# +# GENOME_INDEX="${REFERENCE_DATA}/index_GRCh38_gtf100_overhang100" +# +# and ignore the first STAR command below. # # N.B.: # - We're assuming a read size of 100 bp (--sjdbOverhang 100). An alternative # cut-off is 150, for low-input methods. In general, refer back to the # previous quality control steps if you are unsure about the size. In case of # reads of varying length, the ideal value is max(ReadLength)-1. -# - We're using gzip compressed reference data (--readFilesCommand zcat), i.e., -# .gtf.gz and fa.gz. If not, you can remove the `zcat` flag. +# - If you're using gzip compressed reference data, i.e., .gtf.gz and fa.gz, +# pass the `--readFilesCommand zcat` flag. -# Storage location reference data (in this case on Gearshift). -REFERENCE_DATA="/groups/umcg-griac/prm03/rawdata/reference/genome" -GTF_FILE="${REFERENCE_DATA}/Homo_sapiens.GRCh38.100.gtf.gz" -FASTA_FILE="${REFERENCE_DATA}/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz" +# Store created genome index in this location:. +GENOME_INDEX="${PROJECT_DIRECTORY}/step3/genome_index" +mkdir -p "${GENOME_INDEX}" + +# Storage location reference data on Gearshift. +GTF_FILE="${REFERENCE_DATA}/Homo_sapiens.GRCh38.100.gtf" +FASTA_FILE="${REFERENCE_DATA}/Homo_sapiens.GRCh38.dna.primary_assembly.fa" STAR \ --runThreadN 8 \ --runMode genomeGenerate \ - --readFilesCommand zcat \ --sjdbOverhang 100 \ --genomeFastaFiles ${FASTA_FILE} \ --sjdbGTFfile ${GTF_FILE} \ @@ -51,4 +60,4 @@ STAR \ --readFilesIn "${R1}" "${R2}" \ --outSAMtype BAM SortedByCoordinate \ --genomeDir ${GENOME_INDEX} \ - --outFileNamePrefix "${ALIGNMENT_OUTPUT}" + --outFileNamePrefix "${ALIGNMENT_OUTPUT}/sample1_" diff --git a/rnaseq/step5_count/snippet.sh b/rnaseq/step5_count/snippet.sh index e69de29..ec910ee 100644 --- a/rnaseq/step5_count/snippet.sh +++ b/rnaseq/step5_count/snippet.sh @@ -0,0 +1,21 @@ +#!/bin/bash +PROJECT_DIRECTORY="/groups/umcg-griac/tmp01/rawdata/$(whoami)/rnaseq" +COUNT_OUTPUT="${PROJECT_DIRECTORY}/step5" +mkdir -p "${COUNT_OUTPUT}" + +# Storage location of annotation on Gearshift. +REFERENCE_DATA="/groups/umcg-griac/prm03/rawdata/reference/genome" +GTF_FILE="${REFERENCE_DATA}/Homo_sapiens.GRCh38.100.gtf" + +# Where our alignment file was stored. +BAM="${PROJECT_DIRECTORY}/step3/alignment/sample1_Aligned.sortedByCoord.out.bam" + +# Compute counts using htseq-count. +# +# N.B.: +# - If you are processing multiple files, you can use the --nprocesses flag to +# compute the files in parallel. +htseq-count \ + ${BAM} \ + ${GTF_FILE} \ + > ${COUNT_OUTPUT}/counts.txt \ No newline at end of file