forked from GRIAC/system_genetics
Compare commits
3 Commits
step3/star
...
master
Author | SHA1 | Date | |
---|---|---|---|
|
e3c88c059f | ||
|
1110d00e8c | ||
4b61c099f6 |
BIN
rnaseq/general_files/describing_general_files.docx
Normal file
BIN
rnaseq/general_files/describing_general_files.docx
Normal file
Binary file not shown.
23
rnaseq/step1_fastqc/00_fastqc.pl
Executable file
23
rnaseq/step1_fastqc/00_fastqc.pl
Executable file
@ -0,0 +1,23 @@
|
|||||||
|
#!/usr/bin/perl -w
|
||||||
|
use strict;
|
||||||
|
|
||||||
|
# this script was made with consideration for UMI-deduplicating.
|
||||||
|
# this is because there are three .fastq files for each sample.
|
||||||
|
# the provider states the info about which file contains which info,
|
||||||
|
# but in our case, from GenomeScan in Leiden, R2 contains the UMI read.
|
||||||
|
# R1 and R3 contain sequencing information from paired-end sequencing
|
||||||
|
|
||||||
|
foreach my $file1 ( <*_R1.fastq.gz> ) {
|
||||||
|
my $file2 = $file1;
|
||||||
|
$file2 =~ s/\_R1\./_R2./;
|
||||||
|
my $file3 =~ s/\_R3\./_R2./;
|
||||||
|
die "file1==file2" if $file1 eq $file2;
|
||||||
|
my $sample = $file1;
|
||||||
|
$sample =~ s/\_R1\.fastq\.gz$//;
|
||||||
|
mkdir $sample.'_R1', 0700;
|
||||||
|
system join(' ', 'fastqc', '-o', $sample.'_R1', $file1);
|
||||||
|
mkdir $sample.'_R2', 0700;
|
||||||
|
system join(' ', 'fastqc', '-o', $sample.'_R2', $file2);
|
||||||
|
mkdir $sample.'_R3', 0700;
|
||||||
|
system join(' ', 'fastqc', '-o', $sample.'_R3', $file3)
|
||||||
|
}
|
27
rnaseq/step1_fastqc/00_fastqc.sh
Normal file
27
rnaseq/step1_fastqc/00_fastqc.sh
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#SBATCH --job-name=FastQC.for.alveolar_type_2
|
||||||
|
#SBATCH --comment=FastQC.for.alveolar_type_2
|
||||||
|
#SBATCH --time=48:00:00
|
||||||
|
#SBATCH --mincpus=2
|
||||||
|
#SBATCH --mem=20G
|
||||||
|
#SBATCH --qos=priority
|
||||||
|
|
||||||
|
# For 173 samples, it will take about 24 hrs to run with about 15Gb of memory.
|
||||||
|
# Should probably parallelize the perl script/make it a bash/slurm script.
|
||||||
|
|
||||||
|
module purge
|
||||||
|
module load Perl/5.26.2-foss-2015b-bare
|
||||||
|
module load BioPerl/1.6.924-foss-2015b-Perl-5.22.0
|
||||||
|
module load Java/11.0.2
|
||||||
|
module load FastQC/0.11.7-Java-1.8.0_144-unlimited_JCE
|
||||||
|
|
||||||
|
# Please see
|
||||||
|
# https://www.youtube.com/watch?v=0Rj_xNuyOyQ
|
||||||
|
|
||||||
|
cd /groups/umcg-griac/tmp04/projects/umcg-rbults/alveolar_type2_fastq/
|
||||||
|
perl scripts/00_fastqc.pl
|
||||||
|
|
||||||
|
mkdir rene_FastQC.results
|
||||||
|
find . -maxdepth 1 -type d -iname "*_R[123]" -exec mv {} ./rene_FastQC.results/ \;
|
||||||
|
#find . -maxdepth 1 -type f -iname "*.htm*" -exec mv {} ./FastQC.results/ \;
|
||||||
|
|
@ -0,0 +1,36 @@
|
|||||||
|
## this script is to generate jobs of trimming for each samples on the cluster
|
||||||
|
## please run this script first and then submit the jobs for each samples
|
||||||
|
## reference: http://www.usadellab.org/cms/?page=trimmomatic
|
||||||
|
|
||||||
|
#!/bin/bash
|
||||||
|
# $1 indicates the path of raw samples.
|
||||||
|
# In the input folder, one sample has one independent folder with two pair-end f
|
||||||
|
astq files.
|
||||||
|
# The folder name should be the sample name.
|
||||||
|
# the fastq file should be sample_1.fastq and sample_2.fastq
|
||||||
|
# please prepare a sample.list that include file names for each sample
|
||||||
|
|
||||||
|
out="/ * your output folder * /"
|
||||||
|
input="/ * your input folder * /"
|
||||||
|
cat sample.list | while read line
|
||||||
|
|
||||||
|
do
|
||||||
|
sample=$(echo $line)
|
||||||
|
echo '#!/bin/bash' > rnaseq.${sample}.sh
|
||||||
|
echo "#SBATCH --job-name=RNAseq.${sample}" >> rnaseq.${sample}.sh
|
||||||
|
echo "#SBATCH --error=RNAseq.${sample}.err" >> rnaseq.${sample}.sh
|
||||||
|
echo "#SBATCH --output=RNAseq.${sample}.out" >> rnaseq.${sample}.sh
|
||||||
|
echo "#SBATCH --mem=15gb" >> rnaseq.${sample}.sh
|
||||||
|
echo "#SBATCH --time=6:00:00" >> rnaseq.${sample}.sh
|
||||||
|
echo "#SBATCH --cpus-per-task=6" >> rnaseq.${sample}.sh
|
||||||
|
|
||||||
|
echo "ml Java" >>rnaseq.${sample}.sh
|
||||||
|
|
||||||
|
echo "java -jar /* your folder of software */trimmomatic-0.36.jar PE \
|
||||||
|
-phred33 /$input/${sample}\_1.fq.gz /$input/${sample}\_2.fq.gz \
|
||||||
|
$out/trimmomatic/${sample}\_1_paired.fq $out/trimmomatic/${sample}\_1_unpaired.fq \
|
||||||
|
$out/trimmomatic/${sample}\_2_paired.fq $out/trimmomatic/${sample}\_2_unpaired.fq \
|
||||||
|
ILLUMINACLIP: TruSeq3-PE.fa:2:30:10 \
|
||||||
|
LEADING:3 TRAILING:3 SLIDINGWINDOW:4:25 HEADCROP:8 MINLEN:50" >> rnaseq.${sample}.sh
|
||||||
|
|
||||||
|
done
|
@ -1,53 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
#
|
|
||||||
# Align reads against reference genome.
|
|
||||||
|
|
||||||
STORAGE="/groups/umcg-griac/tmp04/rawdata/$(whoami)/step3"
|
|
||||||
# Store genome index in this location:.
|
|
||||||
GENOME_INDEX="${STORAGE}/genome_index"
|
|
||||||
mkdir -p "${GENOME_INDEX}"
|
|
||||||
# Store the generated `Aligned.sortedByCoord.out.bam` in this dir.
|
|
||||||
ALIGNMENT_OUTPUT="${STORAGE}/alignment"
|
|
||||||
mkdir -p "${ALIGNMENT_OUTPUT}"
|
|
||||||
|
|
||||||
# 1) Generate genome index.
|
|
||||||
#
|
|
||||||
# N.B.:
|
|
||||||
# - We're assuming a read size of 100 bp (--sjdbOverhang 100). Refer back to the
|
|
||||||
# previous quality control steps if you are unsure about the size. In case of
|
|
||||||
# reads of varying length, the ideal value is max(ReadLength)-1.
|
|
||||||
# - We're using gzip compressed reference data (--readFilesCommand zcat), i.e.,
|
|
||||||
# .gtf.gz and fa.gz. If not, you can remove the `zcat` flag.
|
|
||||||
|
|
||||||
# Storage location reference data (in this case on calculon).
|
|
||||||
REFERENCE_DATA="/groups/umcg-griac/prm02/rawdata/reference/genome"
|
|
||||||
GTF_FILE="${REFERENCE_DATA}/Homo_sapiens.GRCh38.100.gtf.gz"
|
|
||||||
FASTA_FILE="${REFERENCE_DATA}/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz"
|
|
||||||
|
|
||||||
STAR \
|
|
||||||
--runThreadN 8 \
|
|
||||||
--runMode genomeGenerate \
|
|
||||||
--readFilesCommand zcat \
|
|
||||||
--sjdbOverhang 100 \
|
|
||||||
--genomeFastaFiles ${FASTA_FILE} \
|
|
||||||
--sjdbGTFfile ${GTF_FILE} \
|
|
||||||
--genomeDir ${GENOME_INDEX}
|
|
||||||
|
|
||||||
|
|
||||||
# 2) Do the actual alignment.
|
|
||||||
#
|
|
||||||
# N.B.:
|
|
||||||
# - We are assuming paired-end, gzip compressed (--readFilesCommand zcat) FastQ
|
|
||||||
# files.
|
|
||||||
|
|
||||||
# THe compressed paired-end FastQ's that we are aligning.
|
|
||||||
R1="sample1_R1.fastq.gz"
|
|
||||||
R2="sample1_R2.fastq.gz"
|
|
||||||
|
|
||||||
STAR \
|
|
||||||
--runThreadN 8 \
|
|
||||||
--readFilesCommand zcat \
|
|
||||||
--readFilesIn "${R1}" "${R2}" \
|
|
||||||
--outSAMtype BAM SortedByCoordinate \
|
|
||||||
--genomeDir ${GENOME_INDEX} \
|
|
||||||
--outFileNamePrefix "${ALIGNMENT_OUTPUT}"
|
|
Loading…
Reference in New Issue
Block a user