forked from GRIAC/system_genetics
		
	Compare commits
	
		
			1 Commits
		
	
	
		
			master
			...
			step3/star
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | 1037d154e3 | 
										
											Binary file not shown.
										
									
								
							| @@ -1,23 +0,0 @@ | ||||
| #!/usr/bin/perl -w | ||||
| use strict; | ||||
|  | ||||
| # this script was made with consideration for UMI-deduplicating. | ||||
| # this is because there are three .fastq files for each sample.  | ||||
| # the provider states the info about which file contains which info, | ||||
| # but in our case, from GenomeScan in Leiden, R2 contains the UMI read. | ||||
| # R1 and R3 contain sequencing information from paired-end sequencing | ||||
|  | ||||
| foreach my $file1 ( <*_R1.fastq.gz> ) { | ||||
|     my $file2 = $file1; | ||||
|     $file2 =~ s/\_R1\./_R2./; | ||||
|     my $file3 =~ s/\_R3\./_R2./; | ||||
|     die "file1==file2" if $file1 eq $file2; | ||||
|     my $sample = $file1; | ||||
|     $sample =~ s/\_R1\.fastq\.gz$//; | ||||
|     mkdir $sample.'_R1', 0700; | ||||
|     system join(' ', 'fastqc', '-o', $sample.'_R1', $file1); | ||||
|     mkdir $sample.'_R2', 0700; | ||||
|     system join(' ', 'fastqc', '-o', $sample.'_R2', $file2); | ||||
|     mkdir $sample.'_R3', 0700; | ||||
|     system join(' ', 'fastqc', '-o', $sample.'_R3', $file3) | ||||
| } | ||||
| @@ -1,27 +0,0 @@ | ||||
| #!/bin/bash | ||||
| #SBATCH --job-name=FastQC.for.alveolar_type_2 | ||||
| #SBATCH --comment=FastQC.for.alveolar_type_2 | ||||
| #SBATCH --time=48:00:00 | ||||
| #SBATCH --mincpus=2 | ||||
| #SBATCH --mem=20G | ||||
| #SBATCH --qos=priority | ||||
|  | ||||
| # For 173 samples, it will take about 24 hrs to run with about 15Gb of memory. | ||||
| # Should probably parallelize the perl script/make it a bash/slurm script. | ||||
|  | ||||
| module purge | ||||
| module load Perl/5.26.2-foss-2015b-bare | ||||
| module load BioPerl/1.6.924-foss-2015b-Perl-5.22.0 | ||||
| module load Java/11.0.2 | ||||
| module load FastQC/0.11.7-Java-1.8.0_144-unlimited_JCE | ||||
|  | ||||
| # Please see | ||||
| # https://www.youtube.com/watch?v=0Rj_xNuyOyQ | ||||
|  | ||||
| cd /groups/umcg-griac/tmp04/projects/umcg-rbults/alveolar_type2_fastq/ | ||||
| perl scripts/00_fastqc.pl | ||||
|  | ||||
| mkdir rene_FastQC.results | ||||
| find . -maxdepth 1 -type d -iname "*_R[123]" -exec mv {} ./rene_FastQC.results/ \; | ||||
| #find . -maxdepth 1 -type f -iname "*.htm*" -exec mv {} ./FastQC.results/ \; | ||||
|  | ||||
| @@ -1,36 +0,0 @@ | ||||
| ## this script is to generate jobs of trimming for each samples on the cluster | ||||
| ## please run this script first and then submit the jobs for each samples | ||||
| ## reference: http://www.usadellab.org/cms/?page=trimmomatic | ||||
|  | ||||
| #!/bin/bash | ||||
| # $1 indicates the path of raw samples.  | ||||
| # In the input folder, one sample has one independent folder with two pair-end f | ||||
| astq files.  | ||||
| # The folder name should be the sample name.  | ||||
| # the fastq file should be sample_1.fastq and sample_2.fastq  | ||||
| # please prepare a sample.list that include file names for each sample | ||||
|  | ||||
| out="/ * your output folder * /" | ||||
| input="/ * your input folder * /" | ||||
| cat sample.list | while read line | ||||
|  | ||||
| do | ||||
| sample=$(echo $line) | ||||
| echo '#!/bin/bash'  > rnaseq.${sample}.sh | ||||
| echo "#SBATCH --job-name=RNAseq.${sample}" >> rnaseq.${sample}.sh | ||||
| echo "#SBATCH --error=RNAseq.${sample}.err" >> rnaseq.${sample}.sh | ||||
| echo "#SBATCH --output=RNAseq.${sample}.out" >> rnaseq.${sample}.sh | ||||
| echo "#SBATCH --mem=15gb" >> rnaseq.${sample}.sh | ||||
| echo "#SBATCH --time=6:00:00" >> rnaseq.${sample}.sh | ||||
| echo "#SBATCH --cpus-per-task=6" >> rnaseq.${sample}.sh | ||||
|  | ||||
| echo "ml Java" >>rnaseq.${sample}.sh | ||||
|  | ||||
| echo "java -jar /* your folder of software */trimmomatic-0.36.jar PE \ | ||||
|   -phred33 /$input/${sample}\_1.fq.gz /$input/${sample}\_2.fq.gz \ | ||||
|    $out/trimmomatic/${sample}\_1_paired.fq $out/trimmomatic/${sample}\_1_unpaired.fq \ | ||||
|    $out/trimmomatic/${sample}\_2_paired.fq $out/trimmomatic/${sample}\_2_unpaired.fq \ | ||||
|    ILLUMINACLIP: TruSeq3-PE.fa:2:30:10 \ | ||||
|    LEADING:3 TRAILING:3 SLIDINGWINDOW:4:25 HEADCROP:8 MINLEN:50" >> rnaseq.${sample}.sh | ||||
|  | ||||
| done | ||||
| @@ -0,0 +1,53 @@ | ||||
| #!/bin/bash | ||||
| # | ||||
| # Align reads against reference genome. | ||||
|  | ||||
| STORAGE="/groups/umcg-griac/tmp04/rawdata/$(whoami)/step3" | ||||
| # Store genome index in this location:. | ||||
| GENOME_INDEX="${STORAGE}/genome_index" | ||||
| mkdir -p "${GENOME_INDEX}" | ||||
| # Store the generated `Aligned.sortedByCoord.out.bam` in this dir. | ||||
| ALIGNMENT_OUTPUT="${STORAGE}/alignment" | ||||
| mkdir -p "${ALIGNMENT_OUTPUT}" | ||||
|  | ||||
| # 1) Generate genome index. | ||||
| # | ||||
| # N.B.: | ||||
| # - We're assuming a read size of 100 bp (--sjdbOverhang 100). Refer back to the | ||||
| #   previous quality control steps if you are unsure about the size. In case of | ||||
| #   reads of varying length, the ideal value is max(ReadLength)-1. | ||||
| # - We're using gzip compressed reference data (--readFilesCommand zcat), i.e., | ||||
| #   .gtf.gz and fa.gz. If not, you can remove the `zcat` flag. | ||||
|  | ||||
| # Storage location reference data (in this case on calculon). | ||||
| REFERENCE_DATA="/groups/umcg-griac/prm02/rawdata/reference/genome" | ||||
| GTF_FILE="${REFERENCE_DATA}/Homo_sapiens.GRCh38.100.gtf.gz" | ||||
| FASTA_FILE="${REFERENCE_DATA}/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz" | ||||
|  | ||||
| STAR \ | ||||
|     --runThreadN 8 \ | ||||
|     --runMode genomeGenerate \ | ||||
|     --readFilesCommand zcat \ | ||||
|     --sjdbOverhang 100 \ | ||||
|     --genomeFastaFiles ${FASTA_FILE} \ | ||||
|     --sjdbGTFfile ${GTF_FILE} \ | ||||
|     --genomeDir ${GENOME_INDEX} | ||||
|  | ||||
|  | ||||
| # 2) Do the actual alignment. | ||||
| # | ||||
| # N.B.: | ||||
| # - We are assuming paired-end, gzip compressed (--readFilesCommand zcat)  FastQ | ||||
| #   files. | ||||
|  | ||||
| # THe compressed paired-end FastQ's that we are aligning. | ||||
| R1="sample1_R1.fastq.gz" | ||||
| R2="sample1_R2.fastq.gz" | ||||
|  | ||||
| STAR \ | ||||
|     --runThreadN 8 \ | ||||
|     --readFilesCommand zcat \ | ||||
|     --readFilesIn "${R1}" "${R2}" \ | ||||
|     --outSAMtype BAM SortedByCoordinate \ | ||||
|     --genomeDir ${GENOME_INDEX} \ | ||||
|     --outFileNamePrefix "${ALIGNMENT_OUTPUT}" | ||||
|   | ||||
		Reference in New Issue
	
	Block a user