diff --git a/rnaseq/step1_fastqc/snippet.sh b/rnaseq/step1_fastqc/snippet.sh index b0d7dcb..1c987c7 100644 --- a/rnaseq/step1_fastqc/snippet.sh +++ b/rnaseq/step1_fastqc/snippet.sh @@ -1,5 +1,12 @@ -# The command to do a FastQC on a fastq file is -file="file_to_analyse.fq.gz" -fastqc_out="./path/to/fastqc/output/dir" +#!/bin/bash +R1="sample1_R1.fastq.gz" +R2="sample1_R2.fastq.gz" -fastqc -o "$fastqc_out" "$file" +PROJECT_DIRECTORY="/groups/umcg-griac/tmp01/rawdata/$(whoami)/rnaseq" +FASTQC_OUT="${PROJECT_DIRECTORY}/step1/" +mkdir -p "${FASTQC_OUT}" + +# Run FastQC on paired-end data. +fastqc \ + -o "${FASTQC_OUT}" \ + "${R1}" "${R2}" diff --git a/rnaseq/step2_trim/snippet.sh b/rnaseq/step2_trim/snippet.sh index 1ae866f..a65bc51 100644 --- a/rnaseq/step2_trim/snippet.sh +++ b/rnaseq/step2_trim/snippet.sh @@ -1,9 +1,12 @@ #!/bin/bash - -# reference: http://www.usadellab.org/cms/?page=trimmomatic +# +# Reference: http://www.usadellab.org/cms/?page=trimmomatic module load Trimmomatic +PROJECT_DIRECTORY="/groups/umcg-griac/tmp01/rawdata/$(whoami)/rnaseq" +FASTQ_OUT="${PROJECT_DIRECTORY}/step2/" +mkdir -p "${FASTQ_OUT}" # Adapters can be found at @@ -11,15 +14,25 @@ module load Trimmomatic # But should be verified with FastQC, or in another way. -# (Example) Paired end +# Trimmomatic example Paired end data. +# +# Flags: +# - ILLUMINACLIP: Cut adapter and other illumina-specific sequences from the +# read. +# - SLIDINGWINDOW: Perform a sliding window trimming, cutting once the average +# quality within the window falls below a threshold. +# - LEADING: Cut bases off the start of a read, if below a threshold quality. +# - TRAILING: Cut bases off the end of a read, if below a threshold quality. +# - HEADCROP: Cut the specified number of bases from the start of the read. +# - MINLEN: Drop the read if it is below a specified length. java -jar $EBROOTTRIMMOMATIC/trimmomatic.jar PE \ -phred33 \ - input_forward.fq.gz \ - input_reverse.fq.gz \ - output_forward_paired.fq.gz \ - output_forward_unpaired.fq.gz \ - output_reverse_paired.fq.gz \ - output_reverse_unpaired.fq.gz \ + sample1_R1.fastq.gz \ + sample1_R2.fastq.gz \ + "${FASTQ_OUT}/sample1_R1_paired.fastq.gz" \ + "${FASTQ_OUT}/sample1_R1_unpaired.fastq.gz" \ + "${FASTQ_OUT}/sample1_R2_paired.fastq.gz" \ + "${FASTQ_OUT}/sample1_R2_unpaired.fastq.gz" \ ILLUMINACLIP: TruSeq3-PE.fa:2:30:10 \ LEADING:3 \ TRAILING:3 \ @@ -28,11 +41,11 @@ java -jar $EBROOTTRIMMOMATIC/trimmomatic.jar PE \ MINLEN:50 -# (Example) Single end +# Example single end data. java -jar $EBROOTTRIMMOMATIC/trimmomatic.jar SE \ -phred33 \ - input.fq.gz \ - output.fq.gz \ + sample1.fastq.gz \ + output.fastq.gz \ ILLUMINACLIP:TruSeq3-SE:2:30:10 \ LEADING:3 \ TRAILING:3 \ diff --git a/rnaseq/step3_align/snippet.sh b/rnaseq/step3_align/snippet.sh index e69de29..547867a 100644 --- a/rnaseq/step3_align/snippet.sh +++ b/rnaseq/step3_align/snippet.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# +# Align reads against reference genome. + +PROJECT_DIRECTORY="/groups/umcg-griac/tmp01/rawdata/$(whoami)/rnaseq" +# Store genome index in this location:. +GENOME_INDEX="${PROJECT_DIRECTORY}/step3/genome_index" +mkdir -p "${GENOME_INDEX}" +# Store the generated `Aligned.sortedByCoord.out.bam` in this dir. +ALIGNMENT_OUTPUT="${PROJECT_DIRECTORY}/step3/alignment/" +mkdir -p "${ALIGNMENT_OUTPUT}" + +# 1) Generate genome index. +# +# N.B.: +# - We're assuming a read size of 100 bp (--sjdbOverhang 100). An alternative +# cut-off is 150, for low-input methods. In general, refer back to the +# previous quality control steps if you are unsure about the size. In case of +# reads of varying length, the ideal value is max(ReadLength)-1. +# - We're using gzip compressed reference data (--readFilesCommand zcat), i.e., +# .gtf.gz and fa.gz. If not, you can remove the `zcat` flag. + +# Storage location reference data (in this case on Gearshift). +REFERENCE_DATA="/groups/umcg-griac/prm03/rawdata/reference/genome" +GTF_FILE="${REFERENCE_DATA}/Homo_sapiens.GRCh38.100.gtf.gz" +FASTA_FILE="${REFERENCE_DATA}/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz" + +STAR \ + --runThreadN 8 \ + --runMode genomeGenerate \ + --readFilesCommand zcat \ + --sjdbOverhang 100 \ + --genomeFastaFiles ${FASTA_FILE} \ + --sjdbGTFfile ${GTF_FILE} \ + --genomeDir ${GENOME_INDEX} + + +# 2) Do the actual alignment. +# +# N.B.: +# - We are assuming paired-end, gzip compressed (--readFilesCommand zcat) FastQ +# files. + +# The compressed, paired-end, FastQ's after trimming (step 2). +R1="${PROJECT_DIRECTORY}/step2/sample1_R1_paired.fastq.gz" +R2="${PROJECT_DIRECTORY}/step2/sample1_R2_paired.fastq.gz" + +STAR \ + --runThreadN 8 \ + --readFilesCommand zcat \ + --readFilesIn "${R1}" "${R2}" \ + --outSAMtype BAM SortedByCoordinate \ + --genomeDir ${GENOME_INDEX} \ + --outFileNamePrefix "${ALIGNMENT_OUTPUT}" diff --git a/rnaseq/step4_multiqc/snippet.sh b/rnaseq/step4_multiqc/snippet.sh index e69de29..ceb2c25 100644 --- a/rnaseq/step4_multiqc/snippet.sh +++ b/rnaseq/step4_multiqc/snippet.sh @@ -0,0 +1,5 @@ +#!/bin/bash +module load multiqc + +PROJECT_DIRECTORY="/groups/umcg-griac/tmp01/rawdata/$(whoami)/rnaseq" +multiqc "${PROJECT_DIRECTORY}" diff --git a/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R b/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R index 0c42844..ee8c249 100644 --- a/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R +++ b/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R @@ -18,7 +18,7 @@ do.scale = FALSE # The analysis -# We ise prcomp to calculate the PCAs. Afterwards you should plot the results. +# We use prcomp to calculate the PCAs. Afterwards you should plot the results. norm.expr.data <- expression.data %>% tibble::column_to_rownames("Gene") norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>% @@ -37,8 +37,8 @@ norm.expr.data.pcs <- norm.expr.data %>% ) # Write summary of PCAs to files -pcs.summery <- summary(norm.expr.data.pcs) -pcs.summery$importance %>% +pcs.summary <- summary(norm.expr.data.pcs) +pcs.summary$importance %>% t() %>% as.data.frame() %>% tibble::rownames_to_column("PC.name") %>% @@ -46,7 +46,7 @@ pcs.summery$importance %>% file.path(results.dir.pca, "importance.csv") ) -pcs.summery$x %>% +pcs.summary$x %>% t() %>% as.data.frame() %>% tibble::rownames_to_column("ensembl.id") %>% @@ -54,7 +54,7 @@ pcs.summery$x %>% file.path(results.dir.pca, "values.csv") ) -pcs.summery$rotation %>% +pcs.summary$rotation %>% t() %>% as.data.frame() %>% tibble::rownames_to_column("sample.id") %>% @@ -63,15 +63,15 @@ pcs.summery$rotation %>% ) data.frame( - rownames = names(pcs.summery$center), - center = pcs.summery$center, - scale = pcs.summery$scale + rownames = names(pcs.summary$center), + center = pcs.summary$center, + scale = pcs.summary$scale ) %>% readr::write_csv( file.path(results.dir.pca, "rest.csv") ) -# Not saved: pcs.summery$sdev, +# Not saved: pcs.summary$sdev, # Next thing to do: diff --git a/rnaseq/umi/07_remove_dups.pl b/rnaseq/umi/07_remove_dups.pl new file mode 100644 index 0000000..2aa2859 --- /dev/null +++ b/rnaseq/umi/07_remove_dups.pl @@ -0,0 +1,60 @@ +#!/usr/bin/perl -w +use strict; +use Parallel::ForkManager; +# this script creats one file with UMI unique reads and one with UMI duplicated reads +my @torun = (); +foreach my $file ( <*Aligned.sortedByCoord.out.bam> ) { + push @torun, $file; +} + +my $pm = Parallel::ForkManager->new( 12 ); +foreach my $file ( @torun ) { + my $sample = $file; + $sample =~ s/Aligned\.sortedByCoord\.out\.bam$//; + next if -s $sample.'_uniq.bam'; + warn "Parsing $sample\n"; + $pm->start and next; + my %seen = (); + my %duplicates = (); + my ( $uniqs, $dups ) = ( 0, 0 ); + open F, 'samtools view -h '.$file.' |'; + open F1, '| samtools view -bS - >'.$sample.'_uniq.bam'; + open F2, '| samtools view -bS - >'.$sample.'_dups.bam'; + while ( ) { + if ( m/^\@/ ) { + print F1; + print F2; + next; + } + my ( $id, $flag, $chr, $pos, $mapq, $cigar, $chr2, $pos2, $tlen ) = split /\t/; + next if $flag & 256 or $flag & 512 or $flag & 1024; #skip if the read is not primary alignment/read fails platform/vendor quality checks/read is PCR or optical duplicate +# foreach ( 256, 512, 1024 ) { $flag-=$_ if $flag&$_ } + my ( $bc ) = $id =~ m/\:([GATCN\d]+)$/; #extract UMI barcode + my $uniq = join( ':', $chr, $pos, $flag, $tlen, $bc ); + my $pos_ = $pos-1; + while ( $cigar =~ m/(\d+)([SHMDIN=])/g ) { + $pos_+=$1 if $2 eq 'M' or $2 eq '=' or $2 eq 'D' or $2 eq 'N'; + } + my $uniq2 = join( ':', $chr, $pos_, $flag, $tlen, $bc ); + if ( exists($duplicates{$id}) or # already marked as duplicate + ( not($flag&16) and ++$seen{ $uniq } > 1 ) or # plus strand + ( $flag&16 and ++$seen{ $uniq2 } > 1 ) + ) { + print F2; + $dups++; + $duplicates{$id} = 1; + } + else { + print F1; + $uniqs++; + } + } + close F; + close F1; + close F2; + warn "Unique: $uniqs (", 100*$uniqs/($uniqs+$dups), ")\n"; + system( 'samtools', 'index', $sample.'_uniq.bam' ); +# last; + $pm->finish; +} +$pm->wait_all_children; diff --git a/rnaseq/umi/snippet.pl b/rnaseq/umi/snippet.pl deleted file mode 100644 index e69de29..0000000