From a7651f6a21e27867e9c36cd03d4439a22ace3fae Mon Sep 17 00:00:00 2001 From: TatiKarp Date: Fri, 12 Feb 2021 13:52:56 +0100 Subject: [PATCH 1/5] UMI deduplication script from Victor --- rnaseq/umi/07_remove_dups.pl | 60 ++++++++++++++++++++++++++++++++++++ rnaseq/umi/snippet.pl | 0 2 files changed, 60 insertions(+) create mode 100644 rnaseq/umi/07_remove_dups.pl delete mode 100644 rnaseq/umi/snippet.pl diff --git a/rnaseq/umi/07_remove_dups.pl b/rnaseq/umi/07_remove_dups.pl new file mode 100644 index 0000000..94f3acc --- /dev/null +++ b/rnaseq/umi/07_remove_dups.pl @@ -0,0 +1,60 @@ +#!/usr/bin/perl -w +use strict; +use Parallel::ForkManager; + +my @torun = (); +foreach my $file ( <*Aligned.sortedByCoord.out.bam> ) { + push @torun, $file; +} + +my $pm = Parallel::ForkManager->new( 12 ); +foreach my $file ( @torun ) { + my $sample = $file; + $sample =~ s/Aligned\.sortedByCoord\.out\.bam$//; + next if -s $sample.'_uniq.bam'; + warn "Parsing $sample\n"; + $pm->start and next; + my %seen = (); + my %duplicates = (); + my ( $uniqs, $dups ) = ( 0, 0 ); + open F, 'samtools view -h '.$file.' |'; + open F1, '| samtools view -bS - >'.$sample.'_uniq.bam'; + open F2, '| samtools view -bS - >'.$sample.'_dups.bam'; + while ( ) { + if ( m/^\@/ ) { + print F1; + print F2; + next; + } + my ( $id, $flag, $chr, $pos, $mapq, $cigar, $chr2, $pos2, $tlen ) = split /\t/; + next if $flag & 256 or $flag & 512 or $flag & 1024; +# foreach ( 256, 512, 1024 ) { $flag-=$_ if $flag&$_ } + my ( $bc ) = $id =~ m/\:([GATCN\d]+)$/; + my $uniq = join( ':', $chr, $pos, $flag, $tlen, $bc ); + my $pos_ = $pos-1; + while ( $cigar =~ m/(\d+)([SHMDIN=])/g ) { + $pos_+=$1 if $2 eq 'M' or $2 eq '=' or $2 eq 'D' or $2 eq 'N'; + } + my $uniq2 = join( ':', $chr, $pos_, $flag, $tlen, $bc ); + if ( exists($duplicates{$id}) or # already marked as duplicate + ( not($flag&16) and ++$seen{ $uniq } > 1 ) or # plus strand + ( $flag&16 and ++$seen{ $uniq2 } > 1 ) + ) { + print F2; + $dups++; + $duplicates{$id} = 1; + } + else { + print F1; + $uniqs++; + } + } + close F; + close F1; + close F2; + warn "Unique: $uniqs (", 100*$uniqs/($uniqs+$dups), ")\n"; + system( 'samtools', 'index', $sample.'_uniq.bam' ); +# last; + $pm->finish; +} +$pm->wait_all_children; diff --git a/rnaseq/umi/snippet.pl b/rnaseq/umi/snippet.pl deleted file mode 100644 index e69de29..0000000 From 1037d154e397a460d6fe19a1aba41b311d367bff Mon Sep 17 00:00:00 2001 From: "Hylke C. Donker" Date: Tue, 9 Feb 2021 17:31:42 +0100 Subject: [PATCH 2/5] Small snippet that generates a genome index and aligns the RNAseq reads. --- rnaseq/step3_align/snippet.sh | 53 +++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/rnaseq/step3_align/snippet.sh b/rnaseq/step3_align/snippet.sh index e69de29..5ad5e64 100644 --- a/rnaseq/step3_align/snippet.sh +++ b/rnaseq/step3_align/snippet.sh @@ -0,0 +1,53 @@ +#!/bin/bash +# +# Align reads against reference genome. + +STORAGE="/groups/umcg-griac/tmp04/rawdata/$(whoami)/step3" +# Store genome index in this location:. +GENOME_INDEX="${STORAGE}/genome_index" +mkdir -p "${GENOME_INDEX}" +# Store the generated `Aligned.sortedByCoord.out.bam` in this dir. +ALIGNMENT_OUTPUT="${STORAGE}/alignment" +mkdir -p "${ALIGNMENT_OUTPUT}" + +# 1) Generate genome index. +# +# N.B.: +# - We're assuming a read size of 100 bp (--sjdbOverhang 100). Refer back to the +# previous quality control steps if you are unsure about the size. In case of +# reads of varying length, the ideal value is max(ReadLength)-1. +# - We're using gzip compressed reference data (--readFilesCommand zcat), i.e., +# .gtf.gz and fa.gz. If not, you can remove the `zcat` flag. + +# Storage location reference data (in this case on calculon). +REFERENCE_DATA="/groups/umcg-griac/prm02/rawdata/reference/genome" +GTF_FILE="${REFERENCE_DATA}/Homo_sapiens.GRCh38.100.gtf.gz" +FASTA_FILE="${REFERENCE_DATA}/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz" + +STAR \ + --runThreadN 8 \ + --runMode genomeGenerate \ + --readFilesCommand zcat \ + --sjdbOverhang 100 \ + --genomeFastaFiles ${FASTA_FILE} \ + --sjdbGTFfile ${GTF_FILE} \ + --genomeDir ${GENOME_INDEX} + + +# 2) Do the actual alignment. +# +# N.B.: +# - We are assuming paired-end, gzip compressed (--readFilesCommand zcat) FastQ +# files. + +# THe compressed paired-end FastQ's that we are aligning. +R1="sample1_R1.fastq.gz" +R2="sample1_R2.fastq.gz" + +STAR \ + --runThreadN 8 \ + --readFilesCommand zcat \ + --readFilesIn "${R1}" "${R2}" \ + --outSAMtype BAM SortedByCoordinate \ + --genomeDir ${GENOME_INDEX} \ + --outFileNamePrefix "${ALIGNMENT_OUTPUT}" From c05d3f78681795dddcfeaccd3a5dd2093ba17610 Mon Sep 17 00:00:00 2001 From: TatiKarp Date: Mon, 15 Feb 2021 17:18:48 +0100 Subject: [PATCH 3/5] Added description and some comments for UMi deduplication --- rnaseq/umi/07_remove_dups.pl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rnaseq/umi/07_remove_dups.pl b/rnaseq/umi/07_remove_dups.pl index 94f3acc..2aa2859 100644 --- a/rnaseq/umi/07_remove_dups.pl +++ b/rnaseq/umi/07_remove_dups.pl @@ -1,7 +1,7 @@ #!/usr/bin/perl -w use strict; use Parallel::ForkManager; - +# this script creats one file with UMI unique reads and one with UMI duplicated reads my @torun = (); foreach my $file ( <*Aligned.sortedByCoord.out.bam> ) { push @torun, $file; @@ -27,9 +27,9 @@ foreach my $file ( @torun ) { next; } my ( $id, $flag, $chr, $pos, $mapq, $cigar, $chr2, $pos2, $tlen ) = split /\t/; - next if $flag & 256 or $flag & 512 or $flag & 1024; + next if $flag & 256 or $flag & 512 or $flag & 1024; #skip if the read is not primary alignment/read fails platform/vendor quality checks/read is PCR or optical duplicate # foreach ( 256, 512, 1024 ) { $flag-=$_ if $flag&$_ } - my ( $bc ) = $id =~ m/\:([GATCN\d]+)$/; + my ( $bc ) = $id =~ m/\:([GATCN\d]+)$/; #extract UMI barcode my $uniq = join( ':', $chr, $pos, $flag, $tlen, $bc ); my $pos_ = $pos-1; while ( $cigar =~ m/(\d+)([SHMDIN=])/g ) { From a2bc31351578a289f4bbb3bb19ec2a3dd255af25 Mon Sep 17 00:00:00 2001 From: Marijn Berg Date: Mon, 22 Feb 2021 14:52:54 +0100 Subject: [PATCH 4/5] Simple snippet on running MultiQC to collate FastQC results --- rnaseq/step4_multiqc/snippet.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/rnaseq/step4_multiqc/snippet.sh b/rnaseq/step4_multiqc/snippet.sh index e69de29..e16b342 100644 --- a/rnaseq/step4_multiqc/snippet.sh +++ b/rnaseq/step4_multiqc/snippet.sh @@ -0,0 +1,5 @@ +#!/bin/bash +module load multiqc + +# assuming you have all your fastQC result files in the ./fastQCresults folder +multiqc ./fastQCresults From f93596dd879a585d2f8c7565dfebd2dda35ca896 Mon Sep 17 00:00:00 2001 From: "Hylke C. Donker" Date: Mon, 22 Feb 2021 13:18:18 +0100 Subject: [PATCH 5/5] Made snippet input- and output files more harmonised. --- rnaseq/step1_fastqc/snippet.sh | 15 ++++++-- rnaseq/step2_trim/snippet.sh | 37 +++++++++++++------ rnaseq/step3_align/snippet.sh | 19 +++++----- rnaseq/step4_multiqc/snippet.sh | 4 +- .../01 - Principle Component Analysis.R | 18 ++++----- 5 files changed, 57 insertions(+), 36 deletions(-) diff --git a/rnaseq/step1_fastqc/snippet.sh b/rnaseq/step1_fastqc/snippet.sh index b0d7dcb..1c987c7 100644 --- a/rnaseq/step1_fastqc/snippet.sh +++ b/rnaseq/step1_fastqc/snippet.sh @@ -1,5 +1,12 @@ -# The command to do a FastQC on a fastq file is -file="file_to_analyse.fq.gz" -fastqc_out="./path/to/fastqc/output/dir" +#!/bin/bash +R1="sample1_R1.fastq.gz" +R2="sample1_R2.fastq.gz" -fastqc -o "$fastqc_out" "$file" +PROJECT_DIRECTORY="/groups/umcg-griac/tmp01/rawdata/$(whoami)/rnaseq" +FASTQC_OUT="${PROJECT_DIRECTORY}/step1/" +mkdir -p "${FASTQC_OUT}" + +# Run FastQC on paired-end data. +fastqc \ + -o "${FASTQC_OUT}" \ + "${R1}" "${R2}" diff --git a/rnaseq/step2_trim/snippet.sh b/rnaseq/step2_trim/snippet.sh index 1ae866f..a65bc51 100644 --- a/rnaseq/step2_trim/snippet.sh +++ b/rnaseq/step2_trim/snippet.sh @@ -1,9 +1,12 @@ #!/bin/bash - -# reference: http://www.usadellab.org/cms/?page=trimmomatic +# +# Reference: http://www.usadellab.org/cms/?page=trimmomatic module load Trimmomatic +PROJECT_DIRECTORY="/groups/umcg-griac/tmp01/rawdata/$(whoami)/rnaseq" +FASTQ_OUT="${PROJECT_DIRECTORY}/step2/" +mkdir -p "${FASTQ_OUT}" # Adapters can be found at @@ -11,15 +14,25 @@ module load Trimmomatic # But should be verified with FastQC, or in another way. -# (Example) Paired end +# Trimmomatic example Paired end data. +# +# Flags: +# - ILLUMINACLIP: Cut adapter and other illumina-specific sequences from the +# read. +# - SLIDINGWINDOW: Perform a sliding window trimming, cutting once the average +# quality within the window falls below a threshold. +# - LEADING: Cut bases off the start of a read, if below a threshold quality. +# - TRAILING: Cut bases off the end of a read, if below a threshold quality. +# - HEADCROP: Cut the specified number of bases from the start of the read. +# - MINLEN: Drop the read if it is below a specified length. java -jar $EBROOTTRIMMOMATIC/trimmomatic.jar PE \ -phred33 \ - input_forward.fq.gz \ - input_reverse.fq.gz \ - output_forward_paired.fq.gz \ - output_forward_unpaired.fq.gz \ - output_reverse_paired.fq.gz \ - output_reverse_unpaired.fq.gz \ + sample1_R1.fastq.gz \ + sample1_R2.fastq.gz \ + "${FASTQ_OUT}/sample1_R1_paired.fastq.gz" \ + "${FASTQ_OUT}/sample1_R1_unpaired.fastq.gz" \ + "${FASTQ_OUT}/sample1_R2_paired.fastq.gz" \ + "${FASTQ_OUT}/sample1_R2_unpaired.fastq.gz" \ ILLUMINACLIP: TruSeq3-PE.fa:2:30:10 \ LEADING:3 \ TRAILING:3 \ @@ -28,11 +41,11 @@ java -jar $EBROOTTRIMMOMATIC/trimmomatic.jar PE \ MINLEN:50 -# (Example) Single end +# Example single end data. java -jar $EBROOTTRIMMOMATIC/trimmomatic.jar SE \ -phred33 \ - input.fq.gz \ - output.fq.gz \ + sample1.fastq.gz \ + output.fastq.gz \ ILLUMINACLIP:TruSeq3-SE:2:30:10 \ LEADING:3 \ TRAILING:3 \ diff --git a/rnaseq/step3_align/snippet.sh b/rnaseq/step3_align/snippet.sh index 5ad5e64..547867a 100644 --- a/rnaseq/step3_align/snippet.sh +++ b/rnaseq/step3_align/snippet.sh @@ -2,25 +2,26 @@ # # Align reads against reference genome. -STORAGE="/groups/umcg-griac/tmp04/rawdata/$(whoami)/step3" +PROJECT_DIRECTORY="/groups/umcg-griac/tmp01/rawdata/$(whoami)/rnaseq" # Store genome index in this location:. -GENOME_INDEX="${STORAGE}/genome_index" +GENOME_INDEX="${PROJECT_DIRECTORY}/step3/genome_index" mkdir -p "${GENOME_INDEX}" # Store the generated `Aligned.sortedByCoord.out.bam` in this dir. -ALIGNMENT_OUTPUT="${STORAGE}/alignment" +ALIGNMENT_OUTPUT="${PROJECT_DIRECTORY}/step3/alignment/" mkdir -p "${ALIGNMENT_OUTPUT}" # 1) Generate genome index. # # N.B.: -# - We're assuming a read size of 100 bp (--sjdbOverhang 100). Refer back to the +# - We're assuming a read size of 100 bp (--sjdbOverhang 100). An alternative +# cut-off is 150, for low-input methods. In general, refer back to the # previous quality control steps if you are unsure about the size. In case of # reads of varying length, the ideal value is max(ReadLength)-1. # - We're using gzip compressed reference data (--readFilesCommand zcat), i.e., # .gtf.gz and fa.gz. If not, you can remove the `zcat` flag. -# Storage location reference data (in this case on calculon). -REFERENCE_DATA="/groups/umcg-griac/prm02/rawdata/reference/genome" +# Storage location reference data (in this case on Gearshift). +REFERENCE_DATA="/groups/umcg-griac/prm03/rawdata/reference/genome" GTF_FILE="${REFERENCE_DATA}/Homo_sapiens.GRCh38.100.gtf.gz" FASTA_FILE="${REFERENCE_DATA}/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz" @@ -40,9 +41,9 @@ STAR \ # - We are assuming paired-end, gzip compressed (--readFilesCommand zcat) FastQ # files. -# THe compressed paired-end FastQ's that we are aligning. -R1="sample1_R1.fastq.gz" -R2="sample1_R2.fastq.gz" +# The compressed, paired-end, FastQ's after trimming (step 2). +R1="${PROJECT_DIRECTORY}/step2/sample1_R1_paired.fastq.gz" +R2="${PROJECT_DIRECTORY}/step2/sample1_R2_paired.fastq.gz" STAR \ --runThreadN 8 \ diff --git a/rnaseq/step4_multiqc/snippet.sh b/rnaseq/step4_multiqc/snippet.sh index e16b342..ceb2c25 100644 --- a/rnaseq/step4_multiqc/snippet.sh +++ b/rnaseq/step4_multiqc/snippet.sh @@ -1,5 +1,5 @@ #!/bin/bash module load multiqc -# assuming you have all your fastQC result files in the ./fastQCresults folder -multiqc ./fastQCresults +PROJECT_DIRECTORY="/groups/umcg-griac/tmp01/rawdata/$(whoami)/rnaseq" +multiqc "${PROJECT_DIRECTORY}" diff --git a/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R b/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R index 0c42844..ee8c249 100644 --- a/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R +++ b/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R @@ -18,7 +18,7 @@ do.scale = FALSE # The analysis -# We ise prcomp to calculate the PCAs. Afterwards you should plot the results. +# We use prcomp to calculate the PCAs. Afterwards you should plot the results. norm.expr.data <- expression.data %>% tibble::column_to_rownames("Gene") norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>% @@ -37,8 +37,8 @@ norm.expr.data.pcs <- norm.expr.data %>% ) # Write summary of PCAs to files -pcs.summery <- summary(norm.expr.data.pcs) -pcs.summery$importance %>% +pcs.summary <- summary(norm.expr.data.pcs) +pcs.summary$importance %>% t() %>% as.data.frame() %>% tibble::rownames_to_column("PC.name") %>% @@ -46,7 +46,7 @@ pcs.summery$importance %>% file.path(results.dir.pca, "importance.csv") ) -pcs.summery$x %>% +pcs.summary$x %>% t() %>% as.data.frame() %>% tibble::rownames_to_column("ensembl.id") %>% @@ -54,7 +54,7 @@ pcs.summery$x %>% file.path(results.dir.pca, "values.csv") ) -pcs.summery$rotation %>% +pcs.summary$rotation %>% t() %>% as.data.frame() %>% tibble::rownames_to_column("sample.id") %>% @@ -63,15 +63,15 @@ pcs.summery$rotation %>% ) data.frame( - rownames = names(pcs.summery$center), - center = pcs.summery$center, - scale = pcs.summery$scale + rownames = names(pcs.summary$center), + center = pcs.summary$center, + scale = pcs.summary$scale ) %>% readr::write_csv( file.path(results.dir.pca, "rest.csv") ) -# Not saved: pcs.summery$sdev, +# Not saved: pcs.summary$sdev, # Next thing to do: