2 Commits

Author SHA1 Message Date
Hylke C. Donker
d71460a7dd Tailored storage locations to calculon. 2021-02-11 10:47:21 +01:00
Hylke C. Donker
ef4a25f54b Small snippet that generates a genome index and aligns the RNAseq reads. 2021-02-09 17:31:42 +01:00
3 changed files with 53 additions and 60 deletions

View File

@@ -0,0 +1,53 @@
#!/bin/bash
#
# Align reads against reference genome.
STORAGE="/groups/umcg-griac/tmp04/rawdata/$(whoami)/step3"
# Store genome index in this location:.
GENOME_INDEX="${STORAGE}/genome_index"
mkdir -p "${GENOME_INDEX}"
# Store the generated `Aligned.sortedByCoord.out.bam` in this dir.
ALIGNMENT_OUTPUT="${STORAGE}/alignment"
mkdir -p "${OUTPUT}"
# 1) Generate genome index.
#
# N.B.:
# - We're assuming a read size of 100 bp (--sjdbOverhang 100). Refer back to the
# previous quality control steps if you are unsure about the size. In case of
# reads of varying length, the ideal value is max(ReadLength)-1.
# - We're using gzip compressed reference data (--readFilesCommand zcat), i.e.,
# .gtf.gz and fa.gz. If not, you can remove the `zcat` flag.
# Storage location reference data (in this case on calculon).
REFERENCE_DATA="/groups/umcg-griac/prm02/rawdata/reference/genome"
GTF_FILE="${REFERENCE_DATA}/Homo_sapiens.GRCh38.100.gtf.gz"
FASTA_FILE="${REFERENCE_DATA}/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz"
STAR \
--runThreadN 8 \
--runMode genomeGenerate \
--readFilesCommand zcat \
--sjdbOverhang 100 \
--genomeFastaFiles ${FASTA_FILE} \
--sjdbGTFfile ${GTF_FILE} \
--genomeDir ${GENOME_INDEX}
# 2) Do the actual alignment.
#
# N.B.:
# - We are assuming paired-end, gzip compressed (--readFilesCommand zcat) FastQ
# files.
# THe compressed paired-end FastQ's that we are aligning.
R1="sample1_R1.fastq.gz"
R2="sample1_R2.fastq.gz"
STAR \
--runThreadN 8 \
--readFilesCommand zcat \
--readFilesIn "${R1}" "${R2}" \
--outSAMtype BAM SortedByCoordinate \
--outFileNamePrefix "${ALIGNMENT_OUTPUT}"

View File

@@ -1,60 +0,0 @@
#!/usr/bin/perl -w
use strict;
use Parallel::ForkManager;
# this script creats one file with UMI unique reads and one with UMI duplicated reads
my @torun = ();
foreach my $file ( <*Aligned.sortedByCoord.out.bam> ) {
push @torun, $file;
}
my $pm = Parallel::ForkManager->new( 12 );
foreach my $file ( @torun ) {
my $sample = $file;
$sample =~ s/Aligned\.sortedByCoord\.out\.bam$//;
next if -s $sample.'_uniq.bam';
warn "Parsing $sample\n";
$pm->start and next;
my %seen = ();
my %duplicates = ();
my ( $uniqs, $dups ) = ( 0, 0 );
open F, 'samtools view -h '.$file.' |';
open F1, '| samtools view -bS - >'.$sample.'_uniq.bam';
open F2, '| samtools view -bS - >'.$sample.'_dups.bam';
while ( <F> ) {
if ( m/^\@/ ) {
print F1;
print F2;
next;
}
my ( $id, $flag, $chr, $pos, $mapq, $cigar, $chr2, $pos2, $tlen ) = split /\t/;
next if $flag & 256 or $flag & 512 or $flag & 1024; #skip if the read is not primary alignment/read fails platform/vendor quality checks/read is PCR or optical duplicate
# foreach ( 256, 512, 1024 ) { $flag-=$_ if $flag&$_ }
my ( $bc ) = $id =~ m/\:([GATCN\d]+)$/; #extract UMI barcode
my $uniq = join( ':', $chr, $pos, $flag, $tlen, $bc );
my $pos_ = $pos-1;
while ( $cigar =~ m/(\d+)([SHMDIN=])/g ) {
$pos_+=$1 if $2 eq 'M' or $2 eq '=' or $2 eq 'D' or $2 eq 'N';
}
my $uniq2 = join( ':', $chr, $pos_, $flag, $tlen, $bc );
if ( exists($duplicates{$id}) or # already marked as duplicate
( not($flag&16) and ++$seen{ $uniq } > 1 ) or # plus strand
( $flag&16 and ++$seen{ $uniq2 } > 1 )
) {
print F2;
$dups++;
$duplicates{$id} = 1;
}
else {
print F1;
$uniqs++;
}
}
close F;
close F1;
close F2;
warn "Unique: $uniqs (", 100*$uniqs/($uniqs+$dups), ")\n";
system( 'samtools', 'index', $sample.'_uniq.bam' );
# last;
$pm->finish;
}
$pm->wait_all_children;

0
rnaseq/umi/snippet.pl Normal file
View File