system_genetics/rnaseq/umi/07_remove_dups.pl

#!/usr/bin/perl -w
use strict;
use Parallel::ForkManager;
# this script creats one file with UMI unique reads and one with UMI duplicated reads
my @torun = ();
foreach my $file ( <*Aligned.sortedByCoord.out.bam> ) {
    push @torun, $file;
}

my $pm = Parallel::ForkManager->new( 12 );
foreach my $file ( @torun ) {
    my $sample = $file;
    $sample =~ s/Aligned\.sortedByCoord\.out\.bam$//;
    next if -s $sample.'_uniq.bam';
    warn "Parsing $sample\n";
    $pm->start and next;
    my %seen = ();
    my %duplicates = ();
    my ( $uniqs, $dups ) = ( 0, 0 );
    open F, 'samtools view -h '.$file.' |';
    open F1, '| samtools view -bS - >'.$sample.'_uniq.bam';
    open F2, '| samtools view -bS - >'.$sample.'_dups.bam';
    while ( <F> ) {
        if ( m/^\@/ ) {
            print F1;
            print F2;
            next;
        }
        my ( $id, $flag, $chr, $pos, $mapq, $cigar, $chr2, $pos2, $tlen ) = split /\t/;
        next if $flag & 256 or $flag & 512 or $flag & 1024; #skip if the read is not primary alignment/read fails platform/vendor quality checks/read is PCR or optical duplicate
#        foreach ( 256, 512, 1024 ) { $flag-=$_ if $flag&$_ }
        my ( $bc ) = $id =~ m/\:([GATCN\d]+)$/; #extract UMI barcode
        my $uniq = join( ':', $chr, $pos, $flag, $tlen, $bc );
        my $pos_ = $pos-1;
        while ( $cigar =~ m/(\d+)([SHMDIN=])/g ) {
            $pos_+=$1 if $2 eq 'M' or $2 eq '=' or $2 eq 'D' or $2 eq 'N';
        }
        my $uniq2 = join( ':', $chr, $pos_, $flag, $tlen, $bc );
        if ( exists($duplicates{$id}) or # already marked as duplicate
            ( not($flag&16) and ++$seen{ $uniq } > 1 ) or # plus strand
            ( $flag&16 and ++$seen{ $uniq2 } > 1 )
            ) {
            print F2;
            $dups++;
            $duplicates{$id} = 1;
        }
        else {
            print F1;
            $uniqs++;
        }
    }
    close F;
    close F1;
    close F2;
    warn "Unique: $uniqs (", 100*$uniqs/($uniqs+$dups), ")\n";
    system( 'samtools', 'index', $sample.'_uniq.bam' );
#    last;
    $pm->finish;
}
$pm->wait_all_children;