system_genetics/rnaseq/umi/07_remove_dups.pl

61 lines
2.1 KiB
Perl

#!/usr/bin/perl -w
use strict;
use Parallel::ForkManager;
# this script creats one file with UMI unique reads and one with UMI duplicated reads
my @torun = ();
foreach my $file ( <*Aligned.sortedByCoord.out.bam> ) {
push @torun, $file;
}
my $pm = Parallel::ForkManager->new( 12 );
foreach my $file ( @torun ) {
my $sample = $file;
$sample =~ s/Aligned\.sortedByCoord\.out\.bam$//;
next if -s $sample.'_uniq.bam';
warn "Parsing $sample\n";
$pm->start and next;
my %seen = ();
my %duplicates = ();
my ( $uniqs, $dups ) = ( 0, 0 );
open F, 'samtools view -h '.$file.' |';
open F1, '| samtools view -bS - >'.$sample.'_uniq.bam';
open F2, '| samtools view -bS - >'.$sample.'_dups.bam';
while ( <F> ) {
if ( m/^\@/ ) {
print F1;
print F2;
next;
}
my ( $id, $flag, $chr, $pos, $mapq, $cigar, $chr2, $pos2, $tlen ) = split /\t/;
next if $flag & 256 or $flag & 512 or $flag & 1024; #skip if the read is not primary alignment/read fails platform/vendor quality checks/read is PCR or optical duplicate
# foreach ( 256, 512, 1024 ) { $flag-=$_ if $flag&$_ }
my ( $bc ) = $id =~ m/\:([GATCN\d]+)$/; #extract UMI barcode
my $uniq = join( ':', $chr, $pos, $flag, $tlen, $bc );
my $pos_ = $pos-1;
while ( $cigar =~ m/(\d+)([SHMDIN=])/g ) {
$pos_+=$1 if $2 eq 'M' or $2 eq '=' or $2 eq 'D' or $2 eq 'N';
}
my $uniq2 = join( ':', $chr, $pos_, $flag, $tlen, $bc );
if ( exists($duplicates{$id}) or # already marked as duplicate
( not($flag&16) and ++$seen{ $uniq } > 1 ) or # plus strand
( $flag&16 and ++$seen{ $uniq2 } > 1 )
) {
print F2;
$dups++;
$duplicates{$id} = 1;
}
else {
print F1;
$uniqs++;
}
}
close F;
close F1;
close F2;
warn "Unique: $uniqs (", 100*$uniqs/($uniqs+$dups), ")\n";
system( 'samtools', 'index', $sample.'_uniq.bam' );
# last;
$pm->finish;
}
$pm->wait_all_children;