forked from GRIAC/system_genetics
61 lines
2.1 KiB
Perl
61 lines
2.1 KiB
Perl
#!/usr/bin/perl -w
|
|
use strict;
|
|
use Parallel::ForkManager;
|
|
# this script creats one file with UMI unique reads and one with UMI duplicated reads
|
|
my @torun = ();
|
|
foreach my $file ( <*Aligned.sortedByCoord.out.bam> ) {
|
|
push @torun, $file;
|
|
}
|
|
|
|
my $pm = Parallel::ForkManager->new( 12 );
|
|
foreach my $file ( @torun ) {
|
|
my $sample = $file;
|
|
$sample =~ s/Aligned\.sortedByCoord\.out\.bam$//;
|
|
next if -s $sample.'_uniq.bam';
|
|
warn "Parsing $sample\n";
|
|
$pm->start and next;
|
|
my %seen = ();
|
|
my %duplicates = ();
|
|
my ( $uniqs, $dups ) = ( 0, 0 );
|
|
open F, 'samtools view -h '.$file.' |';
|
|
open F1, '| samtools view -bS - >'.$sample.'_uniq.bam';
|
|
open F2, '| samtools view -bS - >'.$sample.'_dups.bam';
|
|
while ( <F> ) {
|
|
if ( m/^\@/ ) {
|
|
print F1;
|
|
print F2;
|
|
next;
|
|
}
|
|
my ( $id, $flag, $chr, $pos, $mapq, $cigar, $chr2, $pos2, $tlen ) = split /\t/;
|
|
next if $flag & 256 or $flag & 512 or $flag & 1024; #skip if the read is not primary alignment/read fails platform/vendor quality checks/read is PCR or optical duplicate
|
|
# foreach ( 256, 512, 1024 ) { $flag-=$_ if $flag&$_ }
|
|
my ( $bc ) = $id =~ m/\:([GATCN\d]+)$/; #extract UMI barcode
|
|
my $uniq = join( ':', $chr, $pos, $flag, $tlen, $bc );
|
|
my $pos_ = $pos-1;
|
|
while ( $cigar =~ m/(\d+)([SHMDIN=])/g ) {
|
|
$pos_+=$1 if $2 eq 'M' or $2 eq '=' or $2 eq 'D' or $2 eq 'N';
|
|
}
|
|
my $uniq2 = join( ':', $chr, $pos_, $flag, $tlen, $bc );
|
|
if ( exists($duplicates{$id}) or # already marked as duplicate
|
|
( not($flag&16) and ++$seen{ $uniq } > 1 ) or # plus strand
|
|
( $flag&16 and ++$seen{ $uniq2 } > 1 )
|
|
) {
|
|
print F2;
|
|
$dups++;
|
|
$duplicates{$id} = 1;
|
|
}
|
|
else {
|
|
print F1;
|
|
$uniqs++;
|
|
}
|
|
}
|
|
close F;
|
|
close F1;
|
|
close F2;
|
|
warn "Unique: $uniqs (", 100*$uniqs/($uniqs+$dups), ")\n";
|
|
system( 'samtools', 'index', $sample.'_uniq.bam' );
|
|
# last;
|
|
$pm->finish;
|
|
}
|
|
$pm->wait_all_children;
|