2021-02-12 13:52:56 +01:00
#!/usr/bin/perl -w
use strict ;
use Parallel::ForkManager ;
2021-02-15 17:18:48 +01:00
# this script creats one file with UMI unique reads and one with UMI duplicated reads
2021-02-23 15:18:35 +01:00
# as input you need aligned sorted by coordinate bam file
2021-02-12 13:52:56 +01:00
my @ torun = ( ) ;
foreach my $ file ( <*Aligned.sortedByCoord.out.bam> ) {
push @ torun , $ file ;
}
my $ pm = Parallel::ForkManager - > new ( 12 ) ;
foreach my $ file ( @ torun ) {
my $ sample = $ file ;
$ sample =~ s/Aligned\.sortedByCoord\.out\.bam$// ;
next if - s $ sample . '_uniq.bam' ;
warn "Parsing $sample\n" ;
$ pm - > start and next ;
my % seen = ( ) ;
my % duplicates = ( ) ;
my ( $ uniqs , $ dups ) = ( 0 , 0 ) ;
open F , 'samtools view -h ' . $ file . ' |' ;
open F1 , '| samtools view -bS - >' . $ sample . '_uniq.bam' ;
open F2 , '| samtools view -bS - >' . $ sample . '_dups.bam' ;
while ( <F> ) {
if ( m/^\@/ ) {
print F1 ;
print F2 ;
next ;
}
my ( $ id , $ flag , $ chr , $ pos , $ mapq , $ cigar , $ chr2 , $ pos2 , $ tlen ) = split /\t/ ;
2021-02-15 17:18:48 +01:00
next if $ flag & 256 or $ flag & 512 or $ flag & 1024 ; #skip if the read is not primary alignment/read fails platform/vendor quality checks/read is PCR or optical duplicate
2021-02-12 13:52:56 +01:00
# foreach ( 256, 512, 1024 ) { $flag-=$_ if $flag&$_ }
2021-02-15 17:18:48 +01:00
my ( $ bc ) = $ id =~ m/\:([GATCN\d]+)$/ ; #extract UMI barcode
2021-02-12 13:52:56 +01:00
my $ uniq = join ( ':' , $ chr , $ pos , $ flag , $ tlen , $ bc ) ;
my $ pos_ = $ pos - 1 ;
while ( $ cigar =~ m/(\d+)([SHMDIN=])/g ) {
2021-02-23 15:18:35 +01:00
$ pos_ += $ 1 if $ 2 eq 'M' or $ 2 eq '=' or $ 2 eq 'D' or $ 2 eq 'N' ; # find position for minus strand
2021-02-12 13:52:56 +01:00
}
my $ uniq2 = join ( ':' , $ chr , $ pos_ , $ flag , $ tlen , $ bc ) ;
if ( exists ( $ duplicates { $ id } ) or # already marked as duplicate
( not ( $ flag & 16 ) and + + $ seen { $ uniq } > 1 ) or # plus strand
2021-02-23 15:18:35 +01:00
( $ flag & 16 and + + $ seen { $ uniq2 } > 1 ) #minus strand
2021-02-12 13:52:56 +01:00
) {
print F2 ;
$ dups + + ;
$ duplicates { $ id } = 1 ;
}
else {
print F1 ;
$ uniqs + + ;
}
}
close F ;
close F1 ;
close F2 ;
warn "Unique: $uniqs (" , 100 * $ uniqs / ( $ uniqs + $ dups ) , ")\n" ;
system ( 'samtools' , 'index' , $ sample . '_uniq.bam' ) ;
# last;
$ pm - > finish ;
}
$ pm - > wait_all_children ;