#!/usr/bin/perl -w use strict; use Parallel::ForkManager; # this script creats one file with UMI unique reads and one with UMI duplicated reads # as input you need aligned sorted by coordinate bam file my @torun = (); foreach my $file ( <*Aligned.sortedByCoord.out.bam> ) { push @torun, $file; } my $pm = Parallel::ForkManager->new( 12 ); foreach my $file ( @torun ) { my $sample = $file; $sample =~ s/Aligned\.sortedByCoord\.out\.bam$//; next if -s $sample.'_uniq.bam'; warn "Parsing $sample\n"; $pm->start and next; my %seen = (); my %duplicates = (); my ( $uniqs, $dups ) = ( 0, 0 ); open F, 'samtools view -h '.$file.' |'; open F1, '| samtools view -bS - >'.$sample.'_uniq.bam'; open F2, '| samtools view -bS - >'.$sample.'_dups.bam'; while ( ) { if ( m/^\@/ ) { print F1; print F2; next; } my ( $id, $flag, $chr, $pos, $mapq, $cigar, $chr2, $pos2, $tlen ) = split /\t/; next if $flag & 256 or $flag & 512 or $flag & 1024; #skip if the read is not primary alignment/read fails platform/vendor quality checks/read is PCR or optical duplicate # foreach ( 256, 512, 1024 ) { $flag-=$_ if $flag&$_ } my ( $bc ) = $id =~ m/\:([GATCN\d]+)$/; #extract UMI barcode my $uniq = join( ':', $chr, $pos, $flag, $tlen, $bc ); my $pos_ = $pos-1; while ( $cigar =~ m/(\d+)([SHMDIN=])/g ) { $pos_+=$1 if $2 eq 'M' or $2 eq '=' or $2 eq 'D' or $2 eq 'N'; # find position for minus strand } my $uniq2 = join( ':', $chr, $pos_, $flag, $tlen, $bc ); if ( exists($duplicates{$id}) or # already marked as duplicate ( not($flag&16) and ++$seen{ $uniq } > 1 ) or # plus strand ( $flag&16 and ++$seen{ $uniq2 } > 1 ) #minus strand ) { print F2; $dups++; $duplicates{$id} = 1; } else { print F1; $uniqs++; } } close F; close F1; close F2; warn "Unique: $uniqs (", 100*$uniqs/($uniqs+$dups), ")\n"; system( 'samtools', 'index', $sample.'_uniq.bam' ); # last; $pm->finish; } $pm->wait_all_children;