diff --git a/rnaseq/umi/07_remove_dups.pl b/rnaseq/umi/07_remove_dups.pl new file mode 100644 index 0000000..94f3acc --- /dev/null +++ b/rnaseq/umi/07_remove_dups.pl @@ -0,0 +1,60 @@ +#!/usr/bin/perl -w +use strict; +use Parallel::ForkManager; + +my @torun = (); +foreach my $file ( <*Aligned.sortedByCoord.out.bam> ) { + push @torun, $file; +} + +my $pm = Parallel::ForkManager->new( 12 ); +foreach my $file ( @torun ) { + my $sample = $file; + $sample =~ s/Aligned\.sortedByCoord\.out\.bam$//; + next if -s $sample.'_uniq.bam'; + warn "Parsing $sample\n"; + $pm->start and next; + my %seen = (); + my %duplicates = (); + my ( $uniqs, $dups ) = ( 0, 0 ); + open F, 'samtools view -h '.$file.' |'; + open F1, '| samtools view -bS - >'.$sample.'_uniq.bam'; + open F2, '| samtools view -bS - >'.$sample.'_dups.bam'; + while ( ) { + if ( m/^\@/ ) { + print F1; + print F2; + next; + } + my ( $id, $flag, $chr, $pos, $mapq, $cigar, $chr2, $pos2, $tlen ) = split /\t/; + next if $flag & 256 or $flag & 512 or $flag & 1024; +# foreach ( 256, 512, 1024 ) { $flag-=$_ if $flag&$_ } + my ( $bc ) = $id =~ m/\:([GATCN\d]+)$/; + my $uniq = join( ':', $chr, $pos, $flag, $tlen, $bc ); + my $pos_ = $pos-1; + while ( $cigar =~ m/(\d+)([SHMDIN=])/g ) { + $pos_+=$1 if $2 eq 'M' or $2 eq '=' or $2 eq 'D' or $2 eq 'N'; + } + my $uniq2 = join( ':', $chr, $pos_, $flag, $tlen, $bc ); + if ( exists($duplicates{$id}) or # already marked as duplicate + ( not($flag&16) and ++$seen{ $uniq } > 1 ) or # plus strand + ( $flag&16 and ++$seen{ $uniq2 } > 1 ) + ) { + print F2; + $dups++; + $duplicates{$id} = 1; + } + else { + print F1; + $uniqs++; + } + } + close F; + close F1; + close F2; + warn "Unique: $uniqs (", 100*$uniqs/($uniqs+$dups), ")\n"; + system( 'samtools', 'index', $sample.'_uniq.bam' ); +# last; + $pm->finish; +} +$pm->wait_all_children; diff --git a/rnaseq/umi/snippet.pl b/rnaseq/umi/snippet.pl deleted file mode 100644 index e69de29..0000000