From a7651f6a21e27867e9c36cd03d4439a22ace3fae Mon Sep 17 00:00:00 2001 From: TatiKarp Date: Fri, 12 Feb 2021 13:52:56 +0100 Subject: [PATCH 1/2] UMI deduplication script from Victor --- rnaseq/umi/07_remove_dups.pl | 60 ++++++++++++++++++++++++++++++++++++ rnaseq/umi/snippet.pl | 0 2 files changed, 60 insertions(+) create mode 100644 rnaseq/umi/07_remove_dups.pl delete mode 100644 rnaseq/umi/snippet.pl diff --git a/rnaseq/umi/07_remove_dups.pl b/rnaseq/umi/07_remove_dups.pl new file mode 100644 index 0000000..94f3acc --- /dev/null +++ b/rnaseq/umi/07_remove_dups.pl @@ -0,0 +1,60 @@ +#!/usr/bin/perl -w +use strict; +use Parallel::ForkManager; + +my @torun = (); +foreach my $file ( <*Aligned.sortedByCoord.out.bam> ) { + push @torun, $file; +} + +my $pm = Parallel::ForkManager->new( 12 ); +foreach my $file ( @torun ) { + my $sample = $file; + $sample =~ s/Aligned\.sortedByCoord\.out\.bam$//; + next if -s $sample.'_uniq.bam'; + warn "Parsing $sample\n"; + $pm->start and next; + my %seen = (); + my %duplicates = (); + my ( $uniqs, $dups ) = ( 0, 0 ); + open F, 'samtools view -h '.$file.' |'; + open F1, '| samtools view -bS - >'.$sample.'_uniq.bam'; + open F2, '| samtools view -bS - >'.$sample.'_dups.bam'; + while ( ) { + if ( m/^\@/ ) { + print F1; + print F2; + next; + } + my ( $id, $flag, $chr, $pos, $mapq, $cigar, $chr2, $pos2, $tlen ) = split /\t/; + next if $flag & 256 or $flag & 512 or $flag & 1024; +# foreach ( 256, 512, 1024 ) { $flag-=$_ if $flag&$_ } + my ( $bc ) = $id =~ m/\:([GATCN\d]+)$/; + my $uniq = join( ':', $chr, $pos, $flag, $tlen, $bc ); + my $pos_ = $pos-1; + while ( $cigar =~ m/(\d+)([SHMDIN=])/g ) { + $pos_+=$1 if $2 eq 'M' or $2 eq '=' or $2 eq 'D' or $2 eq 'N'; + } + my $uniq2 = join( ':', $chr, $pos_, $flag, $tlen, $bc ); + if ( exists($duplicates{$id}) or # already marked as duplicate + ( not($flag&16) and ++$seen{ $uniq } > 1 ) or # plus strand + ( $flag&16 and ++$seen{ $uniq2 } > 1 ) + ) { + print F2; + $dups++; + $duplicates{$id} = 1; + } + else { + print F1; + $uniqs++; + } + } + close F; + close F1; + close F2; + warn "Unique: $uniqs (", 100*$uniqs/($uniqs+$dups), ")\n"; + system( 'samtools', 'index', $sample.'_uniq.bam' ); +# last; + $pm->finish; +} +$pm->wait_all_children; diff --git a/rnaseq/umi/snippet.pl b/rnaseq/umi/snippet.pl deleted file mode 100644 index e69de29..0000000 -- 2.40.1 From c05d3f78681795dddcfeaccd3a5dd2093ba17610 Mon Sep 17 00:00:00 2001 From: TatiKarp Date: Mon, 15 Feb 2021 17:18:48 +0100 Subject: [PATCH 2/2] Added description and some comments for UMi deduplication --- rnaseq/umi/07_remove_dups.pl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rnaseq/umi/07_remove_dups.pl b/rnaseq/umi/07_remove_dups.pl index 94f3acc..2aa2859 100644 --- a/rnaseq/umi/07_remove_dups.pl +++ b/rnaseq/umi/07_remove_dups.pl @@ -1,7 +1,7 @@ #!/usr/bin/perl -w use strict; use Parallel::ForkManager; - +# this script creats one file with UMI unique reads and one with UMI duplicated reads my @torun = (); foreach my $file ( <*Aligned.sortedByCoord.out.bam> ) { push @torun, $file; @@ -27,9 +27,9 @@ foreach my $file ( @torun ) { next; } my ( $id, $flag, $chr, $pos, $mapq, $cigar, $chr2, $pos2, $tlen ) = split /\t/; - next if $flag & 256 or $flag & 512 or $flag & 1024; + next if $flag & 256 or $flag & 512 or $flag & 1024; #skip if the read is not primary alignment/read fails platform/vendor quality checks/read is PCR or optical duplicate # foreach ( 256, 512, 1024 ) { $flag-=$_ if $flag&$_ } - my ( $bc ) = $id =~ m/\:([GATCN\d]+)$/; + my ( $bc ) = $id =~ m/\:([GATCN\d]+)$/; #extract UMI barcode my $uniq = join( ':', $chr, $pos, $flag, $tlen, $bc ); my $pos_ = $pos-1; while ( $cigar =~ m/(\d+)([SHMDIN=])/g ) { -- 2.40.1