This is a read-only IO module designed to convert Bowtie
(
http://bowtie-bio.sourceforge.net/) formatted alignments to
Bio::Assembly::Scaffold representations, containing
Bio::Assembly::Contig and
Bio::Assembly::Singlet objects.
It is a wrapper that converts the Bowtie format to BAM format taken
by the Bio::Assembly::IO::sam module which in turn uses lstein's
Bio::DB::Sam to parse binary formatted SAM (.bam) files guided by a
reference sequence fasta database.
Some information is lost in conversion from bowtie format to SAM/BAM format
that is provided by Bowtie using the SAM output option and the conversion
to SAM format from bowtie format is slower than using bowtie's SAM option.
If you plan to use SAM/BAM format it is preferable to use this Bowtie
option rather than convert the format after the fact.
See the Bio::Assembly::IO::sam documentation for relevant details.
sub _bowtie_to_sam
{ my ($self, $file, $refdb) = @_;
$self->throw("'$file' does not exist or is not readable.")
unless ( -e $file && -r _ );
if ($file =~ m/\.gz[^.]*$/) { $file = $self->_uncompress($file); $self->close;
open (my $fh,$file);
$self->file($file);
$self->_fh($fh);
}
my $guesser = Bio::Tools::GuessSeqFormat->new(-file=>$file);
$self->throw("'$file' is not a bowtie formatted file.") unless $guesser->guess =~ m/^bowtie$/;
my %SQ;
my $mapq = 255;
my $in_pair;
my @mate_line;
my $mlen;
my ($sam_tmp_h, $sam_tmp_f) = $self->tempfile( -dir => $self->{'_tempdir'}, -suffix => '.sam' );
while (my $line = $self->_readline) {
chomp($line);
my ($qname,$strand,$rname,$pos,$seq,$qual,$m,$details)=split("\cI",$line);
$SQ{$rname} = 1;
my $paired_f = ($qname =~ m#/[12]#) ? 0x03 : 0; my $strand_f = ($strand eq '-') ? 0x10 : 0;
my $op_strand_f = ($strand eq '+' && $paired_f) ? 0x20 : 0;
my $first_f = ($qname =~ m#/1#) ? 0x40 : 0; my $second_f = ($qname =~ m#/2#) ? 0x80 : 0; my $flag = $paired_f | $strand_f | $op_strand_f | $first_f | $second_f;
$pos++;
my $len = length $seq;
die unless $len == length $qual;
my $cigar = $len.'M';
my @detail = split(',',$details);
my $dist = 'NM:i:'.scalar @detail;
my @mismatch;
my $last_pos = 0;
for (@detail) {
m/(\d+):(\w)>\w/; my $err = ($1-$last_pos);
$last_pos = $1+1;
push @mismatch,($err,$2);
}
push @mismatch, $len-$last_pos;
@mismatch = reverse @mismatch if $strand eq '-';
my $mismatch = join('',('MD:Z:',@mismatch));
if ($paired_f) {
my $mrnm = '=';
if ($in_pair) {
my $mpos = $mate_line[3];
$mate_line[7] = $pos;
my $isize = $mpos-$pos-$len;
$mate_line[8] = -$isize;
print $sam_tmp_h join("\t",@mate_line),"\n";
print $sam_tmp_h join("\t",$qname, $flag, $rname, $pos, $mapq, $cigar, $mrnm, $mpos, $isize, $seq, $qual, $mismatch, $dist),"\n";
$in_pair = 0;
} else {
$mlen = $len;
@mate_line = ($qname, $flag, $rname, $pos, $mapq, $cigar, $mrnm, undef, undef, $seq, $qual, $mismatch, $dist);
$in_pair = 1;
}
} else {
my $mrnm = '*';
my $mpos = 0;
my $isize = 0;
print $sam_tmp_h join("\t",$qname, $flag, $rname, $pos, $mapq, $cigar, $mrnm, $mpos, $isize, $seq, $qual, $mismatch, $dist),"\n";
}
}
$sam_tmp_h->close;
return $sam_tmp_f if $self->{'_no_head'};
my ($samh, $samf) = $self->tempfile( -dir => $self->{'_tempdir'}, -suffix => '.sam' );
print $samh $HD;
unless ($self->{'_no_sq'}) {
my $db = Bio::SeqIO->new( -file => $refdb, -format => 'fasta' );
while ( my $seq = $db->next_seq() ) {
$SQ{$seq->id} = $seq->length if $SQ{$seq->id};
}
map { print $samh join("\t", ('@SQ', "SN:$_", "LN:$SQ{$_}")), "\n" } keys %SQ;
}
print $samh $PG;
open($sam_tmp_h, $sam_tmp_f) or
$self->throw("Can not open '$sam_tmp_f' for reading: $!");
print $samh $_ while (<$sam_tmp_h>);
close($sam_tmp_h);
$samh->close;
return $samf;} |
sub _make_bam
{ my ($self, $file) = @_;
$self->throw("'$file' does not exist or is not readable")
unless ( -e $file && -r _ );
my ($bamh, $bamf) = $self->tempfile( -dir => $self->{'_tempdir'}, -suffix => '.bam' );
my ($srth, $srtf) = $self->tempfile( -dir => $self->{'_tempdir'}, -suffix => '.srt' );
$_->close for ($bamh, $srth);
my $samt = Bio::Tools::Run::Samtools->new( -command => 'view',
-sam_input => 1,
-bam_output => 1 );
$samt->run( -bam => $file, -out => $bamf );
$samt = Bio::Tools::Run::Samtools->new( -command => 'sort' );
$samt->run( -bam => $bamf, -pfx => $srtf);
return $srtf.'.bam'
}
1;} |
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _