Bio::SearchIO
gmap_f9
Toolbar
Summary
Bio::SearchIO::gmap_f9 - Event generator for parsing gmap reports (Z format)
Package variables
No package variables defined.
Included modules
Inherit
Synopsis
# Do not use this object directly - it is used as part of the
# Bio::SearchIO system.
use Bio::SearchIO;
my $searchio = Bio::SearchIO->new(-format => 'gmap',
-file => 't/data/her2.gmapz');
while( my $result = $searchio->next_result ) {
while( my $hit = $result->next_hit ) {
while( my $hsp = $hit->next_hsp ) {
# ...
}
}
}
Description
This object encapsulated the necessary methods for generating events
suitable for building Bio::Search objects from a GMAP "compressed"
report (from gmap run with -Z flag) Read the
Bio::SearchIO for more
information about how to use this.
I believe that I'm doing the correct thing when reporting hits on the
negative strand of the genome. In particular, I've compared the
"exons" this code generates with the set returned by ncbi's megablast
web service. NCBI's hsp's are ordered differently and have a
different genomic location (off by ~18,000,000 bases, padding?) but
the starts, ends, and lengths were similar and my strand handling
matches theirs. E.g.
CDNA GENOME
start end strand start end strand
blast
1913 2989 1 86236731 86237808 -1
1 475 1 86260509 86260983 -1
1510 1727 1 86240259 86240476 -1
841 989 1 86243034 86243182 -1
1381 1514 1 86240630 86240763 -1
989 1122 1 86242457 86242590 -1
599 729 1 86247470 86247600 -1
473 608 1 86259972 86260107 -1
1255 1382 1 86240837 86240964 -1
730 842 1 86244040 86244152 -1
1813 1921 1 86238123 86238231 -1
1725 1814 1 86239747 86239836 -1
1167 1256 1 86241294 86241383 -1
1120 1188 1 86242319 86242387 -1
gmap
1 475 1 104330509 104330983 -1
476 600 1 104329980 104330104 -1
601 729 1 104317470 104317598 -1
730 841 1 104314041 104314152 -1
842 989 1 104313034 104313181 -1
990 1121 1 104312458 104312589 -1
1122 1187 1 104312320 104312385 -1
1188 1256 1 104311294 104311362 -1
1257 1382 1 104310837 104310962 -1
1383 1511 1 104310633 104310761 -1
1512 1726 1 104310260 104310474 -1
1727 1814 1 104309747 104309834 -1
1815 1917 1 104308127 104308229 -1
1918 2989 1 104306731 104307802 -1
Methods
| next_result | Description | Code |
| _hsp_from_info | No description | Code |
| _parse_path_header | No description | Code |
| _parse_alignment_line | No description | Code |
Methods description
Title : next_result Usage : $result = stream->next_result Function: Reads the next ResultI object from the stream and returns it. Returns : A Bio::Search::Result::ResultI object Args : n/a |
Methods code
sub next_result
{ my $self = shift;
my $info = [];
my $result;
my $hit;
my @hsp_info;
my $previous_hit_pos;
while ( $_ = $self->_readline ) {
if ( $_ =~ /^>/ ) { if ($result) { $self->_pushback($_);
goto DONE;
}
else { my ($id, $desc, $md5) = m|>([^ ]*)\s*(.*)\s*(?:md5:(.*))?|;
$result = Bio::Search::Result::GenericResult->new();
$result->algorithm('gmap');
$result->query_name($id);
$result->query_accession($id);
$result->query_description($desc);
$hit ||= Bio::Search::Hit::GenericHit->new( -name =>
"NONE_SPECIFIED");
}
}
else { my $c; ($c->{query_aa_pos}, $c->{query_aa}, $c->{query_pos},
$c->{query_base},
$c->{hit_strand}, $c->{hit_chromo}, $c->{hit_pos},
$c->{hit_concat_pos}, $c->{hit_base}, $c->{hit_aa})
= ($_ =~
m| (\d+)[ ]?(.)?[\t] (\d+)[ ]?(.)?[\t] # TODO chromosome isn't a number... X, Y, MT.... (\+\|\-)([\dxXyY]+\|MT):(\d+)[ ](\d+)[ ](.) [\t]?(.)? |xo );
if ($previous_hit_pos &&
(abs($c->{hit_pos} - $previous_hit_pos) > 1)) {
$hit ||= Bio::Search::Hit::GenericHit->new( -name =>
"NONE_SPECIFIED",
);
$hit->add_hsp( $self->_hsp_from_info(\@hsp_info) );
@hsp_info = ();
}
push @hsp_info, $c;
$previous_hit_pos = $c->{hit_pos};
}
}
DONE:
if ($result) {
$hit->add_hsp( $self->_hsp_from_info(\@hsp_info) ) if (@hsp_info);
my ($hit_length,$query_length);
for my $hsp ($hit->hsps) {
$hit_length += $hsp->length();
$query_length += $hsp->length('query');
}
$hit->length($hit_length);
$hit->query_length($query_length);
$hit->name($hsp_info[0]->{hit_chromo});
$result->add_hit($hit) if ($hit);
}
return($result);} |
sub _hsp_from_info
{ my $self = shift;
my $info = shift;
my $a = {}; my $hsp;
my $identical;
$a->{-algorithm} = 'GMAP';
for my $c (@{$info}) {
$a->{-query_seq} .= $c->{query_base};
$a->{-hit_seq} .= $c->{hit_base};
$a->{-homology_seq} .= $c->{query_base} eq $c->{hit_base} ? $c->{hit_base} : ' ';
$identical++ if ( $c->{query_base} eq $c->{hit_base} );
}
$a->{-query_seq} =~ s| |\-|g; $a->{-hit_seq} =~ s| |\-|g;
$a->{-conserved} = $a->{-identical} = $identical;
$a->{-stranded} = 'both';
$a->{-query_start} = $info->[0]->{query_pos};
$a->{-query_end} = $info->[-1]->{query_pos};
$a->{-query_length} = $a->{-query_end} - $a->{-query_start} + 1;
$a->{-hit_start} = $info->[0]->{hit_pos};
$a->{-hit_end} = $info->[-1]->{hit_pos};
$a->{-hit_length} = abs($a->{-hit_end} - $a->{-hit_start}) + 1;
$a->{-hsp_length} =
$a->{-query_length} > $a->{-hit_length} ?
$a->{-query_length} : $a->{-hit_length};
$hsp = Bio::Search::HSP::GenericHSP->new( %$a );
return $hsp;
}
} |
sub _parse_path_header
{ my $self = shift;
my $path_line = shift;
my $path = {};
(
$path->{query},
$path->{db},
$path->{path_num},
$path->{path_total_num},
$path->{query_length},
$path->{exon_count},
$path->{trimmed_coverage},
$path->{percent_identity},
$path->{query_start},
$path->{query_end},
$path->{whole_genome_start},
$path->{whole_genome_end},
$path->{chromosome},
$path->{chromo_start},
$path->{chromo_end},
$path->{strand},
$path->{sense},
$path->{md5},
) =
($_ =~ qr| > ([^ ]*)[ ] # the query id}, followed by a space ([^ ]*)[ ] # the genome database, followed by a space (\d+)/(\d+)[ ] # path_num/path_total_num (e.g. 3/12) (\d+)[ ] # query length, followed by a space (\d+)[ ] # hsp/exon count, followed by a space (\d+\.\d*)[ ] # trimmed coverage (\d+\.\d*)[ ] # percent identity (\d+)\.\.(\d+)[ ] # query start .. query end, followed by space (\d+)\.\.(\d+)[ ] # whole genome s..e, followed by space (\d+): # chromosome number (\d+)\.\.(\d+)[ ] # chromo s..e, followed by a space ([+-])[ ] # strand, followed by a space dir:(.*) # dir:sense or dir:antisense [ ]md5:([\dabcdefg]+) # md5 signature |x );
$path->{query} or $self->throw("query was not found in path line.");
$path->{db} or $self->throw("db was not found in path line.");
$path->{path_num} or $self->throw("path_num was not found in path line.");
$path->{path_total_num} or
$self->throw("path_total_num was not found in path line.");
$path->{query_length} or
$self->throw("query_length was not found in path line.");
$path->{exon_count} or
$self->throw("exon_count was not found in path line.");
$path->{trimmed_coverage} or
$self->throw("trimmed_coverage was not found in path line.");
$path->{percent_identity} or
$self->throw("percent_identity was not found in path line.");
$path->{query_start} or
$self->throw("query_start was not found in path line.");
$path->{query_end} or
$self->throw("query_end was not found in path line.");
$path->{whole_genome_start} or
$self->throw("whole_genome_start was not found in path line.");
$path->{whole_genome_end} or
$self->throw("whole_genome_end was not found in path line.");
$path->{chromosome} or
$self->throw("chromosome was not found in path line.");
$path->{chromo_start} or
$self->throw("chromo_start was not found in path line.");
$path->{chromo_end} or
$self->throw("chromo_end was not found in path line.");
$path->{strand} or $self->throw("strand was not found in path line.");
$path->{sense} or $self->throw("sense was not found in path line.");
return $path;} |
| _parse_alignment_line | description | prev | next | Top |
sub _parse_alignment_line
{ my $self = shift;
my $a_line = shift;
my $align = {};
(
$align->{chromo_start},
$align->{chromo_end},
$align->{query_start},
$align->{query_end},
$align->{percent_identity},
$align->{align_length},
$align->{intron_length},
) =
($_ =~ qr| [\t] ([\d]+)[ ] # start in chromosome coord. ([\d]+)[ ] # end in chromosome coord. ([\d]+)[ ] # start in query coord. ([\d]+)[ ] # end in query coord. ([\d]+) # percent identity (as integer) [\t].*[\t] # skip the edit script ([\d]+) # length of alignment block. [\t]*([\d]+)* # length of following intron. |x );
$align->{chromo_start}
or $self->throw("chromo_start missing in alignment line.");
$align->{chromo_end},
or $self->throw("chromo_end was missing in alignment line.");
$align->{query_start},
or $self->throw("query_start was missing in alignment line.");
$align->{query_end},
or $self->throw("query_end was missing in alignment line.");
$align->{percent_identity},
or $self->throw("percent_identity was missing in alignment line.");
$align->{align_length},
or $self->throw("align_length was missing in alignment line.");
return $align;
}
1;} |
General documentation
User feedback is an integral part of the evolution of this and other
Bioperl modules. Send your comments and suggestions preferably to
the Bioperl mailing list. Your participation is much appreciated.
bioperl-l@bioperl.org - General discussion
http://bioperl.org/wiki/Mailing_lists - About the mailing lists
Report bugs to the Bioperl bug tracking system to help us keep track
of the bugs and their resolution. Bug reports can be submitted via
the web:
https://redmine.open-bio.org/projects/bioperl/
| AUTHOR - George Hartzell | Top |
Additional contributors names and emails here
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with an underscore (_).