This object implements a parser for HMMER output.
BEGIN {
%MODEMAP = (
'HMMER_Output' => 'result',
'Hit' => 'hit',
'Hsp' => 'hsp'
);
%MAPPING = (
'Hsp_bit-score' => 'HSP-bits',
'Hsp_score' => 'HSP-score',
'Hsp_evalue' => 'HSP-evalue',
'Hsp_query-from' => 'HSP-query_start',
'Hsp_query-to' => 'HSP-query_end',
'Hsp_hit-from' => 'HSP-hit_start',
'Hsp_hit-to' => 'HSP-hit_end',
'Hsp_positive' => 'HSP-conserved',
'Hsp_identity' => 'HSP-identical',
'Hsp_gaps' => 'HSP-hsp_gaps',
'Hsp_hitgaps' => 'HSP-hit_gaps',
'Hsp_querygaps' => 'HSP-query_gaps',
'Hsp_qseq' => 'HSP-query_seq',
'Hsp_hseq' => 'HSP-hit_seq',
'Hsp_midline' => 'HSP-homology_seq',
'Hsp_align-len' => 'HSP-hsp_length',
'Hsp_query-frame' => 'HSP-query_frame',
'Hsp_hit-frame' => 'HSP-hit_frame',
'Hit_id' => 'HIT-name',
'Hit_len' => 'HIT-length',
'Hit_accession' => 'HIT-accession',
'Hit_desc' => 'HIT-description',
'Hit_signif' => 'HIT-significance',
'Hit_score' => 'HIT-score',
'HMMER_program' => 'RESULT-algorithm_name',
'HMMER_version' => 'RESULT-algorithm_version',
'HMMER_query-def' => 'RESULT-query_name',
'HMMER_query-len' => 'RESULT-query_length',
'HMMER_query-acc' => 'RESULT-query_accession',
'HMMER_querydesc' => 'RESULT-query_description',
'HMMER_hmm' => 'RESULT-hmm_name',
'HMMER_seqfile' => 'RESULT-sequence_file',
'HMMER_db' => 'RESULT-database_name',
);} |
sub next_result
{ my ($self) = @_;
my $seentop = 0;
my $reporttype;
my ( $last, @hitinfo, @hspinfo, %hspinfo, %hitinfo );
local $/ = "\n";
local $_;
my $verbose = $self->verbose; $self->start_document();
local ($_);
while ( defined( $_ = $self->_readline ) ) {
my $lineorig = $_;
chomp;
if (/^HMMER\s+(\S+)\s+\((.+)\)/o) {
my ( $prog, $version ) = split;
if ($seentop) {
$self->_pushback($_);
$self->end_element( { 'Name' => 'HMMER_Output' } );
return $self->end_document();
}
$self->{'_hmmidline'} = $_;
$self->start_element( { 'Name' => 'HMMER_Output' } );
$self->{'_result_count'}++;
$seentop = 1;
if ( defined $last ) {
($reporttype) = split( /\s+/, $last );
$self->element(
{
'Name' => 'HMMER_program',
'Data' => uc($reporttype)
}
);
}
$self->element(
{
'Name' => 'HMMER_version',
'Data' => $version
}
);
}
elsif (s/^HMM file:\s+//o) {
$self->{'_hmmfileline'} = $lineorig;
$self->element(
{
'Name' => 'HMMER_hmm',
'Data' => $_
}
);
}
elsif (s/^Sequence\s+(file|database):\s+//o) {
$self->{'_hmmseqline'} = $lineorig;
if ( $1 eq 'database' ) {
$self->element(
{
'Name' => 'HMMER_db',
'Data' => $_
}
);
}
$self->element(
{
'Name' => 'HMMER_seqfile',
'Data' => $_
}
);
}
elsif (s/^Query(\s+(sequence|HMM))?(?:\s+\d+)?:\s+//o) {
if ( !$seentop ) {
$self->_pushback( $self->{'_hmmidline'} );
$self->_pushback( $self->{'_hmmfileline'} );
$self->_pushback( $self->{'_hmmseqline'} );
$self->_pushback($lineorig);
next;
}
s/\s+$//;
$self->element(
{
'Name' => 'HMMER_query-def',
'Data' => $_
}
);
}
elsif (s/^Accession:\s+//o) {
s/\s+$//;
$self->element(
{
'Name' => 'HMMER_query-acc',
'Data' => $_
}
);
}
elsif (s/^Description:\s+//o) {
s/\s+$//;
$self->element(
{
'Name' => 'HMMER_querydesc',
'Data' => $_
}
);
}
elsif ( defined $self->{'_reporttype'}
&& $self->{'_reporttype'} eq 'HMMSEARCH' )
{
if (/^Scores for complete sequences/o) {
while ( defined( $_ = $self->_readline ) ) {
last if (/^\s+$/);
next if ( /^Sequence\s+Description/o || /^\-\-\-/o );
my @line = split;
my ( $name, $n, $evalue, $score ) =
( shift @line, pop @line, pop @line, pop @line );
my $desc = join( ' ', @line );
push @hitinfo, [ $name, $desc, $evalue, $score ];
$hitinfo{$name} = $#hitinfo;
}
}
elsif (/^Parsed for domains:/o) {
@hspinfo = ();
while ( defined( $_ = $self->_readline ) ) {
last if (/^\s+$/);
if (m!^//!) {
$self->_pushback($_);
last;
}
next if ( /^(Model|Sequence)\s+Domain/ || /^\-\-\-/ );
chomp;
if (
my ( $n, $domainnum, $domainct, @vals ) = (
m!^(\S+)\s+ # host name (\d+)/(\d+)\s+ # num/num (ie 1 of 2) (\d+)\s+(\d+).+? # sequence start and end (\d+)\s+(\d+)\s+ # hmm start and end \S+\s+ # [] (\S+)\s+ # score (\S+) # evalue \s*$!ox
)
)
{
my $info = $hitinfo[ $hitinfo{$n} ];
if ( !defined $info ) {
$self->warn(
"Incomplete Sequence information, can't find $n hitinfo says $hitinfo{$n}"
);
next;
}
push @hspinfo, [ $n, @vals ];
}
}
}
elsif (/^Alignments of top/o) {
my ( $prelength, $lastdomain, $count, $width );
$count = 0;
my %domaincounter;
my $second_tier = 0;
while ( defined( $_ = $self->_readline ) ) {
next if ( /^Align/o
|| /^\s+RF\s+[x\s]+$/o );
if ( /^Histogram/o || m!^//!o ) {
if ( $self->in_element('hsp') ) {
$self->end_element( { 'Name' => 'Hsp' } );
}
if ( $self->within_element('hit') ) {
$self->end_element( { 'Name' => 'Hit' } );
}
last;
}
chomp;
if (
m/^\s*(.+):\s+domain\s+(\d+)\s+of\s+(\d+)\,\s+ from\s+(\d+)\s+to\s+(\d+)/x
)
{
my ( $name, $domainct, $domaintotal, $from, $to ) =
( $1, $2, $3, $4, $5 );
$domaincounter{$name}++;
if ( $self->within_element('hit') ) {
if ( $self->within_element('hsp') ) {
$self->end_element( { 'Name' => 'Hsp' } );
}
$self->end_element( { 'Name' => 'Hit' } );
}
$self->start_element( { 'Name' => 'Hit' } );
my $info = [
@{
$hitinfo[ $hitinfo{$name} ] || $self->throw(
"Could not find hit info for $name: Insure that your database contains only unique sequence names"
)
}
];
if ( $info->[0] ne $name ) {
$self->throw(
"Somehow the Model table order does not match the order in the domains (got "
. $info->[0]
. ", expected $name)" );
}
$self->element(
{
'Name' => 'Hit_id',
'Data' => shift @{$info}
}
);
$self->element(
{
'Name' => 'Hit_desc',
'Data' => shift @{$info}
}
);
$self->element(
{
'Name' => 'Hit_signif',
'Data' => shift @{$info}
}
);
$self->element(
{
'Name' => 'Hit_score',
'Data' => shift @{$info}
}
);
$self->start_element( { 'Name' => 'Hsp' } );
$self->element(
{
'Name' => 'Hsp_identity',
'Data' => 0
}
);
$self->element(
{
'Name' => 'Hsp_positive',
'Data' => 0
}
);
my $HSPinfo = shift @hspinfo;
my $id = shift @$HSPinfo;
if ( $id ne $name ) {
$self->throw(
"Somehow the domain list details do not match the table (got $id, expected $name)"
);
}
if ( $domaincounter{$name} == $domaintotal ) {
$hitinfo[ $hitinfo{$name} ] = undef;
}
$self->element(
{
'Name' => 'Hsp_hit-from',
'Data' => shift @$HSPinfo
}
);
$self->element(
{
'Name' => 'Hsp_hit-to',
'Data' => shift @$HSPinfo
}
);
$self->element(
{
'Name' => 'Hsp_query-from',
'Data' => shift @$HSPinfo
}
);
$self->element(
{
'Name' => 'Hsp_query-to',
'Data' => shift @$HSPinfo
}
);
$self->element(
{
'Name' => 'Hsp_score',
'Data' => shift @$HSPinfo
}
);
$self->element(
{
'Name' => 'Hsp_evalue',
'Data' => shift @$HSPinfo
}
);
$lastdomain = $name;
}
else {
if (/^(\s+\*\-\>)(\S+)/o) { $prelength = CORE::length($1);
$width = 0;
$self->element(
{
'Name' => 'Hsp_qseq',
'Data' => $2
}
);
$count = 0;
$second_tier = 0;
}
elsif (/^(\s+)(\S+)\<\-\*\s*$/o) { $self->element(
{
'Name' => 'Hsp_qseq',
'Data' => $2
}
);
$width = CORE::length($2);
$count = 0;
}
elsif (( $count != 1 && /^\s+$/o )
|| CORE::length($_) == 0
|| /^\s+\-?\*\s*$/ )
{
next;
}
elsif ( $count == 0 ) {
$prelength -= 3 unless ( $second_tier++ );
unless ( defined $prelength ) {
next;
}
$self->element(
{
'Name' => 'Hsp_qseq',
'Data' => substr( $_, $prelength )
}
);
}
elsif ( $count == 1 ) {
if ( !defined $prelength ) {
$self->warn("prelength not set");
}
if ($width) {
$self->element(
{
'Name' => 'Hsp_midline',
'Data' =>
substr( $_, $prelength, $width )
}
);
}
else {
$self->debug("midline is $_\n")
if ( $verbose > 0
&& CORE::length($_) <= $prelength );
$self->element(
{
'Name' => 'Hsp_midline',
'Data' => substr( $_, $prelength )
}
);
}
}
elsif ( $count == 2 ) {
if (/^\s+(\S+)\s+(\d+|\-)\s+(\S*)\s+(\d+|\-)/o) {
$self->element(
{
'Name' => 'Hsp_hseq',
'Data' => $3
}
);
}
else {
$self->warn("unrecognized line: $_\n");
}
}
$count = 0 if $count++ >= 2;
}
}
}
elsif ( /^Histogram/o || m!^//!o ) {
while ( my $HSPinfo = shift @hspinfo ) {
my $id = shift @$HSPinfo;
my $info = [ @{ $hitinfo[ $hitinfo{$id} ] } ];
next unless defined $info;
$self->start_element( { 'Name' => 'Hit' } );
$self->element(
{
'Name' => 'Hit_id',
'Data' => shift @{$info}
}
);
$self->element(
{
'Name' => 'Hit_desc',
'Data' => shift @{$info}
}
);
$self->element(
{
'Name' => 'Hit_signif',
'Data' => shift @{$info}
}
);
$self->element(
{
'Name' => 'Hit_score',
'Data' => shift @{$info}
}
);
$self->start_element( { 'Name' => 'Hsp' } );
$self->element(
{
'Name' => 'Hsp_query-from',
'Data' => shift @$HSPinfo
}
);
$self->element(
{
'Name' => 'Hsp_query-to',
'Data' => shift @$HSPinfo
}
);
$self->element(
{
'Name' => 'Hsp_hit-from',
'Data' => shift @$HSPinfo
}
);
$self->element(
{
'Name' => 'Hsp_hit-to',
'Data' => shift @$HSPinfo
}
);
$self->element(
{
'Name' => 'Hsp_score',
'Data' => shift @$HSPinfo
}
);
$self->element(
{
'Name' => 'Hsp_evalue',
'Data' => shift @$HSPinfo
}
);
$self->element(
{
'Name' => 'Hsp_identity',
'Data' => 0
}
);
$self->element(
{
'Name' => 'Hsp_positive',
'Data' => 0
}
);
$self->element(
{
'Name' => 'Hsp_positive',
'Data' => 0
}
);
$self->end_element( { 'Name' => 'Hsp' } );
$self->end_element( { 'Name' => 'Hit' } );
}
@hitinfo = ();
%hitinfo = ();
last;
}
}
elsif ( defined $self->{'_reporttype'}
&& $self->{'_reporttype'} eq 'HMMPFAM' )
{
if (/^Scores for sequence family/o) {
while ( defined( $_ = $self->_readline ) ) {
last if (/^\s+$/);
next if ( /^Model\s+Description/o || /^\-\-\-/o );
chomp;
my @line = split;
my ( $model, $n, $evalue, $score ) =
( shift @line, pop @line, pop @line, pop @line );
my $desc = join( ' ', @line );
push @hitinfo, [ $model, $desc, $score, $evalue, $n ];
$hitinfo{$model} = $#hitinfo;
}
}
elsif (/^Parsed for domains:/o) {
@hspinfo = ();
while ( defined( $_ = $self->_readline ) ) {
last if (/^\s+$/);
if (m!^//!) {
$self->_pushback($_);
last;
}
next if ( /^Model\s+Domain/o || /^\-\-\-/o );
chomp;
if (
my ( $n, $domainnum, $domainct, @vals ) = (
m!^(\S+)\s+ # domain name (\d+)/(\d+)\s+ # domain num out of num (\d+)\s+(\d+).+? # seq start, end (\d+)\s+(\d+)\s+ # hmm start, end \S+\s+ # [] (\S+)\s+ # score (\S+) # evalue \s*$!ox
)
)
{
my $hindex = $hitinfo{$n};
if ( !defined $hindex ) {
push @hitinfo,
[ $n, '', $vals[5], $vals[6], $domainct ];
$hitinfo{$n} = $#hitinfo;
$hindex = $#hitinfo;
}
my $info = $hitinfo[$hindex];
if ( !defined $info ) {
$self->warn(
"incomplete Domain information, can't find $n hitinfo says $hitinfo{$n}"
);
next;
}
push @hspinfo, [ $n, @vals ];
}
}
}
elsif (/^Alignments of top/o) {
my ( $prelength, $lastdomain, $count, $width );
$count = 0;
my $second_tier = 0;
while ( defined( $_ = $self->_readline ) ) {
next
if (
/^Align/o
|| ( $count != 1
&& /^\s+RF\s+[x\s]+$/o )
);
$self->debug("$count $_") if $verbose > 0;
if ( /^Histogram/o || m!^//!o || /^Query sequence/o ) {
if ( $self->in_element('hsp') ) {
$self->end_element( { 'Name' => 'Hsp' } );
}
if ( $self->in_element('hit') ) {
$self->end_element( { 'Name' => 'Hit' } );
}
$self->_pushback($_);
last;
}
chomp;
if (m/(\S+):.*from\s+(\d+)\s+to\s+(\d+)/o) {
my ( $name, $from, $to ) = ( $1, $2, $3 );
if ( $self->within_element('hit') ) {
if ( $self->in_element('hsp') ) {
$self->end_element( { 'Name' => 'Hsp' } );
}
$self->end_element( { 'Name' => 'Hit' } );
}
my $info = [ @{ $hitinfo[ $hitinfo{$name} ] } ];
if ( !defined $info
|| $info->[0] ne $name )
{
$self->warn(
"Somehow the Model table order does not match the order in the domains (got "
. $info->[0]
. ", expected $name). We're back loading this from the alignment information instead"
);
$info = [
$name, '',
/score\s+([^,\s]+),\s+E\s+=\s+(\S+)/ox
];
push @hitinfo, $info;
$hitinfo{$name} = $#hitinfo;
}
$self->start_element( { 'Name' => 'Hit' } );
$self->element(
{
'Name' => 'Hit_id',
'Data' => shift @{$info}
}
);
$self->element(
{
'Name' => 'Hit_desc',
'Data' => shift @{$info}
}
);
$self->element(
{
'Name' => 'Hit_score',
'Data' => shift @{$info}
}
);
$self->element(
{
'Name' => 'Hit_signif',
'Data' => shift @{$info}
}
);
$self->start_element( { 'Name' => 'Hsp' } );
$self->element(
{
'Name' => 'Hsp_identity',
'Data' => 0
}
);
$self->element(
{
'Name' => 'Hsp_positive',
'Data' => 0
}
);
my $HSPinfo = shift @hspinfo;
my $id = shift @$HSPinfo;
if ( $id ne $name ) {
$self->throw(
"Somehow the domain list details do not match the table (got $id, expected $name)"
);
}
$self->element(
{
'Name' => 'Hsp_query-from',
'Data' => shift @$HSPinfo
}
);
$self->element(
{
'Name' => 'Hsp_query-to',
'Data' => shift @$HSPinfo
}
);
$self->element(
{
'Name' => 'Hsp_hit-from',
'Data' => shift @$HSPinfo
}
);
$self->element(
{
'Name' => 'Hsp_hit-to',
'Data' => shift @$HSPinfo
}
);
$self->element(
{
'Name' => 'Hsp_score',
'Data' => shift @$HSPinfo
}
);
$self->element(
{
'Name' => 'Hsp_evalue',
'Data' => shift @$HSPinfo
}
);
$lastdomain = $name;
}
else {
if (/^(\s+\*\-\>)(\S+)/o) {
$prelength = CORE::length($1);
$width = 0;
$self->element(
{
'Name' => 'Hsp_hseq',
'Data' => $2
}
);
$count = 0;
$second_tier = 0;
}
elsif (/^(\s+)(\S+)\<\-?\*?\s*$/o) {
$prelength -= 3 unless ( $second_tier++ );
$self->element(
{
'Name' => 'Hsp_hseq',
'Data' => $2
}
);
$width = CORE::length($2);
$count = 0;
}
elsif (CORE::length($_) == 0
|| ( $count != 1 && /^\s+$/o )
|| /^\s+\-?\*\s*$/ )
{
next;
}
elsif ( $count == 0 ) {
$prelength -= 3 unless ( $second_tier++ );
unless ( defined $prelength ) {
next;
}
$self->element(
{
'Name' => 'Hsp_hseq',
'Data' => substr( $_, $prelength )
}
);
}
elsif ( $count == 1 ) {
if ( !defined $prelength ) {
$self->warn("prelength not set");
}
if ($width) {
$self->element(
{
'Name' => 'Hsp_midline',
'Data' =>
substr( $_, $prelength, $width )
}
);
}
else {
$self->element(
{
'Name' => 'Hsp_midline',
'Data' => substr( $_, $prelength )
}
);
}
}
elsif ( $count == 2 ) {
if ( /^\s+(\S+)\s+(\d+)\s+(\S+)\s+(\d+)/o
|| /^\s+(\S+)\s+(\-)\s+(\S*)\s+(\-)/o )
{
$self->element(
{
'Name' => 'Hsp_qseq',
'Data' => $3
}
);
}
else {
$self->throw(
"unrecognized line ($count): $_\n");
}
}
$count = 0 if $count++ >= 2;
}
}
}
elsif ( /^Histogram/o || m!^//!o ) {
while ( my $HSPinfo = shift @hspinfo ) {
my $id = shift @$HSPinfo;
my $info = [ @{ $hitinfo[ $hitinfo{$id} ] } ];
next unless defined $info;
$self->start_element( { 'Name' => 'Hit' } );
$self->element(
{
'Name' => 'Hit_id',
'Data' => shift @{$info}
}
);
$self->element(
{
'Name' => 'Hit_desc',
'Data' => shift @{$info}
}
);
$self->element(
{
'Name' => 'Hit_signif',
'Data' => shift @{$info}
}
);
$self->element(
{
'Name' => 'Hit_score',
'Data' => shift @{$info}
}
);
$self->start_element( { 'Name' => 'Hsp' } );
$self->element(
{
'Name' => 'Hsp_query-from',
'Data' => shift @$HSPinfo
}
);
$self->element(
{
'Name' => 'Hsp_query-to',
'Data' => shift @$HSPinfo
}
);
$self->element(
{
'Name' => 'Hsp_hit-from',
'Data' => shift @$HSPinfo
}
);
$self->element(
{
'Name' => 'Hsp_hit-to',
'Data' => shift @$HSPinfo
}
);
$self->element(
{
'Name' => 'Hsp_score',
'Data' => shift @$HSPinfo
}
);
$self->element(
{
'Name' => 'Hsp_evalue',
'Data' => shift @$HSPinfo
}
);
$self->element(
{
'Name' => 'Hsp_identity',
'Data' => 0
}
);
$self->element(
{
'Name' => 'Hsp_positive',
'Data' => 0
}
);
$self->element(
{
'Name' => 'Hsp_positive',
'Data' => 0
}
);
$self->end_element( { 'Name' => 'Hsp' } );
$self->end_element( { 'Name' => 'Hit' } );
}
@hitinfo = ();
%hitinfo = ();
last;
}
else {
$self->debug($_) if $verbose > 0;
}
}
$last = $_;
}
$self->end_element( { 'Name' => 'HMMER_Output' } ) unless !$seentop;
return $self->end_document();} |
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _