This is a driver for the SearchIO system for parsing Exonerate (Guy
Slater) output. You can get Exonerate at
http://www.ebi.ac.uk/~guy/exonerate/
[until Guy puts up a Web reference,publication for it.]).
An optional parameter -min_intron is supported by the
newinitialization method. This is if you run Exonerate with a different
minimum intron length (default is 30) the parser will be able to
detect the difference between standard deletions and an intron. Still
some room to play with there that might cause this to get
misinterpreted that has not been fully tested or explored.
The VULGAR and CIGAR formats should be parsed okay now creating HSPs
where appropriate (so merging match states where appropriate rather
than breaking an HSP at each indel as it may have done in the past).
The GFF that comes from exonerate is still probably a better way to go
if you are doing protein2genome or est2genome mapping.
For example you can see this script:
### TODO: Jason, this link is dead, do we have an updated one?
http://fungal.genome.duke.edu/~jes12/software/scripts/process_exonerate_gff3.perl.txtIf your report contains both CIGAR and VULGAR lines only the first one
will processed for a given Query/Target pair. If you preferentially
want to use VULGAR or CIGAR add one of these options when initializing
the SearchIO object.
-cigar => 1
OR
-vulgar => 1
Or set them via these methods.
$parser->cigar(1)
OR
$parser->vulgar(1)
sub next_result
{ my ($self) = @_;
local $/ = "\n";
local $_;
$self->{'_last_data'} = '';
my ($reporttype,$seenquery,$reportline);
$self->start_document();
my @hit_signifs;
my $seentop;
my (@q_ex, @m_ex, @h_ex); while( defined($_ = $self->_readline) ) {
if( /^\s*Query:\s+(\S+)\s*(.+)?/ ) {
if( $seentop ) {
$self->end_element({'Name' => 'ExonerateOutput'});
$self->_pushback($_);
return $self->end_document();
}
$seentop = 1;
my ($nm,$desc) = ($1,$2);
chomp($desc) if defined $desc;
$self->{'_result_count'}++;
$self->start_element({'Name' => 'ExonerateOutput'});
$self->element({'Name' => 'ExonerateOutput_query-def',
'Data' => $nm });
$self->element({'Name' => 'ExonerateOutput_query-desc',
'Data' => $desc });
$self->element({'Name' => 'ExonerateOutput_program',
'Data' => 'Exonerate' });
$self->{'_seencigar'} = 0;
$self->{'_vulgar'} = 0;
} elsif ( /^Target:\s+(\S+)\s*(.+)?/ ) {
my ($nm,$desc) = ($1,$2);
chomp($desc) if defined $desc;
$self->start_element({'Name' => 'Hit'});
$self->element({'Name' => 'Hit_id',
'Data' => $nm});
$self->element({'Name' => 'Hit_desc',
'Data' => $desc});
$self->{'_seencigar'} = 0;
$self->{'_vulgar'} = 0;
} elsif( s/^vulgar:\s+(\S+)\s+ # query sequence id (\d+)\s+(\d+)\s+([\-\+\.])\s+ # query start-end-strand (\S+)\s+ # target sequence id (\d+)\s+(\d+)\s+([\-\+])\s+ # target start-end-strand (\d+)\s+ # score //ox ) {
next if( $self->cigar || $self->{'_seencigar'});
$self->{'_vulgar'}++;
if( ! $self->within_element('result') ) {
$self->start_element({'Name' => 'ExonerateOutput'});
$self->element({'Name' => 'ExonerateOutput_query-def',
'Data' => $1 });
}
if( ! $self->within_element('hit') ) {
$self->start_element({'Name' => 'Hit'});
$self->element({'Name' => 'Hit_id',
'Data' => $5});
}
my ($qs,$qe,$qstrand) = ($2,$3,$4);
my ($hs,$he,$hstrand) = ($6,$7,$8);
my $score = $9;
my @rest = split;
my ($qbegin,$qend) = ('query-from', 'query-to');
if( $qstrand eq '-' ) {
$qstrand = -1; $qe++;
} else {
$qstrand = 1;
$qs++;
}
my ($hbegin,$hend) = ('hit-from', 'hit-to');
if( $hstrand eq '-' ) {
$hstrand = -1;
$he++;
} else {
$hstrand = 1;
$hs++;
}
my ($aln_len,$inserts,$deletes) = (0,0,0);
my ($laststate,@events,$gaps) =( '' );
while( @rest >= 3 ) {
my ($state,$len1,$len2) = (shift @rest, shift @rest, shift @rest);
if( $state eq 'M' ) {
if( $laststate eq 'G' ) {
$events[-1]->{$qend} = $qs + $len1*$qstrand - $qstrand;
$events[-1]->{$hend} = $hs + $len2*$hstrand - $hstrand;
$events[-1]->{'gaps'} = $gaps;
} else {
push @events,
{ 'score' => $score,
'align-len' => $len1,
$qbegin => $qs,
$qend => ($qs + $len1*$qstrand - $qstrand),
$hbegin => $hs,
$hend => ($hs + $len2*$hstrand - $hstrand),
};
}
$gaps = 0;
} else {
$gaps = $len1 + $len2 if $state eq 'G';
}
$qs += $len1*$qstrand;
$hs += $len2*$hstrand;
$laststate= $state;
}
for my $event ( @events ) {
$self->start_element({'Name' => 'Hsp'});
while( my ($key,$val) = each %$event ) {
$self->element({'Name' => "Hsp_$key",
'Data' => $val});
}
$self->element({'Name' => 'Hsp_identity',
'Data' => 0});
$self->end_element({'Name' => 'Hsp'});
}
$self->element({'Name' => 'Hit_score',
'Data' => $score});
$self->end_element({'Name' => 'Hit'});
$self->end_element({'Name' => 'ExonerateOutput'});
return $self->end_document();
} elsif( s/^cigar:\s+(\S+)\s+ # query sequence id (\d+)\s+(\d+)\s+([\-\+])\s+ # query start-end-strand (\S+)\s+ # target sequence id (\d+)\s+(\d+)\s+([\-\+])\s+ # target start-end-strand (\d+)\s+ # score //ox ) {
next if( $self->vulgar || $self->{'_seenvulgar'});
$self->{'_cigar'}++;
if( ! $self->within_element('result') ) {
$self->start_element({'Name' => 'ExonerateOutput'});
$self->element({'Name' => 'ExonerateOutput_query-def',
'Data' => $1 });
}
if( ! $self->within_element('hit') ) {
$self->start_element({'Name' => 'Hit'});
$self->element({'Name' => 'Hit_id',
'Data' => $5});
}
my ($qs,$qe,$qstrand) = ($2,$3,$4);
my ($hs,$he,$hstrand) = ($6,$7,$8);
my $score = $9;
my @rest = split;
if( $qstrand eq '-' ) {
$qstrand = -1;
($qs,$qe) = ($qe,$qs); $qs--; $qe++;
} else { $qstrand = 1; }
if( $hstrand eq '-' ) {
$hstrand = -1;
($hs,$he) = ($he,$hs); $hs--; $he++;
} else { $hstrand = 1; }
$qs++; $hs++;
my ($aln_len,$inserts,$deletes) = (0,0,0);
while( @rest >= 2 ) {
my ($state,$len) = (shift @rest, shift @rest);
if( $state eq 'I' ) {
$inserts+=$len;
} elsif( $state eq 'D' ) {
if( $len >= $MIN_INTRON ) {
$self->start_element({'Name' => 'Hsp'});
$self->element({'Name' => 'Hsp_score',
'Data' => $score});
$self->element({'Name' => 'Hsp_align-len',
'Data' => $aln_len});
$self->element({'Name' => 'Hsp_identity',
'Data' => $aln_len -
($inserts + $deletes)});
$self->element({'Name' => 'Hsp_query-from',
'Data' => $qs});
$qs += $aln_len*$qstrand;
$self->element({'Name' => 'Hsp_query-to',
'Data' => $qs - ($qstrand*1)});
$hs += $deletes*$hstrand;
$self->element({'Name' => 'Hsp_hit-from',
'Data' => $hs});
$hs += $aln_len*$hstrand;
$self->element({'Name' => 'Hsp_hit-to',
'Data' => $hs-($hstrand*1)});
$self->element({'Name' => 'Hsp_align-len',
'Data' => $aln_len + $inserts
+ $deletes});
$self->element({'Name' => 'Hsp_identity',
'Data' => $aln_len });
$self->element({'Name' => 'Hsp_gaps',
'Data' => $inserts + $deletes});
$self->element({'Name' => 'Hsp_querygaps',
'Data' => $inserts});
$self->element({'Name' => 'Hsp_hitgaps',
'Data' => $deletes});
$self->element({'Name' => 'Hsp_qseq',
'Data' => shift @q_ex,
});
$self->element({'Name' => 'Hsp_hseq',
'Data' => shift @h_ex,
});
$self->element({'Name' => 'Hsp_midline',
'Data' => shift @m_ex,
});
$self->end_element({'Name' => 'Hsp'});
$aln_len = $inserts = $deletes = 0;
}
$deletes+=$len;
} else {
$aln_len += $len;
}
}
$self->start_element({'Name' => 'Hsp'});
$self->element({'Name' => 'Hsp_qseq',
'Data' => shift @q_ex,
});
$self->element({'Name' => 'Hsp_hseq',
'Data' => shift @h_ex,
});
$self->element({'Name' => 'Hsp_midline',
'Data' => shift @m_ex,
});
$self->element({'Name' => 'Hsp_score',
'Data' => $score});
$self->element({'Name' => 'Hsp_query-from',
'Data' => $qs});
$qs += $aln_len*$qstrand;
$self->element({'Name' => 'Hsp_query-to',
'Data' => $qs - ($qstrand*1)});
$hs += $deletes*$hstrand;
$self->element({'Name' => 'Hsp_hit-from',
'Data' => $hs});
$hs += $aln_len*$hstrand;
$self->element({'Name' => 'Hsp_hit-to',
'Data' => $hs -($hstrand*1)});
$self->element({'Name' => 'Hsp_align-len',
'Data' => $aln_len});
$self->element({'Name' => 'Hsp_identity',
'Data' => $aln_len - ($inserts + $deletes)});
$self->element({'Name' => 'Hsp_gaps',
'Data' => $inserts + $deletes});
$self->element({'Name' => 'Hsp_querygaps',
'Data' => $inserts});
$self->element({'Name' => 'Hsp_hitgaps',
'Data' => $deletes});
$self->end_element({'Name' => 'Hsp'});
$self->element({'Name' => 'Hit_score',
'Data' => $score});
$self->end_element({'Name' => 'Hit'});
$self->end_element({'Name' => 'ExonerateOutput'});
return $self->end_document();
} else {
}
}
return $self->end_document() if( $seentop );} |
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _