sub next_assembly
{ my $self = shift; my $lingering_read;
local $/="\n";
my $assembly = Bio::Assembly::Scaffold->new(-source=>'phrap');
my ($contigOBJ,$read_name);
my $read_data = {}; while ($_ = $self->_readline) { chomp;
(/^CO Contig(\d+) (\d+) (\d+) (\d+) (\w+)/) && do { my $contigID = $1;
$contigOBJ = Bio::Assembly::Contig->new(-source=>'phrap', -id=>$contigID);
my $ori = ($5 eq 'U' ? 1 : -1); $contigOBJ->strand($ori);
my $consensus_sequence = undef;
while ($_ = $self->_readline) { chomp; last if (/^$/); s/\*/-/g; $consensus_sequence .= $_;
}
my $consensus_length = length($consensus_sequence);
$consensus_sequence = Bio::LocatableSeq->new(-seq=>$consensus_sequence,
-start=>1,
-end=>$consensus_length,
-id=>"Consensus sequence for $contigID");
$contigOBJ->set_consensus_sequence($consensus_sequence);
$assembly->add_contig($contigOBJ);
};
/^BQ/ && do {
my $consensus = $contigOBJ->get_consensus_sequence()->seq();
my ($i,$j,@tmp);
my @quality = ();
$j = 0;
while ($_ = $self->_readline) {
chomp;
last if (/^$/);
@tmp = grep { /^\d+$/ } split(/\s+/);
$i = 0;
my $previous = 0;
my $next = 0;
while ($i<=$#tmp) {
if (substr($consensus,$j,1) eq '-') {
$previous = $tmp[$i-1] unless ($i == 0);
if ($i < $#tmp) {
$next = $tmp[$i+1];
} else {
$next = 0;
}
push(@quality,int(($previous+$next)/2)); } else {
push(@quality,$tmp[$i]);
$i++;
}
$j++;
}
}
my $qual = Bio::Seq::Quality->new(-qual=>join(" ",@quality),
-id=>$contigOBJ->id());
$contigOBJ->set_consensus_quality($qual);
};
/^AF (\S+) (C|U) (-*\d+)/ && do {
$read_name = $1; my $ori = $2;
$read_data->{$read_name}{'padded_start'} = $3; $ori = ( $ori eq 'U' ? 1 : -1);
$read_data->{$read_name}{'strand'} = $ori;
};
/^RD (\S+) (-*\d+) (\d+) (\d+)/ && do {
$read_name = $1;
$read_data->{$read_name}{'length'} = $2; $read_data->{$read_name}{'contig'} = $contigOBJ;
my $read_sequence;
while ($_ = $self->_readline) {
chomp;
last if (/^$/);
s/\*/-/g; $read_sequence .= $_; }
my $read = Bio::LocatableSeq->new(-seq=>$read_sequence,
-start=>1,
-end=>$read_data->{$read_name}{'length'},
-strand=>$read_data->{$read_name}{'strand'},
-id=>$read_name,
-primary_id=>$read_name,
-alphabet=>'dna');
$lingering_read = $read;
my $padded_start = $read_data->{$read_name}{'padded_start'};
my $padded_end = $padded_start + $read_data->{$read_name}{'length'} - 1;
my $coord = Bio::SeqFeature::Generic->new(-start=>$padded_start,
-end=>$padded_end,
-strand=>$read_data->{$read_name}{'strand'},
-tag => { 'contig' => $contigOBJ->id }
);
$contigOBJ->set_seq_coord($coord,$read);
};
/^QA (-?\d+) (-?\d+) (-?\d+) (-?\d+)/ && do {
my $qual_start = $1; my $qual_end = $2;
my $align_start = $3; my $align_end = $4;
unless ($align_start == -1 && $align_end == -1) {
$align_start = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$align_start);
$align_end = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$align_end);
my $align_feat = Bio::SeqFeature::Generic->new(-start=>$align_start,
-end=>$align_end,
-strand=>$read_data->{$read_name}{'strand'},
-primary=>"_align_clipping:$read_name");
$align_feat->attach_seq( $contigOBJ->get_seq_by_name($read_name) );
$contigOBJ->add_features([ $align_feat ], 0);
}
unless ($qual_start == -1 && $qual_end == -1) {
$qual_start = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$qual_start);
$qual_end = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$qual_end);
my $qual_feat = Bio::SeqFeature::Generic->new(-start=>$qual_start,
-end=>$qual_end,
-strand=>$read_data->{$read_name}{'strand'},
-primary=>"_quality_clipping:$read_name");
$qual_feat->attach_seq( $contigOBJ->get_seq_by_name($read_name) );
$contigOBJ->add_features([ $qual_feat ], 0);
}
};
/^DS / && do {
/CHEM: (\S+)/ && do {
$lingering_read->{'chemistry'} = $1;
};
/CHROMAT_FILE: (\S+)/ && do {
$lingering_read->{'chromatfilename'} = $1;
};
/DIRECTION: (\w+)/ && do {
my $ori = $1;
if ($ori eq 'rev') { $ori = 'C' }
elsif ($ori eq 'fwd') { $ori = 'U' }
$lingering_read->{'strand'} = $ori;
};
/DYE: (\S+)/ && do {
$lingering_read->{'dye'} = $1;
};
/PHD_FILE: (\S+)/ && do {
$lingering_read->{'phdfilename'} = $1;
};
/TEMPLATE: (\S+)/ && do {
$lingering_read->{'template'} = $1;
};
/TIME: (\S+ \S+ \d+ \d+\:\d+\:\d+ \d+)/ && do {
$lingering_read->{'phd_time'} = $1;
};
};
/^CT\s*\{/ && do {
my ($contigID,$type,$source,$start,$end,$date) = split(' ',$self->_readline);
my %tags = (source => $source, creation_date => $date);
$contigID =~ s/^Contig//i;
my $tag_type = 'extra_info';
while ($_ = $self->_readline) {
if (/COMMENT\s*\{/)
{
$tag_type = 'comment';
}
elsif (/C\}/)
{
$tag_type = 'extra_info';
}
elsif (/\}/)
{
last;
}
else
{
$tags{$tag_type} .= "$_";
}
}
my $contig_tag = Bio::SeqFeature::Generic->new(-start=>$start,
-end=>$end,
-primary=>$type,
-tag=>\%tags,
);
$assembly->get_contig_by_id($contigID)->add_features([ $contig_tag ],1);
};
/^RT\s*\{/ && do {
my ($readID,$type,$source,$start,$end,$date) = split(' ',$self->_readline);
my $extra_info = undef;
while ($_ = $self->_readline) {
last if (/\}/);
$extra_info .= $_;
}
$start = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$start);
$end = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$end);
my $read_tag = Bio::SeqFeature::Generic->new(-start=>$start,
-end=>$end,
-primary=>$type,
-tag=>{ 'source' => $source,
'creation_date' => $date,
'extra_info' => $extra_info
});
my $contig = $read_data->{$readID}{'contig'};
my $coord = $contig->get_seq_coord( $contig->get_seq_by_name($readID) );
$coord->add_sub_SeqFeature($read_tag);
};
/^WA\s*\{/ && do {
my ($type,$source,$date) = split(' ',$self->_readline);
my $extra_info = undef;
while ($_ = $self->_readline) {
last if (/\}/);
$extra_info = $_;
}
my $assembly_tag = join(" ","TYPE: ",$type,"PROGRAM:",$source,
"DATE:",$date,"DATA:",$extra_info);
$assembly_tag = Bio::Annotation::SimpleValue->new(-value=>$assembly_tag);
$assembly->annotation->add_Annotation('phrap',$assembly_tag);
};
}
my $singletsfilename = $self->file();
$singletsfilename =~ s/\.ace.*$/.singlets/;
$singletsfilename =~ s/\<//;
if (!-f $singletsfilename) {
return $assembly;
}
my $singlets_fh = Bio::SeqIO->new(-file => "<$singletsfilename",
-format => 'fasta');
my $adder;
while (my $seq = $singlets_fh->next_seq()) {
my ($phdfilename,$chromatfilename) = qw(unset unset);
if ($seq->desc() =~ /PHD_FILE: (\S+)/) {
$phdfilename = $1;
}
if ($seq->desc() =~ /CHROMAT_FILE: (\S+)/) {
$chromatfilename = $1;
}
(my $phdfile = $singletsfilename) =~ s/edit_dir.*//;
$phdfile .= "phd_dir/$phdfilename";
my $singlet = new Bio::Assembly::Singlet();
if (-f $phdfile) {
my $phd_fh = new Bio::SeqIO( -file => "<$phdfile", -format => 'phd');
my $swq = $phd_fh->next_seq();
$adder = $swq;
}
else {
$adder = $seq;
}
$adder->{phdfilename} = $phdfilename;
$adder->{chromatfilename} = $chromatfilename;
$singlet->seq_to_singlet($adder);
$assembly->add_singlet($singlet);
}
$assembly->update_seq_list();
return $assembly; } |