#!/usr/local/bin/perl -w
# fgenesh_to_gff2.pl
# Given a FGENESH output for gene-calling, constructs a .gff2 file suitable for
# loading into Argo. The gff2 and FGENESH formats are shown at bottom.
# JC Nelson, 3.22.07
# Note: subroutine fgenes2gff2() contains literals whose validity depends on FGENESH
# report format, as do several of the named constants in this script. I don't know
# the extent of variation in this format, not having run the program on many sequences.
my $kMaxArgs = 1; # num of args required
my $kUsageMsg = "Usage: perl fgenesh_to_gff2.pl \n";
my $numArgs = @ARGV;
fetchParams($numArgs); # exit with usage prompt if input args are wrong
# Define some constants
my $kSeqNameLineIndex = 2;
my $kSeqLenLineIndex = 3;
my $kFeatureStartIndex = 8; # indices into rows of the FGENESH output
my $kFeatureEndIndex = 10;
my $sGFF2_suffix = ".gff2";
my $sSoftwareName = "FGENESH 2.5";
my $sLengthStr = " - Length = ";
my $kCallHeaderTag = 'G';
my $sPredictedTag = "Predicted";
my $kShortFGLineLen = 5;
my $kLongFGLineLen = 12;
my $kStrandIndex = 1;
my $sNoCallsFoundMsg = "No call-table header was found in the call file";
my $time_string = `TIME\/T`;
my $date_string = `DATE\/T`; # DATE\/T is required on Windows to prevent a prompt for the new date, which will hang this script!
chomp $time_string;
chomp $date_string;
my $date_time_string = $date_string . $time_string; # (just use `date` for Unix to get both date and time)
my $gff2_header =
"\#\# gff-version 2
\#\# date $date_time_string
\#\# source-version: $sSoftwareName http://sun1.softberry.com/berry.phtml?topic=fgenesh&group=programs&subgroup=gfind
\# Sequence ";
#### MAIN PROGRAM STARTS
my ($FGENESHcallsFileName) = @ARGV;
my $calls_ref = load_calls($FGENESHcallsFileName);
my $seq_name = fetch_header_value($calls_ref, $kSeqNameLineIndex);
my $seq_len = fetch_header_value($calls_ref, $kSeqLenLineIndex);
$gff2_header .= $seq_name . $sLengthStr . $seq_len . "\n";
my $outFileName = replace_suffix($FGENESHcallsFileName, $sGFF2_suffix);
parse_calls($calls_ref, $outFileName, $gff2_header, $seq_name);
#### MAIN PROGRAM ENDS
### SUBROUTINES ###
sub load_calls
{
my $filename = shift;
open (FH, $filename) || die ("Can't find $filename.\n");
my @arr = ;
close FH;
return \@arr;
}
sub parse_calls
{
my ($calls_ref, $outFileName, $loc_gff2_header, $loc_seq_name) = @_;
open (OUT, ">$outFileName") || die ("Can't find $outFileName.\n");
print OUT $loc_gff2_header;
my $num_call_lines = @$calls_ref;
my $row_num = $kSeqLenLineIndex + 1;
my $calls_found = 0;
while (!$calls_found)
{
my $line = strip_leading_whitespace($calls_ref->[$row_num]);
my @arr = split(/\s+/, $line);
my $first_token = $arr[0];
if ($first_token eq $kCallHeaderTag)
{ $calls_found = 1;}
elsif ($first_token eq $sPredictedTag || (++$row_num == $num_call_lines))
{ die $sNoCallsFoundMsg;}
}
++$row_num; # move to first line following call header
while ($calls_found)
{
my $line = strip_leading_whitespace($calls_ref->[$row_num++]);
next if ($line eq ""); # skip blank lines
my @arr = split(/\s+/, $line);
my $first_token = $arr[0];
if ($first_token eq $sPredictedTag || ($row_num == $num_call_lines))
{ $calls_found = 0;}
else
{ print OUT fgenes2gff2("$seq_name\t$sSoftwareName\t", \@arr);}
}
close OUT;
}
# Convert either of these lines (without the leading #):
# 1 + 1 CDSo 27 - 308 39.43 27 - 308 282
# 1 + PolA 479 1.87
# to this format (without the leading #):
#Contig2893_Contig183-40 geneid_v1.2 Start 36 38 -1.57 + . # CAACTGTGGCACCTGGATGG
sub fgenes2gff2
{
my ($leader, $ref) = @_;
my @arr = @$ref;
my $len = @arr;
my $strand = $arr[$kStrandIndex];
my ($feature_name_index, $feature_start_index, $feature_score_index, $feature_end_index);
if ($len == $kShortFGLineLen)
{
$feature_name_index = 2;
$feature_start_index = $feature_end_index = 3; # Argo, at least, requires both values
$feature_score_index = 4;
}
else
{
$feature_name_index = 3;
$feature_start_index = $kFeatureStartIndex;
$feature_score_index = 7;
$feature_end_index = $kFeatureEndIndex;
}
return "$leader\t$arr[$feature_name_index]\t$arr[$feature_start_index]\t$arr[$feature_end_index]\t$arr[$feature_score_index]\t$strand\t\.\n";
}
sub replace_suffix
{
my ($str, $new_suffix) = @_;
my $pos = rindex($str, '.');
if ($pos > 0)
{ $str = substr($str, 0, $pos);}
return $str . $new_suffix;
}
sub strip_leading_whitespace
{
my $str = shift;
while ($str =~ /^\s/)
{ $str =~ s/^\s//;}
return $str;
}
sub fetch_header_value
{
my ($calls_ref, $index) = @_;
my $loc_line = $calls_ref->[$index];
chomp $loc_line; # else can get a newline when retrieving the last entry
my @line_arr = split(/: /, $loc_line);
return $line_arr[1];
}
sub fetchParams
{
my $numArgs = shift;
if ($numArgs < $kMaxArgs)
{ print $kUsageMsg;}
}
## BELOW IS A MORE COMPLETE GFF2 EXAMPLE, FOLLOWED BY AN EXAMPLE OF FGENESH OUTPUT
## gff-version 2
## date Mon Mar 12 16:20:25 2007
## source-version: geneid v 1.2 -- geneid@imim.es
# Sequence Contig2893_Contig183-40 - Length = 7980 bps
# Starts(+) predicted in sequence Contig2893_Contig183-40: [0,7979]
#Contig2893_Contig183-40 geneid_v1.2 Start 36 38 -1.57 + . # CAACTGTGGCACCTGGATGG
#Contig2893_Contig183-40 geneid_v1.2 Start 45 47 0.32 + . # CACCTGGATGGACAAGATGG
#Contig2893_Contig183-40 geneid_v1.2 Acceptor 2219 2220 -6.12 + . # ATCGTTGGAAGGTTTGCAGAAGAA
#Contig2893_Contig183-40 geneid_v1.2 Acceptor 2252 2253 -5.87 + . # CAACAGGATTTATGAAACTGAGGA
#Contig2893_Contig183-40 geneid_v1.2 Donor 6269 6270 -4.19 + . # CCCGTGGACG
#Contig2893_Contig183-40 geneid_v1.2 Donor 6282 6283 -1.60 + . # CATGTCTGTA
#Contig2893_Contig183-40 geneid_v1.2 Donor 6286 6287 -2.57 + . # TCTGTAAAAC
#Contig2893_Contig183-40 geneid_v1.2 Stop 5030 5032 0.50 + . # GCAAATAAG
#Contig2893_Contig183-40 geneid_v1.2 Stop 5039 5041 0.61 + . # AGCCGTAAT
# Starts(-) predicted in sequence Contig2893_Contig183-40: [0,7979]
#Contig2893_Contig183-40 geneid_v1.2 Start 7977 7979 -0.27 - . # NNNNNNNNNNNNNNNAATGC
#Contig2893_Contig183-40 geneid_v1.2 Start 7907 7909 -3.54 - . # CCCATATTTTATTTGGATGA
# The "Contig" lines are data and in the original file are not preceded by #
# and the FGENESH output here (all # have been added):
# FGENESH 2.5 Prediction of potential genes in Tribolium_castaneum genomic DNA
# Time : Wed Mar 21 12:52:10 2007
# Seq name: Contig5724_Contig4284-20
# Length of sequence: 12780
# Number of predicted genes 2 in +chain 1 in -chain 1
# Number of predicted exons 7 in +chain 1 in -chain 6
# Positions of predicted genes and exons: Variant 1 from 1, Score:147.756439
# G Str Feature Start End Score ORF Len
#
# 1 + 1 CDSo 27 - 308 39.43 27 - 308 282
# 1 + PolA 479 1.87
#
# 2 - PolA 2425 1.87
# 2 - 1 CDSl 2860 - 3272 39.51 2860 - 3270 411
# 2 - 2 CDSi 5289 - 5529 23.65 5290 - 5529 240
# 2 - 3 CDSi 5583 - 5925 47.28 5583 - 5924 342
# 2 - 4 CDSi 6061 - 6283 11.52 6063 - 6281 219
# 2 - 5 CDSi 8157 - 8232 -2.91 8158 - 8232 75
# 2 - 6 CDSf 8836 - 8883 6.49 8836 - 8883 48
# 2 - TSS 8917 -5.74
#Predicted protein(s):
#(aa sequences)