#!/usr/local/bin/perl -w # fgenesh_to_gff2.pl # Given a FGENESH output for gene-calling, constructs a .gff2 file suitable for # loading into Argo. The gff2 and FGENESH formats are shown at bottom. # JC Nelson, 3.22.07 # Note: subroutine fgenes2gff2() contains literals whose validity depends on FGENESH # report format, as do several of the named constants in this script. I don't know # the extent of variation in this format, not having run the program on many sequences. my $kMaxArgs = 1; # num of args required my $kUsageMsg = "Usage: perl fgenesh_to_gff2.pl \n"; my $numArgs = @ARGV; fetchParams($numArgs); # exit with usage prompt if input args are wrong # Define some constants my $kSeqNameLineIndex = 2; my $kSeqLenLineIndex = 3; my $kFeatureStartIndex = 8; # indices into rows of the FGENESH output my $kFeatureEndIndex = 10; my $sGFF2_suffix = ".gff2"; my $sSoftwareName = "FGENESH 2.5"; my $sLengthStr = " - Length = "; my $kCallHeaderTag = 'G'; my $sPredictedTag = "Predicted"; my $kShortFGLineLen = 5; my $kLongFGLineLen = 12; my $kStrandIndex = 1; my $sNoCallsFoundMsg = "No call-table header was found in the call file"; my $time_string = `TIME\/T`; my $date_string = `DATE\/T`; # DATE\/T is required on Windows to prevent a prompt for the new date, which will hang this script! chomp $time_string; chomp $date_string; my $date_time_string = $date_string . $time_string; # (just use `date` for Unix to get both date and time) my $gff2_header = "\#\# gff-version 2 \#\# date $date_time_string \#\# source-version: $sSoftwareName http://sun1.softberry.com/berry.phtml?topic=fgenesh&group=programs&subgroup=gfind \# Sequence "; #### MAIN PROGRAM STARTS my ($FGENESHcallsFileName) = @ARGV; my $calls_ref = load_calls($FGENESHcallsFileName); my $seq_name = fetch_header_value($calls_ref, $kSeqNameLineIndex); my $seq_len = fetch_header_value($calls_ref, $kSeqLenLineIndex); $gff2_header .= $seq_name . $sLengthStr . $seq_len . "\n"; my $outFileName = replace_suffix($FGENESHcallsFileName, $sGFF2_suffix); parse_calls($calls_ref, $outFileName, $gff2_header, $seq_name); #### MAIN PROGRAM ENDS ### SUBROUTINES ### sub load_calls { my $filename = shift; open (FH, $filename) || die ("Can't find $filename.\n"); my @arr = ; close FH; return \@arr; } sub parse_calls { my ($calls_ref, $outFileName, $loc_gff2_header, $loc_seq_name) = @_; open (OUT, ">$outFileName") || die ("Can't find $outFileName.\n"); print OUT $loc_gff2_header; my $num_call_lines = @$calls_ref; my $row_num = $kSeqLenLineIndex + 1; my $calls_found = 0; while (!$calls_found) { my $line = strip_leading_whitespace($calls_ref->[$row_num]); my @arr = split(/\s+/, $line); my $first_token = $arr[0]; if ($first_token eq $kCallHeaderTag) { $calls_found = 1;} elsif ($first_token eq $sPredictedTag || (++$row_num == $num_call_lines)) { die $sNoCallsFoundMsg;} } ++$row_num; # move to first line following call header while ($calls_found) { my $line = strip_leading_whitespace($calls_ref->[$row_num++]); next if ($line eq ""); # skip blank lines my @arr = split(/\s+/, $line); my $first_token = $arr[0]; if ($first_token eq $sPredictedTag || ($row_num == $num_call_lines)) { $calls_found = 0;} else { print OUT fgenes2gff2("$seq_name\t$sSoftwareName\t", \@arr);} } close OUT; } # Convert either of these lines (without the leading #): # 1 + 1 CDSo 27 - 308 39.43 27 - 308 282 # 1 + PolA 479 1.87 # to this format (without the leading #): #Contig2893_Contig183-40 geneid_v1.2 Start 36 38 -1.57 + . # CAACTGTGGCACCTGGATGG sub fgenes2gff2 { my ($leader, $ref) = @_; my @arr = @$ref; my $len = @arr; my $strand = $arr[$kStrandIndex]; my ($feature_name_index, $feature_start_index, $feature_score_index, $feature_end_index); if ($len == $kShortFGLineLen) { $feature_name_index = 2; $feature_start_index = $feature_end_index = 3; # Argo, at least, requires both values $feature_score_index = 4; } else { $feature_name_index = 3; $feature_start_index = $kFeatureStartIndex; $feature_score_index = 7; $feature_end_index = $kFeatureEndIndex; } return "$leader\t$arr[$feature_name_index]\t$arr[$feature_start_index]\t$arr[$feature_end_index]\t$arr[$feature_score_index]\t$strand\t\.\n"; } sub replace_suffix { my ($str, $new_suffix) = @_; my $pos = rindex($str, '.'); if ($pos > 0) { $str = substr($str, 0, $pos);} return $str . $new_suffix; } sub strip_leading_whitespace { my $str = shift; while ($str =~ /^\s/) { $str =~ s/^\s//;} return $str; } sub fetch_header_value { my ($calls_ref, $index) = @_; my $loc_line = $calls_ref->[$index]; chomp $loc_line; # else can get a newline when retrieving the last entry my @line_arr = split(/: /, $loc_line); return $line_arr[1]; } sub fetchParams { my $numArgs = shift; if ($numArgs < $kMaxArgs) { print $kUsageMsg;} } ## BELOW IS A MORE COMPLETE GFF2 EXAMPLE, FOLLOWED BY AN EXAMPLE OF FGENESH OUTPUT ## gff-version 2 ## date Mon Mar 12 16:20:25 2007 ## source-version: geneid v 1.2 -- geneid@imim.es # Sequence Contig2893_Contig183-40 - Length = 7980 bps # Starts(+) predicted in sequence Contig2893_Contig183-40: [0,7979] #Contig2893_Contig183-40 geneid_v1.2 Start 36 38 -1.57 + . # CAACTGTGGCACCTGGATGG #Contig2893_Contig183-40 geneid_v1.2 Start 45 47 0.32 + . # CACCTGGATGGACAAGATGG #Contig2893_Contig183-40 geneid_v1.2 Acceptor 2219 2220 -6.12 + . # ATCGTTGGAAGGTTTGCAGAAGAA #Contig2893_Contig183-40 geneid_v1.2 Acceptor 2252 2253 -5.87 + . # CAACAGGATTTATGAAACTGAGGA #Contig2893_Contig183-40 geneid_v1.2 Donor 6269 6270 -4.19 + . # CCCGTGGACG #Contig2893_Contig183-40 geneid_v1.2 Donor 6282 6283 -1.60 + . # CATGTCTGTA #Contig2893_Contig183-40 geneid_v1.2 Donor 6286 6287 -2.57 + . # TCTGTAAAAC #Contig2893_Contig183-40 geneid_v1.2 Stop 5030 5032 0.50 + . # GCAAATAAG #Contig2893_Contig183-40 geneid_v1.2 Stop 5039 5041 0.61 + . # AGCCGTAAT # Starts(-) predicted in sequence Contig2893_Contig183-40: [0,7979] #Contig2893_Contig183-40 geneid_v1.2 Start 7977 7979 -0.27 - . # NNNNNNNNNNNNNNNAATGC #Contig2893_Contig183-40 geneid_v1.2 Start 7907 7909 -3.54 - . # CCCATATTTTATTTGGATGA # The "Contig" lines are data and in the original file are not preceded by # # and the FGENESH output here (all # have been added): # FGENESH 2.5 Prediction of potential genes in Tribolium_castaneum genomic DNA # Time : Wed Mar 21 12:52:10 2007 # Seq name: Contig5724_Contig4284-20 # Length of sequence: 12780 # Number of predicted genes 2 in +chain 1 in -chain 1 # Number of predicted exons 7 in +chain 1 in -chain 6 # Positions of predicted genes and exons: Variant 1 from 1, Score:147.756439 # G Str Feature Start End Score ORF Len # # 1 + 1 CDSo 27 - 308 39.43 27 - 308 282 # 1 + PolA 479 1.87 # # 2 - PolA 2425 1.87 # 2 - 1 CDSl 2860 - 3272 39.51 2860 - 3270 411 # 2 - 2 CDSi 5289 - 5529 23.65 5290 - 5529 240 # 2 - 3 CDSi 5583 - 5925 47.28 5583 - 5924 342 # 2 - 4 CDSi 6061 - 6283 11.52 6063 - 6281 219 # 2 - 5 CDSi 8157 - 8232 -2.91 8158 - 8232 75 # 2 - 6 CDSf 8836 - 8883 6.49 8836 - 8883 48 # 2 - TSS 8917 -5.74 #Predicted protein(s): #(aa sequences)