#!/usr/bin/perl
use warnings;
use strict;

#STANDARD FORMAT TOKENS:
my $PFRMAT      ="^PFRMAT";
my $TARGET      ="^TARGET";
my $AUTHOR      ="^AUTHOR";
my $MODEL       ="^MODEL";
my $METHOD      ="^METHOD";
my $SCORE       ="^SCORE";
my $REMARK      ="^REMARK";
my $QMODE       ="^QMODE";
my $Mrecord     ="^.*_(TS|AL)[1-5]";
my $END         ="^END";
my $HEADER_END  ="^\$";
my $NUMBER_IN_FIRST_COLL="^[0-9]";
my $PARENT      ="^PARENT";

#STATES OF THE LEXICAL ANALYZER
my $m_section=0; #1 - if in MODEL section

#OBLIGATORY FIELDS COUNTERS
my %fields_count=(
 $PFRMAT => 0,
 $AUTHOR => 0,
 $TARGET => 0,
 $METHOD => 0,
 $MODEL => 0,
 QMODE => 0,
 MRECORD => 0,
 $END => 0
);
#ADDITIONAL VARIABLES

my $status=0;
my $method=0;
my $num_line=0;
my $targ_val=0;
my $qmode_val=0;
my $error_num=0;
my $skip_lines=0;
# hash of released models: used to avoid duplicates
my %HASH_TS_MODELS; 

while (<>){ 
my $error=0;

# skip lines if model section consist with more than one line
if($skip_lines > 0) {$skip_lines--; next;}

$num_line++;
chomp;
$status=0;
 SWITCH: {
       /$PFRMAT/i && (!$m_section)  && do {
                                               $status=check_pfrmat($_);
                                               $fields_count{$PFRMAT}++;
                                               last SWITCH;
                                       };
       /$AUTHOR/i && (!$m_section)     && do {
                                               $status=check_author($_);
                                               $fields_count{$AUTHOR}++;
                                               last SWITCH;
                                       };
       /$TARGET/i && (!$m_section)     && do { $targ_val=check_target($_);
                                               if ($targ_val =~ /T\d{4}/i)
                                                       {$status=0;}
                                               $fields_count{$TARGET}++;
                                               last SWITCH;
                                       };
       /$METHOD/i && (!$m_section)     && do { $method++;
                                               $fields_count{$METHOD}++;
                                               last SWITCH;
                                       };
       /$MODEL/i  && (!$m_section)  && do { $m_section=1;
                                               $fields_count{$MODEL}++;
                                               $status=check_oheadf(%fields_count);
                                               if ($status eq 0) { $status=check_model($_);}
                                               last SWITCH;
                                       };
       /$QMODE/i  && ($m_section)      && do { $fields_count{QMODE}++;
                                               if ($fields_count{QMODE}>1)
                       {$status="ERROR!: Too many $QMODE records in the prediction!";}
                                               else {  $status=check_qmode($_);
                                                       $qmode_val=$status; }
                                               last SWITCH;
                                       };
       /$Mrecord/i  && ($m_section) && do {
            my @m_section_data = read_m_section(tell(ARGV) - length($_) - 1); 
            $fields_count{MRECORD}++;
#
# Commented printing of every line in QA prediction 
#            print strip_linebreaks($m_section_data[0]) . "\n";
            strip_linebreaks($m_section_data[0]);

	    $status=check_mrecord(strip_linebreaks($m_section_data[0]));
            $skip_lines = $m_section_data[1];
            last SWITCH;
      };
       /$REMARK/i                      && do { $status=check_remark($_);
                                               last SWITCH;
                                       };
       /$END/i    && ($m_section)      && do { $m_section=0;
                                               $fields_count{$END}++;
                                               last SWITCH;
                                       };
       /^\s*$/                         && do {
                                               last SWITCH;
                                       };
     $error=1;
  }
 if ($status !~ /^[012]$/) {
  #print STDERR "# ERROR! (Line no. $num_line) $status\n";
  print "# ERROR! (Line no. $num_line) $status\n";
  if ($error_num++ > 10){ print "# ERROR! Too many errors. Exiting...\n"; exit;}
 }
 if ($error) {
  print "# ERROR! (Line no. $num_line): Unknown or misplaced record in the following line: $_\n";
  if ($error_num++ > 10){ print "# ERROR! Too many errors. Exiting...\n"; exit;}
 }
}
check_eof(%fields_count);


#------------------------------------
#   Subroutines for parsing tokens:
#____________________________________


##
# read multiline m_section
# return array = (string m_section, integer skiplines_count)
##
sub read_m_section {
      my ($position) = @_;

      open(FH, $ARGV);
      seek(FH, $position, 0);
      my $m_section_string = '';
      $skip_lines = 0;
      my $find1st = 0;
      while(<FH>) {
            if($_ =~ /$Mrecord/) {
                 if ($find1st++) {return($m_section_string, $skip_lines);}
                  $m_section_string  = $_;
            } elsif ($_ =~ /$END/) {
#                  $skip_lines++;
                  return($m_section_string, $skip_lines);
            } else {
                  $m_section_string .= $_;
                  $skip_lines++;
            }
      }
      return ($m_section_string, $skip_lines);
}

sub check_pfrmat {
 my @params=@_;
 my @fields=split /\s+/, $params[0];
 my $length=scalar @fields;
 if ($length > 2){
 return "ERROR! Too many fields after PFRMAT";
 } elsif ($length < 2) { return "ERROR! Please specify type of prediction after PFRMAT";}
 if ($fields[1] =~ /^QA$/) {return 0;}
 else { return "ERROR! Check the type of prediction after PFRMAT - it must be QA";}
 return 0;
}

sub check_author {
 my @params=@_;
 my $param=$params[0];
 #This functionality movet to Submission.pm
 #if ($param =~ /$AUTHOR\s+\d{4}-\d{4}-\d{4}\s*$/i) { return 0;}
 #return "ERROR! Please check the line:  $param";
 return 0;
}

sub check_target {
 my @params=@_;
 my $param=$params[0];
 (my $keyword, my $value)= split (/\s+/, $param);
 if (($keyword eq 'TARGET') && ($value =~ /^T[0-9]{4}/i)) {
	if (! -f sprintf("./TARGETS/%s.seq.txt", $value)){
		return "ERROR! Check target name.\n";
	}
	return $value;
 }
 return " Please check the line:  $param";
}

sub check_model {
 my @params=@_;
 my $param=$params[0];
 if ($param =~ /$MODEL\s*[0-9]+\s*$/i) { return 0;}
 return "ERROR! Please check the line:  $param";
}

sub check_remark {
 return 0;
}

sub check_qmode {
 my @params=@_;
 my $param=$params[0];
 (my $keyword, my $value)= split (/\s+/, $param);
 if (($keyword eq 'QMODE') && ($value =~ /^[12]$/)) {return $value;}
 return " Please check the line:  $param";
}

sub check_mrecord {
 my @params=@_;
 my $param=$params[0];
 my @fields=split (/\s+/, $param) ;
 my $model = $fields[0];

 if (exists $HASH_TS_MODELS{$model}) {
    return "The duplicate record for model $model";
 }
 $HASH_TS_MODELS{$model} = 1;

#my $i=0;
#foreach my $www (@fields) {
#  $i++;
#  print "$i $www\n";
#}


# if ($model !~ /^$targ_val/){
#   return "Target number in the model identifier $model does not comply with the TARGET header value $targ_val";
# }

 if  ($#fields == 0){
 return " Quality score should occupy second column in the line and should be a real number between 0 and 1. Please check the line:  $param";
 } elsif ($#fields > 0){
 my $score = $fields[1];
 unless ($score =~ /^[01]$/i || $score =~ /^0\.\d+$/ || $score =~/^1\.0*$/){
  return " Quality score should be a real number between 0 and 1. Please check second column in this line: $score";
 }
 }

 if ($qmode_val eq 1){
  if ($#fields > 1){
    return " In QMODE 1 each record should contain two columns: model name and quality score. Check line: $param";
  }
 } elsif ($qmode_val eq 2) {
   (my $t1=$targ_val) =~ s/^t/T/;
   my $SeqFile="./TARGETS/".$t1.".seq.txt"; 
   my $SeqLen=0;
   #print $SeqFile;
   open TargSeq, $SeqFile || die "Could not open $SeqFile: $!";
   while (<TargSeq>){
     chomp;
     $_ =~ s/\s+$//;
     if ($_ =~ /^>.*\s+(\d+)\s+residues/) {
       #my @vv = split (/\s+/, $_);
       #$SeqLen=$vv[-2];
       $SeqLen = $1;
       last;
     }
   }
   close TargSeq;
   #print "\nseqlength: $SeqLen\n";
   my $columns = $#fields - 1;
   my $allcolumns = $#fields + 1;
   if ($SeqLen != $columns){
    return " Check data for $model *** Number of columns in the line: $allcolumns. In QMODE 2 each line should contain two first columns with model name and quality score and additionally one column for each of $SeqLen residues in the $targ_val sequence.";
   }

   my @res = @fields[2 .. $#fields];
   my $i=2;
   my $message=' Error estimate for residue location is a real number in format x.x . Please check: '; 
   my $nfe=0; 
   foreach my $record (@res) {
      $i++; 
     if (($record !~ /^\s*\d+\.\d+\s*$/) && ($record ne 'X')){
         $nfe++;
         $message.=" value $record in the column $i ** ";         
     }
#    unless ($record =~ /^\s*\d+\.\d+\s*$/ || $record eq 'X'){
#      return " Error estimate for residue location is a real number in format x.x . Please check the value: $record in the column $i";
#    }
   }
   if ($nfe>0){return "$message";}
 } else {
  return " Can not check the data if QMODE parameter is not defined or set to something different from 1 or 2.\n Please check the line:  $param";
 }
 return 0;
}

sub check_rr {
 return 0;
}

sub check_oheadf {
 my %fields_max_count=(
 $PFRMAT => 1,
 $AUTHOR => 1,
 $TARGET => 1,
 $METHOD => 20000000, #some large number
 $MODEL => 1,
 MRECORD => 0,
 $END => 0,
 QMODE => 0
 );
 my %fields_min_count=(
 $PFRMAT => 1,
 $AUTHOR => 1,
 $TARGET => 1,
 $METHOD => 1,
 $MODEL => 1,
 MRECORD => 0,
 $END => 0,
 QMODE => 0
 );
 my %params=@_;
 my $item;
 for  (keys %params) {
  if ($fields_max_count{$_} < $params{$_})
   {return "ERROR!: Too many $_ records in a prediction!";}
  elsif ($fields_min_count{$_} > $params{$_}) { return "ERROR! Record  $_ is missing - must be at least $fields_min_count{$_} present!";}
 }
 return 0;
}

sub check_eof {
 my %fields_max_count=(
 $PFRMAT => 1,
 $AUTHOR => 1,
 $TARGET => 1,
 $METHOD => 2000000, #some large number
 $MODEL => 1,
 MRECORD => 1000, # 200groups x 5models
 $END => 1,
 QMODE => 1
 );
 my %fields_min_count=(
 $PFRMAT => 1,
 $AUTHOR => 1,
 $TARGET => 1,
 $METHOD => 1,
 $MODEL => 1,
 MRECORD => 1,
 $END => 1,
 QMODE => 1
 );
 my %params=@_;
 my $item;
 foreach my $e (keys %params) {
  if ($fields_max_count{$e} < $params{$e})
   {print "# ERROR! Too many $e records in the prediction!\n";}
  elsif ($fields_min_count{$e} > $params{$e}) {
    if ($e eq 'MRECORD')
     {print "# ERROR! Your prediction is empty\n";}
    else
    {print "# ERROR! Record  $e is missing - at least $fields_min_count{$e} must be present!\n";}
  }
 }
 return 0;
}

##
# Remove line breaks characters from string
##
sub strip_linebreaks {
      my $string = shift;
      while($string =~ /\n+/m) {
            $string =~ s/\n+/ /m; #Changed to replace newline for whitespace
      }
      return $string;
}
