#!/usr/bin/perl
use warnings;
use strict;

#STANDARD FORMAT TOKENS:
my $PFRMAT      ="^PFRMAT";
my $TARGET      ="^TARGET";
my $AUTHOR      ="^AUTHOR";
my $MODEL       ="^MODEL";
my $METHOD      ="^METHOD";
my $SCORE       ="^SCORE";
my $REMARK      ="^REMARK";
my $QMODE       ="^QMODE";
my $Mrecord     ="^[THCR].*TS";
my $Mrecord1    ="pdb\\s*\$";
my $END         ="^END";
my $HEADER_END  ="^\$";
my $NUMBER_IN_FIRST_COLL="^[0-9]";
my $PARENT      ="^PARENT";

#STATES OF THE LEXICAL ANALYZER
my $m_section=0; #1 - if in MODEL section

#OBLIGATORY FIELDS COUNTERS
my %fields_count=(
 $PFRMAT => 0,
 $AUTHOR => 0,
 $TARGET => 0,
 $METHOD => 0,
 $MODEL => 0,
 QMODE => 0,
 MRECORD => 0,
 MRECORD1 => 0,
 QMODE3_mdls => 0,
 $END => 0
);
#ADDITIONAL VARIABLES

my $status=0;
my $method=0;
my $num_line=0;
my $targ_val=0;
my $qmode_val=0;
my $qmode3_rec=0;
my $model_val=0;
my $error_num=0;
my $skip_lines=0;
my $length4tell=0;
# hash of released models: used to avoid duplicates
my %HASH_TS_MODELS; 

while (<>){
my $error=0;

# skip lines if model section consist with more than one line


if($skip_lines > 0) {
  $skip_lines--; next;
}

$num_line++;

$length4tell=length($_);
$_ =~ s/^\s+//; 
$_ =~ s/\s+$//; #removal of spaces here changes the length of $_, 
                #consider that when using it in conjunction with tell(ARGV) 

#chomp;

$status=0;
 SWITCH: {
       /$PFRMAT/i && (!$m_section)  && do {
                                               $status=check_pfrmat($_);
                                               $fields_count{$PFRMAT}++;
                                               last SWITCH;
                                       };
       /$AUTHOR/i && (!$m_section)     && do {
                                               $status=check_author($_);
                                               $fields_count{$AUTHOR}++;
                                               last SWITCH;
                                       };
       /$TARGET/i && (!$m_section)     && do { $targ_val=check_target($_);
                                               if ($targ_val =~ /[TRH]\d{4}/i)
                                                       {$status=0;}
                                               $fields_count{$TARGET}++;
                                               last SWITCH;
                                       };
       /$METHOD/i && (!$m_section)     && do { $method++;
                                               $fields_count{$METHOD}++;
                                               last SWITCH;
                                       };
       /$MODEL/i  && (!$m_section)  && do { $m_section=1;
                                               $fields_count{$MODEL}++;
                                               $status=check_oheadf(%fields_count);
                                               if ($status eq 0) { $status=check_model($_);}
					       $model_val=$status;
					       if($model_val<0){ 
 			$status="Please check model number in the MODEL record";
						}
                                               last SWITCH;
                                       };
       /$QMODE/i  && ($m_section)      && do { $fields_count{QMODE}++;
                                               if ($fields_count{QMODE}>1)
                       {$status="ERROR!: Too many $QMODE records in the prediction!";}
                                               else {  $status=check_qmode($_);
                                                       $qmode_val=$status;
					               if($qmode_val<0){ 
 			$status="Please check the QMODE record, which should be set to 1, 2 or 3.";
				       }
						if ($qmode_val==3 && $model_val!=2){
			$status="MODEL number should be set to 2 in QMODE 3";
					} elsif (($qmode_val==1 || $qmode_val==2) && $model_val!=1){
			$status="MODEL number should be set to 1 in QMODE 1 or 2";
			        }
					       }
                                               last SWITCH;
                                       };
       (/$Mrecord/i  || /$Mrecord1/i) && ($m_section) && do {

            #print("Str: $_ Length: ".length($_)." length4tell: $length4tell tell(ARGV): ".tell(ARGV)."\n"); 
            my @m_section_data = read_m_section(tell(ARGV) - $length4tell ); 
            $fields_count{MRECORD}++;
	    $status=check_mrecord(strip_linebreaks($m_section_data[0]));

# Printing of every line in QA prediction 
            #print "AFTER READING DATA LINE: " . strip_linebreaks($m_section_data[0]) . " LINE#: " . $fields_count{MRECORD} . " COUNT REC: " . $fields_count{QMODE3_mdls} . "\n";


  	    $skip_lines = $m_section_data[1];
            if (/$Mrecord1/i) {$fields_count{MRECORD1}++;}
            last SWITCH;
      };
       /$REMARK/i                      && do { $status=check_remark($_);
                                               last SWITCH;
                                       };
       /$END/i    && ($m_section)      && do { $m_section=0;
                                               $fields_count{$END}++;
                                               last SWITCH;
                                       };
       /^\s*$/                         && do {
                                               last SWITCH;
                                       };
     $error=1;
  }
 if ($status !~ /^[0123]$/) {
  #print STDERR "# ERROR! (Line no. $num_line) $status\n";
  print "# ERROR! (Line no. $num_line) $status\n";
  if ($error_num++ > 10){ print "# ERROR! Too many errors. Exiting...\n"; exit;}
 }
 if ($error) {
  print "# ERROR! (Line no. $num_line): Unknown or misplaced record in the following line: $_\n";
  if ($error_num++ > 10){ print "# ERROR! Too many errors. Exiting...\n"; exit;}
 }
}
check_eof(%fields_count);


#------------------------------------
#   Subroutines for parsing tokens:
#____________________________________


##
# read multiline m_section
# return array = (string m_section, integer skiplines_count)
##
sub read_m_section {
      my ($position) = @_;

      open(FH, $ARGV);
      seek(FH, $position, 0);
      my $m_section_string = '';
      $skip_lines = 0;
      my $find1st = 0;
      my $first = 1;
      while(<FH>) {
            if($_ =~ /$Mrecord/){ # || $_ =~ /$Mrecord1/) {

                 if ($find1st++) { return($m_section_string, $skip_lines);}
                 $m_section_string  .= $_;
                 if ($first) {$first=0;}
            } elsif ($_ =~ /$Mrecord1/){
                   $m_section_string  .= $_;
                   if (!$first) {
			$skip_lines++;
			}
                   if ($first) {$first=0;}  
            }elsif ($_ =~ /$END/) {
             #     $skip_lines++;
                  return($m_section_string, $skip_lines);
            } else {
                  if ($_ !~ /^\s*$/){
                    $m_section_string .= $_;
                    $skip_lines++;
                  }
            }
      }
      return ($m_section_string, $skip_lines);
}

sub check_pfrmat {
 my @params=@_;
 my @fields=split /\s+/, $params[0];
 my $length=scalar @fields;
 if ($length > 2){
 return "ERROR! Too many fields after PFRMAT";
 } elsif ($length < 2) { return "ERROR! Please specify type of prediction after PFRMAT";}
 if ($fields[1] =~ /^QA$/) {return 0;}
 else { return "ERROR! Check the type of prediction after PFRMAT - it must be QA";}
 return 0;
}

sub check_author {
 my @params=@_;
 my $param=$params[0];
 #This functionality movet to Submission.pm
 #if ($param =~ /$AUTHOR\s+\d{4}-\d{4}-\d{4}\s*$/i) { return 0;}
 #return "ERROR! Please check the line:  $param";
 return 0;
}

sub check_target {
 my @params=@_;
 my $param=$params[0];
 (my $keyword, my $value)= split (/\s+/, $param);
#AK2024 - added H
 if (($keyword eq 'TARGET') && ($value =~ /^[RTHC][0-9]{4}/i)) {
	if (! -f sprintf("./TARGETS/%s.seq.txt", $value)){
		return "ERROR! Check target name.\n";
	}
	return $value;
 }
 return " Please check the line:  $param";
}

sub check_model {
 my @params=@_;
 my $param=$params[0];
 if ($param =~ /$MODEL\s*([0-9]+)\s*$/i) { return $1;}
 return -1;
}

sub check_remark {
 return 0;
}

sub check_qmode {
 my @params=@_;
 my $param=$params[0];
 (my $keyword, my $value)= split (/\s+/, $param);
 if (($keyword eq 'QMODE') && ($value =~ /^[1-3]$/)) {return $value;}
 return -1;
}

sub check_mrecord {
 my @params=@_;
 my $param=$params[0];
 my @fields=split (/\s+/, $param);
 my $model = $fields[0];

 if (exists $HASH_TS_MODELS{$model}) {
    return "ERROR! Duplicate record for model $model";
 }
 $HASH_TS_MODELS{$model} = 1;

# if ($model !~ /^$targ_val/){
#   return "Target number in the model identifier $model does not comply with the TARGET header value $targ_val";
# }

 if (($qmode_val eq 1) || ($qmode_val eq 2)){
 if  ($#fields < 2){
  return " $param: Each line in QA prediction should contain at least 3 columns: model name, overall multimer model accuracy score and interface accuracy score.";
 } elsif ($#fields > 1){
  my $score = $fields[1];
  unless ($score =~ /^[01]$/i || $score =~ /^0\.\d+$/ || $score =~/^1\.0*$/ || uc($score) eq 'X'){
   return " $model: Overall accuracy score should be real number between 0 and 1 or 'X' if you do not provide this score. Please check second column $score";
  }
  my $qscore = $fields[2];
  unless ($qscore =~ /^[01]$/i || $qscore =~ /^0\.\d+$/ || $qscore =~/^1\.0*$/ || uc($qscore) eq 'X'){
   return " $model: Interface accuracy score should be real number between 0 and 1 or 'X' if you do not provide this score. Please check third column $qscore";
  }
 }

 if ($qmode_val eq 1){
  if ($#fields != 2){
    return " $param: In QMODE 1 each record should contain 3 columns: TS model name, overall nultimer model accuracy score, and interface accuracy score.";
  }
 } elsif ($qmode_val eq 2) {
# NOt sure it is still needed
   (my $t1=$targ_val) =~ s/^t/T/;
   my $SeqFile="./TARGETS/".$t1.".seq.txt"; 
   my $nfe=0; 
   my $nfe1=0; 
   # hash of reidues in interfaces: used to avoid duplicates
   my %HASH_res=(); 
   my @res = @fields[3 .. $#fields];
   #if ($#res<2) {
#	$nfe++;
#        return " $model: Interface should contain at least 2 residues. Please put A0:0.0 B0:0.0 in columns 4 and 5 if a model has no interface contacts.";
#   }
   my $i=3;
   my $message=" $model: Reliability scores for interface residues should be in format An:x.xx, where A is chain ID, n - residue number, and x.xx - interface reliability score. Please check"; 
   my $message1=" $model: Duplicate interface residues: "; 
   foreach my $record (@res) {
     $i++; 
     if ($record =~ /^\s*(\S\d+):[01]\.*\d*\s*$/){
       if (exists $HASH_res{$1}) {
	$nfe1++;
        $message1.=" residue $1 ** ";
       }
       $HASH_res{$1} = 1;
     } else {
         $nfe++;
         $message.=" record $record in column $i ** ";         
     } 
#    unless ($record =~ /^\s*\d+\.\d+\s*$/ || $record eq 'X'){
#      return " Error estimate for residue location is a real number in format x.x . Please check the value: $record in the column $i";
#    }
   }
   my $messageL='';
   if ($nfe){$messageL.="$message";}
   if ($nfe1){$messageL.="\n$message1";}
   if ($nfe+$nfe1>0){return "$messageL";}
 } 
 } elsif ($qmode_val eq 3) {
    my %HASH_fields=();
    foreach my $value (@fields) {
      # print "===$value\n";
      if ($value =~ /\.pdb$/ ){
        if (exists $HASH_fields{$value}) {
           return "Duplicated model $value.\n";       
        }else{ 
            $HASH_fields{$value} = 1;
        }
        if (++$qmode3_rec >5){
            return " Only 5 records are expected in QMODE 3.\n";      
        };
	$fields_count{QMODE3_mdls}++;
      } else {
            return "Data record $value is not in the <MODEL>.pdb format.\n";      
      }
    }
    if ($fields_count{QMODE3_mdls}<5) {
       return "Five records are expected in QMODE 3.\n";      
    };
 } else {
  return " $param: ERROR! Can not check the data if QMODE parameter is not defined or set to something different from 1, 2 or 3.\n";
 }
 return 0;
}

sub check_rr {
 return 0;
}

sub check_oheadf {
 my %fields_max_count=(
 $PFRMAT => 1,
 $AUTHOR => 1,
 $TARGET => 1,
 $METHOD => 20000000, #some large number
 $MODEL => 1,
 MRECORD => 0,
 MRECORD1 => 0,
 QMODE3_mdls => 0,
 $END => 0,
 QMODE => 0
 );
 my %fields_min_count=(
 $PFRMAT => 1,
 $AUTHOR => 1,
 $TARGET => 1,
 $METHOD => 1,
 $MODEL => 1,
 MRECORD => 0,
 MRECORD1 => 0,
 QMODE3_mdls => 0,
 $END => 0,
 QMODE => 0
 );
 my %params=@_;
 my $item;
 for  (keys %params) {
  if ($fields_max_count{$_} < $params{$_})
   {return "ERROR!: Too many $_ records in a prediction!";}
  elsif ($fields_min_count{$_} > $params{$_}) { return "ERROR! Record  $_ is missing - must be at least $fields_min_count{$_} present!";}
 }
 return 0;
}

sub check_eof {
 my %fields_max_count=(
 $PFRMAT => 1,
 $AUTHOR => 1,
 $TARGET => 1,
 $METHOD => 2000000, #some large number
 $MODEL => 1,
 MRECORD => 1000, # 200groups x 5models
 MRECORD1 => 5, # 
 QMODE3_mdls => 5,
 $END => 1,
 QMODE => 1
 );
 my %fields_min_count=(
 $PFRMAT => 1,
 $AUTHOR => 1,
 $TARGET => 1,
 $METHOD => 1,
 $MODEL => 1,
 MRECORD => 1,
 MRECORD1 => 0,
 QMODE3_mdls => 0,
 $END => 1,
 QMODE => 1
 );
 my %params=@_;
 my $item;
 foreach my $e (keys %params) {
  #print "e=$e, max=$fields_max_count{$e}, $params{$e}\n";
  if ($fields_max_count{$e} < $params{$e}) {
	if ($e eq 'MRECORD1') {
		print "# ERROR! A prediction in QMODE 3 should contain only one line with model rankings\n";
	} else {
		print "# ERROR! Too many $e records in the prediction!\n";
	}
  } elsif ($fields_min_count{$e} > $params{$e}) {
    if ($e eq 'MRECORD')
     {print "# ERROR! Your prediction is empty\n";}
    else
    {print "# ERROR! Record  $e is missing - at least $fields_min_count{$e} must be present!\n";}
  }
 }
 return 0;
}

##
# Remove line breaks characters from string
##
sub strip_linebreaks {
      my $string = shift;
    #  while($string =~ /\n+/m) {
            $string =~ s/\n+/ /g; #Changed to replace newline for whitespace
            $string =~ s/\s+$//g; #remove whitespaces at the end of the line
            $string =~ s/\s+/ /g;
     # }
      return $string;
}
