#!/usr/bin/perl
package SplitByParent;

#
# This script reads _single_ e-mail message from given file
# with multiple predictions, and makes mailbox file with
# headers repeated for each prediction and corrects
# the order of PFRMAT, TARGET and AUTHOR fields
#

use strict;
use warnings;

use DBI;
use Digest::MD5 qw(md5 md5_hex md5_base64);
use Data::Dumper;

use lib qw(Core);
use Database;
use Configuration;

sub new {
    my ($class, $in_file) = @_;
    my $self = {
        _id => undef,
        _database => Database->new($CONFIG->{HOSTNAME}, $CONFIG->{PORT}, $CONFIG->{DATABASE}, $CONFIG->{USERNAME}, $CONFIG->{PASSWORD}),
        _in_file => $in_file
        
    };
    
    # check database connection if not connect throw an exception
    
    bless $self, $class;
    return $self;
}

sub process {
    my ($self, $in_file, $out_file) = @_;
    $self->{_in_file} = $in_file;
    
    my @models = ();
    
#
#Regular expresions def.:
my $PFRMAT="^PFRMAT";
my $FORMAT="^FORMAT";
my $TARGET="^TARGET";
my $TARGET_QUERY="^TARGET\\s+QUERY";
my $AUTHOR="^AUTHOR";
my $MODEL ="^MODEL";
my $TER ="^TER";
my $METHOD ="^METHOD";
my $AUTCHK="^AUTCHK";
my $SCORE="^SCORE";
my $REMARK_AUTHOR="^REMARK\\s+AUTHOR";
my $REMARK="^REMARK";
my $REMARK_ALIGN="REMARK Aligment"; #necessary for processing al2ts  models
my $END="^END";
my $HEADER_END="^\$";
my $NUMBER_IN_FIRST_COLL="^[0-9]";
my $PARENT="^PARENT";
#Variables for saving values of lines matching regular expr. given above:
my $PARSRC="^PARSRC";
my $PFRMAT_VALUE="";
my $TARGET_VALUE="";
my $AUTHOR_VALUE="";
my $MODEL_VALUE="";
my $END_VALUE="";
my $AUTCHK_VALUE="";
my $AUTHOR_ID_VALUE="";
my $SCORE_VALUE="";
my $tmp="";
my $PFRMAT_ID_VALUE="";
my $PARENT_VALUE="";

my $ORIG="^ORIG";
my $SCALE="^SCALE";
my $SEQRES="^SEQRES";

#Arrays for saving prediction and header data:
my @REST_OF_PRED_HEAD=();
my @HEADER_ARRAY=();
my @MODEL_DATA=();
my @SCORE_RECORD=();
my @REMARK_AUTH_CHECK=();
my @REMARK_AUTHOR_ARRAY=(); 
my @METHOD_RECORDS=();
my @REMARKS=();
my @line=();
my @tmp_arr=();
my %COUNT_MODELS=(
 DP => 0,
 RR => 0,
 TS => 0,
 DR => 0,
 AL => 0, 
 FN => 0  
       );
my %COUNT_LIMIT=(
 DP => 5,
 RR => 5,
 TS => 5,
 DR => 5,
 AL => 5,
 FN => 5       
       );

#Additional variables:
my $out_file_name;
my $method_new=1;
my $renumber_models=0;
my $model_num=0;
my $MODEL_NUMBER_VALUE=0;
my $wrong_auth_check=0;
my $numbered_lines=0;
my $auth_wrong_predictions="auth_wrong_predictions.dat";
my $skip=0;
my $parent_counter=0;

#open(IN_FILE,"$ARGV[0]");

open(IN_FILE, $self->{_in_file});



$out_file_name=$ARGV[1]; #this variable never used
#$out_file_name=$ARGV[1];
#1) Get and store mail header
while (<IN_FILE>)
 {

   if (/$HEADER_END/){
     push(@HEADER_ARRAY, $_);
     last;
   }
    elsif (/$PARENT/) {$PARENT_VALUE=$_;} 
   elsif (/$AUTCHK/) {
#    ($tmp, $AUTCHK_VALUE)= split;
#    $AUTHOR_VALUE="AUTHOR $AUTCHK_VALUE\n"; 
#    push(@REMARK_AUTH_CHECK,"REMARK AUTHCHK: $AUTCHK_VALUE\n");
    last;
   }
   elsif ((/$PFRMAT/)||(/$FORMAT/)) { 
   $PFRMAT_VALUE=$_;
   ($tmp, $PFRMAT_ID_VALUE)=split;
    if (/$FORMAT/) { $PFRMAT_VALUE="PFRMAT $PFRMAT_ID_VALUE\n";}
   $COUNT_MODELS{$PFRMAT_ID_VALUE}++;
   last;
   }
   elsif(/$TARGET_QUERY/){
    @tmp_arr=split;
    $TARGET_VALUE="TARGET $tmp_arr[2] $tmp_arr[3]\n";
    last;
   } elsif (/$TARGET/) {
      $TARGET_VALUE=$_;
      if (!(/$TARGET\s+[HOTRSNFAXL]/)){
      chomp;
      $TARGET_VALUE="$_ error\n"
      }
      last;
     }
   elsif (/$AUTHOR/) {
        $AUTHOR_VALUE=$_;
        last;
       }
   elsif (/$REMARK_AUTHOR/) {
        @REMARK_AUTHOR_ARRAY=split;
        $AUTHOR_VALUE="AUTHOR $REMARK_AUTHOR_ARRAY[2]\n";
        last; 
 }
   elsif (/$REMARK_ALIGN/) {
#	print $_;
        my @tmp=split; 
        push(@MODEL_DATA,"PARENT $tmp[5]\n");
	push(@MODEL_DATA,$_);
	$skip=1;
	last;}
   elsif (/$SCORE/) {
     $SCORE_VALUE=$_;
     push(@SCORE_RECORD,"REMARK $SCORE_VALUE");
     last;
   }
   else {
     push(@HEADER_ARRAY, $_);
   }
 }
#2) Get PFRMAT, TARGET and AUTHOR records of the prediction (if not already 
#   recived in 1)
if ($skip!=1) {
 while (<IN_FILE>)
  {
  if ($MODEL_VALUE ne ""){
    if (/$SCORE/) {
      @SCORE_RECORD=();
     push(@SCORE_RECORD, "REMARK $_");
     last;
    } else {
    push(@MODEL_DATA, $_); last; }
   }


   if (/$AUTCHK/) {
#   ($tmp, $AUTCHK_VALUE)= split;
#    $AUTHOR_VALUE="AUTHOR $AUTCHK_VALUE\n";
#    push(@REMARK_AUTH_CHECK,"REMARK AUTHCHK: $AUTCHK_VALUE\n");
   }
    elsif (/$PARENT/) {$PARENT_VALUE=$_;} 
   elsif (/$SCORE/) {
     @SCORE_RECORD=();
     $SCORE_VALUE=$_;
     push(@SCORE_RECORD,"REMARK $SCORE_VALUE");
   }
   elsif ((/$PFRMAT/)||(/$FORMAT/))  {
     $PFRMAT_VALUE=$_;
     ($tmp, $PFRMAT_ID_VALUE)=split;
    if (/$FORMAT/) { $PFRMAT_VALUE="PFRMAT $PFRMAT_ID_VALUE\n";}
   }
   elsif (/$TARGET_QUERY/){
    @tmp_arr=split;
   if ((scalar @tmp_arr)>3){
    $TARGET_VALUE="TARGET $tmp_arr[2] $tmp_arr[3]\n";
    }
   else {    $TARGET_VALUE="TARGET $tmp_arr[2]\n";} 
   }
   elsif (/$TARGET/)  {
          $TARGET_VALUE=$_;
	if (!(/$TARGET\s+[HOTRSNFAXL]/)){
      	chomp;
      	$TARGET_VALUE="$_ error\n"
     	 }
         }
   elsif ((/$AUTHOR/)||(/$REMARK_AUTHOR/))  {
          if (/$REMARK_AUTHOR/) {
           @REMARK_AUTHOR_ARRAY=split;
           $AUTHOR_VALUE="AUTHOR $REMARK_AUTHOR_ARRAY[2]\n"; 
           $AUTHOR_ID_VALUE=$REMARK_AUTHOR_ARRAY[2];
          } else {
          $AUTHOR_VALUE=$_;
          ($tmp, $AUTHOR_ID_VALUE)= split;
          }
          if ($AUTHOR_ID_VALUE eq $AUTCHK_VALUE) {
#            @REMARK_AUTH_CHECK=(); 
#            push(@REMARK_AUTH_CHECK,"REMARK AUTHCHK: $AUTCHK_VALUE\n");
#            push(@REMARK_AUTH_CHECK,"REMARK AUTHCHK_OK\n"); 
          }
          else {
#            @REMARK_AUTH_CHECK=();
#            push(@REMARK_AUTH_CHECK,"REMARK AUTHCHK: $AUTCHK_VALUE\n");
#            push(@REMARK_AUTH_CHECK,"REMARK AUTHCHK_WRONG\n");  
           $wrong_auth_check=1; 
          } 
        }
   elsif (/$METHOD/){
          if ($method_new==1) {$method_new=0; @METHOD_RECORDS=();}
           push(@METHOD_RECORDS,$_);
          }

   elsif ((/$NUMBER_IN_FIRST_COLL/)&&($PFRMAT_ID_VALUE eq "DP")){
          $numbered_lines=1;
          @line=split;
          push(@MODEL_DATA, "$line[1] $line[2] $line[3]\n");
          push(@METHOD_RECORDS,"METHOD -------------\n");
          $MODEL_VALUE="MODEL 1\n";
          last;
         }
   elsif (/$REMARK/) {
          push(@REMARKS,$_);
         }
   elsif (!(/$MODEL/)){
           push(@REST_OF_PRED_HEAD,"REMARK $_");
          }
   else {
   $COUNT_MODELS{$PFRMAT_ID_VALUE}++;
         if ($renumber_models==0) {
         $MODEL_VALUE=$_;  
         ($tmp,$MODEL_NUMBER_VALUE)=split;
          if ($MODEL_NUMBER_VALUE eq "") {
            $renumber_models=1;
            $model_num++; 
            $MODEL_NUMBER_VALUE=$model_num; 
		$MODEL_VALUE="MODEL $model_num\n";  
           }
         } else {
           ($tmp,$MODEL_NUMBER_VALUE)=split;
           $model_num++;
           $MODEL_NUMBER_VALUE=$model_num; 
           $MODEL_VALUE="MODEL $model_num\n";
         }
         last;}
 }
}
#3) Get MODEL data
my $parent_in_model=0;
my $prev_ter=0;
 while (<IN_FILE>) {
   if (/$PARENT/) {push(@MODEL_DATA,$_); $parent_in_model=1; } 
   elsif ((/$SCORE/)||(/$ORIG/)||(/$SCALE/)||(/$SEQRES/)){
   push(@MODEL_DATA,"REMARK $_");
  }
  elsif (/$PARSRC/) {
   push(@MODEL_DATA,"REMARK $_");
  } elsif ((/$TER/)||(/$REMARK_ALIGN/)||((/$END/)&&($prev_ter==0))) {
	$parent_counter++;
        open(OUT_FILE,">$self->{_in_file}"."_".$parent_counter);
        print OUT_FILE "REMARK PARENT number $parent_counter\n";	
	print OUT_FILE $PFRMAT_VALUE;
	print OUT_FILE $TARGET_VALUE;
	print OUT_FILE $AUTHOR_VALUE;
	print OUT_FILE @MODEL_DATA;
	print OUT_FILE "TER\nEND\n";
	@MODEL_DATA=();
       if (/$REMARK_ALIGN/){	
	my @tmp=split;
        push(@MODEL_DATA,"PARENT $tmp[5]\n"); 
	push(@MODEL_DATA,$_);
	}
	close(OUT_FILE);
  }
  elsif ((/$REMARK/)&&(!(/$REMARK_ALIGN/))){
   push(@MODEL_DATA, $_);}
   elsif (/$PARENT/) {push(@MODEL_DATA,$_); $parent_in_model=1; }
   elsif ((/$SCORE/)||(/$ORIG/)||(/$SCALE/)||(/$SEQRES/)){
   push(@MODEL_DATA,"REMARK $_");
  }
   elsif (/$END/) {  $END_VALUE=$_;
     last;
   } else {
#     if ($numbered_lines){
#      @line=split;
#      push(@MODEL_DATA, "$line[1] $line[2] $line[3]\n"); 

     if (($PFRMAT_ID_VALUE eq "DP")&&(/$PARENT/)) {
      push(@MODEL_DATA, "REMARK $_");
     } else {  
      push(@MODEL_DATA,$_); 
     }
   }
  if (/$TER/) {$prev_ter=1;}
  else {$prev_ter=0;}

 }

close IN_FILE;


#DELETE FILES
if($parent_counter > 1) {
#we are going to delete common prediction prediction 
system(sprintf("rm -f %s", $self->{_in_file}));
} else {    
#we need to delete parent number one
#parent number one is the same as prediction
system(sprintf("rm -f %s_%s", $self->{_in_file}, '1'));

} 

#TODO:
#WRITE TO DB RESULTS


} #End of process

1;
