#!/usr/bin/perl
package Rearange;

#
# This script reads _single_ e-mail message from given file
# with multiple predictions, and makes mailbox file with
# headers repeated for each prediction and corrects
# the order of PFRMAT, TARGET and AUTHOR fields
#

use strict;
use warnings;

use DBI;
use Digest::MD5 qw(md5 md5_hex md5_base64);
use Data::Dumper;

use lib qw(Core);
use Database;
use Configuration;

sub new {
    my ($class, $in_file, $out_file) = @_;
    my $self = {
        _id => undef,
        _database => Database->new($CONFIG->{HOSTNAME}, $CONFIG->{PORT}, $CONFIG->{DATABASE}, $CONFIG->{USERNAME}, $CONFIG->{PASSWORD}),
        _in_file => $in_file,
        _out_file => $out_file
    };
    
    # check database connection if not connect throw an exception
    
    bless $self, $class;
    return $self;
}

### returns true if the models are numbered properly and any number is not repeated
### and all models correspond to the same target
#: BM
sub is_properly_divided {
	my ($self) = @_;
	my $result = 1;
	my $TARGET = undef;
	my %HASH_MODEL;
	open(IN_FILE, $self->{_in_file});
	while (my $l = <IN_FILE>) {
		if ($l =~ /^MODEL/) {   
			if ($l =~ /^MODEL\s+(\d+)/){
				my $model_index = $1;
				if (exists $HASH_MODEL{$model_index}){ # duplicated model indexes
					$result = 0;
					close IN_FILE;
					return $result;
				} else {
					$HASH_MODEL{$model_index} = $model_index;
				}
			} else { # bad format of the line: MODEL
				$result = 0;
				close IN_FILE;
				return $result;
			}
	        }
		if ($l =~ /^TARGET\s+(\S+)/){
			my $target = $1;
			if (! defined($TARGET)) {
				$TARGET = $target;
			}
			if ($TARGET ne $target){ # different targets
				$result = 0;
				close IN_FILE;
				return $result;
			}
		}
	}
	close IN_FILE;
	return $result;
}

sub process {
    my ($self) = @_;
    
    my @models = ();
    if ($self->is_properly_divided() == 0){
	return @models;
    }
    
#
#Regular expresions def.:
my $PFRMAT="^PFRMAT";
my $FORMAT="^FORMAT";
my $TARGET="^TARGET";
my $TARGET_QUERY="^TARGET\\s+QUERY";
my $AUTHOR="^AUTHOR";
my $MODEL ="^MODEL";
my $IMODE ="^IMODE";
my $METHOD ="^METHOD";
my $AUTCHK="^AUTCHK";
my $SCORE="^SCORE";
my $REMARK_AUTHOR="^REMARK\\s+AUTHOR";
my $REMARK="^REMARK";
my $END="^END\\s*\$";
my $HEADER_END="^\$";
my $NUMBER_IN_FIRST_COLL="^[0-9]";
my $PARENT="^PARENT";
#Variables for saving values of lines matching regular expr. given above:
my $PARSRC="^PARSRC";
my $PFRMAT_VALUE="";
my $TARGET_VALUE="";
my $AUTHOR_VALUE="";
my $IMODE_VALUE=undef;
my $MODEL_VALUE="";
my $END_VALUE="";
my $AUTCHK_VALUE="";
my $AUTHOR_ID_VALUE="";
my $SCORE_VALUE="";
my $tmp="";
my $PFRMAT_ID_VALUE="";
my $PARENT_VALUE="";

my $ORIG="^ORIG";
my $SCALE="^SCALE";
my $SEQRES="^SEQRES";

#Arrays for saving prediction and header data:
my @REST_OF_PRED_HEAD=();
my @HEADER_ARRAY=();
my @MODEL_DATA=();
my @SCORE_RECORD=();
my @REMARK_AUTH_CHECK=();
my @REMARK_AUTHOR_ARRAY=(); 
my @METHOD_RECORDS=();
my @REMARKS=();
my @line=();
my @tmp_arr=();
my %COUNT_MODELS=(
 DP => 0,
 RR => 0,
 TS => 0,
 DR => 0,
 AL => 0,
 QA => 0,
 FN => 0,
 IA => 0  
       );
my %COUNT_LIMIT=(
 DP => 5,
 RR => 5,
 TS => 5,
 DR => 5,
 AL => 5,
 QA => 5,
 FN => 5,
 IA => 5       
       );

#Additional variables:
my $model_in_file=1;
my $out_file_name;
my $method_new=1;
my $renumber_models=0;
my $model_num=0;
my $MODEL_NUMBER_VALUE=0;
my $wrong_auth_check=0;
my $numbered_lines=0;

#this constant do not used my $auth_wrong_predictions="/RCSB/CASP7_MAILBOXES/auth_wrong_predictions.dat";

#open(IN_FILE,"$ARGV[0]");
#$out_file_name=$ARGV[1];

open(IN_FILE, $self->{_in_file});
$out_file_name = $self->{_out_file};

#1) Get and store mail header
while (<IN_FILE>)
 {

   if (/$HEADER_END/){
     push(@HEADER_ARRAY, $_);
     last;
   }
    elsif (/$PARENT/) {$PARENT_VALUE=$_;} 
   elsif (/$AUTCHK/) {
#    ($tmp, $AUTCHK_VALUE)= split;
#    $AUTHOR_VALUE="AUTHOR $AUTCHK_VALUE\n"; 
#    push(@REMARK_AUTH_CHECK,"REMARK AUTHCHK: $AUTCHK_VALUE\n");
    last;
   }
   elsif ((/$PFRMAT/)||(/$FORMAT/)) { 
   $PFRMAT_VALUE=$_;
   ($tmp, $PFRMAT_ID_VALUE)=split;
    if (/$FORMAT/) { $PFRMAT_VALUE="PFRMAT $PFRMAT_ID_VALUE\n";}
   $COUNT_MODELS{$PFRMAT_ID_VALUE}++;
   last;
   }
   elsif(/$TARGET_QUERY/){
    @tmp_arr=split;
    $TARGET_VALUE="TARGET $tmp_arr[2] $tmp_arr[3]\n";
    last;
   } elsif (/$TARGET/) {
      $TARGET_VALUE=$_;
      if (!(/$TARGET\s+[HOTRNSFAXL]/)){
      chomp;
      $TARGET_VALUE="$_ error\n"
      }
      last;
     }
   elsif (/$AUTHOR/) {
        $AUTHOR_VALUE=$_;
        last;
       }
   elsif (/$IMODE/) {
        $IMODE_VALUE=$_;
        last;
       }
   elsif (/$REMARK_AUTHOR/) {
        @REMARK_AUTHOR_ARRAY=split;
        $AUTHOR_VALUE="AUTHOR $REMARK_AUTHOR_ARRAY[2]\n";
        last; 
 }
   elsif (/$SCORE/) {
     $SCORE_VALUE=$_;
     push(@SCORE_RECORD,"REMARK $SCORE_VALUE");
     last;
   }
   else {
     push(@HEADER_ARRAY, $_);
   }
 }
#2) Get PFRMAT, TARGET and AUTHOR records of the prediction (if not already 
#   recived in 1)
$model_in_file=1;
while ($model_in_file==1) {
 while (<IN_FILE>)
  {
  if ($MODEL_VALUE ne ""){
    if (/$SCORE/) {
      @SCORE_RECORD=();
     push(@SCORE_RECORD, "REMARK $_");
     last;
    } else {
    push(@MODEL_DATA, $_); last; }
   }


   if (/$AUTCHK/) {
#   ($tmp, $AUTCHK_VALUE)= split;
#    $AUTHOR_VALUE="AUTHOR $AUTCHK_VALUE\n";
#    push(@REMARK_AUTH_CHECK,"REMARK AUTHCHK: $AUTCHK_VALUE\n");
   }
    elsif (/$PARENT/) {$PARENT_VALUE=$_;} 
   elsif (/$SCORE/) {
     @SCORE_RECORD=();
     $SCORE_VALUE=$_;
     push(@SCORE_RECORD,"REMARK $SCORE_VALUE");
   }
   elsif ((/$PFRMAT/)||(/$FORMAT/))  {
     $PFRMAT_VALUE=$_;
     ($tmp, $PFRMAT_ID_VALUE)=split;
    if (/$FORMAT/) { $PFRMAT_VALUE="PFRMAT $PFRMAT_ID_VALUE\n";}
   }
   elsif (/$TARGET_QUERY/){
    @tmp_arr=split;
   if ((scalar @tmp_arr)>3){
    $TARGET_VALUE="TARGET $tmp_arr[2] $tmp_arr[3]\n";
    }
   else {    $TARGET_VALUE="TARGET $tmp_arr[2]\n";} 
   }
   elsif (/$TARGET/)  {
          $TARGET_VALUE=$_;
	if (!(/$TARGET\s+[HOTRNSFAXL]/)){
      	chomp;
      	$TARGET_VALUE="$_ error\n"
     	 }
   }
   elsif (/$IMODE/) {
        $IMODE_VALUE=$_;
        
       }
   elsif ((/$AUTHOR/)||(/$REMARK_AUTHOR/))  {
          if (/$REMARK_AUTHOR/) {
           @REMARK_AUTHOR_ARRAY=split;
           $AUTHOR_VALUE="AUTHOR $REMARK_AUTHOR_ARRAY[2]\n"; 
           $AUTHOR_ID_VALUE=$REMARK_AUTHOR_ARRAY[2];
          } else {
          $AUTHOR_VALUE=$_;
          ($tmp, $AUTHOR_ID_VALUE)= split;
          }
          if ($AUTHOR_ID_VALUE eq $AUTCHK_VALUE) {
#            @REMARK_AUTH_CHECK=(); 
#            push(@REMARK_AUTH_CHECK,"REMARK AUTHCHK: $AUTCHK_VALUE\n");
#            push(@REMARK_AUTH_CHECK,"REMARK AUTHCHK_OK\n"); 
          }
          else {
#            @REMARK_AUTH_CHECK=();
#            push(@REMARK_AUTH_CHECK,"REMARK AUTHCHK: $AUTCHK_VALUE\n");
#            push(@REMARK_AUTH_CHECK,"REMARK AUTHCHK_WRONG\n");  
           $wrong_auth_check=1; 
          } 
        }
   elsif (/$METHOD/){
          if ($method_new==1) {$method_new=0; @METHOD_RECORDS=();}
           push(@METHOD_RECORDS,$_);
          }

   elsif ((/$NUMBER_IN_FIRST_COLL/)&&($PFRMAT_ID_VALUE eq "DP")){
          $numbered_lines=1;
          @line=split;
          push(@MODEL_DATA, "$line[1] $line[2] $line[3]\n");
          push(@METHOD_RECORDS,"METHOD -------------\n");
          $MODEL_VALUE="MODEL 1\n";
          last;
         }
   elsif (/$REMARK/) {
          push(@REMARKS,$_);
         }
   elsif (!(/$MODEL/)){
           push(@REST_OF_PRED_HEAD,"REMARK $_");
          }
   else {
   $COUNT_MODELS{$PFRMAT_ID_VALUE}++;
         if ($renumber_models==0) {
         $MODEL_VALUE=$_;  
         ($tmp,$MODEL_NUMBER_VALUE)=split;
          if ($MODEL_NUMBER_VALUE eq "") {
            $renumber_models=1;
            $model_num++; 
            $MODEL_NUMBER_VALUE=$model_num; 
	    $MODEL_VALUE="MODEL $model_num\n";  
           }
         } else {
           ($tmp,$MODEL_NUMBER_VALUE)=split;
           $model_num++;
           $MODEL_NUMBER_VALUE=$model_num; 
           $MODEL_VALUE="MODEL $model_num\n";
         }
         last;}
 }
#3) Get MODEL data
my $parent_in_model=0;
 while (<IN_FILE>) {
  if (/$REMARK/){
   push(@MODEL_DATA, $_);}
   elsif (/$PARENT/) {push(@MODEL_DATA,$_); $parent_in_model=1; } 
   elsif ((/$SCORE/)||(/$ORIG/)||(/$SCALE/)||(/$SEQRES/)){
   push(@MODEL_DATA,"REMARK $_");
  } elsif (/$PARSRC/) {
   push(@MODEL_DATA,"REMARK $_");
  } elsif (/$END/) {
     $END_VALUE=$_;
     last;
   } else {
#     if ($numbered_lines){
#      @line=split;
#      push(@MODEL_DATA, "$line[1] $line[2] $line[3]\n"); 
     if (($PFRMAT_ID_VALUE eq "DP")&&(/$PARENT/)) {
      push(@MODEL_DATA, "REMARK $_");
     } else {  
      push(@MODEL_DATA, $_); 
     }
   }
 }
 
   
#4) Save first model to file if the limit of models not yet exeeded

my $file_path = "";
if ((exists $COUNT_LIMIT{$PFRMAT_ID_VALUE})&&($MODEL_NUMBER_VALUE=~/^[1-5]$/)&&((($COUNT_LIMIT{$PFRMAT_ID_VALUE}+1)>$MODEL_NUMBER_VALUE)||(($COUNT_LIMIT{$PFRMAT_ID_VALUE}+1)>$COUNT_MODELS{$PFRMAT_ID_VALUE}))){
#if ((exists $COUNT_LIMIT{$PFRMAT_ID_VALUE})&&((($COUNT_LIMIT{$PFRMAT_ID_VALUE}+1)>$MODEL_NUMBER_VALUE)||(($COUNT_LIMIT{$PFRMAT_ID_VALUE}+1)>$COUNT_MODELS{$PFRMAT_ID_VALUE}))){
 
 my $model_number_value_tmp = '0';  
 if ($MODEL_VALUE =~ /$MODEL\s*(\S+)/) {   
  $model_number_value_tmp = $1;
 }
 
 $file_path = sprintf("%s\_%s", $out_file_name, $model_number_value_tmp);
 open(OUT_FILE ,">$file_path");

# Commented printing of the text above the first keyword into the prediction
#
# print OUT_FILE @HEADER_ARRAY;
 print OUT_FILE $PFRMAT_VALUE;
 print OUT_FILE $TARGET_VALUE;
 print OUT_FILE $AUTHOR_VALUE;
# print OUT_FILE @REMARK_AUTH_CHECK;
# print OUT_FILE @REST_OF_PRED_HEAD;
 print OUT_FILE @REMARKS;
 print OUT_FILE @METHOD_RECORDS;
 print OUT_FILE "METHOD -------------\n";
 print OUT_FILE @SCORE_RECORD;
 if (defined($IMODE_VALUE)){
	print OUT_FILE $IMODE_VALUE;
 }
 print OUT_FILE $MODEL_VALUE;
if ($parent_in_model==0){ print OUT_FILE $PARENT_VALUE;}
 print OUT_FILE @MODEL_DATA;
 print OUT_FILE "END\n";
 push(@models, $model_number_value_tmp);
 
# print OUT_FILE $END_VALUE;
} else {
 $file_path = sprintf("%s\_%s", $out_file_name, $MODEL_NUMBER_VALUE);
 open(OUT_FILE ,">$file_path");
 print OUT_FILE @HEADER_ARRAY;
 print OUT_FILE "TOO_MANY_MODELS_OR_WRONG_FORMAT\n";
 print OUT_FILE $PFRMAT_VALUE;
 print OUT_FILE $TARGET_VALUE;
 print OUT_FILE $AUTHOR_VALUE;
 print OUT_FILE @REMARK_AUTH_CHECK;
 print OUT_FILE @REST_OF_PRED_HEAD;
 print OUT_FILE @REMARKS;
 print OUT_FILE @METHOD_RECORDS;
 print OUT_FILE "METHOD -------------\n";
 print OUT_FILE @SCORE_RECORD;
 if (defined($IMODE_VALUE)) {
        print OUT_FILE $IMODE_VALUE;
 }
 print OUT_FILE "MODEL $MODEL_NUMBER_VALUE\n";
 print OUT_FILE $PARENT_VALUE;
 print OUT_FILE @MODEL_DATA;
 print OUT_FILE "END\n";
 push(@models, $MODEL_NUMBER_VALUE);
 
}

 print OUT_FILE "\n\n\n";
close OUT_FILE;
system(sprintf("chmod a+rw %s", $file_path));

#if ($wrong_auth_check==1){
#open(OUT_FILE1,">>$auth_wrong_predictions");
# print OUT_FILE1 @HEADER_ARRAY;
# print OUT_FILE1 $PFRMAT_VALUE;
# print OUT_FILE1 $TARGET_VALUE;
# print OUT_FILE1 $AUTHOR_VALUE;
# print OUT_FILE1 @REMARK_AUTH_CHECK;
# print OUT_FILE1 @REST_OF_PRED_HEAD;
# print OUT_FILE1 @REMARKS;
# print OUT_FILE1 @METHOD_RECORDS;
# print OUT_FILE1 @SCORE_RECORD;
# print OUT_FILE1 $MODEL_VALUE;
# print OUT_FILE1 $PARENT_VALUE;
# print OUT_FILE1 @MODEL_DATA;
# print OUT_FILE1 $END_VALUE;
# print OUT_FILE1 "\n\n";
#close OUT_FILE1;
#}


#clean arrays: $REST_OF_PRED_HEAD and $MODEL_DATA
@REST_OF_PRED_HEAD=();
@MODEL_DATA=();
@REMARKS=();
$MODEL_VALUE="";
#$PARENT_VALUE="";
#5)Check if there is other model in the file
 $method_new=1;
 $model_in_file=0;
  while (<IN_FILE>) {
    if (/$PARENT/) {$PARENT_VALUE=$_;}     
    elsif ((/$PFRMAT/)||(/$FORMAT/))  {
     $PFRMAT_VALUE=$_;
     @REST_OF_PRED_HEAD=();
     $model_in_file=1;
     ($tmp, $PFRMAT_ID_VALUE)=split;
     if (/$FORMAT/) { $PFRMAT_VALUE="PFRMAT $PFRMAT_ID_VALUE\n";}    
    last;
    }
    elsif (/$TARGET/)  {
             $TARGET_VALUE=$_;
             $model_in_file=1;
             last;
            }
    elsif (/$AUTHOR/)  {
          $AUTHOR_VALUE=$_;
          ($tmp, $AUTHOR_ID_VALUE)= split;
          if ($AUTHOR_ID_VALUE eq $AUTCHK_VALUE) {
#            @REMARK_AUTH_CHECK=();
#            push(@REMARK_AUTH_CHECK,"REMARK AUTHCHK: $AUTCHK_VALUE\n");
#            push(@REMARK_AUTH_CHECK,"REMARK AUTHCHK_OK\n");
          }
          else {
#            @REMARK_AUTH_CHECK=();
#            push(@REMARK_AUTH_CHECK,"REMARK AUTHCHK: $AUTCHK_VALUE\n");
#            push(@REMARK_AUTH_CHECK,"REMARK AUTHCHK_WRONG\n");
          }
          $model_in_file=1;
          last;
         }
   elsif (/$METHOD/){
          if ($method_new==1) {$method_new=0; @METHOD_RECORDS=();}
           push(@METHOD_RECORDS,$_);
          }
   elsif (/$SCORE/) {
     $SCORE_VALUE=$_;
     @SCORE_RECORD=();
     push(@SCORE_RECORD,"REMARK $SCORE_VALUE");
     $model_in_file=1;
     last;
   }
   elsif (/$MODEL/) {
   $COUNT_MODELS{$PFRMAT_ID_VALUE}++;
            $MODEL_VALUE=$_;            
            if ($MODEL_VALUE =~ /$MODEL\s*(\S+)/) {   
                $MODEL_NUMBER_VALUE = $1;
            }            
            $model_in_file=1;
            last;
           }
 }
} #end while $model_in_file

close IN_FILE;

return @models;

} #End of process


1;
