package MultimerTargetVerificationManager;

# The class deals with verification of quaternary structure predictions
# targets in format Hxxxx or Oxxxx

use strict;
use warnings;
use Cwd 'abs_path';

use DBI; 
#use Digest::MD5 qw(md5 md5_hex md5_base64);
#use Data::Dumper;

use lib qw(Core);
use Database;
use Configuration;
use PDBUtils qw(%AMINOACIDS_LONG_TO_SHORT %AMINOACIDS_SHORT_TO_LONG %AMINOACIDS_ATOMS splitSequences clean_model);

use lib qw(Classes);
use TargetsManager;
use Submission;
use LocalConfiguration;

my $DEBUG = 0;


# constructor with single argument - file with prediction
sub new {
    my ($class, $target, $submission) = @_;
    my $self = {
        _id => undef,
        _database => Database->new($CONFIG->{HOSTNAME}, $CONFIG->{PORT}, $CONFIG->{DATABASE}, $CONFIG->{USERNAME}, $CONFIG->{PASSWORD}),
        #_in_file => $in_file,
	_target => $target,
	_submission => $submission,
    };
    bless $self, $class;
    return $self;
}

sub process{
   my ($self) = @_;
   foreach my $m (@{$self->{_submission}->{_models}}){
        if($m !~ m/^[1-5]$/){next;}
	$self->process_model($m);
   }
}

sub process_model{
  my ($self, $m) = @_;
  my $accepted_file = sprintf("%s/%s%s%03d_%1d ", $LOCAL_CONFIG->{ACCEPTED_PREDICTIONS_DIR}, $self->{_submission}->{_target}, $self->{_submission}->{_pfrmat}, $self->{_submission}->{_groups_code}, $m);
  if ($self->{_target} =~ m/^H/) {
    # find correspondent sequence for each chain found in file
    my @chains = $self->findCorrespondentSeqs($accepted_file);
    my %done; # subunit done;
    foreach my $ch (@chains) {
        if ($ch->{_len} < 20) { # too short chain
                next;
        }
        if (exists $done{$ch->{_su}}) {
                next;
        }
	my $accepted_file_monomeric = sprintf("%s/%s%s%s%03d_%1d ", $LOCAL_CONFIG->{ACCEPTED_PREDICTIONS_DIR}, 'T'.substr($self->{_submission}->{_target},1), 's'.$ch->{_su},$self->{_submission}->{_pfrmat}, $self->{_submission}->{_groups_code}, $m);
        $accepted_file_monomeric = $self->genPredictionPerSubunit($ch->{_id}, $ch->{_su}, $accepted_file,  $accepted_file_monomeric);
        $done{$ch->{_su}} = 1;
	if ( ! defined($accepted_file_monomeric)) {
		next; # do nothing
	} else {
		system("chgrp users $accepted_file_monomeric");
	        system("chmod g+wr $accepted_file_monomeric");
		my $monomeric_target = sprintf("T%ss%d", substr($self->{_submission}->{_target}, 1), $ch->{_su});
		$self->process_monomeric($accepted_file_monomeric, $monomeric_target, $m);
	}
    }
  } elsif ($self->{_target} =~ m/^O/) {
    # find correspondent sequence for each chain found in file
    my @chains = $self->findCorrespondentSeqs($accepted_file);
    my %done; # subunit done;
    foreach my $ch (@chains) {
        if ($ch->{_len} < 20) { # too short chain
                next;
        }
        if (exists $done{$ch->{_su}}) {
                next;
        }
	my $accepted_file_monomeric = sprintf("%s/%s%s%03d_%1d ", $LOCAL_CONFIG->{ACCEPTED_PREDICTIONS_DIR}, 'T'.substr($self->{_submission}->{_target},1), $self->{_submission}->{_pfrmat}, $self->{_submission}->{_groups_code}, $m);
        $accepted_file_monomeric = $self->genPredictionPerSubunit($ch->{_id}, $ch->{_su}, $accepted_file, $accepted_file_monomeric);
        $done{$ch->{_su}} = 1;
        if ( ! defined($accepted_file_monomeric)) {
                next; # do nothing
        } else {
                system("chgrp users $accepted_file_monomeric");
                system("chmod g+wr $accepted_file_monomeric");
                my $monomeric_target = sprintf("T%s", substr($self->{_submission}->{_target}, 1));
                $self->process_monomeric($accepted_file_monomeric, $monomeric_target, $m);
        }
    }
  } elsif ($self->{_target} =~ m/^[SNFAXL]/) {
    # find correspondent sequence for each chain found in file
    my @chains = $self->findCorrespondentSeqs($accepted_file);
    my %done; # subunit done;
    foreach my $ch (@chains) {
        if ($ch->{_len} < 20) { # too short chain
                next;
        }
        if (exists $done{$ch->{_su}}) {
                next;
        }
        my $accepted_file_monomeric = sprintf("%s/%s%s%s%03d_%1d ", $LOCAL_CONFIG->{ACCEPTED_PREDICTIONS_DIR}, $self->{_submission}->{_target}, 's'.$ch->{_su},$self->{_submission}->{_pfrmat}, $self->{_submission}->{_groups_code}, $m);
        $accepted_file_monomeric = $self->genPredictionPerSubunit($ch->{_id}, $ch->{_su}, $accepted_file,  $accepted_file_monomeric);
        $done{$ch->{_su}} = 1;
        if ( ! defined($accepted_file_monomeric)) {
                next; # do nothing
        } else {
                system("chgrp users $accepted_file_monomeric");
                system("chmod g+wr $accepted_file_monomeric");
                my $monomeric_target = sprintf("%ss%d", $self->{_submission}->{_target}, $ch->{_su});
                $self->process_monomeric($accepted_file_monomeric, $monomeric_target, $m);
        }
    }
  }

}

sub process_monomeric {
  my ($self, $accepted_file_monomeric, $monomeric_target, $m) = @_;
  # create clean predictions
  my $clean_dir = sprintf("%s/%s", $LOCAL_CONFIG->{CLEAN_PREDICTIONS_DIR}, $monomeric_target);
  if (! -d $clean_dir){
	system("mkdir -p $clean_dir");
        system("chmod 775 $clean_dir");
        system("chgrp users $clean_dir");
  }
  my $clean_file = sprintf("%s/%s/%s%s%03d_%s", $LOCAL_CONFIG->{CLEAN_PREDICTIONS_DIR}, $monomeric_target, $monomeric_target, $self->{_submission}->{_pfrmat}, $self->{_submission}->{_groups_code}, $m);
   clean_model($accepted_file_monomeric, $clean_file); # keep chain (third arg=1)
   system("chmod 664 $clean_file");
   system("chgrp users $clean_file");
   # register prediction in database
   if(0 == $self->add_predictions_log($monomeric_target, $m)){
            return 0;
   }

   if(0 == $self->add($self->{_submission}->{_email},$self->{_submission}->{_type},$self->{_submission}->{_pfrmat},$monomeric_target,$self->{_submission}->{_groups_id}, $m, $self->{_submission}->{_predictor_id} )){
            return 0;
   }
   return 1;
}


# fetch sequence(s) from database 
# and split into subunits 
# return: hash of hashes
sub fetchSequence{
  my ($self) = @_;
  my $and = '';
  if ($self->{_target} =~ m/^H/){
	$and = " AND t.subunit=0 ";
  }
  my $query = sprintf("SELECT t.subunit, ss.sequence FROM casp13.targets t JOIN casp13.submitted_sequences ss 
			ON t.submitted_sequences_id=ss.id WHERE t.name='%s' %s  ", $self->{_target}, $and) ;
#  if ($DEBUG) {print "===\n$query \n===\n";}
  my $sth = $self->{_database}->query($query);
  my $su; my $seq;
  if(defined($sth) && ($sth->rows() > 0)) {
        ($su, $seq) = $sth->fetchrow_array();
  }
  my @arrSeqs = splitSequences($seq);
  my %result; # hash of hashes: key - s_index, value - hash: key - res_no, value - 1letter AA
  my $s_index = 1; # subunit index
  for my $s (@arrSeqs) {
	$s =~ s/^>.*\n+//; # remove fasta header
	$s =~ s/\s//g; # remove all non-printable symbols
	$s =~ s/[^A-Z]//g;
	my @letters = split(//,$s);
	my $res = 1;
	my %hash;
	for my $letter (@letters){
		$hash{$res} = $letter;
		$res++;
	}
	$result{$s_index} = \%hash;
	$s_index++;
  }
  return %result;
}

# The method finds correspondence between sequences in chains (prediction) and subunits (target).
# list of checks performed:
#	- not empty chain id field - ERROR is reported
#	- sequence of each chain corresponds to ONE subunit sequence - ERROR is reported if failed
# Return array of elements in data structure:
# {
#   _id => $chain_id, # chain id 
#   _len => undef, # length of chain = number of AA
#   _su => undef # corresponding subunit
# };
# The order of elements is the order of appearence in prediction.
sub findCorrespondentSeqs{
   my ($self, $in_file) = @_;
   my %hash_chains; # multidimensional hash with two keys {chain_id}{subunit_i}: value 1 or 0 - 1 if chain matches subunit_i, 0 - otherwise
   my %hash_seqs = $self->fetchSequence();
   my $no_errors = 0;
   my $MAX_NO_ERRORS = 25;
   my @chains; # array of chain : in order of appearence {_id, _len, _su}
   my $chain_len = 0; # calculate the length of each chain
   open CAs, sprintf(" grep -P \"^ATOM\" %s | grep \" CA \" | ", $in_file) ;
   while(defined(my $l = <CAs>)){
	my $chain_id = substr($l, 20,  2); # position 22 and reserved position 21
	$chain_id =~ s/\s+//g;
	if ($chain_id =~ m/^$/) {
#		print "ERROR! In multimeric prediction the chain id field (position 22) can not be empty:\n $l";
		$no_errors++;
		if ($no_errors >= $MAX_NO_ERRORS) {
#                        print "Too many errors! Check the format of prediction.\n";
			return undef;
                }
		next;
	}
	if (scalar(@chains) == 0 || $chains[$#chains]->{_id} ne $chain_id) {
		if (scalar(@chains) != 0) {
			$chains[$#chains]->{_len} = $chain_len;
			$chain_len=0;
		}
		push @chains, {
			_id => $chain_id, # chain id 
			_len => undef, # length of chain = number of AA
			_su => undef # corresponding subunit
		};
	}
	my $aa = substr($l, 17, 3); # positions 18-20
	#printf "=$aa: +%s+\n", $AMINOACIDS_LONG_TO_SHORT{$aa};
	if (! exists $AMINOACIDS_LONG_TO_SHORT{$aa}){
		#print "ERROR! Non standard amino acid $aa at line:\n $l";
		$no_errors++;	
		if ($no_errors >= $MAX_NO_ERRORS) {
#			print "Too many errors! Check the format of prediction.\n";
			return undef;
		}
		next;
        }
	my $res = substr($l, 22, 4); # positions 23-26 
	$res =~ s/\s//g;
	$res = sprintf("%d", $res);
	foreach my $su (keys %hash_seqs){ # loop over subunits
		# first time
		if (! exists $hash_chains{$chain_id}{$su}) {
			$hash_chains{$chain_id}{$su} = 1; # hash of hashes
		}
		if (!defined($hash_seqs{$su}{$res}) || $AMINOACIDS_LONG_TO_SHORT{$aa} ne $hash_seqs{$su}{$res}) {
		    $hash_chains{$chain_id}{$su} = 0; # the chain in prediction doesn't match any subunit in the target
		}
	}
	$chain_len++;
   }
   close CAs;
   if ($no_errors > 0) {
	return undef;
   }
   if (scalar(@chains) == 0) { # empty prediction
	return undef;
   }
   $chains[$#chains]->{_len} = $chain_len;
   # check if each chain in prediction matches only one subunit sequence
   foreach my $ch (@chains){
	my $no_su_matched = 0; # number of subunits that match the sequence of chain
	my $su_matched = '';
	my $ch_id = $ch->{_id};
	foreach my $su (keys %hash_seqs){
		if ($hash_chains{$ch_id}{$su} == 1){
			$no_su_matched++;
			$su_matched .= "$su, ";
		}
	}
	if ($no_su_matched == 0) {
#		print "ERROR! Chain $ch_id doesn't match any subunit sequence.\n";
		return undef;
	}
	$su_matched =~ s/,\s+$//g; # remove tailing comma
	if ($no_su_matched > 1) {
#		print "ERROR! Ambiguity problem: sequence of the chain $ch_id matches more then one sequences of subunits: $su_matched\n";
		return undef; 
	}
	$ch->{_su} = $su_matched;
   }
   # check if there are at least two chains
   if (scalar(@chains) < 2) {
#	print "ERROR! For multimeric predictions at least two chains are expected.\n";
	return undef;
   }
   return @chains;
}

sub genMonomericPredictions {
   my ($self) = @_;
   my @chains = $self->findCorrespondentSeqs();
   my %done; # subunit done;
   foreach my $ch (@chains) {
	if ($ch->{_len} < 20) { # too short chain
		next;
	}
	if (exists $done{$ch->{_su}}) {
		next;
	}
	my $pred_file = $self->genPredictionPerSubunit($ch->{_id}, $ch->{_su});
	$done{$ch->{_su}} = 1;
   }
}

# The method generates file with monomeric prediction, i.e. for single chain that corresponds to one unit
sub genPredictionPerSubunit{
   my ($self, $chain_id, $su_index, $infile, $outfile) = @_;
   if (! defined($infile)) {
#	$infile = $self->{_in_file};
	return undef; # do nothing
   }
   my $target = sprintf ("T%ss%d", substr($self->{_target}, 1), $su_index); # generate regular target name
   if ($self->{_target} =~ m/^O/){
	$target = sprintf ("T%s", substr($self->{_target}, 1)); # generate regular target name
   } elsif ($self->{_target} =~ m/^H/){
        $target = sprintf ("T%ss%d", substr($self->{_target}, 1), $su_index); # generate regular target name
   } elsif ($self->{_target} =~ m/^[SNFAXL]/){
        $target = sprintf ("%ss%d", $self->{_target}, $su_index); # generate regular target name
   }
   my $targets_manager = new TargetsManager();
   my $target_id = $targets_manager->get_id_by_name($target);
   if ($target_id == 0){
	return undef; # do nothing - target wasn't released as a separate subunit
   }
   my $file_name = sprintf("%s_s%d", $infile, $su_index);
   if (defined($outfile)) {
	$file_name = $outfile;
   }
   open F, "> $file_name";
   open I, sprintf( "< %s", $infile);
   my $cur_parent = 'N/A'; my $flag = 0;
   while(defined(my $l=<I>)){
	if ($l=~ m/^TARGET/){
		print F "TARGET $target\n";
	} elsif ($l =~ m/^AUTHOR/ || $l =~ m/^PFRMAT/ || $l =~ m/^METHOD/ || $l =~ m/^REMARK/ || $l =~ m/^MODEL/) {
		print F $l;
	} elsif ($l =~ m/^PARENT\s+(\S+.*)/) {
		$cur_parent = $1;
	} elsif ($l =~ m/^ATOM/) {
		my $ch = substr($l, 21, 1);
		if ($ch eq $chain_id) {
			if ($flag == 0) {
				print F "PARENT $cur_parent\n";
			}
			print F $l;
			$flag = 1;
		}
	} elsif ($l =~ /^TER/ && $flag == 1) {
		print F "TER\n";
		print F "END\n";
		last;
	}
   }
   close I;
   close F;
   return $file_name;
}

# register prediction in database
sub add {
        my($self, $email, $type, $pfrmat, $target, $groups_id, $model_id, $predictor_id) = @_;

        my $result = 0;

        my $pid = $self->prediction_exist($target, $model_id, $pfrmat, $groups_id);

        if($pid > 0) {
                $result = $self->update($pid, $email, $type, $pfrmat, $target, $groups_id, $model_id, $predictor_id);
        } else {
                my $query = sprintf("INSERT INTO casp13.predictions (groups_id, type, email, pfrmat, target, model, predictors_id) VALUES (%d, '%s', '%s', '%s', '%s', %d, %d) RETURNING id", $groups_id, $type, $email, $pfrmat, $target, $model_id, $predictor_id);
                my $sth = $self->{_database}->query($query);

                if(defined($sth)) {
                        # add logger
                        ($result) = $sth->fetchrow_array();
                }
        }

        return $result;
}

# update prediction in database
sub update {
    my ($self, $id, $email, $type, $pfrmat, $target, $groups_id, $model_id, $predictor_id) = @_;

    my $result = 0;

    my $query = sprintf("UPDATE casp13.predictions SET target = '%s', type = '%s', pfrmat = '%s', email = '%s', model = %d, groups_id = %d, predictors_id = %d, date=now() WHERE (id = %d)", $target, $type, $pfrmat, $email, $model_id, $groups_id, $predictor_id, $id);
    my $sth = $self->{_database}->query($query);

    if(defined($sth)) {
        # add logger
        $result = 1;
    }

    return $result;
}

sub prediction_exist {
        my($self, $target, $model, $pfrmat, $groups_id) = @_;

        my $result = 0;

    my $query = sprintf("SELECT id FROM casp13.predictions WHERE (target = '%s') AND (model = %d) AND (pfrmat = '%s') AND (groups_id = %d)", $target, $model, $pfrmat, $groups_id);
    my $sth = $self->{_database}->query($query);

    if(defined($sth) && ($sth->rows() > 0)) {
        ($result) = $sth->fetchrow_array();
    }

    return $result;
}


sub add_predictions_log {
    my ($self, $monomeric_target, $m) = @_;
    my $targ_manager = new TargetsManager();
    my $monomeric_target_id = $targ_manager->get_id_by_name($monomeric_target);
    if (!defined($monomeric_target_id) || $monomeric_target_id == 0) {
	return 0;
    }
    my $file_name = sprintf("%s%s%03d_%1d ", $monomeric_target, $self->{_submission}->{_pfrmat}, $self->{_submission}->{_groups_code}, $m);

    my $query = sprintf("INSERT INTO casp13.predictions_log
        (date, group_id, group_name, target_id, target_name,
        model, pfrmat, status, file_name, case_id, recived_from, return, predictor_id, submission_type )
        VALUES ( now(), %s, '%s', %s, '%s', %s, '%s', '%s', '%s', '%s', '%s', '%s' , '%s', '%s' ) RETURNING id",
        $self->{_submission}->{_groups_id}, $self->{_submission}->{_author}, $monomeric_target_id, $monomeric_target, $m, $self->{_submission}->{_pfrmat}, "accepted", $file_name , $self->{_submission}->{_case_id}, $self->{_submission}->{_email}, "saved prediction", $self->{_submission}->{_predictor_id}, $self->{_submission}->{_type});

    my $sth = $self->{_database}->query($query);

    my $prediction_log_id = 0;

    if(defined($sth)) {
        # add logger
        ($prediction_log_id) = $sth->fetchrow_array();
    }

    return $prediction_log_id;

}


1;
