/*-----------------------------------------------------------
/
/   dp_ver.c 
/
/   Program checks the DP format for CASP3 submissions
/   (Domain prediction format veryfier deriverd from Secondary Structure prediction)
/   VERSION FOR FORMAT WITH NUMBERED LINES
/  Modified by MM in 2004 [Copyright by Adam Zemla (06/05/1998)]
/
/------------------------------------------------------------*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAXRES           10000

char *letter="ARNDCQEGHILKMFPSTWYV-";

typedef struct {
  float  confidence[MAXRES];
  int    ss[MAXRES];
  int    aa[MAXRES];
  int    n_aa;
  int    n_confidence;
  int    n_cols[MAXRES];
} data_ssp;

typedef struct {
  char     target_name[10];
  char     target_aa[MAXRES];
  int      target_n_aa;
  int      model;
  int      end;
  int      n_method;
  int      errors;
  data_ssp ssp;
} ss_f;

int check_aa(char);
void read_ss_f(ss_f *, char*);
void read_data_ssp(ss_f *, FILE*);
void check_ss_f(ss_f *);
void clean_ss_f(ss_f *);
int read_seq(char*, char*);
void escape(int);

main(int argc, char *argv[]) 
{
  char ss_file[120];
  ss_f ss_data;
  
  if(argc<2){
    printf(" Usage: ss_ver <ss_file>\n");
    exit(0);
  }
  
  strcpy(ss_file,argv[1]);

  clean_ss_f(&ss_data);

  printf("# Reading prediction format DP\n\n");
  read_ss_f(&ss_data,ss_file);
  printf("\n# Reading prediction format DP             (DONE)\n");

  check_ss_f(&ss_data);

  if(ss_data.errors==0) {
    printf("\n# MODEL index: %d \n",ss_data.model);
    printf("\n# Total number of residues in model:          %4d ",ss_data.ssp.n_aa);
    printf("\n# Number of residues with nonzero confidence: %4d ",ss_data.ssp.n_confidence);
    printf("\n# Number of METHOD records:                   %4d \n",ss_data.n_method);
    printf("\n# No errors.\n\n");
  }
  else {
    printf("\n# Number of errors = %d.\n\n",ss_data.errors);
  }
}

void escape(int error)
{
  if(error>25) {
    printf("\n# Too many ERRORS ...");
    printf("\n#    Please check format for DP predictions.\n");
    exit(0);
  }
  return;
}

/*-----------------------------------------------------------
/
/   clean ss_f structure
/
/------------------------------------------------------------*/
void clean_ss_f(ss_f *ss_data)
{
  int j;

  strcpy(ss_data->target_name,"   ");
  ss_data->target_n_aa=0;
  ss_data->model=0;
  ss_data->end=0;
  ss_data->n_method=0;
  ss_data->errors=0;
  ss_data->ssp.n_aa=0;
  ss_data->ssp.n_confidence=0;
  for(j=0;j<MAXRES;j++) {
    ss_data->target_aa[j]=' ';
    ss_data->ssp.confidence[j]=0.0;
    ss_data->ssp.ss[j]=0;
    ss_data->ssp.aa[j]=0;
    ss_data->ssp.n_cols[j]=0;
  }
  return;
}

/*-----------------------------------------------------------
/
/   read_ss_f - read the DP predictions format
/
/------------------------------------------------------------*/
void read_ss_f(ss_f *ss_data, char* fname)
{
  int i, begflag, authflag;
  char keyword[500], line[500], name[30], model_nb[100];
  FILE *fp;

  if((fp = fopen(fname,"r"))==NULL) {
    printf("\n# error opening file %s for read\n\n",fname);
    exit(0);
  }

  /* Read in the ss_data
  -------------------------------------------*/
  begflag=0;
  authflag=0;
  while(fgets(line,500,fp)!=NULL) {
    strcpy(keyword,"   ");
    strcpy(name,"   ");
    strcpy(model_nb,"#   ");
    sscanf(line,"%s",keyword);
    if(!strncmp(keyword,"PFRMAT\0",7)) {

      for(i=0;i<500;i++)
        if(line[i]=='%' || line[i]=='"' || 
           line[i]=='/' || line[i]=='\\') line[i]=' ';
      printf(line);

      begflag=1;
      sscanf(line,"%s %s",keyword,name);
      if(strncmp(name,"DP\0",3)) {
        printf("\n# ERROR! Wrong specification of the DP format category");
        printf("\nPFRMAT DP      # was expected\n\n");
        exit(0);
      }
    }
    else if(!strncmp(keyword,"TARGET\0",7)) {

      for(i=0;i<500;i++)
        if(line[i]=='%' || line[i]=='"' || 
           line[i]=='/' || line[i]=='\\') line[i]=' ';
      printf(line);

      if(begflag<1) {
        printf("\n# ERROR! Unacceptable order of the DP prediction data records");
        printf("\nPFRMAT DP                # the first line");
        printf("\nTARGET Txxxx             # the second line");
        printf("\nAUTHOR xxxx-xxxx-xxxx    # the third line was expected\n\n");
        exit(0);
      }
      begflag=2;
      sscanf(line,"%s %s",keyword,name);
      strcpy(ss_data->target_name,name);
      ss_data->target_n_aa=read_seq(ss_data->target_aa,name);
    }
    else if(!strncmp(keyword,"AUTHOR\0",7)) {
      if(begflag<1) {
        printf("\n# ERROR! Unacceptable order of the DP prediction data records");
        printf("\nPFRMAT DP                # the first line");
        printf("\nTARGET Txxxx             # the second line");
        printf("\nAUTHOR xxxx-xxxx-xxxx    # the third line was expected\n\n");
        exit(0);
      }
      for(i=0;i<500;i++)
        if(line[i]=='%' || line[i]=='"' || 
           line[i]=='/' || line[i]=='\\') line[i]=' ';
      printf(line);
      authflag=1;
    }
    else if(!strncmp(keyword,"REMARK\0",7)) {
      if(begflag<1) {
        printf("\n# ERROR! Unacceptable order of the DP prediction data records");
        printf("\nPFRMAT DP                # the first line");
        printf("\nTARGET Txxxx             # the second line");
        printf("\nAUTHOR xxxx-xxxx-xxxx    # the third line was expected\n\n");
        exit(0);
      }
      for(i=0;i<500;i++)
        if(line[i]=='%' || line[i]=='"' || 
           line[i]=='/' || line[i]=='\\') line[i]=' ';
      printf(line);
    }
    else if(!strncmp(keyword,"METHOD\0",7)) {
      if(begflag<1) {
        printf("\n# ERROR! Unacceptable order of the DP prediction data records");
        printf("\nPFRMAT DP                # the first line");
        printf("\nTARGET Txxxx             # the second line");
        printf("\nAUTHOR xxxx-xxxx-xxxx    # the third line was expected\n\n");
        exit(0);
      }
      ss_data->n_method++;
    }
    else if(!strncmp(keyword,"MODEL\0",6)) {
      if(begflag<2 || authflag!=1) {
        printf("\n# ERROR! Unacceptable order of the DP prediction data records");
        printf("\nPFRMAT DP                # the first line");
        printf("\nTARGET Txxxx             # the second line");
        printf("\nAUTHOR xxxx-xxxx-xxxx    # the third line was expected\n\n");
        exit(0);
      }
      i=0;
      sscanf(line,"%s %s",keyword,model_nb);
      sscanf(model_nb,"%d",&i);
      if(model_nb[1]!=' ' && model_nb[1]!='\0' && model_nb[1]!='\n') {
        printf("\n# ERROR! Unacceptable index of the prediction MODEL\n\n");
        ss_data->errors++;
        escape(ss_data->errors);
      }
      if(i>0 && i<=5) {
        begflag=3;
        ss_data->model=i;
        printf("\n# Reading MODEL %2d\n",i);
        read_data_ssp(ss_data,fp);
        break;
      }
      else {
        printf("\n# ERROR! Unacceptable index of the DP prediction MODEL\n\n");
        exit(0);
      }
    }
    else if(!strncmp(keyword,"END\0",4)) {
      ss_data->end=1;
      break;
    }
    else if(strncmp(keyword," ",1)) {

      for(i=0;i<500;i++)
        if(line[i]=='%' || line[i]=='"' || 
           line[i]=='/' || line[i]=='\\') line[i]=' ';
      printf(line);

      printf("\n# ERROR! Unknown record keyword in this section of the prediction.");
      printf("\n#        Check  format description.\n\n");
      ss_data->errors++;
      escape(ss_data->errors);
    }
  }
  if(begflag!=3) {
    printf("\n# ERROR! There is no DP prediction data in this file\n\n");
    exit(0);
  }
  if(ss_data->end==0) {
    printf("\n# ERROR! There is no END record in this file\n\n");
    exit(0);
  }
  if(authflag!=1) {
    printf("\n# ERROR! Check AUTHOR record in this file\n\n");
    exit(0);
  }
  if(ss_data->n_method==0) {
    printf("\n# ERROR! There is no METHOD records in this file\n\n");
    exit(0);
  }
  fclose(fp);

  return;
}
/*---------------------------------------------------------------
/
/  strcountcoll - count number of columns in a line (by mm in 2004)
/
/-----------------------------------------------------------------*/
int strcountcol(char* s)
{
 int i=0, coll=0;
 char flag=0;
 for (i=0;i<strlen(s);i++)
   {if ((flag==0)&&(s[i]!=' ')) {flag=1; coll++;}
    if ((s[i]==' ')&&(flag==1)){
      while (s[i]==' '){
       i++;
      }
      if ((s[i]!=0)&&(s[i]!='\n')) coll++;
    }
   }
 return coll;
}


/*-----------------------------------------------------------
/
/   read_seq - read a sequence file
/
/------------------------------------------------------------*/
int read_seq(char* seq, char* tname)
{
  int i, n_aa;
  FILE *fp;
  char line[MAXRES], fname[20], lname[20];

  strcpy(lname,tname);
  strcpy(fname,"TARGETS/");
  strcat(fname,lname);
  strcat(fname,".seq.txt");
  if((fp = fopen(fname,"r"))==NULL) {
    for(i=0;i<20;i++) if(lname[i]=='t') lname[i]='T';
    strcpy(fname,"TARGETS/");
    strcat(fname,lname);
    strcat(fname,".seq.txt");
    if((fp = fopen(fname,"r"))==NULL) {
      for(i=0;i<20;i++) if(lname[i]=='T') lname[i]='t';
      strcpy(fname,"TARGETS/");
      strcat(fname,lname);
      strcat(fname,".seq.txt");
      if((fp = fopen(fname,"r"))==NULL) {
        printf("\n# ERROR! There is no target name:  %s",tname);
        printf("\n# TARGET Txxxx      # was expected\n\n");
        exit(0);
      }
    }
  }

  n_aa=0;
  while ((fgets(line, MAXRES, fp) != NULL)) {    
    if (strncmp(line, ">", 1) == 0) {}
    else {
      i=0;
      while (line[i]!='\n') {
        if(line[i]!='\0' && line[i]!=' ') {
          if(check_aa(line[i])<20) {
            seq[n_aa]=line[i];
            n_aa++;
          }
          else {
            printf("\n# ERROR! Check file  %s  Wrong amino acid code  %c\n\n",tname,line[i]);
            fclose(fp);
            exit(0);
          }            
        }
        i++;
      }
    }
  }
  fclose(fp);
  return n_aa;
}

/*-----------------------------------------------------------
/
/   check_ss_f - checks the DP prediction file format
/
/------------------------------------------------------------*/
void check_ss_f(ss_f *ss_data)
{
  int i, j;

  printf("\n# Checking the DP prediction MODEL %2d\n",ss_data->model);
  if(ss_data->ssp.n_aa!=ss_data->target_n_aa) {
    printf("\n# ERROR! Check the number %d of residues in the model. In target: %d residues.",
                       ss_data->ssp.n_aa,ss_data->target_n_aa);
    printf("\n#        Check the TARGET %s specification (see Template Sequence file).\n",
                       ss_data->target_name);
    ss_data->errors++;
    escape(ss_data->errors);
  }
  if(ss_data->ssp.n_aa==0) /*|| ss_data->ssp.n_confidence==0)*/ {
    printf("# ERROR! The number of predicted residues in the model: 0\n");
/*    printf("#        Check the confidence level of the residues in the model.\n");
*/    ss_data->errors++;
    escape(ss_data->errors);
    return;
  }
  for(i=0;i<ss_data->ssp.n_aa;i++) {
    if(ss_data->target_aa[i]!=ss_data->ssp.aa[i]) {
      printf("# ERROR! Check residue %c number %d. (In TARGET: %c %d)\n",
                ss_data->ssp.aa[i],i+1,ss_data->target_aa[i],i+1);
      ss_data->errors++;
      escape(ss_data->errors);
    }
    if((ss_data->ssp.ss[i]!=-1)&&((ss_data->ssp.ss[i]<0) || 
       (ss_data->ssp.ss[i]>100))) {
      printf("# ERROR! Check the DP domain number %d for the residue number %d\n",
                ss_data->ssp.ss[i],i+1);
      ss_data->errors++;
      escape(ss_data->errors);
    }
    if (ss_data->ssp.n_cols[i]>3){
     if(ss_data->ssp.confidence[i]<0.0 || ss_data->ssp.confidence[i]>1.0) {
      printf("# ERROR! Check confidence level value = %5.2f for residue number %d\n",
                ss_data->ssp.confidence[i],i+1);
      ss_data->errors++;
      escape(ss_data->errors);
     }
    }
  }
  printf("# Checking the DP prediction MODEL %2d      (DONE)\n",ss_data->model);
  return;
}

/*-----------------------------------------------------------
/
/   check_aa - checks an amino acid
/
/------------------------------------------------------------*/
int check_aa(char token)
{
  int i;

  for(i=0;i<21;i++) {
    if(letter[i]==token)
      return i;
  }
  return 21;
}

/*-----------------------------------------------------------
/
/   read_data_ssp - read the DP predictions file format. 
/                   Three column DP CASP3 format.
/
/------------------------------------------------------------*/
void read_data_ssp(ss_f *ss_data, FILE* fp)
{
  int i, n_aa,no_of_col=0;
  float conf;
  char line[500],keyword[500],domain_no[500];
  char line_number[500];
/*,second[100]; */
 int second;
  n_aa=0;
  ss_data->ssp.n_aa=n_aa;
  while (fgets(line, 500, fp) != NULL) {
    strcpy(keyword,"   ");
    sscanf(line,"%s",keyword);   
    if(!strncmp(keyword, "END\0", 4)) {
      ss_data->end=1;
      return;
    }
    else if(keyword[0]=='#') {}
    else if(!strncmp(keyword,"REMARK\0",7)) {}
    else if(!strncmp(keyword,"PARSRC\0",7)) {}
   else if(!strncmp(keyword,"PARENT\0",7)){}/* ||
            !strncmp(keyword,"TER\0",4)) {
      for(i=0;i<500;i++)
        if(line[i]=='%' || line[i]=='"' || 
           line[i]=='/' || line[i]=='\\') line[i]=' ';
      printf(line);

      printf("\n# ERROR! Unknown record in the MODEL - END section.");
      printf("\n#        Check three column DP format.\n\n");
      ss_data->errors++;
      escape(ss_data->errors);
    }*/
    else if(!strncmp(keyword,"TER\0",4)) {}
    else if(!strncmp(keyword,"METHOD\0",7)) {
      ss_data->n_method++;
    }
    else {
      strcpy(keyword,"   ");
     /* strcpy(second,"   ");*/
      strcpy(domain_no,"   "); 
      second=0; 
      conf=-1.0;
      no_of_col=strcountcol(line);
      if (no_of_col>3){
      sscanf(line,"%s %s %s %f",line_number,keyword,domain_no,&conf);
      } else sscanf(line,"%s %s %s",line_number,keyword,domain_no); 
      if (domain_no[0]=='-') {second=-1; }
      else  second=atoi(domain_no);
      
/*      if(second[0]!='C' && second[0]!='E' && second[0]!='H') {
*/      if ((domain_no[0]!='-')&&((second<0)||(second>100))){
        for(i=0;i<500;i++)
          if(line[i]=='%' || line[i]=='"' || 
             line[i]=='/' || line[i]=='\\') line[i]=' ';
        printf(line);

        printf("\n# ERROR! Check the line NB = %d in the MODEL section. ",n_aa+1);
        if(keyword[0]==' ') {
          printf("\n#        The blank records are not allowed.");
        }           
        printf("\n#        Check the DP format description.\n\n");
        exit(0);
      }           
      if(check_aa(keyword[0])>=20) {
        printf("\n# ERROR! Check the DP format. Wrong amino acid code  %c (NB = %d)\n\n",keyword[0],n_aa+1);
        ss_data->errors++;
        escape(ss_data->errors);
      } 
     if (no_of_col>3) { 
      if(conf<0.0 || conf>1.0) {
        printf("\n# ERROR! Check the line NB = %d in the MODEL section. ",n_aa+1);
        printf("\n#        Check the value of the confidence level.\n\n");
        ss_data->errors++;
        escape(ss_data->errors);
       }
      }
      ss_data->ssp.aa[n_aa]=keyword[0];
      ss_data->ssp.ss[n_aa]=second;
      ss_data->ssp.n_cols[n_aa]=no_of_col; 
      if (no_of_col>3){      
       ss_data->ssp.confidence[n_aa]=conf;
       if(conf>0.0 && conf<=1.0) ss_data->ssp.n_confidence++;    
      }
      n_aa++;
      ss_data->ssp.n_aa=n_aa;
      if(n_aa>ss_data->target_n_aa) {
        printf("\n# ERROR! There are more residues in the model than in the TARGET.");
        printf("\n#        Check the TARGET %s specification. In target: %d residues\n",
                           ss_data->target_name,ss_data->target_n_aa);
        ss_data->errors++;
        escape(ss_data->errors);
        ss_data->end=1;
        return;
      }
    }
  }
  return ;
}
