/*-----------------------------------------------------------
/
/   dr_ver.c 
/
/   Program checks the DR format for CASP4 submissions
/   (Disordered Regions prediction)
/
/   Modified version of the ss_ver program.
/
/   Copyright by Adam Zemla (07/28/2000)
/
/------------------------------------------------------------*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAXRES           10000

char *letter="ARNDCQEGHILKMFPSTWYV-";

typedef struct {
  float  confidence[MAXRES];
  int    ss[MAXRES];
  int    aa[MAXRES];
  int    n_aa;
  int    n_confidence;
} data_ssp;

typedef struct {
  char     target_name[10];
  char     target_aa[MAXRES];
  int      target_n_aa;
  int      model;
  int      end;
  int      n_method;
  int      errors;
  data_ssp ssp;
} ss_f;

int check_aa(char);
void read_ss_f(ss_f *, char*);
void read_data_ssp(ss_f *, FILE*);
void check_ss_f(ss_f *);
void clean_ss_f(ss_f *);
int read_seq(char*, char*);
void escape(int);

main(int argc, char *argv[]) 
{
  char ss_file[120];
  ss_f ss_data;
  
  if(argc<2){
    printf(" Usage: dr_ver <dr_file>\n");
    exit(0);
  }
  
  strcpy(ss_file,argv[1]);

  clean_ss_f(&ss_data);

  printf("# Reading prediction format DR\n\n");
  read_ss_f(&ss_data,ss_file);
  printf("\n# Reading prediction format DR             (DONE)\n");

  check_ss_f(&ss_data);

  if(ss_data.errors==0) {
    printf("\n# MODEL index: %d \n",ss_data.model);
    printf("\n# Total number of residues in model:          %4d ",ss_data.ssp.n_aa);
    printf("\n# Number of residues with non 0.5 confidence: %4d ",ss_data.ssp.n_confidence);
    printf("\n# Number of METHOD records:                   %4d \n",ss_data.n_method);
    printf("\n# No errors.\n\n");
  }
  else {
    printf("\n# Number of errors = %d.\n\n",ss_data.errors);
  }
}

void escape(int error)
{
  if(error>25) {
    printf("\n# Too many ERRORS ...");
    printf("\n#    Please check format for DR predictions.\n");
    exit(0);
  }
  return;
}

/*-----------------------------------------------------------
/
/   clean ss_f structure
/
/------------------------------------------------------------*/
void clean_ss_f(ss_f *ss_data)
{
  int j;

  strcpy(ss_data->target_name,"   ");
  ss_data->target_n_aa=0;
  ss_data->model=0;
  ss_data->end=0;
  ss_data->n_method=0;
  ss_data->errors=0;
  ss_data->ssp.n_aa=0;
  ss_data->ssp.n_confidence=0;
  for(j=0;j<MAXRES;j++) {
    ss_data->target_aa[j]=' ';
    ss_data->ssp.confidence[j]=0.5;
    ss_data->ssp.ss[j]=' ';
    ss_data->ssp.aa[j]=' ';
  }
  return;
}

/*-----------------------------------------------------------
/
/   read_ss_f - read the DR predictions format
/
/------------------------------------------------------------*/
void read_ss_f(ss_f *ss_data, char* fname)
{
  int i, begflag, authflag;
  char keyword[500], line[500], name[30], model_nb[100];
  FILE *fp;

  if((fp = fopen(fname,"r"))==NULL) {
    printf("\n# error opening file %s for read\n\n",fname);
    exit(0);
  }

  /* Read in the ss_data
  -------------------------------------------*/
  begflag=0;
  authflag=0;
  while(fgets(line,500,fp)!=NULL) {
    strcpy(keyword,"   ");
    strcpy(name,"   ");
    strcpy(model_nb,"#   ");
    sscanf(line,"%s",keyword);
    if(!strncmp(keyword,"PFRMAT\0",7)) {

      for(i=0;i<500;i++)
        if(line[i]=='%' || line[i]=='"' || 
           line[i]=='/' || line[i]=='\\') line[i]=' ';
      printf(line);

      begflag=1;
      sscanf(line,"%s %s",keyword,name);
      if(strncmp(name,"DR\0",3)) {
        printf("\n# ERROR! Wrong specification of the DR format category");
        printf("\nPFRMAT DR      # was expected\n\n");
        exit(0);
      }
    }
    else if(!strncmp(keyword,"TARGET\0",7)) {

      for(i=0;i<500;i++)
        if(line[i]=='%' || line[i]=='"' || 
           line[i]=='/' || line[i]=='\\') line[i]=' ';
      printf(line);

      if(begflag<1) {
        printf("\n# ERROR! Unacceptable order of the DR prediction data records");
        printf("\nPFRMAT DR                # the first line");
        printf("\nTARGET Txxxx             # the second line");
        printf("\nAUTHOR xxxx-xxxx-xxxx    # the third line was expected\n\n");
        exit(0);
      }
      begflag=2;
      sscanf(line,"%s %s",keyword,name);
      strcpy(ss_data->target_name,name);
      ss_data->target_n_aa=read_seq(ss_data->target_aa,name);
    }
    else if(!strncmp(keyword,"AUTHOR\0",7)) {
      if(begflag<1) {
        printf("\n# ERROR! Unacceptable order of the DR prediction data records");
        printf("\nPFRMAT DR                # the first line");
        printf("\nTARGET Txxxx             # the second line");
        printf("\nAUTHOR xxxx-xxxx-xxxx    # the third line was expected\n\n");
        exit(0);
      }
      for(i=0;i<500;i++)
        if(line[i]=='%' || line[i]=='"' || 
           line[i]=='/' || line[i]=='\\') line[i]=' ';
      printf(line);
      authflag=1;
    }
    else if(!strncmp(keyword,"REMARK\0",7)) {
      if(begflag<1) {
        printf("\n# ERROR! Unacceptable order of the DR prediction data records");
        printf("\nPFRMAT DR                # the first line");
        printf("\nTARGET Txxxx             # the second line");
        printf("\nAUTHOR xxxx-xxxx-xxxx    # the third line was expected\n\n");
        exit(0);
      }
      for(i=0;i<500;i++)
        if(line[i]=='%' || line[i]=='"' || 
           line[i]=='/' || line[i]=='\\') line[i]=' ';
      printf(line);
    }
    else if(!strncmp(keyword,"METHOD\0",7)) {
      if(begflag<1) {
        printf("\n# ERROR! Unacceptable order of the DR prediction data records");
        printf("\nPFRMAT SS                # the first line");
        printf("\nTARGET Txxxx             # the second line");
        printf("\nAUTHOR xxxx-xxxx-xxxx    # the third line was expected\n\n");
        exit(0);
      }
      ss_data->n_method++;
    }
    else if(!strncmp(keyword,"MODEL\0",6)) {
      if(begflag<2 || authflag!=1) {
        printf("\n# ERROR! Unacceptable order of the SS prediction data records");
        printf("\nPFRMAT DR                # the first line");
        printf("\nTARGET Txxxx             # the second line");
        printf("\nAUTHOR xxxx-xxxx-xxxx    # the third line was expected\n\n");
        exit(0);
      }
      i=0;
      sscanf(line,"%s %s",keyword,model_nb);
      sscanf(model_nb,"%d",&i);
      if(model_nb[1]!=' ' && model_nb[1]!='\0' && model_nb[1]!='\n') {
        printf("\n# ERROR! Unacceptable index of the prediction MODEL\n\n");
        ss_data->errors++;
        escape(ss_data->errors);
      }
      if(i>0 && i<=5) {
        begflag=3;
        ss_data->model=i;
        printf("\n# Reading MODEL %2d\n",i);
        read_data_ssp(ss_data,fp);
        break;
      }
      else {
        printf("\n# ERROR! Unacceptable index of the DR prediction MODEL\n\n");
        exit(0);
      }
    }
    else if(!strncmp(keyword,"END\0",4)) {
      ss_data->end=1;
      break;
    }
    else if(strncmp(keyword," ",1)) {

      for(i=0;i<500;i++)
        if(line[i]=='%' || line[i]=='"' || 
           line[i]=='/' || line[i]=='\\') line[i]=' ';
      printf(line);

      printf("\n# ERROR! Unknown record keyword in this section of the prediction.");
      printf("\n#        Check DR format description.\n\n");
      ss_data->errors++;
      escape(ss_data->errors);
    }
  }
  if(begflag!=3) {
    printf("\n# ERROR! There is no DR prediction data in this file\n\n");
    exit(0);
  }
  if(ss_data->end==0) {
    printf("\n# ERROR! There is no END record in this file\n\n");
    exit(0);
  }
  if(authflag!=1) {
    printf("\n# ERROR! Check AUTHOR record in this file\n\n");
    exit(0);
  }
  if(ss_data->n_method==0) {
    printf("\n# ERROR! There is no METHOD records in this file\n\n");
    exit(0);
  }
  fclose(fp);

  return;
}

/*-----------------------------------------------------------
/
/   read_seq - read a sequence file
/
/------------------------------------------------------------*/
int read_seq(char* seq, char* tname)
{
  int i, n_aa;
  FILE *fp;
  char line[MAXRES], fname[20], lname[20];

  strcpy(lname,tname);
  strcpy(fname,"TARGETS/");
  strcat(fname,lname);
  strcat(fname,".seq.txt");
  if((fp = fopen(fname,"r"))==NULL) {
    for(i=0;i<20;i++) if(lname[i]=='t') lname[i]='T';
    strcpy(fname,"TARGETS/");
    strcat(fname,lname);
    strcat(fname,".seq.txt");
    if((fp = fopen(fname,"r"))==NULL) {
      for(i=0;i<20;i++) if(lname[i]=='T') lname[i]='t';
      strcpy(fname,"TARGETS/");
      strcat(fname,lname);
      strcat(fname,".seq.txt");
      if((fp = fopen(fname,"r"))==NULL) {
        printf("\n# ERROR! There is no target name:  %s",tname);
        printf("\n# TARGET Txxxx      # was expected\n\n");
        exit(0);
      }
    }
  }

  n_aa=0;
  while ((fgets(line, MAXRES, fp) != NULL)) {    
    if (strncmp(line, ">", 1) == 0) {}
    else {
      i=0;
      while (line[i]!='\n') {
        if(line[i]!='\0' && line[i]!=' ') {
          if(check_aa(line[i])<20) {
            seq[n_aa]=line[i];
            n_aa++;
          }
          else {
            printf("\n# ERROR! Check file  %s  Wrong amino acid code  %c\n\n",tname,line[i]);
            fclose(fp);
            exit(0);
          }            
        }
        i++;
      }
    }
  }
  fclose(fp);
  return n_aa;
}

/*-----------------------------------------------------------
/
/   check_ss_f - checks the DR prediction file format
/
/------------------------------------------------------------*/
void check_ss_f(ss_f *ss_data)
{
  int i, j;

  printf("\n# Checking the DR prediction MODEL %2d\n",ss_data->model);
  if(ss_data->ssp.n_aa!=ss_data->target_n_aa) {
    printf("\n# ERROR! Check the number %d of residues in the model. In target: %d residues.",
                       ss_data->ssp.n_aa,ss_data->target_n_aa);
    printf("\n#        Check the TARGET %s specification (see Template Sequence file).\n",
                       ss_data->target_name);
    ss_data->errors++;
    escape(ss_data->errors);
  }
  if(ss_data->ssp.n_aa==0 || ss_data->ssp.n_confidence==0) {
    printf("# ERROR! The number of predicted residues in the model: 0\n");
    printf("#        Check the confidence level of the residues in the model.\n");
    ss_data->errors++;
    escape(ss_data->errors);
    return;
  }
  for(i=0;i<ss_data->ssp.n_aa;i++) {
    if(ss_data->target_aa[i]!=ss_data->ssp.aa[i]) {
      printf("# ERROR! Check residue %c number %d. (In TARGET: %c %d)\n",
                ss_data->ssp.aa[i],i+1,ss_data->target_aa[i],i+1);
      ss_data->errors++;
      escape(ss_data->errors);
    }
    if(ss_data->ssp.ss[i]!='D' &&
       ss_data->ssp.ss[i]!='O') {
      printf("# ERROR! Check the DR conformation %c for the residue number %d\n",
                ss_data->ssp.ss[i],i+1);
      ss_data->errors++;
      escape(ss_data->errors);
    }
    if(ss_data->ssp.confidence[i]<0.0 || ss_data->ssp.confidence[i]>1.0) {
      printf("# ERROR! Check confidence = %5.2f for residue number %d\n",
                ss_data->ssp.confidence[i],i+1);
      ss_data->errors++;
      escape(ss_data->errors);
    }
  }
  printf("# Checking the DR prediction MODEL %2d      (DONE)\n",ss_data->model);
  return;
}

/*-----------------------------------------------------------
/
/   check_aa - checks an amino acid
/
/------------------------------------------------------------*/
int check_aa(char token)
{
  int i;

  for(i=0;i<21;i++) {
    if(letter[i]==token)
      return i;
  }
  return 21;
}

/*-----------------------------------------------------------
/
/   read_data_ssp - read the DR predictions file format. 
/                   Three column DR CASP4 format.
/
/------------------------------------------------------------*/
void read_data_ssp(ss_f *ss_data, FILE* fp)
{
  int i, n_aa;
  float conf;
  char line[500],keyword[500],second[100];

  n_aa=0;
  ss_data->ssp.n_aa=n_aa;
  while (fgets(line, 500, fp) != NULL) {
    strcpy(keyword,"   ");
    sscanf(line,"%s",keyword);   
    if(!strncmp(keyword, "END\0", 4)) {
      ss_data->end=1;
      return;
    }
    else if(keyword[0]=='#') {}
    else if(!strncmp(keyword,"REMARK\0",7)) {}
    else if(!strncmp(keyword,"PARENT\0",7) ||
            !strncmp(keyword,"TER\0",4)) {
      for(i=0;i<500;i++)
        if(line[i]=='%' || line[i]=='"' || 
           line[i]=='/' || line[i]=='\\') line[i]=' ';
      printf(line);

      printf("\n# ERROR! Unknown record in the MODEL - END section.");
      printf("\n#        Check three column DR format.\n\n");
      ss_data->errors++;
      escape(ss_data->errors);
    }
    else if(!strncmp(keyword,"TER\0",4)) {}
    else if(!strncmp(keyword,"METHOD\0",7)) {
      ss_data->n_method++;
    }
    else {
      strcpy(keyword,"   ");
      strcpy(second,"   ");
      conf=-1.0;
      sscanf(line,"%s %s %f",keyword,second,&conf);
      if(second[0]!='D' && second[0]!='O') {
        for(i=0;i<500;i++)
          if(line[i]=='%' || line[i]=='"' || 
             line[i]=='/' || line[i]=='\\') line[i]=' ';
        printf(line);

        printf("\n# ERROR! Check the line NB = %d in the MODEL section. ",n_aa+1);
        if(keyword[0]==' ') {
          printf("\n#        The blank records are not allowed.");
        }           
        printf("\n#        Check the DR format description.\n\n");
        exit(0);
      }           
      if(check_aa(keyword[0])>=20) {
        printf("\n# ERROR! Check the DR format. Wrong amino acid code  %c (NB = %d)\n\n",keyword[0],n_aa+1);
        ss_data->errors++;
        escape(ss_data->errors);
      } 
      if(conf<0.0 || conf>1.0) {
        printf("\n# ERROR! Check the line NB = %d in the MODEL section. ",n_aa+1);
        printf("\n#        Check the value of the confidence level.\n\n");
        ss_data->errors++;
        escape(ss_data->errors);
      }
      ss_data->ssp.aa[n_aa]=keyword[0];
      ss_data->ssp.ss[n_aa]=second[0];
      ss_data->ssp.confidence[n_aa]=conf;
      n_aa++;
      ss_data->ssp.n_aa=n_aa;
      if(conf>=0.0 && conf!=0.5 && conf<=1.0) ss_data->ssp.n_confidence++;
      if(n_aa>ss_data->target_n_aa) {
        printf("\n# ERROR! There are more residues in the model than in the TARGET.");
        printf("\n#        Check the TARGET %s specification. In target: %d residues\n",
                           ss_data->target_name,ss_data->target_n_aa);
        ss_data->errors++;
        escape(ss_data->errors);
        ss_data->end=1;
        return;
      }
    }
  }
  return ;
}
