/******************************************************************************
 The computer software and associated documentation called DOMAK hereinafter
 referred to as the WORK which is more particularly identified and described in
 Appendix A of the file LICENSE.  Conditions and restrictions for use of
 this package are also in this file.

 This routine was developed by Asim S. Siddiqui


 All use of the WORK must cite:
 Siddiqui, A. S. and Barton, G. J., "Continuous and Discontinuous Domains: An
 Algorithm for the Automatic Generation of Reliable Protein Domain Definitions" 
 PROTEIN SCIENCE, 4:872-884 (1995).
*****************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <glob_routs.h>
#include <include.h>


/* DOMAIN_SCREEN 
 * Reads in domains specifying STAMP format for DSSP or PDB files and filters
 *  them based on Radius of Gyration, percent of hydrogen bonds missing, etc. 
 *  See my thesis, Chapter 4 (end) 
 *
 * 12/11/93
 * Modification.  Istvan Simon has pointed out that Rg should increase as a function
 *  of L to the power 1/3 (i.e., due to the volume of a sphere, etc. 
 *  Using 66 domains (originally used to derive the straight line fit) I
 *  obtained a good fit to the date with:
 *   Rg = 2.357 x L ^ 0.366
 *  Most real compact domains seem to have Rg within 3 angstroms of this curve,
 *   though I would expect that the relationship will not hold for domains larger
 *   than about 200 residues.
 *  The progam now uses this function.  To use the old function, one need only run
 *   the program using the option "-t" (for thesis).
 */
float
as_globularity(char *domain_string)
{
	char c;
	char *sec;
	char *buff;
	char Ftype;

	int i,j,k,test;
	int ndomain,total,add;
	int gottrans;
	int wseq,report_all;
	int start,end,n_hbonds,sflag,dssp_len;
	int n_missing_b_hbonds,n_b_hbonds;
	int include_comments,old_method;
	int *Ro;

	float distance,Rg,fit;
	float sum,sumsq,longest,shortest;
	float mean,sd;
	float frac_b_miss;
        float dummy_float;

	FILE *PARMS,*PDB,*DSSP;

	struct domain_loc *domain;
	struct hbond *nho1,*nho2,*onh1,*onh2;
	struct brookn *brk;
	struct parameters parms;

	buff=(char*)malloc(1000*sizeof(char));

	/* Defaults */
	parms.PRECISION=1000;
	parms.MAX_SEQ_LEN=1000;
	parms.LARGER_LIMIT=18.0;
	parms.LARGE=150;
	parms.GRADIENT=0.0557092;
	parms.INTERCEPT=9.50;
	parms.MAX=2.0;
	parms.DIST_POWER=2;
	parms.HBOND_MIN_ENERGY=-0.5;
	parms.MAX_MISS_BHBOND=0.40;
	parms.A=2.257;
	parms.B=0.3657;


	wseq=1; report_all=0;
	include_comments=0; old_method=0;
        Ftype='p';
        report_all=1;

	/* determine the number of domains specified */
	ndomain=1;
	domain=(struct domain_loc*)malloc(ndomain*sizeof(struct domain_loc));
/*	printf("%%Reading domain descriptors\n");  */
	if(rbrg_getdomain(domain_string,domain,&ndomain,ndomain,&gottrans,stdout)==-1) exit(-1);

	
/*	printf("%%Reading coordinates...\n");  */
	for(i=0; i<ndomain; ++i) {
/*	   printf("%%Domain %3d %s (%s),",i+1,domain[i].filename,domain[i].id); */
	   if((PDB=fopen(domain[i].filename,"r"))==NULL) {
	      printf("error: file %s does not exist\n",domain[i].filename);
	      exit(-1);
	   }
	   domain[i].ncoords=0;
	   domain[i].coords=(int**)malloc(parms.MAX_SEQ_LEN*sizeof(int*));
	   domain[i].aa=(char*)malloc((parms.MAX_SEQ_LEN+1)*sizeof(char)); 
	   domain[i].numb=(struct brookn*)malloc((parms.MAX_SEQ_LEN)*sizeof(struct brookn));
	   total=0;
/*	   printf(" "); */
	   for(j=0; j<domain[i].nobj; ++j) {
	      if(Ftype=='p') {
		  if(RBR_igetca(PDB,&domain[i].coords[total],&domain[i].aa[total],&domain[i].numb[total],
				&add,domain[i].start[j],domain[i].end[j],
				domain[i].type[j],(parms.MAX_SEQ_LEN-total),0,parms.PRECISION,stdout)==-1) exit(-1);
	      } else {
		  /* This if statement calls a non-existent function. As a result, the code won't build on OS X */
		  /* if (RBR_igetcadssp(PDB,&domain[i].coords[total],&domain[i].aa[total],&domain[i].numb[total], */
/* 				     &add,domain[i].start[j],domain[i].end[j], */
/* 				     domain[i].type[j],(parms.MAX_SEQ_LEN-total),0,parms.PRECISION,stdout)==-1){ */
		      exit(-1);
		 /*  } */
	      }
/*	       switch(domain[i].type[j]) {
	 	  case 1: printf("%% all residues"); break;
		  case 2: printf("%% chain %c",domain[i].start[j].cid); break;
		  case 3: printf("%% %c %3d%c to %c %3d%c",
			 domain[i].start[j].cid,domain[i].start[j].n,domain[i].start[j].in,
			 domain[i].end[j].cid,domain[i].end[j].n,domain[i].end[j].in); break;
			 }
			 printf("%%%4d CAs ",add); */
	      total+=add;
	      rewind(PDB);
	   }
	   fclose(PDB);
	   domain[i].ncoords=total;
/*	    printf("%%= %4d CAs\n",domain[i].ncoords);
	    printf("%%Applying the transformation... \n");
	    printmat(domain[i].R,domain[i].V,3,stdout);
	    printf("%%      ...to these coordinates.\n");
	    matmult(domain[i].R,domain[i].V,domain[i].coords,domain[i].ncoords,parms.PRECISION); */

	    /* Now we have CA information, lets look at the distance distribution */
	    /* get center of mass */
            Ro=RBR_c_of_m_pep(domain[i].coords,domain[i].ncoords,domain[i].aa,wseq, &dummy_float);
            /* get radius of gyration */
            Rg=RBR_r_of_gyration_pep(Ro,domain[i].coords,domain[i].ncoords,domain[i].aa,wseq,parms.PRECISION,parms.DIST_POWER);
            RBR_dist_stats(domain[i].coords,domain[i].ncoords,&mean,&sd,&longest,&shortest,parms.PRECISION,parms.DIST_POWER);

            /* Decide whether this is a compact domain
             *   positive values probably indicate a non-compact domain */
            if(old_method) {
              if(domain[i].ncoords<=parms.LARGE)
                fit=Rg-(parms.GRADIENT*domain[i].ncoords+parms.INTERCEPT);
              else
                fit=Rg-parms.LARGER_LIMIT;
            } else {
               fit=Rg-(parms.A*pow(domain[i].ncoords,parms.B));
            }
	    /* If using DSSP files, count the number of Hydrogen bonds present in the region */
	    n_hbonds=0; n_missing_b_hbonds=0; n_b_hbonds=0;
	    if(Ftype=='d') {
	      /* read the H-bonding schemes */
	      nho1=get_hb(domain[i].filename,&j,"NHO1"); 
	      nho2=get_hb(domain[i].filename,&j,"NHO2");
	      onh1=get_hb(domain[i].filename,&j,"ONH1");
	      onh2=get_hb(domain[i].filename,&j,"ONH2");
	      sec=get_ss(domain[i].filename,&j);
	      /* determine the start and end of the region within the DSSP file 
	       *  this is pretty simplistic, since it only looks for the highest and
	       *  lowest DSSP numbers that occur within the domain (ie. if regions in
	       *  between have been removed, these will be included). */
	      brk=get_bn(domain[i].filename,&dssp_len);
	      start=10000; end=0;
	      if(domain[i].type[0]==1) {
		start=1;
		end=dssp_len;
	      } else {
	         for(j=0; j<domain[i].nobj; ++j) {
		    sflag=0;
		    for(k=0; k<dssp_len; ++k) {
		      if(sflag==0 && 
			 ((domain[i].type[j]==2 && brk[k].cid==domain[i].start[j].cid) ||
			  (domain[i].type[j]==3 && brk[k].cid==domain[i].start[j].cid && 
			   brk[k].in==domain[i].start[j].in && brk[k].n==domain[i].start[j].n))) {
			 sflag=1;
			 if((k+1)<start) start=k+1;
		      }
		      if(sflag==1 && 
		        ((domain[i].type[j]==2 && brk[k].cid!=domain[i].start[j].cid && brk[k].cid!='!') ||
			 (domain[i].type[j]==3 && brk[k].cid==domain[i].end[j].cid &&
			  brk[k].in==domain[i].end[j].in && brk[k].n==domain[i].end[j].n) ||
			  k>=(dssp_len-1))) {
			  if((k+1)>end) end=k+1-(domain[i].type[j]==2);
			  break;
		      }
		    }
	        }
	      }
/*	      printf("%%DSSP start and end are %4d and %4d",start,end);   */
	      /* count the number of hydrogen bonds 
	       * two things must be satisfied: 
	       *  1. the energy must be less than or equal to the minimum allowable H-bond energy
	       *  2. the residue to which the residue is hydrogen bonded must lie within the domain, and
	       */
	      n_hbonds=0; n_missing_b_hbonds=0; n_b_hbonds=0;
	      for(j=0; j<dssp_len; ++j) {
	         if(j>=(start-1) && j<=(end-1)) { /* first of all, are we in the range */
	  	    if(nho1[j].energy<=parms.HBOND_MIN_ENERGY && 
		       (j+nho1[j].pos)>=start && 
		       (j+nho1[j].pos)<=end
/*		       && nho1[j].pos>0 */
		       ) 
		       n_hbonds++;
		    if(nho2[j].energy<=parms.HBOND_MIN_ENERGY && 
		       (j+nho2[j].pos)>=start && 
		       (j+nho2[j].pos)<=end 
/*		       && nho2[j].pos>0 */
		       ) 
		       n_hbonds++;
		    if(onh1[j].energy<=parms.HBOND_MIN_ENERGY && 
		       (j+onh1[j].pos)>=start && 
		       (j+onh1[j].pos)<=end
/*		       && onh1[j].pos>0 */
		       ) 
		       n_hbonds++;
		    if(onh2[j].energy<=parms.HBOND_MIN_ENERGY && 
		       (j+onh2[j].pos)>=start && 
		       (j+onh2[j].pos)<=end 
/*		       && onh2[j].pos>0 */
		       ) 
		       n_hbonds++;
		    /* check if sheet H-bonding is missing */
		    if(sec[j]=='E' || sec[j]=='B') {
		      if(nho1[j].energy<=parms.HBOND_MIN_ENERGY) {
		        if((j+nho1[j].pos)<start || (j+nho1[j].pos)>end) n_missing_b_hbonds++;
		        n_b_hbonds++;
		      }
		      if(nho2[j].energy<=parms.HBOND_MIN_ENERGY) {
		        if((j+nho2[j].pos)<start || (j+nho2[j].pos)>end) n_missing_b_hbonds++;
			n_b_hbonds++;
		      }
		      if(onh1[j].energy<=parms.HBOND_MIN_ENERGY) {
		        if((j+onh1[j].pos)<start || (j+onh1[j].pos)>end) n_missing_b_hbonds++; 
			n_b_hbonds++;
		      }
		      if(onh2[j].energy<=parms.HBOND_MIN_ENERGY) {
		       if((j+onh2[j].pos)<start || (j+onh2[j].pos)>end) n_missing_b_hbonds++;  
		       n_b_hbonds++;
		      }
		    }
	          }
	       }
/*	       printf("%%; number of H-bonds is %4d; number B or E missing is %4d (out of %4d)\n", 
		  n_hbonds,n_missing_b_hbonds,n_b_hbonds);   */
	       free(nho1); free(nho2); free(onh1); free(onh2);
	    }
	    /* output findings if screending is satisfied */
	    if(Ftype=='d') frac_b_miss=(float)n_missing_b_hbonds/(float)n_b_hbonds;
	    else frac_b_miss=0.0;


	    if(report_all || (frac_b_miss<parms.MAX_MISS_BHBOND && fit<=parms.MAX)) {
	      if(include_comments) printf("%% %s\n",buff);
/*
	      printf("%% ID %10s Rg %8.3f F %8.3f H %4d MB %4d B %4d M %6.3f SD %6.3f R %6.3f - %6.3f L %4d ",
	        domain[i].id,Rg,fit,n_hbonds,n_missing_b_hbonds,n_b_hbonds,mean,sd,shortest,longest,domain[i].ncoords);
	      printf("\n");
	      rbrg_printdomain(stdout,domain[i],0);
*/
	    }

	    /* Free memory */
	    for(j=0; j<domain[i].ncoords; ++j) 
	       free(domain[i].coords[j]);
	    free(domain[i].coords);
	    free(domain[i].aa);
	    free(domain[i].numb);
	}
	free(Ro);
	free(buff);
        return(fit);
}
int exit_error()
{
	    printf("format: domain_screen -f <domain file> -d -p -w -a -c -t -P <parameter file> \n");
	    printf("          -d  => read DSSP files\n");
	    printf("          -p  => read PDB files\n");
	    printf("          -w  => weight according to sequence MW\n");
	    printf("          -a  => report all (don't filter)\n");
	    printf("          -c  => include comments\n");
	    printf("          -t  => use thesis method (default is better)\n");
	    exit(-1);
}
