/******************************************************************************
 The computer software and associated documentation called DOMAK hereinafter
 referred to as the WORK which is more particularly identified and described in
 Appendix A of the file LICENSE.  Conditions and restrictions for use of
 this package are also in this file.

 This routine was developed by Robert B. Russell

 The WORK was developed by:
        Asim S. Siddiqui and Geoffrey J. Barton
        Laboratory of Molecular Biophysics
        University of Oxford
        Rex Richards Building
        South Parks Road
        Oxford OX1 3QU U.K.
        Tel:  (+44) 865-275379
        FAX:  (+44) 865-510454
        INTERNET: as@bioch.ox.ac.uk
        JANET:    as@uk.ac.ox.bioch

 The WORK is Copyright (1995) University of Oxford
        Administrative Offices
        Wellington Square
        Oxford OX1 2JD U.K.

 All use of the WORK must cite:
 Siddiqui, A. S. and Barton, G. J., "Continuous and Discontinuous Domains: An
 Algorithm for the Automatic Generation of Reliable Protein Domain Definitions" 
 PROTEIN SCIENCE, 4:872-884 (1995).
*****************************************************************************/

#include <stdio.h>
#include "structs.h"

#define max_hetatm 1000
/* getdat: reads in appropriate information from a given PDB file */

struct residue *getdat(f,info,res,waters,acefor,force)
FILE *f;
struct protein *info;
struct residue *res;
int waters,acefor;
int force;
{
     int i,j,k,count,atcount,end,aorblank,not_hydrogen,n_hetatm;
     int not_protein,n_main_chain;
     int model_number;
     char c;

     /* count records the number of residues considered,
      *   atcount is a record of the number of atoms in each
      *   residue considered. */

     char *buff,*temp,*current,*last;

     buff=(char*)malloc(100*sizeof(char));
     temp=(char*)malloc(100*sizeof(char)); 
     current=(char*)malloc(10*sizeof(char));  
     last=(char*)malloc(10*sizeof(char)); 
     res=(struct residue *)malloc(2*sizeof(struct residue));
     /* The first residue entry is for heteroatoms */

     n_main_chain=0;
     printf("reading PDB file....\n");
     /* Heteratom stuff */
     n_hetatm=0; 
     strcpy(&res[0].resname[0],"HET"); res[0].resname[3]='\0';
     res[0].aa='?'; res[0].kssum='?'; 
     res[0].resnum.cid='?'; 
     res[0].resnum.in='?'; 
     res[0].resnum.n=0;
/*   strncpy(temp,&buff[22],4); temp[4]='\0'; 
     sscanf(temp,"%d",&res[0].resnum.n);
     if(buff[21]!=' ') res[0].resnum.cid=buff[21]; 
     else res[0].resnum.cid='_';
     if(buff[26]!=' ') res[0].resnum.in=buff[26];
     else res[0].resnum.in='_';  */
     j=max_hetatm;
     res[0].atoms=(float**)malloc(j*sizeof(float*));
     res[0].j=j;
     for(k=0; k<j; ++k) res[0].atoms[k]=(float*)malloc(3*sizeof(float));
     res[0].atname=(char**)malloc(j*sizeof(char*));
     for(k=0; k<j; ++k) res[0].atname[k]=(char*)malloc(6*sizeof(char));
     res[0].atnum=(int*)malloc(j*sizeof(int));
     res[0].occ=(float*)malloc(j*sizeof(float));
     res[0].B=(float*)malloc(j*sizeof(float));
     res[0].num=0;
     res[0].nrc=0;
     res[0].med_nrc=0;
     res[0].long_nrc=0;
     res[0].contacts=0;
     res[0].sidecont=0;
     res[0].elec=0; 
     res[0].sideelec=0; 
     res[0].acc=0; 
     res[0].s_s=0;
     res[0].hyd=0;
     res[0].sidehyd=0;
     res[0].polar=0;
     res[0].sidepolar=0;
     res[0].ksnum=0;

     /* read the file till 'ATOM' is encountered */
     i=0; count=0; last[0]='\0';
     end=0;
    /* printf("Marker 1\n"); */
     while(!end) {
	while(((c=getc(f)) != (char)EOF) && (c!='\n')) buff[i++]=c;
	if(c==(char)EOF) break;
	buff[i]='\0'; i=0; 

	aorblank=((buff[16]==' ') || (buff[16]=='A'));
		/* only reads the atom if it is labelled blank or A, this is (perhaps)
		 *  this simplest way of excluding multiple conformations */
	not_hydrogen=(buff[13]!='H'); /* ignores H's */
	if(testaa(&buff[17])) 
	   not_protein=0;
	else 
	   not_protein=1;

	if(not_protein && strncmp(buff,"ATOM  ",6)==0) { 
	  strncpy(temp,&buff[17],3); temp[3]='\0';
	  printf("warning: unknown amino acid type %3s classified as HETATM\n",temp);
	}
	if((strncmp(buff,"ATOM  ",6)==0) &&
	   ( (strncmp(&buff[17],"FOR",3)!=0 && strncmp(&buff[17],"ACE",3)!=0) || acefor)
	   && !not_protein) {
	strncpy(&current[0],&buff[21],7); current[7]='\0';
	if(strcmp(current,last)!=0) {
	   /* If we are at a new residue, read in residue info */
	   /* First we must see if the last residue had a complete main chain */
	   if(n_main_chain<4 && count!=0 && force==0) {  
	      printf("WARNING: main chain atoms missing for residue %s\n",last);
	      printf("  will ignore this residue\n");
	      /* we must free all the old memory */
	      for(k=0; k<res[count].j; ++k) {
		free(res[count].atoms[k]);
		free(res[count].atname[k]);
	      }
	      free(res[count].atoms); free(res[count].atname);
	      free(res[count].atnum); free(res[count].occ); 
	      free(res[count].B);
	   } else { 
	     count++; 
	     res=(struct residue *)realloc(res,(count+1)*sizeof(struct residue));
	   }
	   n_main_chain=0;
	   atcount=0;
	   strncpy(&res[count].resname[0],&buff[17],3);
	   res[count].resname[3]='\0';
	   res[count].aa=RBR_a3to1(res[count].resname);
	   strncpy(temp,&buff[22],4); temp[4]='\0';
	   sscanf(temp,"%d",&res[count].resnum.n);
	   if(buff[21]!=' ') res[count].resnum.cid=buff[21];
	   else res[count].resnum.cid='_'; /* underscore = no chain */
	   if(buff[26]!=' ') res[count].resnum.in=buff[26];
	   else res[count].resnum.in='_';  /* as above */
	  /* printf("%s\n",buff); */
	   /* allocate res for number of atoms in a given residue */
	   j=RBR_numberofats(res[count].resname)+20;
	   res[count].atoms=(float**)malloc(j*sizeof(float*));
	   res[count].j=j;
	   for(k=0; k<j; ++k)
	      res[count].atoms[k]=(float*)malloc(3*sizeof(float));
	   res[count].atname=(char**)malloc(j*sizeof(char*));
	   for(k=0; k<j; ++k)
	      res[count].atname[k]=(char*)malloc(6*sizeof(char));
	   res[count].atnum=(int*)malloc(j*sizeof(int));
	   res[count].occ=(float*)malloc(j*sizeof(float));
	   res[count].B=(float*)malloc(j*sizeof(float));
	   res[count].num=0;
	   res[count].nrc=0;
	   res[count].med_nrc=0;
	   res[count].long_nrc=0;
	   res[count].contacts=0;
	   res[count].sidecont=0;
	   res[count].hyd=0;
	   res[count].sidehyd=0;
	   res[count].elec=0;
	   res[count].sideelec=0;
	   res[count].polar=0;
	   res[count].sidepolar=0;
	   res[count].acc=0;
	   res[count].s_s=0;
        } /* End of if(strcmp(current,last);
	/* Read in atom data, if not_hydrogen and aorblank are != 0 */
	if(not_hydrogen && aorblank) {
	/* if atcount>=j return an error message */
	if(atcount>=j) {
	   printf("funny pdb file format... last line read:\n%s\n",buff);
	   strcpy(res[0].resname,"error");
	   return res;
	   }
	strncpy(res[count].atname[atcount],&buff[12],4);      /* name */
	if(strncmp(&buff[12]," CA ",4)==0 || strncmp(&buff[12]," N  ",4)==0 ||
	   strncmp(&buff[12]," C  ",4)==0 || strncmp(&buff[12]," O  ",4)==0 ) n_main_chain++;
	res[count].atname[atcount][4]='\0';
	sscanf(&buff[6],"%d",&res[count].atnum[atcount]);     /* number */
	sscanf(&buff[30],"%f",&res[count].atoms[atcount][0]); /* X */
	sscanf(&buff[38],"%f",&res[count].atoms[atcount][1]); /* Y */
	sscanf(&buff[46],"%f",&res[count].atoms[atcount][2]); /* Z */
	sscanf(&buff[54],"%f",&res[count].occ[atcount]);      /* Occ */
	sscanf(&buff[60],"%f",&res[count].B[atcount]);	      /* B */
	strcpy(last,current); atcount++; res[count].num++;
/*	printf("Read %d atom\n",atcount); */
	} /* End of if(not_hydrogen... */
	} else if((strncmp(buff,"ATOM  ",6)==0 && not_protein) ||  (strncmp(buff,"HETATM",6)==0 && (strncmp(&buff[17],"HOH",3)!=0 || waters==1)) ) {
	   /* if `HETATM' is encountered, then the coordinates are stored in one `res' (0)
	    * all atoms (except for HOH unless desired) are read in
	    * untill `HETATM' is no longer encountered */
	  if(not_hydrogen) {
	    if((n_hetatm)>=(max_hetatm)) { 
	      printf("funny pdb file format... last line read:\n%s\n",buff);
	      strcpy(res[0].resname,"error");
    	      return res;
	    }
	    if(strncmp(&buff[17],"HOH",3)==0) strcpy(res[0].atname[n_hetatm],"HOH ");
	    else strncpy(res[0].atname[n_hetatm],&buff[12],4); /* name */
	    res[0].atname[n_hetatm][4]='\0';
            sscanf(&buff[6],"%d",&res[0].atnum[n_hetatm]);     /* number */
            sscanf(&buff[30],"%f",&res[0].atoms[n_hetatm][0]); /* X */
            sscanf(&buff[38],"%f",&res[0].atoms[n_hetatm][1]); /* Y */
	    sscanf(&buff[46],"%f",&res[0].atoms[n_hetatm][2]); /* Z */
	    sscanf(&buff[54],"%f",&res[0].occ[n_hetatm]);      /* Occ */
	    sscanf(&buff[60],"%f",&res[0].B[n_hetatm]);        /* B */
            strcpy(last,current); n_hetatm++;
            res[0].num++;
          } /* End of if(not_hydrogen... */
     }/* End of if((strncmp(buff,... */

     /* check for multiple model structures (i.e., with NMR) */
     if(strncmp(buff,"ENDMDL",6)==0) end=1;
     if(strncmp(buff,"MODEL ",6)==0) {
	/* if we see the word model, give a warning, and end if the number is greater than 1 */
	printf("WARNING: MODEL keyword found.\n");
	printf("  Will ignore models greater than 1\n");
	sscanf(&buff[6],"%d",&model_number);
	if(model_number>1) end=1;
      }
   } /* End of while... */

   /* we must do one last check for main chain atoms */

   if(n_main_chain<4 && count!=0 && force==0) {  
      printf("WARNING: main chain atoms missing for residue %s\n",last);
      printf("  will ignore this residue\n");
      /* we must free all the old memory */
      for(k=0; k<res[count].j; ++k) {
	free(res[count].atoms[k]);
	free(res[count].atname[k]);
      }
      free(res[count].atoms); free(res[count].atname);
      free(res[count].atnum); free(res[count].occ); 
      free(res[count].B);
      count--;
   } 
   info[0].num=count+1;
   free(buff); free(temp); free(current); free(last);
   printf("               done.\n");
   return res;
}

int printres(lr)
struct residue lr;
{
	int i,atcount;
	atcount=lr.num;
	printf("%s %c %d %c\n",
	   lr.resname,lr.resnum.cid,lr.resnum.n,
	   lr.resnum.in);
	for(i=0; i<atcount; ++i) {
	   printf("atom: %d\n",i);
	   printf("%d %s %f %f %f %f %f\n",
	      lr.atnum[i],lr.atname[i],
	      lr.atoms[i][0],lr.atoms[i][1],lr.atoms[i][2],
	      lr.occ[i],lr.B[i]);
	}
}
int testaa(taa)
char *taa;
/* Given a three-letter code, this routine returns 1 if it is one that the program knows about.
 *  Otherwise it returns 0, and includes the residue as a HETATM */
{
	int i;
	static char AA_TYPES[53][3] = { 
	  "ALA", "ASX", "CYS", "ASP", "GLU", "PHE", "GLY", "HIS", "ILE", "LYS", "LEU", "MET",
	  "ASN", "PRO", "GLN", "ARG", "SER", "THR", "VAL", "TRP", "UNK", "TYR", "GLX", "CYH",
	  "ACD", "ALB", "ALI", "ABU", "ARO", "BAS", "BET", "HSE", "HYP", "HYL", "ORN", "PCA",
	  "SAR", "TAU", "THY", "UNK", "ACE", "FOR", "CYH", "CSH", "CSS", "CYX", "ILU", "PRZ", 
	  "PR0", "CPR", "TRY", "HOH", "PLP" };
	static int N_TYPES=53;

	for(i=0; i<N_TYPES; ++i) { 
	  if(strncmp(AA_TYPES[i],taa,3)==0) return 1;
	}
	return 0;
}
