#!CHANGEJNET

# hack to pull the alignments and profiles from psiblast

# -m6 switch is needed to get the alignments
# in the BLAST report, also generates the
# psiblast frequency profiles, as they
# are no longer present in the output

use Getopt::Std;
getopt(hfgud);

if ($opt_f eq"req"){$freq=1}
elsif ($opt_g eq "ap"){$gap=1}
elsif ($opt_u eq "ngap"){$ungap=1}
elsif ($opt_d eq "egap"){$degap=1}
elsif ($opt_h eq "elp"){
    print " 
# Parse_PSI

This little script allows parsing of psiblast report files
to extract the embedded sequence alignment, and write out
in different formats.

parse_psi -help  : help
parse_psi -degap : remove all gaps in alignment and print as
                   a fasta format file
parse_psi -ungap : remove gaps in query sequence, and those
                   below the gap in the query sequence, print
                   as fasta format
parse_psi -gap   : print alignment with gaps in fasta format
parse_psi -freq  : print frequency profile needed for Jnet 

Usage:

parse_psi -freq file.blast > file.freq
\n";exit(0);
}

else {print "NO options given, try -help\n";exit}
if (!-e "$ARGV[0]"){
    print ("No such file as $ARGV[0] exists in this path...\n");
    exit (0);
}

if ($ARGV[0] eq ""){
    print "No filename given... \n";
    exit;
}

open IN, "$ARGV[0]";
$num=$thisone=$c=0;
while(<IN>){
    if (/^Sequences producing significant alignments/){
	$num++;
    }
}
print STDERR "$num alignments were found in PSIBlast file $ARGV[0]\n";
$num--;

close IN;
open IN, "$ARGV[0]";
while(<IN>){
    if (/^Sequences producing significant alignments/){
	if ($thisone == $num){
	    while (($ln2 = <IN>)){
		if ($ln2 =~ /^QUERY/){
		    $alignout[$c]=$ln2;
		    $c++;
		    while (($align = <IN>) !~ /^  Database/){
		       	$alignout[$c]=$align;
			$c++;
		    }
		    last;
		}
	    }
	}
	$thisone++;
    }
}

$c=$j=$z=$blocks=0;

# get number of blocks and number of sequences in a block

foreach (@alignout){    
    if ($_ =~ /^\n/){
	$blocks++;
    }
    if ($_ !~ /^\n/){
	$store[$blocks][$z]=$_;
	$z++;
    }
}
$npb=($z/($blocks+1));

for ($i=0; $i < ($blocks+1); $i++){
    for ($j=0; $j < $npb ; $j++){
	# get the alignment itself
	#print "$store[$i][$q]";
	(@tokens)=split(" ",$store[$i][$q]);
	#print "$tokens[0] $tokens[1] $tokens[2] $tokens[3]\n";
	if($#tokens == 3){
	    $id=$tokens[0];
	    $interest=$tokens[2];
	}
	if($#tokens == 1){
	    $id=$tokens[0];
	    $interest=$tokens[1];
	}
	if($#tokens == 2){
	    $id=$tokens[0];
	    $interest=$tokens[1];
	    if ($interest =~ /\d/g){
		# get the next token, number
		# must be at this end
		$interest=$tokens[2];
	    }
	}
	# remove any digits that may creep in with the 3 token,
	# this would mean the coord is on the end not in the
	# middle.
	$interest =~ s/\d//g;
	undef @tokens;
       
	$namekeep[$j]=$id;
	$seqkeep[$j]=$seqkeep[$j].$interest;
	$q++;
    }
}


if ($gap==1 || $degap==1){
    $c=0;
    foreach (@seqkeep){
	
	if ($degap==1){
	  $seqkeep[$c] =~ s/-//g;
	}
	print ">$namekeep[$c]\n$seqkeep[$c]\n";
	$c++;
    }
}

if ($ungap==1 || $freq == 1){
    $c=0;
    
    @sequence=split("",$seqkeep[0]);
    
    foreach (@sequence){
	if($sequence[$count] eq "\-"){
	    $printer[$count]=1;
	}
	else{$printer[$count]=0;}
	
	$count++;
    }
    $d=0;
    foreach $erm (@seqkeep){
	undef @sequence;
	@sequence=split("",$erm);
	$count=0;
	foreach (@sequence){
	    if ($printer[$count]==0){
		$ung[$d]=$ung[$d].$sequence[$count];
	    }
	    $count++;
	}
	$d++;
    }
    if ($ungap == 1){
	$c=0;
	foreach $re (@ung){
	    print ">$namekeep[$c]\n$re\n";
	    $c++;
	}
    }
    if ($freq == 1){
	# do the counting....
	$seqlen=length($ung[0]);
	
	#get a double array	
	for ($i =0 ; $i < $#ung ; $i++){
	    undef @sequence;
	    @sequence=split("",$ung[$i]);

	    # convert sequence to an integer value
	    @ints = seq2int(@sequence);
	    $count=0;
	    $j=0;
	    foreach (@sequence){
		$ar[$i][$j]=$ints[$j];
		$j++;
	    }
	}
	
	# zero countup
	for ($r=0; $r <20; $r++){ 
	    $countup[$r]=0;
	}
	
	for ($i =0 ; $i < ($seqlen);  $i++){ 
	    # skip the query sequence
	    for ($j=1; $j < $#ung ;$j++){
		# look at array
		$countup[$ar[$j][$i]]++;
	    }
	   
	    # get total counts for this column
	    $tot = 0;
	    for ($r=0; $r <20; $r++){
		$tot=$tot+$countup[$r];
	    }
	    for ($r=0; $r <20; $r++){
		# get percent and round up to 10
		# this seems to be slightly different to how 
		# psiblast works god knowns how.... 
                # works for now. :-(
		$outnum=(($countup[$r])/($tot))*10;
		printf ("%2.0f ",$outnum);
		$countup[$r]=0;
	    }
	    print "\n";
	}
    }
}

sub seq2int{
    my(@s)=@_;
    $g=0;
    for ($g=0; $g <= $#s; $g++){
	if ($s[$g] eq 'A'){ $intseq[$g] = "0"};
	if ($s[$g] eq 'R'){ $intseq[$g] = "1"};
	if ($s[$g] eq 'N'){ $intseq[$g] = "2"};
	if ($s[$g] eq 'D'){ $intseq[$g] = "3"};
	if ($s[$g] eq 'C'){ $intseq[$g] = "4"};
	if ($s[$g] eq 'Q'){ $intseq[$g] = "5"};
	if ($s[$g] eq 'E'){ $intseq[$g] = "6"};
	if ($s[$g] eq 'G'){ $intseq[$g] = "7"};
	if ($s[$g] eq 'H'){ $intseq[$g] = "8"};
	if ($s[$g] eq 'I'){ $intseq[$g] = "9"};
	if ($s[$g] eq 'L'){ $intseq[$g] = "10"};
	if ($s[$g] eq 'K'){ $intseq[$g] = "11"};
	if ($s[$g] eq 'M'){ $intseq[$g] = "12"};
	if ($s[$g] eq 'F'){ $intseq[$g] = "13"};
	if ($s[$g] eq 'P'){ $intseq[$g] = "14"};
	if ($s[$g] eq 'S'){ $intseq[$g] = "15"};
	if ($s[$g] eq 'T'){ $intseq[$g] = "16"};
	if ($s[$g] eq 'W'){ $intseq[$g] = "17"};
	if ($s[$g] eq 'Y'){ $intseq[$g] = "18"};
	if ($s[$g] eq 'V'){ $intseq[$g] = "19"};
        if ($s[$g] eq '-'){ $intseq[$g] = "25"};
    }
    return (@intseq);
}
