
/* 
  Copyright (c) 2002 Gavin E. Crooks <gec@compbio.berkeley.edu> and
                     Richard Green <ed@compbio.berkeley.edu>,
		     Univ. of California, Berkeley

  Permission is hereby granted, free of charge, to any person obtaining a 
  copy of this software and associated documentation files (the "Software"),
  to deal in the Software without restriction, including without limitation
  the rights to use, copy, modify, merge, publish, distribute, sublicense,
  and/or sell copies of the Software, and to permit persons to whom the
  Software is furnished to do so, subject to the following conditions:

  The above copyright notice and this permission notice shall be included
  in all copies or substantial portions of the Software.

  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 
  THE SOFTWARE.

  (This is the MIT Open Source License, 
  http://www.opensource.org/licenses/mit-license.html)
*/


/* ********************************************************
  doublet_align : 
    Fast Protein Sequence Alignment using Doublet Scores 

  See associated documentation and embedded comments for
  more information about the dynamic programming
  sequence alignment algorithm.

  Try "doublet_align -h" for usage information.


******************************************************** */

/* THINGS TO DO
   o Reimplement res_stats, and improve output reporting
*/


#include <assert.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include <ctype.h>
#include <stdbool.h>

#include <unistd.h>

#include "cbt_util.h"
#include "cbt_seq.h"
#include "cbt_evd.h"

/* ********************************************************
   CONSTANTS
******************************************************** */

#define VERSION   "$Id: doublet_align.c,v 1.29 2003/11/14 18:02:25 ed Exp $"

#define FALSE 0
#define TRUE 1



// A large negative number, much larger than any
// score, but not so large that it might overflow
// if a small neg. number were added to it
#define MINUS_ALOT (INT_MIN/2)

#define NULL_SCORE MINUS_ALOT

/* Flag the start of the alignment */
#define START_ALIGNMENT MINUS_ALOT

// Max distance for scoring doublet substitutions. This value
// must be at least 1 and not greater than the amount of
// data in the doublet substitution matrix. 
// A smaller max distance can be set using the command line
// parameter '-L max_score_distance'
#ifndef MAX_DOUBLET
#define MAX_DOUBLET  6
#endif

// For some reason, this code runs faster if
// MAX_SEQ_LENGTH is NOT a power of 2
#define MAX_SEQ_LENGTH  (1800)
#define BEGIN_SEQ_LENGTH (1024)

#define MAX_LINE_LENGTH (256+1)
#define MAX_ID_LENGTH (24+1)

/* maximum number of sequences in query database */
#define MAX_LIB_SIZE    (10000)


// Valid values of dist (the distance back to the last gap)
// are [0,MAX_DOUBLET], GAP_UP, GAP_LEFT
#define GAP_UP (MAX_DOUBLET+1)
#define GAP_LEFT (MAX_DOUBLET+2)
#define DIST (GAP_LEFT+1)

#define AA_TYPES 20
#define AA_TYPES_EXT (AA_TYPES+3) /* Standard 20 plus B, Z, X */
#define GAP_CHAR ('-')
const char aaCodes[] = "ARNDCQEGHILKMFPSTWYVBZX";

#define DEFAULT_GAP_OPEN      16
#define DEFAULT_GAP_EXTEND    2

char DEFAULT_SUB_MATRIX_FILENAME[]= "data/doublet_blosum70-b13+.txt";
char STATS_FN[] = "stats.dat";


/*********************************************************
   TYPEDEFS AND STRUCTS
*********************************************************/

// Typedefs that are pointers to structures begin with 
// an upper case letter


typedef int amino_acid;


// Singlet and doublet substitution matrix, and gap scores
typedef struct sub_matrix_str {
  int gap_open;
  int gap_extend;
  int dist;

  /* The singlet substitution matrix */
  signed char singlet[AA_TYPES_EXT][AA_TYPES_EXT];
  
  /* The doublet substitution matrix 
   * doublet[l][i][j][k][m] == Score for substituting amino acids 'i' and 'j'
   * (separated by a distance l) with amino acids 'k' and 'm' 
   * (l==0 is meaningless)
   */
  signed char doublet[MAX_DOUBLET+1][AA_TYPES_EXT][AA_TYPES_EXT][AA_TYPES_EXT][AA_TYPES_EXT];

} * Sub_matrix;  


typedef struct location_str {
  int dist;
  int seq1;
  int seq2;
} location;

typedef struct alignment_str {
  Sub_matrix mat; // The scores used in this alignment
  cbt_seq * seq1;
  cbt_seq * seq2;
  int*** score;
  int*** traceback;

  int max_score;
  int length;
  location* trace;
} * Alignment;


/*********************************************************
   FUNCTION DECLARATIONS
*********************************************************/


Sub_matrix sub_matrix_create();
Sub_matrix sub_matrix_load( char * mat_name );
Sub_matrix sub_matrix_parse(FILE *stream);
void sub_matrix_set_max_distance(Sub_matrix mat, int max_score_distance);

Alignment  alignment_create();
Alignment  alignment_free(Alignment self);
void alignment_align(Alignment self);
void alignment_print(Alignment self, FILE *stream);
void alignment_print_stats(Alignment self, FILE *stream);
void alignment_print_res_stats(Alignment self, FILE *stream);
void alignment_print_res_stats_header(Alignment self, FILE *stream);

void doublet_align_res_stats( FILE * query_stream,
                              FILE * lib_stream,
                              Sub_matrix mat,
                              FILE * out_stream,
                              char * out_filename,
			      int make_stats_file );
void doublet_align(const Sub_matrix mat, 
                   FILE * query_stream,
                   FILE * lib_stream,
                   FILE * out_stream,
                   int display_alignments,
		   int stats_method,
		   int make_stats_file);

amino_acid char_to_aa(char one_letter_code);
char aa_to_char(amino_acid aa);

void help(void);


#define STATS_NONE (0)
#define STATS_BAILEY (1)
#define STATS_PEARSON (2)

/*********************************************************
   MAIN
 *********************************************************/

int main(int argc, char *argv[]) {
  int c,i;
  int flag;
  Sub_matrix mat;  

  // Various parameters that can be set on the command line
  int gap_open = DEFAULT_GAP_OPEN;
  int gap_extend =DEFAULT_GAP_EXTEND;
  char * sub_matrix_filename = DEFAULT_SUB_MATRIX_FILENAME;
  int max_score_distance = MAX_DOUBLET;
  int display_alignments = FALSE;
  char * query_filename;
  FILE * query_stream = NULL;
  char * lib_filename;
  FILE * lib_stream = NULL;
  char * out_filename = NULL;
  FILE * out_stream = stdout;
  int stats_method = STATS_BAILEY;
  int make_stats_file = FALSE;

  while ((c = getopt (argc, argv, "g:f:L:s:S:O:DFh" ) ) != -1 ) {
    /*                             |----------||-|
				        |       |
				 Do take args   |
				               Do not
    */
    switch (c) {
    case 'D' : 
      display_alignments = TRUE;
      break;
    case 'g' : 
      flag = sscanf(optarg,"%d",&gap_extend);
      if(flag==EOF) cbt_die( "Cannot parse gap extend penalty");
      i++;
      break;
    case 'f' : 
      flag = sscanf(optarg,"%d",&gap_open);
      if(flag==EOF) cbt_die("Cannot parse gap open penalty");
      break;
    case 'L' :
      flag = sscanf(optarg,"%d",&max_score_distance);
      if(flag==EOF) cbt_die("Cannot parse max score distance");
      if(max_score_distance<0 || max_score_distance>MAX_DOUBLET)
        cbt_die("Invalid max score distance: %d", max_score_distance);
      break;
    case 's' : // Set score matrix filename
      sub_matrix_filename=optarg;
      break;
    case 'h' :
      help();
      break;
    case 'S' :
      flag = sscanf( optarg, "%d", &stats_method );
      if ( flag == EOF ) cbt_die( "Cannot parse statistics method flag" );
      if(stats_method<STATS_NONE || stats_method>STATS_PEARSON) 
        cbt_die("Invalide Stats method");
      break;
    case 'F' :
      make_stats_file = TRUE;
      break;
    case 'O' :
      out_filename = optarg;
      out_stream = fopen(optarg, "a+");
      if(out_stream == NULL) cbt_die("Cannot open output file %s",optarg);
      break;
    default :
      help();
    }
  }

  argc -= optind;
  argv += optind;

  if(argc !=2) help();
  
  query_filename = argv[0];
  if( query_filename[0] == '@') {
    query_stream = stdin;
  } else {
    query_stream = fopen(query_filename, "r");
    if(query_stream == NULL) cbt_die("Cannot open input file %s", query_filename);
  }

  lib_filename = argv[1];
  lib_stream = fopen(lib_filename, "r");
  if(lib_stream == NULL) cbt_die("Cannot open library file %s", lib_filename);
  

  // Load substitution matrix
  mat = sub_matrix_load(sub_matrix_filename);
  sub_matrix_set_max_distance(mat, max_score_distance);
  mat->gap_open = gap_open;
  mat->gap_extend = gap_extend;

  // Print header
  fprintf(out_stream, "# Doublet Alignment (%s)\n", VERSION);
  fprintf(out_stream, "#   Query:      %s\n", query_filename);
  fprintf(out_stream, "#   Library:    %s\n", lib_filename);
  fprintf(out_stream, "#   Matrix:     %s\n", sub_matrix_filename);
  fprintf(out_stream, "#   Lookback:   %d\n", max_score_distance);
  fprintf(out_stream, "#   Gap open:   %d\n", gap_open);
  fprintf(out_stream, "#   Gap extend: %d\n\n", gap_extend);
  fflush(out_stream);

  // Align all
  doublet_align( mat, query_stream, lib_stream, out_stream, 
		 display_alignments, stats_method, make_stats_file );
  fflush(out_stream);

  // Remove?? TODO
  //
  //if ( stats_method == 2 ) {
  //  /* User wants Pearson res_stats routine */  
  //  rewind(lib_stream);
  //  rewind(query_stream);
  //  doublet_align_res_stats( query_stream, lib_stream, mat, 
  //			     out_stream, out_filename, 
  //                           make_stats_file);
  //}

  // Print footer
  fprintf( out_stream, "## FIN doublet_align FIN ##\n" );
  fflush( out_stream );

  return 0;
}

void doublet_align(const Sub_matrix mat, 
                   FILE * query_stream,
                   FILE * lib_stream,
                   FILE * out_stream,
                   int display_alignments,
                   int stats_method,
		   int make_stats_file
		   ) {
  Alignment align;
  int error_flag;
  size_t lib_size, n, i;
  long int query_length;
  long int lengths[MAX_LIB_SIZE];
  long int scores[MAX_LIB_SIZE];
  double e_values[MAX_LIB_SIZE];
  double lambda, K, H;
  char ids[MAX_LIB_SIZE][MAX_ID_LENGTH];
  FILE* fstats;


  align = alignment_create();
  align->mat = mat;

  // Loop over query sequences
  while(cbt_seq_read_fasta(align->seq1, query_stream, aaCodes, true) !=EOF) {
    rewind(lib_stream);
    n=0;
    
    // Loop over database sequences
    query_length = align->seq1->len;
    while(cbt_seq_read_fasta(align->seq2, lib_stream, aaCodes, true) !=EOF) {
    
      // Do the magic
      alignment_align(align);
      lengths[n]=align->seq2->len;
      scores[n]=align->max_score;
      strncpy(ids[n], align->seq2->id, MAX_ID_LENGTH); 
      n +=1;
      assert(n<MAX_LIB_SIZE);

      if(display_alignments) alignment_print(align, out_stream);

    }
    lib_size = n;
    

    // Statistics

    // Output the stats file FIRST, in case data breaks cbt_evd()
    // Is this stats file compatable with pearson res-stats? GEC
    if ( make_stats_file ) {
      if ( ( fstats = fopen( STATS_FN, "w" ) ) == NULL ) {
        cbt_die( "Cannot open %s", STATS_FN );
      }
      fprintf( fstats, "# P-VALUES\n" );
      fprintf( fstats, "# target        score           N       pv\n" );
      for ( i = 0; i < n; i++ ) {
        fprintf( fstats, "%s   %ld   %ld   0.0\n", ids[ i ],
		 scores[ i ], lengths[ i ] ); 
      }
      fprintf( fstats, "\n" );
      fclose( fstats );
    }

    if(stats_method == STATS_NONE) {
      // Dump raw scores
      fprintf(out_stream, "#Query: id length \n");
      fprintf(out_stream, "%-*s\t%d\n",
	      MAX_ID_LENGTH, align->seq1->id, 
	      align->seq1->len);

      fprintf(out_stream,"#id length score \n");
      
      for(n=0; n<lib_size; n++) {
	  fprintf(out_stream, "%-*s\t%d\t%d\n", 
		  MAX_ID_LENGTH,ids[n], 
		  (int)lengths[n], (int)scores[n]);
      }
      fprintf(out_stream,"\n\n");

    } else if(stats_method == STATS_BAILEY) {
      // Bailey/Gribskov ML statistics

      error_flag = cbt_evd(lib_size, scores, lengths, query_length,
				&lambda, &K, &H, e_values);
      if(error_flag) cbt_die("Failed to estimate extream value distribution");
      
      fprintf(out_stream, "#Query: id length lambda K H\n");
      fprintf(out_stream, "%-*s\t%d\t%g\t%g\t%g\n", 
	      MAX_ID_LENGTH, align->seq1->id, 
	      align->seq1->len,
	      lambda, K, H);
      
      fprintf(out_stream,"#id length score e_value\n");
      
      for(n=0; n<lib_size; n++) {
	if( e_values[n]<CBT_E_VALUE_CUTOFF) 
	  fprintf(out_stream, "%-*s\t%d\t%d\t%g\n", 
		  MAX_ID_LENGTH,ids[n], 
		  (int)lengths[n], (int)scores[n], e_values[n]);
      }
      fprintf(out_stream,"\n\n");
    
    } else if(stats_method == STATS_PEARSON) {
      cbt_die("Not Implimented");
    } else {
      cbt_die("Unkown stats method");
    }
    
  } // end loop over query sequences

  align = alignment_free(align);
    
}


void doublet_align_res_stats( FILE * query_stream,
                              FILE * lib_stream,
                              Sub_matrix mat,
                              FILE * out_stream,
                              char * out_filename,
			      int make_stats_file) {
  Alignment align;

  //  char tmpfilename[L_tmpnam];
  FILE* fstats = NULL;
  char command[L_tmpnam+256];

  align = alignment_create();
  align->mat = mat;
  
  // tmpnam(tmpfilename);

  /* Align every sequence in the query file against every
   *   sequence of the library 
   */

  while(cbt_seq_read_fasta(align->seq1, query_stream, aaCodes, false) !=EOF) {
    rewind(lib_stream);

    if( (fstats = fopen(STATS_FN, "w"))==NULL) 
      cbt_die("Cannot open tmp file %s",STATS_FN);
    alignment_print_res_stats_header(align, fstats);
    
    while(cbt_seq_read_fasta(align->seq2, lib_stream, aaCodes, false) !=EOF) {
      if( strcmp(align->seq1->id, align->seq2->id) ==0) {
	// Same sequence. Skip, to avoid core dumping res_stats with
	// very large scores
	continue;
      }

      alignment_align(align);
      alignment_print_res_stats(align, fstats);
      
      //if(display_alignment) {
//	alignment_print_stats(align, out_stream);
	alignment_print(align, out_stream);
  //         }
           
    }
    
    fflush( fstats );
    fclose( fstats );

    strcpy(command, "res_stats ");
    strcat(command, STATS_FN);
    if(out_filename != NULL) {
      strcat(command, " >> ");
      strcat(command, out_filename);
    }
    system(command);
  }
    
  if ( make_stats_file == FALSE ) {
    remove( STATS_FN );
  }

}


void help( void ) {
  printf( "\nFast Sequence Alignment using Doublet Scores\n\n");
  printf("  Usage: doublet_align query_filename library_filename\n\n");


  printf("  Options:\n");
  printf("   -h           Help: print this help message\n");
  printf("   -O filename  Output file. (Default: stdout)\n"); 
  printf("   -s filename  Doublet scoring matrix. (Default: %s)\n", 
	 DEFAULT_SUB_MATRIX_FILENAME );
  printf("   -f int       Gap open penalty. (Default: %d)\n",DEFAULT_GAP_OPEN);
  printf("   -g int       Gap extend penalty. (Default: %d)\n", DEFAULT_GAP_EXTEND);
  printf("   -L int       Doublet scoring distance. (Default: %d)\n", MAX_DOUBLET);
  printf("   -S int       0 = None; 1 = ML; 2 = Pearson (Default: 1)\n" );
  printf("   -F int       make statistics file\n" );
  printf("   -D           Display alignments, not just scores.\n\n");
  printf("  Note: If the query filename starts with @, then the query\n");
  printf("        is read from standard input.\n\n");
  exit ( 0 );
}


/*********************************************************
 * Sub_matrix: Singlet and doublet substitution matrixes
 *********************************************************/

Sub_matrix  sub_matrix_create() {
  Sub_matrix self;
  self = malloc(sizeof(struct sub_matrix_str));
  return self;
}

Sub_matrix sub_matrix_load( char * mat_name ) {
  FILE * fp;
  fp = fopen(mat_name, "r" );
  if(fp==NULL) cbt_die("Cannot open substitution matrix file %s",mat_name);
  return sub_matrix_parse(fp);
}

/* Parse a matrix text file. */
Sub_matrix sub_matrix_parse(FILE *fp) {
  int i,j,k,m,len;
  char line[MAX_LINE_LENGTH]; 
  int line_num=0;
  int score;

  Sub_matrix mat = sub_matrix_create();

  assert(fp!=NULL);

  // Skip comments and blank lines 
  do {
    if(fgets(line, MAX_LINE_LENGTH, fp)==0) 
      cbt_die("sub_matrix parse error line %d", line_num);
    line_num++;
  } while( line[0]=='#' || line[0]=='\n');


  // Make sure we are on the correct line
  if(!strncmp(line, "    A   R   N   D",16)) 
      cbt_die("sub_matrix parse error line %d", line_num);

  //Read in the singlet matrix
  for(i=0; i<AA_TYPES_EXT; i++) {
    if(fgets(line, MAX_LINE_LENGTH, fp)==0) 
      cbt_die("sub_matrix parse error line %d", line_num);
    line_num++;

    for(j=0;j<AA_TYPES_EXT;j++) {
      if(sscanf(line+4*(j+1),"%d", &score)==EOF) 
        cbt_die("sub_matrix parse error line %d", line_num);

      if(score>127 || score<-127) 
        cbt_die("sub_matrix parse error line %d: score is out of bounds: %d "
            , line_num, score);

      // Store scores
      mat->singlet[i][j] = (signed char) score;
    }
  }

  // Skip comments and blank lines 
  do {
    if(fgets(line, MAX_LINE_LENGTH, fp)==0) 
      cbt_die("sub_matrix parse error line %d", line_num);
    line_num++;
  } while( line[0]=='#' || line[0]=='\n');


  // Make sure we are on the correct line
  if(!strncmp(line, "            1    2    3",16)) 
      cbt_die("sub_matrix odd error line %d: %s", line_num, line);

  // Clear the doublet Matrix.
  // The doublet matrix contains no data for len=0 (doesn't make sense)
  // or for the optional AA codes 'B', 'Z' and 'X'. Put Zero in
  for(i=0; i<AA_TYPES_EXT; i++) 
    for(j=0; j<AA_TYPES_EXT; j++) 
      for(k=0; k<AA_TYPES_EXT; k++) 
        for(m=0; m<AA_TYPES_EXT; m++) 
          for(len=0;len<=MAX_DOUBLET;len++) 
            mat->doublet[len][i][j][k][m] = (signed char) 0;
       

  //Read in the doublet matrix
  for(i=0; i<AA_TYPES_EXT; i++) {
    for(j=0; j<AA_TYPES_EXT; j++) {
      for(k=0; k<AA_TYPES_EXT; k++) {
        for(m=0; m<AA_TYPES_EXT; m++) {
          if(fgets(line, MAX_LINE_LENGTH, fp)==0) 
            cbt_die("sub_matrix parse error line %d:%s", line_num,line);
          line_num++;

          for(len=1;len<=MAX_DOUBLET;len++) {
            if(sscanf(line+3+(5*len),"%d", &score)==EOF) 
              cbt_die("sub_matrix parse error line %d:%s", line_num, line);

            if(score>127 || score<-127) 
              cbt_die("sub_matrix parse error line %d: score is out of bounds: %d "
                  , line_num, score);

            mat->doublet[len][i][j][k][m] = (signed char) score;
          }
        }
      }
    }
  }

  return mat;
}


/*
 * Set the maximum distance over which doublet scores are counted.
 * if dist==0, then only singlet scores are used.
 * (This works by simple setting all larger distance scores to zero.)
 */
void sub_matrix_set_max_distance(Sub_matrix mat, int dist) {
  int i,j,k,m,d;
  assert( dist>=0 && dist<=MAX_DOUBLET);

  for(d=dist+1;d<=MAX_DOUBLET;d++)
    for(i=0; i<AA_TYPES_EXT; i++) 
      for(j=0; j<AA_TYPES_EXT; j++) 
        for(k=0; k<AA_TYPES_EXT; k++) 
          for(m=0; m<AA_TYPES_EXT; m++) 
            mat->doublet[d][i][j][k][m] = (signed char) 0;

  mat->dist = dist;
}



/*********************************************************
 * Alignment
 *********************************************************/

Alignment  alignment_create() {
  Alignment self;
  self = malloc(sizeof(struct alignment_str));
  self->seq1 = cbt_seq_alloc();
  self->seq2 = cbt_seq_alloc();

  // We dont initilize our scratch space until we know
  // how big it needs to be.
  self->score = NULL;
  self->traceback = NULL;
  self->trace = NULL;

  return self;
}

Alignment alignment_free(Alignment self) {
  if(self->seq1 != NULL) 
    cbt_seq_free(self->seq1);
  if(self->seq2 != NULL) 
    cbt_seq_free(self->seq2);

  if(self->score != NULL) 
    cbt_mat3d_free(self->score);
  if(self->traceback != NULL) 
    cbt_mat3d_free(self->traceback);
  if(self->trace != NULL) 
    free(self->trace);

  free(self);

  return NULL;
}



/* pretty print minimum alignment statistics */
void alignment_print_stats(Alignment self, FILE * stream) {
  fprintf(stream, "# Alignment of %s length %d to %s length %d -- Raw Score %d\n",
          self->seq1->id, self->seq1->len, 
          self->seq2->id, self->seq2->len, 
          self->max_score);
}

/* print statistics in format used by res_stats*/
void alignment_print_res_stats(Alignment self, FILE * stream) {

  fprintf(stream, "%s 0 %d 0 -1.00000 -1.00000 %d 0 0 999\n",
          self->seq2->id, self->seq2->len,  self->max_score);
}

void alignment_print_res_stats_header(Alignment self, FILE * stream) {
  fprintf(stream, "%d >%s\n", self->seq1->len, self->seq1->id);
}


/* Print the entire alignment */
void alignment_print(Alignment self, FILE * stream) {
  const char GAP = '-';
  const char NEG_SCORE = ' ';
  const char ZERO_SCORE = '.';
  const char POS_SCORE = '*';

  const int LINE_LENGTH=60;

  const int TOTAL_LINES = MAX_DOUBLET+3;
  const int SEQ1_LINE = 0;
  const int SEQ2_LINE = 1;
  const int SINGLET_LINE = 2;

  char lines[TOTAL_LINES][ MAX(self->seq1->len, self->seq2->len )];

  location loc;
  int length = self->length;
  int i,g,s,pos,len;
  amino_acid aa1a,aa1b,aa2a,aa2b;

  for(i=0; i<length; i++) {
    lines[SEQ1_LINE][i] = GAP;
    lines[SEQ2_LINE][i] = GAP;
    for(g=0;g<=MAX_DOUBLET; g++) {
      lines[g+SINGLET_LINE][i] = NEG_SCORE;
    }
  }
  
  for(i=0; i<length; i++) {
    loc = self->trace[length-1-i];
    aa1b = self->seq1->sym[loc.seq1];
    aa2b = self->seq2->sym[loc.seq2];

    if(loc.dist != GAP_UP)
      lines[SEQ1_LINE][i] = aa_to_char(aa1b);

    if(loc.dist != GAP_LEFT) 
      lines[SEQ2_LINE][i] = aa_to_char(aa2b);

    if(loc.dist != GAP_UP && loc.dist != GAP_LEFT) {
     // Singlet score
      s = (int) self->mat->singlet[aa1b][aa2b];
      if(s>0) lines[SINGLET_LINE][i] = POS_SCORE;
      if(s==0) lines[SINGLET_LINE][i] = ZERO_SCORE;
      
      // Doublet scores
      for(g=1;g<=loc.dist; g++) {
        aa1a = self->seq1->sym[loc.seq1-g];
        aa2a = self->seq2->sym[loc.seq2-g];
    
        s = (int) self->mat->doublet[g][aa1a][aa1b][aa2a][aa2b];
        if(s>0) lines[g+SINGLET_LINE][i] = POS_SCORE;
        if(s==0) lines[g+SINGLET_LINE][i] = ZERO_SCORE;
      }
    }
  }

  fprintf(stream, "\n");      
  for(pos=0; pos<length; pos += LINE_LENGTH) {   
    len = (length-pos >LINE_LENGTH ? LINE_LENGTH : length-pos);
    for(g=0;g<(3+ self->mat->dist); g++) {
      for(i=pos;i<(pos+len); i++) {
        fprintf(stream, "%c", lines[g][i]);
      }
      fprintf(stream, "\n");      
    }
    fprintf(stream, "\n");      
  }
}


void alignment_align(Alignment self) {
  int d,i,j;
  int score, max_score, length;
  location loc = {0,0,0};
  amino_acid aa1, aa2, aa1a, aa1b, aa2a, aa2b;
  int s[DIST]; //temp array of scores;
  int best;

  int seq1_length = self->seq1->len;
  int seq2_length = self->seq2->len;
  int max_align_len = seq1_length + seq2_length;


  
  // Allocate memory for doing the alignment
  // But first free any memory left over from last alignment

  if(self->score != NULL) 
    cbt_mat3d_free(self->score);
  if(self->traceback != NULL) 
    cbt_mat3d_free(self->traceback);
  if(self->trace != NULL) 
    free(self->trace);
  
  self->score     = cbt_mat3d_alloc( DIST, 
                                     self->seq1->len, 
                                     self->seq2->len );
  self->traceback = cbt_mat3d_alloc( DIST, 
				 self->seq1->len, 
				 self->seq2->len );
  self->trace = (location*)calloc( max_align_len,
                                   sizeof( struct location_str ) );
  
  // Zero the score submatrix that we are about to use.
  for(d=0; d<DIST; d++) 
    for(i=0; i<seq1_length; i++) 
      for(j=0; j<seq2_length; j++) 
        self->score[d][i][j] =0;

  // Insert the singlet substitution scores
  for(i=0; i<seq1_length; i++) {
    for(j=0; j<seq2_length; j++) {
      aa1 = self->seq1->sym[i];
      aa2 = self->seq2->sym[j];
      self->score[0][i][j] = (int) self->mat->singlet[aa1][aa2];
    }
  }

  // Add the doublet substitution scores
  for(d=1;d<=MAX_DOUBLET;d++) {
    for(i=d; i<seq1_length; i++) {
      for(j=d; j<seq2_length; j++) {
        aa1a = self->seq1->sym[i-d];
        aa1b = self->seq1->sym[i];
        aa2a = self->seq2->sym[j-d];
        aa2b = self->seq2->sym[j];
        self->score[d][i][j] = self->score[d-1][i][j] +  
          (int) self->mat->doublet[d][aa1a][aa1b][aa2a][aa2b];
      }
    }
  }


  
  // Boundary values
  // Signal possible alignment starts
  // BUT Cannot start alignment except at d==0,
  // so put in a large negative score
  for(d=0;d<DIST;d++) {
    for(i=0; i<seq1_length; i++) {
        self->traceback[d][i][0] = START_ALIGNMENT;
      if(d!=0) self->score[d][i][0] = NULL_SCORE;
    }
    for(j=0; j<seq2_length; j++) {
      self->traceback[d][0][j] = START_ALIGNMENT; 
      if(d!=0) self->score[d][0][j] = NULL_SCORE;
    }
  }

  // Don't forget Gap boundary values.
  // Without these a boundary match cannot be followed by
  // a gap. Thus "WWWWWWWW" will not align correctly against "WDWWWWWW"
  // Doh. 
  // Kudos to Marcus Zacharich for finding this bug.
  
  // Gap boundary values
  j=0;
  for(i=1; i<seq1_length; i++) {
    // Gap left
    s[GAP_UP] = NULL_SCORE;
    s[GAP_LEFT] = self->score[GAP_LEFT][i-1][j] - self->mat->gap_extend;
    for(d=0;d<=MAX_DOUBLET; d++)
      s[d] = self->score[d][i-1][j] - self->mat->gap_open;
    best = cbt_argmax(DIST, s);
    self->score[GAP_LEFT][i][j] = s[best];
    self->traceback[GAP_LEFT][i][j] = best;
  }

  i=0;
  for(j=1; j<seq2_length; j++) {
    // Gap up
    s[GAP_LEFT] = NULL_SCORE;
    s[GAP_UP] = self->score[GAP_UP][i][j-1] - self->mat->gap_extend;
    for(d=0;d<=MAX_DOUBLET; d++)
      s[d] = self->score[d][i][j-1] - self->mat->gap_open;
    best = cbt_argmax(DIST,s);
    self->score[GAP_UP][i][j] = s[best];
    self->traceback[GAP_UP][i][j]  = best;
  }



  // Now we can do the actual dynamic programming.
  // We work through the score matrix, starting in the top left.
  // At each position we look back at the relevant previous positions,
  // add the relevant local scores, and decide which way the local 
  // optimal alignment goes.  We always know from the context which 
  // way to traceback in sequence space. traceback[][][] contains 
  // the DIST value of the previous element of the local optimal alignment.
  for(i=1; i<seq1_length; i++) {
    for(j=1; j<seq2_length; j++) {

      // Gap up
      s[GAP_LEFT] = NULL_SCORE;
      s[GAP_UP] = self->score[GAP_UP][i][j-1] - self->mat->gap_extend;
      for(d=0;d<=MAX_DOUBLET; d++)
        s[d] = self->score[d][i][j-1] - self->mat->gap_open;
      best = cbt_argmax(DIST, s);
      self->score[GAP_UP][i][j] = s[best];
      self->traceback[GAP_UP][i][j]  = best;


      // Gap left
      s[GAP_UP] = NULL_SCORE;
      s[GAP_LEFT] = self->score[GAP_LEFT][i-1][j] - self->mat->gap_extend;
      for(d=0;d<=MAX_DOUBLET; d++)
        s[d] = self->score[d][i-1][j] - self->mat->gap_open;
      best = cbt_argmax(DIST, s);
      self->score[GAP_LEFT][i][j] = s[best];
      self->traceback[GAP_LEFT][i][j] = best;


      // Just Gaped, or Just started a new alignment.
      s[0] = self->score[0][i][j]; //Start new alignment
      s[1] = s[0]+self->score[GAP_LEFT][i-1][j-1];
      s[2] = s[0]+self->score[GAP_UP][i-1][j-1];
      best = cbt_argmax(3, s);
      self->score[0][i][j] = s[best];
      if(best==0) {
        self->traceback[0][i][j] = START_ALIGNMENT;
      } else if(best==1) {
        self->traceback[0][i][j] = GAP_LEFT;
      } else {
        self->traceback[0][i][j] = GAP_UP;
      }

      // Gap an intermediate distance back
      for(d=1; d<MAX_DOUBLET; d++ ) {
        self->score[d][i][j] += self->score[d-1][i-1][j-1]; 
        self->traceback[d][i][j] = d-1;
      }

      // Gap maximum distance back
      s[0] = self->score[MAX_DOUBLET-1][i-1][j-1];
      s[1] = self->score[MAX_DOUBLET][i-1][j-1];
      if( s[0]>s[1] ) {
        self->score[MAX_DOUBLET][i][j] += s[0];
        self->traceback[MAX_DOUBLET][i][j] = MAX_DOUBLET-1;
      } else {
        self->score[MAX_DOUBLET][i][j] += s[1];
        self->traceback[MAX_DOUBLET][i][j] = MAX_DOUBLET;
      }      
    }
  }

  // Where did the best alignment end?
  max_score = NULL_SCORE;
  for(d=0;d<DIST;d++) {    
    for(i=0;i<seq1_length;i++) {
      for(j=0;j<seq2_length;j++) { 
        score = self->score[d][i][j];
        if(score > max_score) {
          max_score = score;
          loc.seq1 = i;
          loc.seq2 = j;
          loc.dist = d;
        }
      }
    }
  }
  self->max_score = max_score;

  // Now we can backtrack to the start of this alignment
  // The optimal alignment in stored in trace[] in reverse
  // order.
  length=0;
  while(TRUE) {
    assert(length < max_align_len);
    self->trace[length] = loc;
    d = self->traceback[loc.dist][loc.seq1][loc.seq2];
    length+=1;
    if(d==START_ALIGNMENT) break;
    loc.dist = d;
    loc.seq1 += (d==GAP_UP ? 0 : -1) ;
    loc.seq2 += (d==GAP_LEFT ? 0 : -1) ;
  }
  self->length = length;

}

/*********************************************************
 * Misc. Utility routines 
 *********************************************************/

/* 
 * Convert an Amino acid one letter code into an integer
 * between 0 (A, Alanine) and 22 (X, Unknown)
 */
amino_acid char_to_aa(char one_letter_code) {
  int i;
  char c = toupper(one_letter_code);
  for(i=0;i< AA_TYPES_EXT; i++)
    if(aaCodes[i]== c) return i;
  cbt_die("Unknown Amino Acid Code: %c", one_letter_code);
  return -1; /* can't happen */
}

/* 
 * Convert an AminoAcid (an int) derived from aatoi() back into
 * a character.
 */
char aa_to_char(amino_acid aa) {
  assert(aa>=0 && aa<AA_TYPES_EXT);
  return aaCodes[aa];
}




