/*
 * DomReduce.c -- select exemplars of clusters of similar sequences
 * from a multiple sequence alignment.
 *
 * Morgan N. Price, March-May 2008
 *
 *  Copyright (C) 2008 The Regents of the University of California
 *  All rights reserved.
 * 
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License along
 *  with this program; if not, write to the Free Software Foundation, Inc.,
 *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 *  Disclaimer
 *
 *  NEITHER THE UNITED STATES NOR THE UNITED STATES DEPARTMENT OF ENERGY,
 *  NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED,
 *  OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
 *  COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
 *  OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
 *  PRIVATELY OWNED RIGHTS.
 */

#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <stdbool.h>

/* An older version of this program wrote the cluster file
   in CD-HIT format instead of tab-delimited and wrote the
   fasta sequences of the alignment of each domain
   Change these to 1 if you want that behavior.
*/
#define WRITE_FASTA 0
#define WRITE_CDHIT 0

static const char *usage = "Usage: DomReduce Fasta HmmHits out Threshold EdgeThreshold\n"
"    Given a fasta file and a list of hmm hits, including alignment fields,\n"
"    DomReduce makes non-redundant clusters for each domain\n"
"    It writes to out and to out.clstr\n"
"    out is tab-delimited with fields domainId, name, begin, end\n"
"    out.clstr is tab-delimited with fields exemplar, representatives\n"
"    where exemplar and representatives are of the form exemplarId.begin.end"
"    and exemplar is included in the representatives\n"
"    The thresholds are percent identities, e.g. 35 and 30\n"

"\n"
"The hmmhits file should be tab-delimited with the following fields:\n"
"    DomainName, SequenceName, SeqBegin, SeqEnd, DomBegin, DomEnd,\n"
"    Score (ignored), Evalue (ignored), SeqExtents, DomExtents\n"
"All positions start at 1\n"
"Extents are comma-delimited lists of pairs, e.g. 2:10,12:15\n"
"and imply a sequence-domain alignment and (indirectly) imply\n"
"a multiple sequence alignment\n";

#include "Hash.h"

/* extents are inclusive. In the input file they are 1-based,
   but in this structure they are 0-based
*/
typedef struct {
  int iFirst;
  int iLast;
} extent_t;

typedef struct {
  int iSeq;
  char *seqName;
  int seqBeg, seqEnd, domBeg, domEnd;
  double score;
  double evalue;
  int nExtentsSeq;
  extent_t *extentsSeq;
  int nExtentsDom;
  extent_t *extentsDom;
  int alignWidth;
  int iCluster;			/* which cluster in the ReduceDom list this is a member of, or -1 */
} dommember_t;

/* returns NULL if nothing left.
   The returned members should be freed with FreeDom()
   The domname should not be freed
*/
dommember_t *ReadDom(FILE *fp, /*OUT*/int *nMembers, /*OUT*/char **domname);
dommember_t *FreeDom(dommember_t *, int nMembers); /* returns NULL */

/* Return indices into the non-redundant subset of the members */
int *ReduceDom(/*IN/OUT*/dommember_t *members, int nMember, char **seqs, /*OUT*/int *nReduce);

void PrintMember(FILE *fp, dommember_t *member, char *domName);

extent_t *ParseExtents(char *string, /*OUT*/int *nExtent);

/* Returns number of sequences read */
int LoadFasta(FILE *fp, /*OUT*/char ***outSeqs, /*OUT*/char ***outNames);

/* Denominator is all non-gap positions in second sequence
   BeginSimilarity and EndSimilarity use only the nChar non-gap
   positions at the beginning or end of the second sequence
*/
double AlignSimilarity(char *s1, char *s2);
double BeginSimilarity(char *s1, char *s2, int nChar);
double EndSimilarity(char *s1, char *s2, int nChar);

/* global options */
int threshold = 30; /* percent identity for clustering two sequences together */
int edgethreshold = 30; /* percent identity requirement at ends */
int nEdgeSize = 40; /* number of positions at end to check as an edge */
int debug = 0;

int main(int argc, char *argv[]) {
  if (argc != 6) {
    fprintf(stderr, "%s", usage);
    exit(1);
  }

  char *fastafile = argv[1];
  char *hmmhits = argv[2];
  char *outprefix = argv[3];

  threshold = atoi(argv[4]);
  if (threshold < 1 || threshold > 100) {
    fprintf(stderr, "threshold must be between 1 and 100\n%s", usage);
    exit(1);
  }

  edgethreshold = atoi(argv[5]);
  if (edgethreshold < 1 || edgethreshold > 100) {
    fprintf(stderr, "edge threshold must be between 1 and 100\n%s", usage);
    exit(1);
  }


  FILE *fpfaa = fopen(fastafile, "r");
  if (fpfaa == NULL) {
    fprintf(stderr, "Cannot read from %s\n", fastafile);
    exit(1);
  }

  char **seqs = NULL;
  char **names = NULL;
  int nSeqs = LoadFasta(fpfaa, &seqs, &names);
  if (fclose(fpfaa) != 0) {
    fprintf(stderr, "Error reading %s\n", fastafile);
    exit(1);
  }
  fprintf(stderr,"Read %d sequences from fasta file %s\n", nSeqs, fastafile);

  /* Check for non-unique names and map hash iterator to seq index */
  hashstrings_t *hashnames = MakeHashtable(names, nSeqs);
  int hashSize = hashnames->nBuckets;
  int *hashToSeqI = (int*)mymalloc(sizeof(int)*hashSize);
  int i;
  for(i=0;i<hashSize;i++) hashToSeqI[i] = -1;

  for (i=0; i<nSeqs; i++) {
    hashiterator_t hi = FindMatch(hashnames,names[i]);
    assert(GetHashString(hashnames, hi) != NULL);
    if (HashCount(hashnames,hi) != 1) {
      fprintf(stderr,"Non-unique name %s in the fasta file\n",names[i]);
      exit(1);
    }
    assert(hi>=0 && hi < hashSize);
    hashToSeqI[hi] = i;
  }

  FILE *fpDom = fopen(hmmhits,"r");
  if(fpDom==NULL) {
    fprintf(stderr, "Cannot read from %s\n",hmmhits);
    exit(1);
  }

  /* Write domains to outprefix but write them to outprefix.tmp first
     and rename later
  */
  char *buf = (char*)mymalloc(strlen(outprefix)+100);
  strcpy(/*TO*/buf, /*FROM*/outprefix);
  strcat(buf, ".tmp");
  char *tmpout = strdup(buf);
  FILE *fpOutDom = fopen(tmpout,"w");
  if (fpOutDom==NULL) {
    fprintf(stderr, "Cannot write to %s\n", buf);
    exit(1);
  }

#if WRITE_FASTA
  strcpy(/*TO*/buf, /*FROM*/outprefix);
  strcat(buf, ".faa");
  FILE *fpOutFaa = fopen(buf,"w");
  if (fpOutFaa==NULL) {
    fprintf(stderr, "Cannot write to %s\n", buf);
    exit(1);
  }
#endif

  strcpy(/*TO*/buf, /*FROM*/outprefix);
  strcat(buf, ".clstr");
  FILE *fpOutClstr = fopen(buf, "w");
  if (fpOutClstr==NULL) {
    fprintf(stderr, "Cannot write to %s\n", buf);
    exit(1);
  }

  free(buf);
  buf = NULL;

  int nDom = 0;
  int nAssign = 0;
  int nReduceTot = 0;
  int nMembers;
  dommember_t *members = NULL;
  char *domName = NULL;
  while ((members = ReadDom(fpDom, /*OUT*/&nMembers, /*OUT*/&domName)) != NULL) {
    nDom++;
    nAssign += nMembers;

    /* Fill in member->iSeq */
    for (i = 0; i < nMembers; i++) {
      dommember_t *m = &members[i];
      hashiterator_t hi = FindMatch(hashnames, m->seqName);
      if (GetHashString(hashnames, hi) == NULL) {
	fprintf(stderr, "Domain %s has unknown sequence %s\n", domName, m->seqName);
	exit(1);
      }
      m->iSeq = hashToSeqI[hi];
      assert(m->iSeq>=0 && m->iSeq < nSeqs);
      int len = strlen(seqs[m->iSeq]);
      if (m->seqEnd >= len) {
	fprintf(stderr, "Domain %s has illegal span %d:%d for sequence %s of length %d\n",
		domName, m->seqBeg+1, m->seqEnd+1, m->seqName, len);
	exit(1);
      }
    }
    int nReduce;
    int *list = ReduceDom(/*IN/OUT*/members, nMembers, seqs, /*OUT*/&nReduce);
    if(debug) fprintf(stderr, "Reduce %s from %d to %d members\n", domName, nMembers, nReduce);

    /* Build the inverse mapping from an exemplar to a list
       next goes from a member to the next item in the list (or -1)
     */
    int *next = (int*)mymalloc(sizeof(int)*nMembers);
    for (i = 0; i < nMembers; i++)
      next[i] = -1;
    for (i = 0; i < nMembers; i++) {
      int iCluster = members[i].iCluster;
      int exemplar = list[iCluster];
      if (exemplar == i) {
	/* do nothing */
      } else {
	/* add self to cluster */
	int index = exemplar;
	while(next[index] != -1)
	  index = next[index];
	assert(next[index] == -1);
	next[index] = i;
      }
    }
    for (i = 0; i < nReduce; i++) {
      int iCluster = i;
      int iExemplar = list[iCluster];
      dommember_t *exemplar = &members[iExemplar];
      
      fprintf(fpOutDom, "%s\t%s\t%d\t%d\n",
	      domName,
	      exemplar->seqName, 
	      exemplar->seqBeg+1, 
	      exemplar->seqEnd+1);
      assert(exemplar->seqEnd >= exemplar->seqBeg);
#if WRITE_FASTA
      fprintf(fpOutFaa,">%s.%d.%d\n",
	      exemplar->seqName,
	      exemplar->seqBeg+1,
	      exemplar->seqEnd+1);
      int i;
      for (i = exemplar->seqBeg; i <= exemplar->seqEnd; i++) {
	fputc(seqs[exemplar->iSeq][i], fpOutFaa);
      }
      fputc('\n', fpOutFaa);
#endif

      nReduceTot++;

#if WRITE_CDHIT
      fprintf(fpOutClstr, ">Cluster %d\n", nReduceTot);
      int index;
      int iClusterMember = 0;
      char label[40];
      for (index = iExemplar; index != -1; index = next[index]) {
	if (index == iExemplar)
	  strcpy(/*TO*/label, /*FROM*/"*");
	else
	  sprintf(label, "at %d%%", threshold); /* report minimum not actual %match */
	dommember_t *m = &members[index];
	fprintf(fpOutClstr, "%d\t%daa, >%s.%d.%d... %s\n",
		iClusterMember,
		m->seqEnd-m->seqBeg+1, 
		m->seqName,
		m->seqBeg+1, m->seqEnd+1,
		label);
	iClusterMember++;
      }
#else
      int index;
      fprintf(fpOutClstr, "%s.%d.%d", 
	      exemplar->seqName, exemplar->seqBeg+1, exemplar->seqEnd+1);
      for (index = iExemplar; index != -1; index = next[index]) {
	dommember_t *m = &members[index];
	fprintf(fpOutClstr, "\t%s.%d.%d",
		m->seqName, m->seqBeg+1, m->seqEnd+1);
      }
      fprintf(fpOutClstr, "\n");
#endif
    }
    FreeDom(members, nMembers);
    free(next);
    free(list);
  }
  if(fclose(fpDom) != 0) {
    fprintf(stderr, "Error reading %s\n",hmmhits);
    exit(1);
  }
  if(fclose(fpOutDom) != 0) {
    fprintf(stderr, "Error writing to %s\n", outprefix);
    exit(1);
  }
  if(rename(/*FROM*/tmpout,/*TO*/outprefix) != 0) {
    fprintf(stderr, "Error renaming %s to %s\n", tmpout, outprefix);
    exit(1);
  }
#if WRITE_FASTA
  if(fclose(fpOutFaa) != 0) {
    fprintf(stderr, "Error writing to %s.faa\n", outprefix);
    exit(1);
  }
#endif
  if(fclose(fpOutClstr) != 0) {
    fprintf(stderr, "Error writing to %s.clstr\n", outprefix);
    exit(1);
  }

  fprintf(stderr,"Wrote %d clusters from %d assignments to %s.clstr\n", nReduceTot, nAssign, outprefix);
  hashnames = DeleteHashtable(hashnames);
  free(hashToSeqI);
  return(0);
}

dommember_t *ReadDom(FILE *fp, /*OUT*/int *outMembers, /*OUT*/char **outDomname) {
  static char buf[100*1000] = ""; /* stores the last line */
  static char domainName[1000] = "";

  domainName[0] = '\0';

  int nMembers = 0;
  int nSaved = 10000;
  dommember_t *members = (dommember_t*)mymalloc(sizeof(dommember_t)*nSaved);

  bool domainLeft = false;
  do {
    if (buf[0] == '\0')
      continue;			/* no cached line to parse */
    char *tab = strchr(buf, '\t');
    if (tab == NULL) {
      fprintf(stderr,"Line without tabs in domains file:\n%s\n",buf);
      exit(1);
    }
    *tab = '\0';
    if (domainName[0] == '\0') {
      /* save the domain name */
      if (strlen(buf) > sizeof(domainName)-1) {
	fprintf(stderr,"domainName too long in domains file: %s\n",buf);
	exit(1);
      }
      strcpy(/*TO*/domainName,/*FROM*/buf);
    } else if (strcmp(domainName,buf) != 0) {
      /* start of a new domain: return and undo the change so we have a
         normal first line for the next call
      */
      domainLeft = true;
      *tab = '\t';
      break;
    }

    nMembers++;
    if (nMembers > nSaved) {
      nSaved *= 2;
      members = (dommember_t*)realloc(members, sizeof(dommember_t)*nSaved);
      assert(members != NULL);
    }

    dommember_t *member = &members[nMembers-1];
    member->iSeq = -1; /* need the name hashtable to set it*/
    member->seqName = NULL;
    member->seqBeg = -1;
    member->seqEnd = -1;
    member->domBeg = -1;
    member->domEnd = -1;
    member->score = -1.0;
    member->evalue = -1.0;
    member->nExtentsSeq = 0;
    member->extentsSeq = NULL;
    member->nExtentsDom = 0;
    member->extentsDom = NULL;
    member->alignWidth = 0;
    member->iCluster = -1;

    char *extentSeqString = NULL;
    char *extentDomString = NULL;

    /* Tokenize the line and make the extents */
    *tab = '\t';
    int iCol;
    char *field;
    for (iCol = 0, field = strtok(buf,"\t\r\n");
	 field != NULL; 
	 iCol++, field = strtok(NULL,"\t\r\n")) {
      switch(iCol) {
      case 1:
	member->seqName = strdup(field);
	assert(field != NULL);
	break;			/* out of switch */
      case 2:
	member->seqBeg = atoi(field)-1;
	break;
      case 3:
	member->seqEnd = atoi(field)-1;
	break;
      case 4:
	member->domBeg = atoi(field)-1;
	break;
      case 5:
	member->domEnd = atoi(field)-1;
	break;
      case 6:
	member->score = atof(field);
	break;
      case 7:
	member->evalue = atof(field);
	break;
      case 8:
	if (member->seqBeg < 0 || member->seqEnd < 0 || member->seqBeg > member->seqEnd) {
	  fprintf(stderr, "Illegal sequence begin and end for %s %s: %d to %d\n",
		  domainName, member->seqName, member->seqBeg+1, member->seqEnd+1);
	}
	extentSeqString = field;
	break;
      case 9:
	if (member->domBeg < 0 || member->domEnd < 0 || member->domBeg > member->domEnd) {
	  fprintf(stderr, "Illegal domain begin and end for %s %s: %d to %d\n",
		  domainName, member->seqName, member->domBeg+1, member->domEnd+1);
	}
	extentDomString = field;
	break;
      default:
	break;
      }
    }

    if (extentDomString == NULL) {
      fprintf(stderr,"Not enough fields (no domain extents) in entry for %s %s %d %d\n",
	      domainName, member->seqName, member->seqBeg+1,member->seqEnd+1);
    }

    /* Parse extent strings after we are done because cannot reenter strtok() */
    member->extentsSeq = ParseExtents(extentSeqString, /*OUT*/&member->nExtentsSeq);
    member->extentsDom = ParseExtents(extentDomString, /*OUT*/&member->nExtentsDom);
    
    if (member->extentsSeq[0].iFirst != member->seqBeg) {
      fprintf(stderr,"Sequence extent beginning %d does not match sequence beginning %d for %s %s\n",
	      member->extentsSeq[0].iFirst+1, member->seqBeg+1,
	      domainName, member->seqName);
      exit(1);
    }
    if (member->extentsSeq[member->nExtentsSeq-1].iLast != member->seqEnd) {
      fprintf(stderr,"Sequence extent end %d does not match sequence end %d for %s %s\n",
	      member->extentsSeq[member->nExtentsSeq-1].iLast+1, member->seqEnd+1,
	      domainName, member->seqName);
      exit(1);
    }
    /* Do not check that begin or end of alignment extents matches because of problems with fasthmm */
    member->alignWidth = 0;
    int iExt;
    for (iExt=0; iExt < member->nExtentsSeq; iExt++)
      member->alignWidth += member->extentsSeq[iExt].iLast - member->extentsSeq[iExt].iFirst + 1;
    /* and compute width the other way */
    int width = 0;
    for (iExt=0; iExt < member->nExtentsDom; iExt++)
      width += member->extentsDom[iExt].iLast - member->extentsDom[iExt].iFirst + 1;
    if (width != member->alignWidth) {
      fprintf(stderr, "extents for %s %s do not have equal numbers of positions\n",
	      domainName, member->seqName);
      PrintMember(stderr, member, domainName);
      exit(1);
    }
  } while(fgets(buf,sizeof(buf),fp) != NULL);

  if (!domainLeft)
    *buf = '\0'; 		/* don't reread the last line */
  if (nMembers == 0) {
    free(members);
    members = NULL;
  }
  if (debug && nMembers>0)
    fprintf(stderr, "Read %d members for %s\n", nMembers, domainName);
  *outDomname = domainName;
  *outMembers = nMembers;
  return(members);
}

dommember_t *FreeDom(dommember_t *members, int nMembers) {
  int i;
  for(i=0;i<nMembers;i++) {
    free(members[i].seqName);
    free(members[i].extentsSeq);
    free(members[i].extentsDom);
  }
  free(members);
  return(NULL);
}

extent_t *ParseExtents(char *string, /*OUT*/int *nExtents) {
  /* First count the pairs */
  int nComma = 0;
  char *p;
  for (p = string; *p != '\0'; p++)
    if (*p == ',')
      nComma++;
  int nPairs = nComma+1;
  extent_t *extents = (extent_t*)mymalloc(sizeof(extent_t)*nPairs);
  char *pairs;
  int iPair;
  for (iPair = 0, pairs = strtok(string,","); 
       pairs != NULL;  
       iPair++, pairs = strtok(NULL,",")) {
    if (sscanf(pairs, "%d:%d", &extents[iPair].iFirst, &extents[iPair].iLast) != 2) {
      fprintf(stderr,"Cannot parse piece of extents: %s\n", pairs);
      exit(1);
    }
    if (extents[iPair].iFirst > extents[iPair].iLast) {
      fprintf(stderr, "end %d before begin %d in extents list!\n",
	      extents[iPair].iLast, extents[iPair].iFirst);
      exit(1);
    }

    /* correct to 0-based */
    extents[iPair].iFirst--;
    extents[iPair].iLast--;
  }
  assert(iPair == nPairs);

  /* Check the correctness of the extents */
  for (iPair = 1; iPair < nPairs; iPair++) {
    if (extents[iPair].iFirst <= extents[iPair-1].iLast) {
      fprintf(stderr, "Extent beginning %d is before end of previous one %d!\n",
	      extents[iPair].iFirst+1,
	      extents[iPair-1].iLast+1);
      exit(1);
    }
  }
  *nExtents = nPairs;
  return(extents);
}

int CompareWidthReverse(const void *p1, const void *p2) {
  dommember_t *m1 = (dommember_t*)p1;
  dommember_t *m2 = (dommember_t*)p2;
  /* try to ensure consistent choice of exemplar to aid merging*/
  if (m1->alignWidth == m2->alignWidth) {
    int cmp = strcmp(m1->seqName,m2->seqName);
    if (cmp==0)
      return(m1->seqBeg-m2->seqBeg);
    /*else*/
    return (cmp);
  }
  /*else*/
  return( m2->alignWidth - m1->alignWidth );
}

int *ReduceDom(/*IN/OUT*/dommember_t *members, int nMember, char **seqs, /*OUT*/int *outReduce) {
  /* Use longest member first */
  qsort(/*IN/OUT*/members, nMember, sizeof(dommember_t), CompareWidthReverse);

  /* Compute the total alignment width (the maximum position in the domain alignment) */
  int maxWidth = 0;
  int iMember;
  for (iMember = 0; iMember < nMember; iMember++) {
    dommember_t *m = &members[iMember];
    assert(m->nExtentsDom >= 1);
    int width = m->extentsDom[m->nExtentsDom-1].iLast + 1;
    assert(width >= 1);
    if (width > maxWidth)
      maxWidth = width;
  }
  /* Compute alignment sequences */
  char **mseqs = (char**)mymalloc(sizeof(char*)*nMember);
  for (iMember=0; iMember < nMember; iMember++) {
    dommember_t *m = &members[iMember];
    assert(m->iSeq >= 0);
    char *seq = seqs[m->iSeq];
    int len = strlen(seq);

    char *s = (char*)mymalloc(maxWidth+1);
    mseqs[iMember] = s;
    int i;
    for (i=0;i<maxWidth;i++)
      s[i] = '-';
    s[maxWidth] = '\0';		/* for debugging ease */
    /* which extents we are in */
    int iExtDom = 0;
    int iExtSeq = 0;
    /* where we are within the current extent */
    int iOffDom = 0;
    int iOffSeq = 0;
    while (iExtDom < m->nExtentsDom) {
      assert(iExtSeq < m->nExtentsSeq);
      int offSeq = m->extentsSeq[iExtSeq].iFirst+iOffSeq;
      int offDom = m->extentsDom[iExtDom].iFirst+iOffDom;
      assert(offSeq >= 0 && offSeq < len);
      assert(offDom >= 0 && offDom < maxWidth);
      s[offDom] = seq[offSeq];
      iOffDom++;
      if (iOffDom > m->extentsDom[iExtDom].iLast - m->extentsDom[iExtDom].iFirst) {
	iExtDom++;
	iOffDom = 0;
      }
      iOffSeq++;
      if (iOffSeq > m->extentsSeq[iExtSeq].iLast - m->extentsSeq[iExtSeq].iFirst) {
	iExtSeq++;
	iOffSeq = 0;
      }
    }
  }

  /* Make a list of non-redundant sequences */
  double dThreshold = threshold/100.0;
  double dEdgeThreshold = edgethreshold/100.0;
  int nSaved = 0;
  int *saved = (int*)mymalloc(sizeof(int)*nMember);
  int iSeed;
  for (iSeed = 0; iSeed < nMember; iSeed++) {
    dommember_t *seed = &members[iSeed];
    if (seed->iCluster >= 0)
      continue;			/* this is already assigned to a cluster */
    seed->iCluster = nSaved;
    for (iMember = iSeed+1; iMember < nMember; iMember++) {
      dommember_t *m = &members[iMember];
      if (m->iCluster >= 0)
	continue;
      /* Check for similarity overall and also at ends, to
	 avoid errors in clustering due to domain shuffling */
      if (AlignSimilarity(mseqs[iSeed], mseqs[iMember]) >= dThreshold
	  && BeginSimilarity(mseqs[iSeed], mseqs[iMember], nEdgeSize) >= dEdgeThreshold
	  && EndSimilarity(mseqs[iSeed], mseqs[iMember], nEdgeSize) >= dEdgeThreshold)
	m->iCluster = seed->iCluster;
    }
    saved[nSaved] = iSeed;
    nSaved++;
  }

  /* Free alignment sequences */
  for (iMember = 0; iMember < nMember; iMember++)
    free(mseqs[iMember]);
  free(mseqs);
  *outReduce = nSaved;
  return(saved);
}

void PrintMember(FILE *fp, dommember_t *m, char *domName) {
  fprintf(fp, "%s\t%s\t%d\t%d\t%d\t%d\t%.1f\t%.2e\t",
	  domName, m->seqName,
	  m->seqBeg+1, m->seqEnd+1,
	  m->domBeg+1, m->domEnd+1,
	  m->score, m->evalue);
  int iExt = 0;
  for (iExt = 0; iExt < m->nExtentsSeq; iExt++) {
    if(iExt>0) fprintf(fp, ",");
    fprintf(fp,"%d:%d",
	    m->extentsSeq[iExt].iFirst+1,
	    m->extentsSeq[iExt].iLast+1);
  }
  fprintf(fp,"\t");
  for (iExt = 0; iExt < m->nExtentsDom; iExt++) {
    if(iExt>0) fprintf(fp, ",");
    fprintf(fp,"%d:%d",
	    m->extentsDom[iExt].iFirst+1,
	    m->extentsDom[iExt].iLast+1);
  }
  fprintf(fp,"\n");
}

double AlignSimilarity(char *s1, char *s2) {
  int i;
  int nDiff = 0;
  int nPos = 0;

  for (i=0; s1[i] != '\0'; i++) {
    if (s2[i] != '-') {
      nPos++;
      if (s1[i] != s2[i])
	nDiff++;
    }
  }
  assert(nPos > 0);
  if(debug>2) fprintf(stderr, "Compare nDiff %d nPos %d:\n%s\n%s\n",nDiff,nPos,s1,s2);
  return( (nPos-nDiff)/(double)nPos );
}

double BeginSimilarity(char *s1, char *s2, int nBegin) {
  int i;
  int nPos = 0;
  int nDiff = 0;
  for (i=0; s1[i] != '\0' && nPos < nBegin; i++) {
    if (s2[i] != '-') {
      nPos++;
      if (s1[i] != s2[i])
	nDiff++;
    }
  }
  if(debug>2) fprintf(stderr, "CompareBegin nDiff %d nPos %d:\n%s\n%s\n",nDiff,nPos,s1,s2);
  assert(nPos > 0);
  return( (nPos-nDiff)/(double)nPos );
}

double EndSimilarity(char *s1, char *s2, int nEnd) {
  int i;
  int nPos = 0;
  int nDiff = 0;
  int len = strlen(s1);
  for (i=len-1; i>=0 && nPos < nEnd; i--) {
    if (s2[i] != '-') {
      nPos++;
      if (s1[i] != s2[i])
	nDiff++;
    }
  }
  if(debug>2) fprintf(stderr, "CompareEnd nDiff %d nPos %d:\n%s\n%s\n",nDiff,nPos,s1,s2);
  assert(nPos > 0);
  return( (nPos-nDiff)/(double)nPos );
}

int LoadFasta(FILE *fp, /*OUT*/char ***outSeqs, /*OUT*/char ***outNames) {
  int nSeqStore = 1000;
  int nSeqs = 0;
  char **seqs = (char**)mymalloc(sizeof(char*)*nSeqStore);
  char **names = (char**)mymalloc(sizeof(char*)*nSeqStore);

  char buf[1000];
  while(fgets(buf,sizeof(buf),fp) != NULL) {
    static const char *nameStop = "(),: \t\r\n";
    static const char *seqSkip = " \t\r\n";

    if (buf[0] == '>') {	/* name line in fasta file */
      nSeqs++;
      if (nSeqs > nSeqStore) {
	nSeqStore *= 2;
	seqs = (char**)realloc(seqs,sizeof(char*)*nSeqStore);
	names = (char**)realloc(names,sizeof(char*)*nSeqStore);
	assert(seqs != NULL && names != NULL);
      }
      char *p;
      const char *q;
      for (p = buf+1; *p != '\0'; p++) {
	for (q = nameStop; *q != '\0'; q++) {
	  if (*p == *q) {
	    *p = '\0';
	    break;
	  }
	}
	if (*p == '\0')
	  break;
      }
      names[nSeqs-1] = strdup(buf+1);
      assert(names[nSeqs-1] != NULL);
      seqs[nSeqs-1] = NULL;
    } else {
      if (nSeqs == 0) {
	fprintf(stderr, "Input not in fasta format in DomReduce\n");
	exit(1);
      }
      /* count non-space characters and append to sequence */
      int nKeep = 0;
      char *p;
      const char *q;
      for (p=buf; *p != '\0'; p++) {
	for (q=seqSkip; *q != '\0'; q++) {
	  if (*p == *q)
	    break;
	}
	if (*p != *q) {
	  *p = toupper(*p);
	  nKeep++;
	}
      }
      int nOld = (seqs[nSeqs-1] == NULL) ? 0 : strlen(seqs[nSeqs-1]);
      seqs[nSeqs-1] = (char*)realloc(seqs[nSeqs-1], nOld+nKeep+1);
      assert(seqs[nSeqs-1] != NULL);
      char *out = seqs[nSeqs-1] + nOld;
      for (p=buf; *p != '\0'; p++) {
	for (q=seqSkip; *q != '\0'; q++) {
	  if (*p == *q)
	    break;
	}
	if (*p != *q) {
	  *out = *p;
	  out++;
	}
      }
      assert(out-seqs[nSeqs-1] == nKeep + nOld);
      *out = '\0';
    }
  }

  *outSeqs = seqs;
  *outNames = names;
  return(nSeqs);
}

