#!/usr/bin/perl -w
#
#  $Id: fastHmm.pl,v 1.16 2008/08/07 18:53:16 mprice Exp $
#  fastHmm/fastBlast Alignment Tools
#  http://microbesonline.org/fasthmm (fasthmm@microbesonline.org)
#
#  fastHmm Alignment Tool
#
#  Copyright (C) 2007 The Regents of the University of California
#  All rights reserved.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License along
#  with this program; if not, write to the Free Software Foundation, Inc.,
#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
#  Disclaimer
#
#  NEITHER THE UNITED STATES NOR THE UNITED STATES DEPARTMENT OF ENERGY,
#  NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED,
#  OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
#  COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
#  OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
#  PRIVATELY OWNED RIGHTS.
#

use strict;
use lib exists( $ENV{FASTHMM_DIR} ) ?
		"$ENV{FASTHMM_DIR}/lib" :
		"./lib";
use Args;
use File;
use Util;

sub dprint($$);
sub HMMLength($);

#
# USAGE
#
my $usage =
"Usage:
  fastHmm.pl <options>

Parameters:
  -i <queryFa>	Fasta file containing input sequences
  -t <db>	Specify target database:
		   gene3d
		   panther
		   pfam
		   pirsf
		   smart
		   superfam
		   tigrfam
		   ...

		You may specify 'all' to run the 7 included db's, or a
		comma-separated list of target databases.  Targets can
		optionally include the desired e-value by specifying
		'target1:evalue,target2:evalue,...'.  Targets that do
		not include e-values will use fastHmm default e-values.

Optional Parameters:
  -c <confFile>	Specify alternate configuration file
  -I		Ignore errors and continue running analysis
		   (Default: if a requisite tool terminates abnormally,
			     fastHmm will terminate.)
  -ha		Run full search on all HMMs
  -H		Do not run full search on 'hard' HMMs; faster, less complete
		   (Default: run full search on 'hard' HMMs)
  -hardpanther  Run full search on hard panther HMMs (default is off)
  -C		Make psiblast checkpoint files for selected database/targets
  -a <acc>	Specify accession or accessions, comma-separated, to process
		   (Default: Process all accessions for specified database)
  -A <accList>	Process all accesions in the specified accession list
		   (Default: Process all accessions for specified database)
  -E <e-value>	Specify e-value cutoff default for hmmsearch
		   (Default: smart:2.04e-5, gene3d:0.001, panther:1e-3, 
			default:0.02)
  -R		Provide only the raw HMM hits; do not run post-processing
  -M		In post-processing, do not map accessions to family names
		   (Default: map accession to family names)
  -f		Automatically run formatdb on <queryFa> as needed
		   (Default: Do not automatically format the input db)
  -h		Halt on incomplete target database
		   (Default: Ignore incomplete entries and proceed)
  -r		Rebuild .accList file for target database
  -d <dbDir>	Specify database base directory
		   (Default: \$ENV{FASTHMM_DIR}/db)
  -o <outDir>	Directory in which to generate output; required with -b
		   (Default: Use current working directory)
  -D		Debugging mode; do not remove intermediate files
  -q		Quiet execution; disable status updates to stdout
  -j <procs>	Parallelize analysis of targets on a single host

Distributed Computation Parameters:
  -b <size>	Specify batch size; each job will be run against <size> targets
		   Job list will be output to stdout; will not run jobs
		   This option also implies -q; -j option applies per job
		   Requires -o and the path must be absolute
  -m		Merge results of distributed computation
		   Run this only after all jobs are complete;
		   Must be run with exact options used to create job list
  -p		Run post-processing step (only after merging)

";

#
# Parse Command Line Options
#
my ($opts, $nonOpts) = Args::getArgs( 
	"+i:|+t:|a:|A:|E:|R|M;!R|f|h|r|d:|+;b;o:|D|q|j:|C|H|b:;o|m|J:|I|ha|p|c:|hardpanther",
	@ARGV, -1, $usage );

$opts->{c} = ( exists( $ENV{FASTHMM_DIR} ) ? $ENV{FASTHMM_DIR} : "." ) . "/conf/fastHmm.conf"
	if ( !exists( $opts->{c} ) );

my $conf = File::loadConfMacros( $opts->{c} );

my %eDefault = ();
foreach my $type ( qw/smart gene3d panther tigrfam pfam/ )
{
	$eDefault{$type} = $conf->{"E_$type"}
		if ( exists( $conf->{"E_$type"} ) );
}

$opts->{d} = ( exists( $ENV{FASTHMM_DIR} ) ? $ENV{FASTHMM_DIR} : "." ) . "/db"
	if ( !exists( $opts->{d} ) );

die "***FATAL*** Error: Specified input query file '", $opts->{i}, "' does not exist!\n"
	if ( !(-e $opts->{i}) );

if ( ( exists($opts->{b}) ) && ( ( $opts->{i} !~ /^\// ) ||
	( exists( $opts->{d} ) && ( $opts->{d} !~ /^\// ) ) ||
	( exists( $opts->{o} ) && ( $opts->{o} !~ /^\// ) ) ) )
{
	print STDERR "Error: You must specify absolute paths when using distributed computation mode (-b)\n";
	print STDERR "       -i: " . $opts->{i} . "\n"
		if ( $opts->{i} !~ /^\// );
	print STDERR "       -d: " . $opts->{d} . "\n"
		if ( exists( $opts->{d} ) && ( $opts->{d} !~ /^\// ) );
	print STDERR "       -o: " . $opts->{o} . "\n"
		if ( exists( $opts->{o} ) && ( $opts->{o} !~ /^\// ) );
	exit(1);
}

die "***FATAL*** Error: You must specify the original command-line to merge results\n"
	if ( exists( $opts->{m} ) && !exists( $opts->{o} ) );

$opts->{t} = lc( $opts->{t} );

$opts->{o} = "."
	if ( !exists( $opts->{o} ) );

$opts->{j} = $conf->{NUMTHREADS}
	if ( !exists( $opts->{j} ) );

die "***FATAL*** Error: Specified output directory '", $opts->{o}, "' does not exist!\n"
	if ( !(-d $opts->{o}) );

my %targets = ();
my $targetsFlat = $opts->{t};
foreach my $t ( split(/\s*,\s*/, $targetsFlat) )
{
	$t =~ s/^\s+|\s+$//g;
	if ( $t =~ /^all(?::([^,]+))?/ )
	{
		$targets{gene3d} = $1;
		$targets{panther} = $1;
		$targets{pfam} = $1;
		$targets{pirsf} = $1;
		$targets{smart} = $1;
		$targets{superfam} = $1;
		$targets{tigrfam} = $1;
	} elsif ( $t =~ /^(\w+)(?::([^,]+))?/ )
	{
		$targets{$1} = $2;
	}
}

my $edv = exists( $opts->{E} ) ?
		$opts->{E} :
		$conf->{E_default};

foreach my $target ( sort( keys( %targets ) ) )
{
	$opts->{t} = $target;
	$opts->{E} = defined( $targets{$target} ) ?
			$targets{$target} :
			undef;
	if ( !defined($opts->{E}) )
	{
		$opts->{E} = exists( $eDefault{$target} ) ?
				$eDefault{$target} :
				$edv;
	}

	my $dataDir = $opts->{d} . "/" . $opts->{t};
	my $hmmDir = $opts->{d} . "/" . $opts->{t} . "/hmm";
	if ( !(-d $dataDir) || !(-d $hmmDir) ) {
	    dprint( "WARNING -- Target database type '" . $opts->{t} . "' has no data in '" . $opts->{d} . "'; skipping ...\n", $opts );
	} else {
	    dprint( "Running fastHmm on '$opts->{t}' with E-value '$opts->{E}' ...\n", $opts );
	    fastHmm( $opts, $conf );
	}
}

exit(0);

# Runs PSI-BLAST to get candidates for the specified target accession
# If the HMM is >= threshold, uses a different e-value cutoff
# If target.hmm.psiblast file exists, runs a second job with the second
# checkpoint to try and get more candidates
sub runBlastPgp
{
	my $target = shift;
	my $baseDir = shift;
	my $opts = shift;
	my $conf = shift;
	my $dbDir = $opts->{d} . "/" . $opts->{t};

	my $cmd = $baseDir . "/bin/blastpgp -d " . $opts->{i};
	my $seqFile = $opts->{t} eq "superfam" ? "$dbDir/$target.fa" : "$dbDir/$target.seq";
	$cmd .= " -i $seqFile";

	my $psiFile = "$dbDir/$target.psiblast";
	if (-e $psiFile) {
	    $cmd .= " -R $psiFile";
	} else {
	    $cmd .= " -B $dbDir/$target.b";
	    $cmd .= " -C $psiFile" if exists $opts->{C};
	}
	my $outFile = $opts->{o} . "/blastpgp." . $opts->{t} . "." . $target . ".pgp";
	my $optParams = Util::configParams( $conf, "BLASTPGP" );
	$cmd .= " -o $outFile -m 8";
	$cmd .= " $optParams"
		if ( length($optParams) > 0 );

	# is this a short or long HMM?
	my $longparam = HMMLength($dbDir."/hmm/".$target.".hmm") >= $conf->{PSIBLAST_LONG_LENGTH} ?
	    "PSIBLAST_LONG_E" : "PSIBLAST_SHORT_E";
	my $eval = $conf->{$longparam};
	$cmd .= " -e $eval";

	# run the command
	dprint( "[status] blastpgp, target $target in " . $opts->{t} . "...\n", $opts );
	dprint( "[debug]  $cmd\n", $opts )
		if ( $opts->{D} );
	system( $cmd . " >/dev/null 2>&1" );
	checkReturnCode( $?, "blastpgp", $opts );

	# second round of searching for candidates if .hmm.psiblast file exists
	my @outFiles = $outFile;
	my $outFile2 = undef;
	my $psi2 = $dbDir . "/" . $target . ".hmm.psiblast";
	if (-e $psi2) {
	    $outFile2 = $outFile."2";
	    $cmd = "$baseDir/bin/blastpgp -d $opts->{i} -i $psi2.fa -R $psi2 -o $outFile2 -m 8 $optParams";
	    $cmd .= " -e $eval";
	    dprint( "[debug]  $cmd\n", $opts )
		if ( $opts->{D} );
	    system( $cmd . " >/dev/null 2>&1" );
	    checkReturnCode( $?, "blastpgp", $opts );
	    push @outFiles, $outFile2;
	}

	# extract relevant sequences
	dprint( "[status] fastacmd, target $target in " . $opts->{t} . "...\n", $opts );
	local *IN;
	local *OUT;
	my %hits = ();
	my $totalHits = 0;
	foreach my $file (@outFiles) {
	    open(IN, "<",$file) || die "**FATAL** cannot read blastpgp output file $file";
	    while(<IN>)
	    {
		my @data = split(/\t/, $_);
		die "***FATAL*** cannot parse $_ from blastpgp output $file" unless scalar(@data) >= 2;
		$totalHits++;
		$hits{$data[1]} = 1;
	    }
	    close(IN) || die "***FATAL*** Error reading blastpgp output file $file";
	}
	unlink($outFile2) if defined $outFile2 && !exists $opts->{D};

	my @hits = sort( keys( %hits ) );
	my $numHits = scalar( @hits );
	my $hitsFaFile = $outFile . ".fa";
	open(OUT, ">$hitsFaFile");
	close(OUT);
	if ( scalar(@hits) > 0 )
	{
	        my $tmpdir = exists $ENV{TMPDIR} && -d $ENV{TMPDIR} ? $ENV{TMPDIR} : "/tmp";
		my $hitsFile = "$tmpdir/fastHmm.$target." . $opts->{t} . ".hits.$$";
		open(OUT, ">$hitsFile");
		foreach my $hit ( @hits )
		{
			if ( $hit =~ /^\d+$/ )
			{
				print OUT "lcl|$hit\n";
			} else {
				print OUT $hit, "\n";
			}
		}
		close(OUT);

		$cmd = $baseDir . "/bin/fastacmd -d " . $opts->{i} . " -i $hitsFile";
		dprint( "[debug]  $cmd\n", $opts )
			if ( $opts->{D} );
		open(IN, "$cmd |");
		open(OUT, ">$hitsFaFile");
		while(<IN>)
		{
			s/lcl\|(\S+).+$/$1/g
				if ( /^>/ );
			print OUT;
		}
		close(OUT);
		checkReturnCode( $?, "fastacmd", $opts );

		close(IN);

		unlink( $hitsFile )
			if ( !exists( $opts->{D} ) );
	}

	return ( $outFile, $hitsFaFile, $numHits, $totalHits );
}

sub parseHmmAlignments
{
	my $target = shift;
	my $inFile = shift;
	my $outFile = shift;
	my $opts = shift;
	local *IN;
	local *OUT;

	my ($ACC,$SEQID,$SEQBEG,$SEQEND,$DOMBEG,$DOMEND,$SCORE,$E) = (0..7);

	open(IN, "<$inFile");

	while(<IN>)
	{
		last if /^Parsed for domains/;
	}

	my @hits = ();
	while(<IN>)
	{
		chomp;
		last if m/^Alignments/ || m/^[ \t]+.?no hits/;
		next if $_ eq "" || m/^Sequence +Domain/ || m/^[ -]+$/;
		my @F = split /[\t ]+/, $_;
		die "***FATAL*** Cannot parse $_" unless @F==10;
		push @hits, [$target,$F[0],$F[2],$F[3],$F[5],$F[6],$F[8],$F[9]];
	}

	my $hit = undef;

	# Lineoffset keeps track of where we are:
	# lineoffset = 1 means we expect the model alignment line [or an RF line]
	# lineoffset = 2 means we expect the match-quality line (pluses and spaces)
	# lineoffset = 3 means we expect the sequence alignment line
	# lineoffset = 4 means we expect the blank line that separates the alignment sections
	my $lineOffset = 0;
	my $modelAlign = undef; # the model alignment
	my $seqAlign = undef; # the sequence alignment

	my @lines = <IN>;
	close(IN);
	chomp(@lines);

	open(OUT, ">>$outFile");

while(@lines > 0)
{
    my $line = shift @lines;

    # example input line: "616154: domain 1 of 1, from 196 to 341: score 10.1, E = 0.0048"
    # First, save existing alignment if we're at the begining of another one, or at the end
    if (($line =~ m/^Histogram/ || $line =~ m/domain.*from/)
        && defined $hit) {
        $modelAlign =~ s/^[*]-[>]//;
        $modelAlign =~ s/[<]-[*]$//;
        # The modelALign and seqAlign should line up
        die "***FATAL*** Sequence and model for $hit->[$SEQID] beginning at $hit->[$SEQBEG]"
            . " are of different lengths:\n$modelAlign\n$seqAlign\nin ParseHmmAlignments.pl"
            if length($modelAlign) != length($seqAlign);

        my $seqOff = $hit->[$SEQBEG];
        my $modelOff = $hit->[$DOMBEG];

        my $range = []; # in sequence (list of begin-to)
        my $rangeHMM = [];
        for (my $i = 0; $i < length($seqAlign); $i++) {
            my $c = substr($seqAlign,$i,1);
            if (uc($c) eq $c && $c ne "-") { # upper-case characters are matches to the HMM
                if (@$range > 0 && $range->[-1][1] == $seqOff-1) {
                    $range->[-1][1] = $seqOff;
                } else {
                    push @$range, [$seqOff,$seqOff];
                }
                if (@$rangeHMM > 0 && $rangeHMM->[-1][1] == $modelOff-1) {
                    $rangeHMM->[-1][1] = $modelOff;
                } else {
                    push @$rangeHMM, [$modelOff,$modelOff];
                }
            }
            $modelOff++ unless substr($modelAlign,$i,1) eq ".";
            $seqOff++ unless $c eq "-";
        }
#       print STDERR "Parsing $subStart:$subEnd\nseq $subseq\nmod $modelAlign\nended at seq $seqOff model $modelOff\n";
#       print STDERR "Last model range " . join(" ",@{$rangeHMM->[-1]})."\n";

        print OUT join("\t",@$hit,
                   join(",", map {$_->[0].":".$_->[1]} @$range),
                   join(",", map {$_->[0].":".$_->[1]} @$rangeHMM))
            ."\n";
    }
    # Then process the line
    if ($line =~ m/^Histogram/) {
        last;
    } elsif ($line =~ m/^(.*): domain.*from ([0-9]+) to ([0-9]+):/) {
        my ($seqId, $begin, $end) = ($1,$2,$3);
        die "***FATAL*** Unexpected domain -- no hits left" if @hits==0;
        $hit = shift @hits;
        die "***FATAL*** Unexpected seqId $seqId $begin $end vs. $hit->[$SEQID] $hit->[$SEQBEG] $hit->[$SEQEND]" unless
            $hit->[$SEQID] eq $seqId && $hit->[$SEQBEG] == $begin && $hit->[$SEQEND] == $end;
        $lineOffset = 1;
        $modelAlign = "";
        $seqAlign = "";
    }  elsif (defined $hit && $lineOffset == 1) {
        next if $line eq ""; # There are extra blank lines between alignments and start of histogram section
        # I don't know what these RF lines are, but I need to check that the RF is aligned
        # to the left of the sequence. ("RF " could be the alignment of the model!)
        if ($line =~ m/^( +RF) +/) {
            my $RFLength = length($1);
            die "***FATAL*** Cannot parse RF -- no successor: $line" if @lines < 3;
            my $nextLine = $lines[2];
            # Example RF lines:
            #                  RF  xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
            #                  RF
            # Example nextline:
            #        651984   121  vesgkkqfvgseiagkrlgviglgaigalvandalalgmdvvgydpyisv 170
            if ($nextLine =~ m/^( +\S+ +\S+) +\S/ && length($1) == $RFLength) {
                next;
            }
            # else keep the line, assume it is a bona fide alignment line
        }
        die "***FATAL*** Cannot parse $line" unless $line =~ m/^ +([*<>a-zA-Z.-]+)$/;
        $modelAlign .= $1;
        # e.g., $modelAlign = "*->LklldkhLipka...ttaeskvFylKmkGDYyRYlaEfatgeerkeaadk<-*"
        # (The ends are optional)
        # Capitalization indicates the strength of the match, not whether or not it is a match,
        # and ... indicates insert states
        $lineOffset = 2;
    } elsif (defined $hit && $lineOffset == 2) {
        $lineOffset = 3; # skip the alignment matches indicator
    } elsif (defined $hit && $lineOffset == 3) {
        # example line: "      616154   196    GRVFRRDF--EDATHAMMFHQVEGLVI--DKGITMASLKGALTEMAR 238 "
        # example line: "        15502     -     -    " #  Hangover line, note absence of right-end count...
        if ($line =~ m/^\s*(\S+)\s+[-]\s+[-]\s+$/) {
            my $seqId2 = $1;
            die "***FATAL*** Cannot parse $line -- expected sequence $seqId2" unless
                substr($hit->[$SEQID],0,length($seqId2)) eq $seqId2;
        } elsif ($line =~ m/^\s*(\S+)\s+([0-9-]+)\s+(\S+)\s+([0-9-]+) ?/) {
            my ($seqId2,$subStart,$subseq,$subEnd) = ($1,$2,$3,$4);
            # HMMer often truncates the seqid, but it should still match as far as it is present
            die "***FATAL*** Cannot parse $line -- expected sequence $seqId2"
                unless substr($hit->[$SEQID],0,length($seqId2)) eq $seqId2;
            my $subseqStrip = $subseq;
            $subseqStrip =~ s/[-]//g;
            if ($subStart ne "-") {
                die "***FATAL*** Cannot parse $line -- subseq $subseq is of wrong length"
                    unless length($subseqStrip) == $subEnd-$subStart+1;
            }
            $seqAlign .= $subseq;
        } else {
            die "***FATAL*** Cannot parse $line";
        }
        $lineOffset = 4;
    } elsif ($line eq "" && $lineOffset==4) {
        $lineOffset = 1;
    }
}

	close(OUT);

}

sub parseHmmDomains
{
	my $target = shift;
	my $inFile = shift;
	my $outFile = shift;
	my $opts = shift;
	local *IN;
	local *OUT;

	open(IN, "<$inFile");
	while(<IN>)
	{
		last if ( /^Parsed for domains/i );
	}

	open(OUT, ">>$outFile");
	while(<IN>)
	{
		chomp;
		last if ( (/^Alignments/i) || (/^[ \t]+.?no hits/i) );
		next if ( ($_ eq "") || (/^Sequence +Domain/i) || (/^[ -]+$/) );
		my @F = split( /[\t ]+/, $_ );
		if ( scalar(@F) == 10 )
		{
			print OUT join("\t", $target, $F[0], $F[2],
					$F[3], $F[5], $F[6], $F[8], $F[9]) . "\n";
		}
	}
	close(OUT);
}

sub parseHmmGenes
{
	my $target = shift;
	my $inFile = shift;
	my $outFile = shift;
	my $opts = shift;
	local *IN;
	local *OUT;

	open(IN, "<$inFile");
	while(<IN>)
	{
		last if /^Scores for complete/i;
	}

	open(OUT, ">>$outFile");
	while(<IN>)
	{
		chomp;
		last if ( (/^Parsed for domains/i) || (/^[ \t]+.?no hits/i) );
		next if ( ($_ eq "") || (/^Sequence +Description/i) || (/^[ -]+$/) );
		my @F = split( /[\t ]+/, $_ );
		if ( scalar(@F) == 4 )
		{
			print OUT join("\t", $target, $F[0], $F[1], $F[2]) . "\n";
		}
	}
	close(OUT);
}

sub runHmmSearch
{
	my $target = shift;
	my $numFams = shift;
	my $baseDir = shift;
	my $opts = shift;
	my $pgpFaOut = shift;
	my $conf = shift;
	my $dbDir = $opts->{d} . "/" . $opts->{t};

	$numFams = int($numFams / 2)
		if ( $opts->{t} eq 'pfam' );

	if ( !(-z $pgpFaOut) )
	{
		my $tcOut = $opts->{o} . "/hmmsearch.$target." . $opts->{t} . ".tc";
		my $cmd = $baseDir . "/bin/hmmsearch --informat FASTA -Z $numFams ";
		if ( $opts->{E} =~ /^trusted$/i )
		{
			$cmd .= "--cut_tc ";
		} elsif ( $opts->{E} =~ /^gather$/i )
		{
			$cmd .= "--cut_ga ";
		} else {
			$cmd .= "-E " . $opts->{E} . " ";
		}
		my $optParams = Util::configParams( $conf, "HMMSEARCH" );
		$cmd .= "$optParams "
			if ( length($optParams) > 0 );
		$cmd .= $dbDir . "/hmm/$target.hmm $pgpFaOut >$tcOut";
		dprint( "[status] hmmsearch, target $target in " . $opts->{t} . "...\n", $opts );
		dprint( "[debug]  $cmd\n", $opts )
			if ( $opts->{D} );
		system( $cmd . " 2>/dev/null" );
		checkReturnCode( $?, "hmmsearch", $opts );

		if ( !(-z $tcOut) )
		{
			dprint( "[status] parsing hmmsearch output for target $target in " . $opts->{t} . "...\n", $opts );
			my $baseName = ( split(/\//, $opts->{i}) )[-1];
			my $prefix = ( split(/\./, $baseName) )[0];
			my $tcOutParsed;
			if ( $opts->{j} > 1 )
			{
				# if more than one proc, we have to write separate
				# output domain files, then merge at the end
				# due to concurrent write issues
				$tcOutParsed = $opts->{o} . "/result." . $opts->{t} . ".$target.hmmhits";
			} else {
				# single analysis thread; okay to write to same output
				$tcOutParsed = $opts->{o} . "/result.$prefix." . $opts->{t} . ".hmmhits";
				$tcOutParsed .= ".part" . $opts->{J}
					if ( exists( $opts->{J} ) );
			}
			#parseHmmDomains( $target, $tcOut, $tcOutParsed, $opts );
			parseHmmAlignments( $target, $tcOut, $tcOutParsed, $opts );
		}
		if ( !exists( $opts->{D} ) )
		{
			unlink( $tcOut );
		}
	}
}

sub mergeDistributedResults
{
	my $baseDir = shift;
	my $opts = shift;
	local *IN;
	local *OUT;

	delete( $opts->{b} )
		if ( exists( $opts->{b} ) );

	my $pattern = "^result\\..+\\." . $opts->{t} . "\\.hmmhits\\.part\\d+";
	my $files = File::getFilteredFileList( $opts->{o}, [ $pattern ], -1 );

	my $mergedFile = $files->[0];
	$mergedFile =~ s/\.part\d+//g;

	open(OUT, ">$mergedFile");
	my $lastPart = -1;
	my $missing = 0;
	foreach my $file ( sort
				{
					my ( $pa ) = $a =~ /\.part(\d+)$/;
					my ( $pb ) = $b =~ /\.part(\d+)$/;
					$pa <=> $pb
				} @{$files} )
	{
		my ( $partNum ) = $file =~ /\.part(\d+)$/;
		if ( $partNum > $lastPart + 1 )
		{
			if ( $lastPart + 1 < $partNum - 1 )
			{
				print STDERR "Warning: Result parts " . ($lastPart + 1) . "-" . ($partNum - 1) . " are missing; possibly the jobs did not run!\n";
				$missing += ( $partNum - 1 - $lastPart );
			} else {
				print STDERR "Warning: Result part " . ($lastPart + 1) . " is missing; possibly the job did not run!\n";
				$missing++;
			}
		}

		open(IN, "<$file");
		while(<IN>)
		{
			print OUT;
		}
		close(IN);
		unlink( $file )
			if ( !exists( $opts->{D} ) );

		$lastPart = $partNum;
	}
	close(OUT);

	dprint( "Merged " . scalar(@{$files}) . " result parts; possibly $missing missing parts.\n", $opts );
	dprint( "Results written to $mergedFile\n", $opts );
}

sub postProcessResults
{
	my $opts = shift;
	my $conf = shift;

	dprint( "Running post-processing step for $opts->{t} ...\n", $opts );
	my $baseName = ( split(/\//, $opts->{i}) )[-1];
	my $prefix = ( split(/\./, $baseName) )[0];
	my $outPrefix = $opts->{o} . "/result.$prefix." . $opts->{t};
	my $hmmHitsFile = $outPrefix . ".hmmhits";
	my $domainsFile = $outPrefix . ".domains";
	local *HMM;
	local *DOM;

	if ( ($opts->{t} eq 'panther') && exists( $opts->{p} ) )
	{
		dprint( "Generating command lists to be run on the cluster for panther post-processing ...\n", $opts );
		my $cmd = "$ENV{FASTHMM_DIR}/bin/pantherSubdomain.pl -db $opts->{i} -hits $hmmHitsFile -dbdir $opts->{d} -numcpus $opts->{j} -cluster -outdir $opts->{o}";
		system( $cmd ) == 0 || die "***FATAL*** pantherSubdomain.pl failed $? -- $cmd";

		return;
	}

	open(DOM, ">$domainsFile");
	if ( $opts->{t} eq 'pfam' )
	{
		open(HMM, "$ENV{FASTHMM_DIR}/bin/combinePfam.pl $hmmHitsFile $opts->{d} |");
	} elsif ( $opts->{t} eq 'panther' )
	{
		open(HMM, "$ENV{FASTHMM_DIR}/bin/pantherSubdomain.pl -db $opts->{i} -hits $hmmHitsFile -dbdir $opts->{d} -numcpus $opts->{j} 2>/dev/null |");
	} elsif ( $opts->{t} eq 'pirsf' )
	{
		open(HMM, "$ENV{FASTHMM_DIR}/bin/pirsfAssign.pl $hmmHitsFile $opts->{d} 2>/dev/null |");
	} elsif ( $opts->{t} eq 'gene3d' )
	{
		my $cmd = "$ENV{FASTHMM_DIR}/bin/ssfAssignFast.pl -superfams $opts->{d}/$opts->{t}/.accList -db $opts->{i} -parsed $hmmHitsFile";
		my $optParams = Util::configParams( $conf, "SSFASSIGNFAST_gene3d" );
		$cmd .= " $optParams"
			if ( length($optParams) > 0 );
		$cmd .= " -map $opts->{d}/$opts->{t}/$opts->{t}.tab"
			if ( !exists( $opts->{M} ) );
		open(HMM, "$cmd 2>/dev/null |");
	} elsif ( $opts->{t} eq 'superfam' )
	{
		my $cmd = "$ENV{FASTHMM_DIR}/bin/ssfAssignFast.pl -superfams $opts->{d}/$opts->{t}/.accList -db $opts->{i} -parsed $hmmHitsFile";
		$cmd .= " -map $opts->{d}/$opts->{t}/$opts->{t}.tab"
			if ( !exists( $opts->{M} ) );
		open(HMM, "$cmd 2>/dev/null |") || die "**FATAL** Error running $cmd";
	} else
	{
		# default is no post-proc
		open(HMM, "<$hmmHitsFile") || die "**FATAL** Cannot read $hmmHitsFile";
	}

	while(<HMM>)
	{
		print DOM;
	}
	close(HMM);
	checkReturnCode( $?, "$opts->{t} post-processing", $opts )
		if ( $opts->{t} =~ /^(?:pfam|panther|pirsf|gene3d|superfam)$/ );
	close(DOM);
}

sub fastHmm
{
	my $opts = shift;
	my $conf = shift;

	my $baseDir = exists( $ENV{FASTHMM_DIR} ) ?
				$ENV{FASTHMM_DIR} :
				".";

	$ENV{'BLASTMAT'} = $baseDir . "/matrix";

	if ( exists( $opts->{p} ) )
	{
		postProcessResults( $opts, $conf );
		return;
	}

	if ( exists( $opts->{m} ) )
	{
		mergeDistributedResults( $baseDir, $opts );
		return;
	}

	#
	# Check if query database has been formatted for BLAST
	#
	foreach my $ext ( qw/phr pin psd psi psq/ )
	{
		my $file = $opts->{i} . "." . $ext;
		if ( !(-e $file) && !(-e $opts->{i} . ".00.$ext"))
		{
			# part or all of the BLASTable db is missing
			if ( exists( $opts->{f} ) )
			{
				my $cmd = $baseDir . "/bin/formatdb -i $opts->{i} -o T -p T 2>/dev/null";
				dprint( "The BLAST-compatible database for '$opts->{i}' does not exist - creating it ...\n", $opts );
				dprint( "[debug]  $cmd\n", $opts )
					if ( $opts->{D} );
				system( $cmd );
				checkReturnCode( $?, "formatdb", $opts );

				# check for presence again before continuing
				foreach my $ext ( qw/phr pin psd psi psq/ )
				{
					my $file = $opts->{i} . "." . $ext;
					if ( !(-e $file) )
					{
						print STDERR "Error: Unable to create BLAST-compatible database for '$opts->{i}'\n";
						print STDERR "       Please check the 'formatdb.log' for error messages.\n";
						exit(1);
					}
				}

				# Remove unless the user specifies -D
				unlink( "formatdb.log" )
					if ( !exists( $opts->{D} ) );
			} else {
				print STDERR "Error: The BLAST-compatible database for '$opts->{i}' does not exist\n";
				print STDERR "       Please create it using 'formatdb -i $opts->{i} -o T -p T', or\n";
				print STDERR "       rerun fastHmm.pl with the -f option.\n";
				exit(1);
			}
		}
	}


	#
	# Check that all necessary files for the specified input database exist
	# - $ENV{FASTHMM_DIR}/db/target/
	# - $ENV{FASTHMM_DIR}/db/target/hmm/
	# - $ENV{FASTHMM_DIR}/db/target/.accList (create if it doesn't exist or -r)
	# - foreach entry in .accList:
	#	- superfam: <entry>.fa, <entry>.psiblast, hmm/<entry>.hmm
	#	- rest: <entry>.seq, (<entry>.b or <entry>.psiblast), hmm/<entry>.hmm
	#
	my $targetSet = undef;
	if ( exists( $opts->{a} ) || exists( $opts->{A} ) )
	{
		dprint( "Loading user-supplied target set ...\n", $opts );
		$targetSet = loadUserTargetSet( $opts );
	}

	my %hardHmms = ();


	if (!exists $opts->{H} && ($opts->{t} ne "panther" || exists $opts->{hardpanther}))
	{
		local *IN;
		my $targetDir = $opts->{d} . "/" . $opts->{t};
		my $hardListFile = $targetDir . "/" . $opts->{t} . ".hard.list";

		if ( -e $hardListFile )
		{
			open(IN, "<$hardListFile");
			while(<IN>)
			{
				chomp;
				$hardHmms{$_} = 1;
			}
			close(IN);

			dprint( "Loaded " . scalar( keys( %hardHmms ) ) . " hard HMMs for complete search ...\n", $opts );
		}
	}

	dprint( "Checking completeness of target database ...\n", $opts );
	my ($targets, $numFams) = validateAndLoadTargets( $opts, $baseDir, $targetSet );
	my $numTargets = scalar( @{$targets} );
	if ( $numTargets > 0 )
	{
		# if cluster operation
		if ( exists( $opts->{b} ) )
		{
			$opts->{j} = $opts->{b}
				if ( exists( $opts->{j} ) &&
					( $opts->{j} > $opts->{b} ) );

			generateClusterJobList( $opts, $targets, \%hardHmms ); # to STDOUT
			return;
		}

		# run it now
		my $availSlots = $opts->{j};
		for ( my $t = 0 ; $t < $numTargets; $t++ )
		{
			if ( $availSlots > 0 )
			{
				my $childPid = fork();
				if ( defined($childPid) )
				{
					if ( $childPid == 0 )
					{
						my $target = $targets->[$t];
						my $pgpOut;
						my $faOut;

						if ( exists $hardHmms{$target} )
						{
							( $pgpOut, $faOut ) = ( undef, $opts->{i} );
						} else {
							my $numHits = 0;
							my $totalHits = 0;
							( $pgpOut, $faOut, $numHits, $totalHits ) = 
								runBlastPgp( $target, $baseDir, $opts, $conf );
							dprint( "[status] blastpgp found $totalHits hits to $numHits sequences for $target in $opts->{t}.\n", $opts );
						}
						runHmmSearch( $target, $numFams, $baseDir, $opts, $faOut, $conf );
						if ( !exists( $opts->{D} ) && defined( $pgpOut ) )
						{
							unlink( $pgpOut );
							unlink( $faOut );
						}
						exit(0);
					}
				} else {
					print STDERR "[error] unable to create child process to run analysis!\n";
					exit(1);
				}

				# one slot used
				$availSlots--;
			} else {
				# no more slots available; wait for a child to finish
				$t--;
				my $childPid = wait();
				die "***FATAL*** Child process died: $?" if ($? & 255) != 0 && $childPid > 0;
				if ( $childPid > 0 )
				{
					$availSlots++
				} elsif ( $childPid < 0 )
				{
					# should never get this, but just in case
					$availSlots = $opts->{j};
				}
			}
		}

		# wait for all child procs to finish
		dprint( "Waiting for child processes to complete...\n", $opts );
		my $childPid = 0;
		do
		{
			$childPid = wait();
			die "***FATAL*** Child process died: $?" if ($? & 255) != 0 && $childPid > 0;
		} until ( $childPid < 0 );

		# merge results if -j > 1
		if ( $opts->{j} > 1 )
		{
			dprint( "Merging output files...\n", $opts );
			my $baseName = ( split(/\//, $opts->{i}) )[-1];
			my $prefix = ( split(/\./, $baseName) )[0];
			my $tcOutParsed = $opts->{o} . "/result.$prefix." . $opts->{t} . ".hmmhits";
			$tcOutParsed .= ".part" . $opts->{J}
				if ( exists( $opts->{J} ) );
			local *IN;
			local *OUT;

			open(OUT, ">$tcOutParsed");
			for ( my $t = 0; $t < $numTargets; $t++ )
			{
				my $target = $targets->[$t];
				my $inFile = $opts->{o} . "/result." . $opts->{t} . ".$target.hmmhits";
				if ( -e $inFile )
				{
					open(IN, "<$inFile");
					while(<IN>)
					{
						print OUT;
					}
					close(IN);
					unlink($inFile)
						unless ( exists( $opts->{D} ) );
				}
			}
			close(OUT);
		}

		# post-processing unless not desired or running in cluster mode
		if ( !exists( $opts->{R} ) && !exists( $opts->{J} ) )
		{
			# run post-processing
			postProcessResults( $opts, $conf );
		}
	} else {
		dprint( "Nothing to do!  Check target databases and/or specified accessions.\n", $opts );
	}
}

sub checkReturnCode
{
	my $rv = shift;
	my $component = shift;
	my $opts = shift;
	my $code = $rv >> 8;

	if ( $code != 0 )
	{
		dprint( "**FATAL** $component exited abnormally with exit code $code\n", $opts );
		exit(1)
			if ( !exists( $opts->{I} ) );
	}
}

sub loadUserTargetSet
{
	my $opts = shift;
	my @set = ();
	my %setExists = ();

	if ( exists( $opts->{a} ) )
	{
		foreach my $target ( split(/\s*,\s*/, $opts->{a}) )
		{
			if ( !exists( $setExists{$target} ) )
			{
				push( @set, $target );
				$setExists{$target} = 1;
			}
		}
	}

	if ( exists( $opts->{A} ) )
	{
		local *IN;
		open(IN, "<$opts->{A}");
		while(<IN>)
		{
			s/^\s+|\s+$//g;
			my $target = $_;
			if ( !exists( $setExists{$target} ) )
			{
				push( @set, $target );
				$setExists{$target} = 1;
			}
		}
		close(IN);
	}

	return \@set;
}

sub buildAccList
{
	my $targetDir = shift;
	my $opts = shift;
	my $baseDir = shift;
	my @targets = ();

	my $accListFile = $targetDir . "/.accList";

	my $filters = [ '\.hmm$' ];
	my $seqExt = ( $opts->{t} eq 'superfam' ) ?
			".fa" :
			".seq";

	dprint( "Building target database accession cache; only performed first time ...\n", $opts );
	my $files = File::getFilteredFileList( $targetDir, $filters );
	if ( scalar(@{$files}) < 1 )
	{
		print STDERR "[error] Could not find source files for $opts->{t}.  Check database base directory\n";
		exit(1);
	}
	foreach my $file ( @{$files} )
	{
		my ( $baseFile ) = $file =~ /([^\/]+)$/;

		# skip panther subfamily hmm's
		next
			if ( $baseFile eq 'hmmer.hmm' );

		my $baseName = $baseFile;
		$baseName =~ s/\.[^.]+$//;

		my $alnFileB = $targetDir . "/" . $baseName . ".b";
		my $alnFilePsi = $targetDir . "/" . $baseName . ".psiblast";
		my $seqFile = $targetDir . "/" . $baseName . $seqExt;

		if ( (-e $seqFile) && ((-e $alnFileB) || (-e $alnFilePsi)) )
		{
			# all files present, add to list
			push( @targets, $baseName );
		} else {
			if ( exists( $opts->{h} ) )
			{
				print STDERR "Error: target [$opts->{t}]; accession [$baseName] is incomplete.\n";
				print STDERR "       Ignore this and continue by not specifying -h.\n";
				exit(1);
			} else {
				dprint( "Warning: target [$opts->{t}]; accession [$baseName] is incomplete.  Converting from HMM ...\n", $opts );
				# try to convert the existing hmm to corresponding psiblast and seq file
				my $cmd = $baseDir . "/bin/hmmToPsiBlast.pl -in $file -out $alnFilePsi -seed $seqFile";
				system( $cmd . " >/dev/null 2>&1" );
				checkReturnCode( $?, "hmmToPsiBlast.pl", $opts );
				if ( !(-z $alnFilePsi) && !(-z $seqFile) )
				{
					push( @targets, $baseName );
				} else {
					if ( !exists( $opts->{D} ) )
					{
						unlink( $alnFilePsi );
						unlink( $seqFile );
					}
					dprint( "Warning: could not convert target [$opts->{t}]; Skipping ...\n", $opts );
				}
			}
		}
	}

	local *OUT;
	open(OUT, ">$accListFile");
	print OUT join("\n", @targets), "\n";
	close(OUT);

	return \@targets;
}

sub loadAccList
{
	my $accListFile = shift;
	my $targetDir = shift;
	my $opts = shift;
	my @targets = ();

	local *IN;
	open(IN, "<$accListFile");
	while(<IN>)
	{
		chomp;
		push( @targets, $_ );
	}
	close(IN);

	return \@targets;
}

sub validateAndLoadTargets
{
	my $opts = shift;
	my $baseDir = shift;
	my $targetSet = shift;

	my $targetDir = $opts->{d} . "/" . $opts->{t};
	my $accListFile = $targetDir . "/.accList";
	my $targetHmmDir = $targetDir . "/hmm";

	my $targets = undef;
	if ( !(-e $accListFile) || exists( $opts->{r} ) )
	{
		$targets = buildAccList( $targetDir, $opts, $baseDir );
	} else {
		$targets = loadAccList( $accListFile, $targetDir, $opts );
	}

	my $numTargets = scalar( @{$targets} );

	if ( defined($targetSet) )
	{
		$targets = pruneTargetList( $targets, $numTargets, $targetSet, $opts->{h} );
	}

	dprint( "Loaded $numTargets complete accession targets in $opts->{t} ...\n", $opts );

	return ( $targets, $numTargets );
}

sub pruneTargetList
{
	my $targets = shift;
	my $numTargets = shift;
	my $targetSet = shift;
	my $haltOnError = shift;

	my @newTargets = ();

	$haltOnError = 0
		if ( !defined($haltOnError) );

	my $numSet = scalar( @{$targetSet} );

	return $targets
		if ( $numSet < 1 );

	my %filter = ();
	for ( my $i = 0; $i < $numSet; $i++ )
	{
		$filter{$targetSet->[$i]} = 1;
	}

	for ( my $i = 0; $i < $numTargets; $i++ )
	{
		push( @newTargets, $targets->[$i] )
			if ( exists( $filter{$targets->[$i]} ) );
	}

	if ( (scalar( @newTargets ) < $numSet) && $haltOnError )
	{
		print STDERR "Error: Found ", scalar(@newTargets), " of $numSet requested targets!\n";
		print STDERR "       Please check your list and try again or omit the -h option to ignore this error.\n";
		exit(1);
	}

	return \@newTargets;
}

sub generateClusterJobList
{
	my ($opts,$targets,$hardhmms) = @_;

	my $batchsize = $opts->{b};

	# If some are hard and some are easy, put the hard lists
	# in groups that are 20x smaller, and put them first
	my @hardlist = grep {exists $hardhmms->{$_}} @$targets;
	my @easylist = grep {!exists $hardhmms->{$_}} @$targets;
	my $hardsize = int($batchsize/20 + 0.5);
	$hardsize = 1 if $hardsize < 1;
	if (scalar(@hardlist) == scalar(@$targets)) {
	    # no special treatment if all hard
	    $hardsize = $batchsize;
	}

	my $jobNum = 0;
	my @batch = ();
	foreach my $target (@hardlist) {
	    push @batch, $target;
	    if (scalar(@batch) == $hardsize) {
		print generateFastHmmCmd( $opts, \@batch, $jobNum++ ), "\n";
		@batch = ();
	    }
	}
	print generateFastHmmCmd( $opts, \@batch, $jobNum++ ), "\n"
	    if @batch > 0;
	@batch = ();

	foreach my $target (@easylist) {
	    push @batch, $target;
	    if (scalar(@batch) == $batchsize) {
		print generateFastHmmCmd( $opts, \@batch, $jobNum++ ), "\n";
		@batch = ();
	    }
	}
	print generateFastHmmCmd( $opts, \@batch, $jobNum++ ), "\n"
	    if @batch > 0;
	@batch = ();
}

sub generateFastHmmCmd
{
	my $opts = shift;
	my $batch = shift;
	my $jobNum = shift;

	$jobNum = 0
		if ( !defined($jobNum) );

	my $baseDir = exists( $ENV{FASTHMM_DIR} ) ?
			$ENV{FASTHMM_DIR} :
			".";
	my $cmd = $baseDir . "/bin/fastHmm.pl";
	foreach my $key ( sort( keys( %{$opts} ) ) )
	{
		next	if ( $key eq 'b' );
		$cmd .= " -" . $key;
		next	if ( $key =~ /^[HCfhrDqm]$/ );
		if ( $key eq 'a' )
		{
			$cmd .= " " . join(",", @{$batch});
		} else {
			$cmd .= " " . $opts->{$key}
				if ( defined( $opts->{$key} ) );
		}
	}

	if ( !exists( $opts->{a} ) )
	{
		$cmd .= " -a " . join(",", @{$batch});
	}

	$cmd .= " -J $jobNum";

	return $cmd;
}

sub dprint($$)
{
	my $msg = shift;
	my $opts = shift;
	print "[" . localtime() . "] $msg"
		if ( !exists( $opts->{q} ) &&
			!exists( $opts->{b} ) );
}

# HMM file name to length
sub HMMLength($) {
    my $file = shift;
    my $len = undef;
    local *HMM;
    open(HMM,"<",$file) || die "***FATAL*** Cannot read $file";
    while(<HMM>) {
	chomp;
	if (m/^LENG\s+(\d+)$/) {
	    $len = $1;
	    last;
	}
    }
    close(HMM) || die "***FATAL*** Error reading $file";
    die "***FATAL*** No LENG entry for hmm $file" unless defined $len;
    return $len;
}
