#!/usr/bin/perl -w
#
#  $Id: pantherSubdomain.pl,v 1.4 2008/05/20 17:27:00 mprice Exp $
#  fastHmm/fastBlast Alignment Tools
#  http://microbesonline.org/fasthmm (fasthmm@microbesonline.org)
#
#  Expand hits from panther families to include panther subfamilies
#
#  Copyright (C) 2007 The Regents of the University of California
#  All rights reserved.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License along
#  with this program; if not, write to the Free Software Foundation, Inc.,
#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
#  Disclaimer
#
#  NEITHER THE UNITED STATES NOR THE UNITED STATES DEPARTMENT OF ENERGY,
#  NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED,
#  OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
#  COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
#  OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
#  PRIVATELY OWNED RIGHTS.
#

use strict;

use lib exists( $ENV{FASTHMM_DIR} ) ?
		"$ENV{FASTHMM_DIR}/lib" :
		"./lib";
use Args;
use File;
use Util;
use Vector;


#
# USAGE
#
my $usage =
"Usage:
  pantherSubdomain.pl <options>

Parameters:
  -db      <blastDb>	Specify FASTA database
  -hits    <inFile>	Specify FastHMM panther hits file

Optional Parameters:
  -e       <e-value>	Specify e-value cutoff (Default: 0.001)
  -fnf     <numFams>	Specify filtering number of families (Default: 10000)
  -dbdir   <dbDir>	Specify FastHMM base directory
  -tmp     <tmpDir>	Specify temporary directory (Default: /tmp)
  -numcpus <#>		Specify number of parallel processes to run

Cluster Parameters:

Cluster operation is meant for compute farms.  This script will generate two
command lists.  Each list contains commands which may be run in any order
or in parallel but no command from the second list may be run concurrently
or before any command in the first list.  You must schedule these jobs
and ensure jobs are run according to the above constraint.  This script will
not submit or execute any jobs when run in cluster mode.

The output directory <outdir> must be a directory accessible from every
compute node on which any job in either command list is to be run.  This
script has no way of verifying that the directory is accessible and
therefore will fail silently if the <outdir> is set improperly.

  -cluster		Enable cluster mode (generates command list)
  -cfpre   <prefix>	Cluster file prefix (Default: <hits>.cmds)
  -outdir  <dir>	Specify directory where all output is to be written
  -merge		Merge and post-process data

";

#
# GLOBALS
#
my ($FAMILY, $GENE, $BEG, $END, $DOMBEG, $DOMEND, $SCORE, $EVAL) = (0..7);

my $fullArgs = join( " ", @ARGV );

#
# Parse Command Line Options
#
my ($opts, $nonOpts) = Args::getArgs( 
	"+db:|+hits:|dbdir:|tmp:|numcpus:|cluster;outdir|+;cluster;outdir:;cluster|merge;cluster|e:|fnf:|cfpre:",
	@ARGV, -1, $usage );

die "Error: Specified FastHMM file '", $opts->{hits}, "' does not exist!\n"
	if ( !(-e $opts->{hits}) );

die "Error: Specified sequence database '", $opts->{db}, "' does not exist!\n"
	if ( !(-e $opts->{db}) );

my $baseDir = exists( $ENV{FASTHMM_DIR} ) ?
		$ENV{FASTHMM_DIR} :
		".";

$opts->{fullArgs} = $fullArgs;
$opts->{baseDir} = $baseDir;
$opts->{dbdir} = $baseDir . "/db"
	if ( !exists( $opts->{dbdir} ) );

die "Error: Cannot find FastHMM database directory '", $opts->{dbdir}, "'\n"
	if ( !(-d $opts->{dbdir}) );

die "Error: Please set the FASTHMM_DIR environment variable correctly\n"
	if ( !(-d "$baseDir/bin") );

die "Error: <outdir> must be an absolute path: '", $opts->{outdir}, "'\n"
	if ( exists( $opts->{outdir} ) &&
		( $opts->{outdir} !~ /^\// ) );

$opts->{e} = 0.001
	if ( !exists( $opts->{e} ) );

$opts->{fnf} = 10000
	if ( !exists( $opts->{fnf} ) );

$opts->{tmp} = "/tmp"
	if ( !exists( $opts->{tmp} ) );

$opts->{cfpre} = "$opts->{hits}.cmds"
	if ( !exists( $opts->{cfpre} ) );

pantherSubdomain( $opts );

exit(0);

sub expandSubFiles($$$)
{
	my $fams = shift;
	my $numFams = shift;
	my $opts = shift;

	my @files = ();
	foreach my $fam ( keys( %{$fams} ) )
	{
		foreach my $sf ( @{$fams->{$fam}} )
		{
			push( @files, "$opts->{outdir}/sub.$fam.sfhits.$sf" );
		}
	}

	return \@files;
}

sub clusterCleanup($$$)
{
	my $fams = shift;
	my $numFams = shift;
	my $opts = shift;

	foreach my $fam ( keys( %{$fams} ) )
	{
		unlink( "$opts->{outdir}/sub.$fam.faa" );
		foreach my $sf ( @{$fams->{$fam}} )
		{
			unlink( "$opts->{outdir}/sub.$fam.sfhits.$sf" );
		}
	}
}

sub pantherSubdomain
{
	my $opts = shift;

	my $pantherNames = $opts->{dbdir} . "/panther/panther.names";
	my ($fams, $numFams) = loadPantherFams( $pantherNames );

	if ( exists( $opts->{merge} ) )
	{
		# do clustered merge
		print STDERR "Merging output files ...\n";
		my $subFiles = expandSubFiles( $fams, $numFams, $opts );
		mergeFiles( "$opts->{hits}.withsf", $opts->{hits}, @{$subFiles} );

		print STDERR "Cleaning up clustered output ...\n";
		clusterCleanup( $fams, $numFams, $opts );

		print STDERR "Filtering results...\n";
		filterResults( "$opts->{hits}.withsf", $opts, $numFams );

		exit;
	}

	# check if command list files exist; if so, remove
	unlink( "$opts->{cfpre}.1" )
		if ( exists( $opts->{cluster} ) && (-e "$opts->{cfpre}.1") );

	#
	# This opens a file and returns groups of lines with the same "id"
	# as defined by the regular expression
	#
	local *HITS;
	my $cmd = "$opts->{baseDir}/bin/sort -k1 $opts->{hits}";
	my $fId = File::openPipeIterCache( \*HITS, $cmd, '^(PTHR\w+)' );
	my $slots = exists( $opts->{cluster} ) ?
			1 :
			$opts->{numcpus};
	my %pidToDomain = ();
	print STDERR "Extracting sequences for analysis ...\n"
		if ( exists( $opts->{cluster} ) );
	while ( my $group = File::readNextPipeIter( $fId ) )
	{
		my ( $panther ) = $group->[0] =~ /^(PTHR\w+)/;

		# no available execution slots; block until one frees up
		while ( $slots < 1 )
		{
			my $childPid = wait();
			die "Child process died: $?" if ($? & 255) != 0 && $childPid > 0;

			if ( $childPid > 0 )
			{
				if ( !exists( $pidToDomain{$childPid} ) )
				{
					#warn "Process id $childPid was reaped and not associated with a domain - ignoring results!\n";
				} else {
					synthesizeDomainHits( $pidToDomain{$childPid}, $childPid, $opts );
					delete( $pidToDomain{$childPid} );
					$slots++;
				}
			} elsif ( $childPid < 0 )
			{
				warn "wait() returned $childPid - error?\n";
				# no children to reap? some sort of problem
				$slots = exists( $opts->{cluster} ) ?
						1 :
						$opts->{numcpus};
			}
		}

		my $childPid = fork();
		if ( defined($childPid) )
		{
			if ( $childPid == 0 )
			{
				processDomainHits( $panther, $group, $opts, $fams, $numFams );
				exit(0);
			}

			$pidToDomain{$childPid} = $panther;
			my @tmp = ();
			foreach my $pid ( sort { $a <=> $b } keys( %pidToDomain ) )
			{
				push( @tmp, "$pid:$pidToDomain{$pid}" );
			}
			warn "Running processDomainHits for $panther pid $childPid... (running " . scalar( @tmp ) . " => [" . join(" ", @tmp) . "])\n";
		} else {
			die "Unable to fork analysis process!";
		}

		$slots--;
	}
	File::closeFileIterCache( $fId );

	# Wait for all processing jobs to finish
	print STDERR "Waiting for all child processes to terminate ...\n"
		if ( $opts->{numcpus} > 1 );
	my $childPid = undef;
	do
	{
		$childPid = wait();
		die "Child process died: $?" if ($? & 255) != 0 && $childPid > 0;
		if ( $childPid > 0 )
		{
			if ( !exists( $pidToDomain{$childPid} ) )
			{
				#warn "Process id $childPid was reaped and not associated with a domain - ignoring results!\n";
			} else {
				synthesizeDomainHits( $pidToDomain{$childPid}, $childPid, $opts );
				delete( $pidToDomain{$childPid} );
			}
		}
	} until ( $childPid < 0 );

	if ( !exists( $opts->{cluster} ) )
	{
		# Combine results and filter to build domains file
		print STDERR "Merging subfamily hits with family hits\n";
		mergeFiles( "$opts->{hits}.withsf", $opts->{hits}, "$opts->{hits}.sf" );
		unlink( "$opts->{hits}.sf" );
		print STDERR "Filtering results...\n";
		filterResults( "$opts->{hits}.withsf", $opts, $numFams );
	} elsif ( exists( $opts->{cluster} ) && !exists( $opts->{merge} ) )
	{
		my $domainsFile = $opts->{hits};
		$domainsFile =~ s/\.hmmhits$/\.domains/i;
		# output merging command list
		local *CMD;
		open(CMD, ">$opts->{cfpre}.2");
		print CMD "$baseDir/bin/pantherSubdomain.pl $opts->{fullArgs} -merge >$domainsFile\n";
		close( CMD );

		print STDERR "Wrote: $opts->{cfpre}.1\n";
		print STDERR "Wrote: $opts->{cfpre}.2\n";
	}
}

sub filterResults
{
	my $inFile = shift;
	my $opts = shift;
	my $numFams = shift;

	local *HITS;
	my $cmd = "$opts->{baseDir}/bin/sort -nk2 $inFile";
	my $fId = File::openPipeIterCache( \*HITS, $cmd, '^\S+\s+(\d+)' );
	while ( my $group = File::readNextPipeIter( $fId ) )
	{
		my ( $locusId ) = $group->[0] =~ /^\S+\s+(\d+)/;
		print STDERR "Filtering hits for locus $locusId...\n";
		my @hits = ();
		foreach my $line ( @{$group} )
		{
			my @data = split(/\t/, $line);
			push( @hits, \@data );
		}

		# Sort by e-value
		@hits = sort { $a->[$EVAL] <=> $b->[$EVAL] } @hits;
		my @kept = ();

		foreach my $hit ( @hits )
		{
			next
				unless ( ($hit->[$EVAL] * 1.0 / $numFams) <=
						($opts->{e} * 1.0 / $opts->{fnf}) );

			my $keep = 1;
			foreach my $k ( @kept )
			{
				# check for overlap with something already kept
				if ( ( $hit->[$BEG] < $k->[$END] ) &&
					( $k->[$BEG] < $hit->[$END] ) )
				{
					my $iBeg = ($k->[$BEG] > $hit->[$BEG]) ?
							$k->[$BEG] :
							$hit->[$BEG];
					my $iEnd = ($k->[$END] < $hit->[$END]) ?
							$k->[$END] :
							$hit->[$END];
					my $iLen = $iEnd - $iBeg + 1;
					my $hitLen = $hit->[$END] - $hit->[$BEG] + 1;
					if ( $iLen > ($hitLen / 2) )
					{
						$keep = 0
							unless ( ( $k->[$FAMILY] =~ /^(.*):(.*)$/ ) &&
									( $1 eq $hit->[$FAMILY] ) );
					}
				}
			}

			push( @kept, $hit )
				if ( $keep == 1 );
		}

		print STDERR " - ", scalar( @kept ), " hit(s) kept.\n";
		foreach my $hit ( @kept )
		{
			print join( "\t", @{$hit} ), "\n";
		}
	}
}

sub mergeFiles
{
	my $destFile = shift;
	local *IN;
	local *OUT;

	open(OUT, ">$destFile");

	foreach my $inFile ( @_ )
	{
		next
			if ( !(-e $inFile) );
		open(IN, "<$inFile");
		while(<IN>)
		{
			print OUT;
		}
		close(IN);
	}

	close(OUT);
}

sub synthesizeDomainHits($$$)
{
	my $panther = shift;
	my $pid = shift;
	my $opts = shift;

	return
		if ( exists( $opts->{cluster} ) );

	print STDERR "Adding results from $panther to global hits file $opts->{hits}.sf\n";
	my $sfHitsFile = $opts->{tmp} . "/sub.$panther.$pid.sfhits";
	if ( !(-e $sfHitsFile) )
	{
		warn "Missing subfamily hits file for $panther (pid $pid)!";
	} else {
		local *IN;
		local *OUT;

		open(OUT, ">>$opts->{hits}.sf");
		open(IN, "<$sfHitsFile");
		while(<IN>)
		{
			print OUT;
		}
		close(IN);
		close(OUT);

		unlink( $sfHitsFile );
	}
}

sub processDomainHits
{
	my $panther = shift;
	my $hits = shift;
	my $opts = shift;
	my $fams = shift;
	my $numFams = shift;
	my $cmdFh = shift;

	# file handle for command list (if cluster mode)
	local *CMD;
	open( CMD, ">>$opts->{cfpre}.1" )
		if ( exists( $opts->{cluster} ) );


	if ( !defined($fams) )
	{
		$fams = {};
		$numFams = 0;
	}

	my $subDir = "$opts->{dbdir}/panther/books/$panther";
	die "Cannot find $panther books directory '$subDir'"
		if ( !(-d $subDir) );

	if ( !exists( $fams->{$panther} ) )
	{
		warn "Skipping unknown panther '$panther'";
		return;
	}

	my $numSf = scalar( @{$fams->{$panther}} );

	my $tmpPrefix = ( !exists( $opts->{cluster} ) ) ?
				"$opts->{tmp}/sub.$panther.$$" :
				"$opts->{outdir}/sub.$panther";
	my ($numSeqs, $seqIds) = parseHits( $hits, $GENE, 1 );
	my $numExtracted = extractFasta( $opts->{db}, $seqIds, "$tmpPrefix.faa", $opts->{baseDir} );

	warn "Expected $numSeqs sequences but only extracted $numExtracted for domain $panther hits!"
		if ( $numSeqs != $numExtracted );

	for ( my $i = 0; $i < $numSf; $i++ )
	{
		my $sf = $fams->{$panther}->[$i];
		print STDERR "Computing for $panther:$sf ...\n"
			if ( !exists( $opts->{cluster} ) );
		my $hmmFile = "$subDir/$sf/hmmer.hmm";
		if ( -e $hmmFile )
		{
			my $cmd = "$opts->{baseDir}/bin/hmmsearch --informat FASTA -Z $numFams -E $opts->{e} $hmmFile $tmpPrefix.faa | $opts->{baseDir}/bin/parseHmmAlignments.pl $panther:$sf";
			if ( !exists( $opts->{cluster} ) )
			{
				system( $cmd . " >>$tmpPrefix.sfhits" ) == 0 || die "hmmsearch failed $? -- $cmd";
			} else {
				print CMD "$cmd >$tmpPrefix.sfhits.$sf\n";
			}
		} else {
			warn "Domain subfamily $panther:$sf is missing hmmer.hmm file (not in '$hmmFile') - skipping";
		}
	}

	if ( exists( $opts->{cluster} ) )
	{
		close( CMD );
	} else {
		unlink( "$tmpPrefix.faa" );
	}
}

sub extractFastaWithIdx($$$$)
{
	my $faFile = shift;
	my $seqIdsHash = shift;
	my $outFaFile = shift;
	my $baseDir = shift;

	my $numExtracted = 0;

	# create temporary hits file
	local *OUT;
	open( OUT, ">$outFaFile.hits" );
	foreach my $seqId ( keys( %{$seqIdsHash} ) )
	{
		if ( $seqId =~ /^\d+$/ )
		{
			print OUT "lcl|$seqId\n";
		} else {
			print OUT "$seqId\n";
		}
	}
	close( OUT );

	my $cmd = "$baseDir/bin/fastacmd -d $faFile -i $outFaFile.hits 2>/dev/null";
	local *PIPE;
	open( OUT, ">$outFaFile" );
	open( PIPE, "$cmd |" );

	while ( <PIPE> )
	{
		if ( /^>/ )
		{
			s/^>lcl\|(\S+).+$/>$1/;
			$numExtracted++
		}
		print OUT;
	}

	close( PIPE );
	close( OUT );

	unlink( "$outFaFile.hits" );

	return $numExtracted;
}

sub extractFasta($$$$)
{
	my $faFile = shift;
	my $seqIdsHash = shift;
	my $outFaFile = shift;
	my $baseDir = shift;

	# check for fastacmd-indexed file first
	return extractFastaWithIdx( $faFile, $seqIdsHash, $outFaFile, $baseDir )
		if ( -e "$faFile.pin" || -e "$faFile.00.pin" );

	my $numExtracted = 0;
	local *IN;
	local *OUT;
	open( OUT, ">$outFaFile" );

	if ( -e $faFile )
	{
		my $fId = File::openFastaCache( \*IN, $faFile );
		while ( 1 )
		{
			my $seq = readNextFastaSeq( $fId );
			last
				if ( !defined( $seq->{defline} ) );

			if ( exists( $seqIdsHash->{ $seq->{defline} } ) )
			{
				$numExtracted++;
				print OUT File::formatFasta( $seq->{defline},
						$seq->{seq} );
			}
		}
		File::closeFastaCache( $fId );
	}

	close( OUT );

	return $numExtracted;
}

sub parseHits($$;$)
{
	my $hits = shift;
	my $col = shift;
	my $hash = shift;
	$hash = 0
		if ( !defined( $hash ) );

	my %hret = ();
	my @ret = ();

	foreach my $hit ( @{$hits} )
	{
		my @data = split(/\t/, $hit);
		if ( $hash == 0 )
		{
			push( @ret, $data[$col] );
		} else {
			$hret{ $data[$col] } = 1;
		}
	}

	return ( $hash == 0 ) ?
			( scalar( @ret ), \@ret ) :
			( scalar( keys( %hret ) ), \%hret );
}

sub loadPantherFams
{
	my $file = shift;
	my %fams = ();
	my $numFams = 0;
	local *IN;

	open(IN, "<$file") ||
		die "Error: Could not open '$file' for reading";

	while(<IN>)
	{
		chomp;
		if ( /^(PTHR\w+)\.(SF\d+)/ )
		{
			if ( !exists( $fams{$1} ) )
			{
				$fams{$1} = [];
				$numFams++;
			}
			push( @{$fams{$1}}, $2 );
		}
	}

	close(IN);

	return ( \%fams, $numFams );
}
