#!/usr/bin/perl -w
#
#  $Id: combinePfam.pl,v 1.2 2007/10/16 23:20:20 whuang Exp $
#  fastHmm/fastBlast Alignment Tools
#  http://microbesonline.org/fasthmm (fasthmm@microbesonline.org)
#
#  Combine ls/fs Pfam results
#
#  Copyright (C) 2007 The Regents of the University of California
#  All rights reserved.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License along
#  with this program; if not, write to the Free Software Foundation, Inc.,
#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
#  Disclaimer
#
#  NEITHER THE UNITED STATES NOR THE UNITED STATES DEPARTMENT OF ENERGY,
#  NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED,
#  OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
#  COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
#  OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
#  PRIVATELY OWNED RIGHTS.
#

use strict;

use lib "$ENV{FASTHMM_DIR}/lib";
use Pfam;

my $debug = 0;
my ($DOM,$LOCUS,$BEG,$END,$DOMBEG,$DOMEND,$SCORE,$EVAL,$ALIGN) = (0..8);
sub overlap($$);

die "Usage: combinePfam.pl ResultsFile dbDir [-sorted] > CombinedFile\n"
    unless @ARGV>=2 && -e $ARGV[0] && -d $ARGV[1];

my $baseDir = exists( $ENV{FASTHMM_DIR} ) ?
		$ENV{FASTHMM_DIR} :
		".";

my $resultFile = shift;
my $dbDir = shift;
my $sorted = shift;
$sorted = ( defined($sorted) && ($sorted eq '-sorted') ) ?
		1 : 0;

my $clan = Pfam::loadAccToClanMap( $dbDir, 0 );	# 0 = strip ver from acc
my $amMode = Pfam::loadAMMap( $dbDir, 0 ); # 0 = strip ver from acc

local *HITS;
if ( $sorted )
{
	open(HITS, "<$resultFile") || die "Cannot read hits file '$resultFile'";
} else {
	open(HITS, "$baseDir/bin/sort -nk2 $resultFile |") ||
		die "Cannot sort and read hits file '$resultFile'";
}

my $thisLocus = undef;
my @lshits = (); # list of ls hits
my @fshits = (); # list of fs hits

while(<HITS>)
{
	chomp;
	my @F = split( /\t/, $_ );
	my $locus = $F[1];
	die $_
		if ( !defined( $locus ) );
	if ( !defined($thisLocus) )
	{
		$thisLocus = $locus;
		@lshits = ();
		@fshits = ();
	} elsif ( $thisLocus ne $locus )
	{
		# now we have $thisLocus, fshits/lshits for interactice proc
		processLociHits( $thisLocus, \@lshits, \@fshits, $clan, $amMode )
			if ( scalar(@lshits) + scalar(@fshits) > 0 );

		$thisLocus = $locus;
		@lshits = ();
		@fshits = ();
	}

	if ( $F[0] =~ /\.fs$/i )
	{
		$F[0] =~ s/\..+$//;
		push( @fshits, \@F );
	} else {
		$F[0] =~ s/\..+$//;
		push( @lshits, \@F );
	}
}

processLociHits( $thisLocus, \@lshits, \@fshits, $clan, $amMode )
	if ( scalar(@lshits) + scalar(@fshits) > 0 );

exit;



sub processLociHits
{
	my $locus = shift;
	my $lshits = shift;
	my $fshits = shift;
	my $clan = shift;
	my $amMode = shift;

	# first, eliminate redundant ls/fs hits, to create merged
	my @merged = ();
	my %lshitsdom = (); # dom -> list of hits
	my %fshitsdom = (); # dom -> list of hits
	my %dom = ();
	foreach my $hit ( @{$lshits} )
	{
		push @{ $lshitsdom{$hit->[$DOM]} }, $hit;
		$dom{$hit->[$DOM]} = 1;
	}
	foreach my $hit ( @{$fshits} )
	{
		push @{ $fshitsdom{$hit->[$DOM]} }, $hit;
		$dom{$hit->[$DOM]} = 1;
	}

	foreach my $dom (sort keys %dom)
	{
		my @ls = sort {$a->[$BEG] <=> $b->[$BEG]} @{ $lshitsdom{$dom} || []};
		my @fs = sort {$a->[$BEG] <=> $b->[$BEG]} @{ $fshitsdom{$dom} || []};
		print STDERR  "Warning: No am-mode for $dom\n" unless exists $amMode->{$dom};
		my $mode = $amMode->{$dom} || "globalfirst";

		while(@ls > 0 && @fs > 0)
		{
			if (overlap($ls[0],$fs[0]))
			{
				my $ls = shift @ls;
				my $fs = shift @fs;
				if ($mode eq "globalfirst")
				{
					push @merged, $ls;
				} elsif ($mode eq "localfirst")
				{
					push @merged, $fs;
				} elsif ($mode eq "byscore")
				{
					if ($ls->[$SCORE] > $fs->[$SCORE])
					{
						push @merged, $ls;
					} else {
						push @merged, $fs;
					}
				} else {
					print STDERR "Warning: am-mode $mode for $dom not recognized\n";
					push @merged, $ls; # treat globalfirst as default
				}
			} elsif ($ls[0][$BEG] < $fs[0][$BEG])
			{
				# no overlap? use the 1st one
				push @merged, (shift @ls);
			} else {
				push @merged, (shift @fs);
			}
		}

		push @merged, @ls;
		push @merged, @fs;
    	}

	# now eliminate contained-within hits and overlapping hits to the same clan
	my @kept = ();
	# sort by e-value
	@merged = sort {$a->[$EVAL] <=> $b->[$EVAL]} @merged;

	foreach my $hit (@merged)
	{
		my $keep = 1;
		foreach my $old (@kept)
		{
			if ($hit->[$BEG] >= $old->[$BEG] && $hit->[$END] <= $old->[$END])
			{
				$keep = 0; # is inside the older hit, ignore
				print STDERR join("\t","ContainedWithin",@$hit)."\n"
					if $debug;
			} elsif (overlap($old,$hit)
				&& exists $clan->{$hit->[$DOM]} && exists $clan->{$old->[$DOM]}
				&& $clan->{$hit->[$DOM]} eq $clan->{$old->[$DOM]})
			{
				print STDERR join("\t","OverlapsClann",@$hit)."\n" if $debug;
				$keep = 0; # overlaps older hit & in same clan, ignore
			}
		}

		push @kept, $hit
			if $keep;
	}

	foreach my $hit (@kept)
	{
		print join("\t", @$hit)."\n";
	}
}

sub overlap($$)
{
	my ($hit,$old) = @_;
	return ($hit->[$BEG] <= $old->[$END] && $hit->[$END] >= $old->[$BEG]);
}
