#!/usr/bin/perl -w
#
#  $Id: expandDomains.pl,v 1.9 2008/09/23 01:41:38 mprice Exp $
#  fastHmm/fastBlast Alignment Tools
#  http://microbesonline.org/fasthmm (fasthmm@microbesonline.org)
#
#  Expand blast hits to domain families
#
#  Copyright (C) 2007 The Regents of the University of California
#  All rights reserved.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License along
#  with this program; if not, write to the Free Software Foundation, Inc.,
#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
#  Disclaimer
#
#  NEITHER THE UNITED STATES NOR THE UNITED STATES DEPARTMENT OF ENERGY,
#  NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED,
#  OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
#  COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
#  OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
#  PRIVATELY OWNED RIGHTS.
#

use strict;

use lib exists( $ENV{FASTHMM_DIR} ) ?
		"$ENV{FASTHMM_DIR}/lib" :
		"./lib";
use Args;

#
# USAGE
#
my $usage =
"Usage:
  expandDomains.pl <options>

Parameters:
  -domains  <blastp.domains.sorted>	domains file
  -clusters <combined.clstr.parsed.sorted>	clusters file
  -out output file

   The domains file should contain tab-delimited lines of the form
	domainId locusId begin end
   The clusters file should contain tab-delimited lines of the form
	exemplar member1 ... memberN
   where both exemplars and members are of the from
	geneId.begin.end
   and the exemplar is included in the member list.

   All ids should be 1-based, the domains file should be sorted by geneId,
   and the clusters file should be sorted by geneId of the exemplar.

Optional Parameters:
  -o        <numAA>	Specify minimum overlap for expansion in aa
			   Default: 30aa
  -c        <%-cov>	Specify minimum %-coverage for expansion [0-1]
			   Default: 0.5
  Either criterion suffices.
  -debug		Enable additional debugging output

expandDomains.pl removes any lcl| prefix at the beginning of
names for the domains.
";

sub GetClustersForGene($$); # file handle, geneId -> hash of exemplar -> list of members
sub GetDomains($); # file handle -> (geneId, list of (domainId, begin, end))
sub mergeRegions($$); # ref to list of begin/end, minimum domain size

my ($opts, $nonOpts) = Args::getArgs( 
	"+domains:|+clusters:|+out:|debug|o:|c:|a",
	@ARGV, -1, $usage );

die "Error: Specified input file '", $opts->{domains}, "' does not exist!\n"
	if ( !(-e $opts->{domains}) );

die "Error: Specified clusters file '", $opts->{clusters}, "' does not exist!\n"
	if ( !(-e $opts->{clusters}) );

$opts->{o} = 30
	if ( !exists( $opts->{o} ) );

$opts->{c} = 0.5
	if ( !exists( $opts->{c} ) );

{
    local *CLUSTERS;
    open(CLUSTERS, "<", $opts->{clusters}) || die "Cannot read $opts->{clusters}";

    local *DOMAINS;
    open(DOMAINS, "<", $opts->{domains}) || die "Cannot read $opts->{domains}";

    my $tmpout = $opts->{out}.".tmp";
    local *OUT;
    open(OUT,  ">", $tmpout) || die "Cannot write to $tmpout";

    my $minsize = $opts->{o};
    my $minfraction = $opts->{c};

    my $nGeneDone = 0;
    while(1) {
	my ($geneId, $domList) = GetDomains(\*DOMAINS);
	if ($geneId eq "") {
	    last;
	} else {
	    die "Empty domains" if !defined $geneId || scalar(@$domList) == 0;
	    my $clusterHash = GetClustersForGene(\*CLUSTERS, $geneId);
	    die "No clusters for $geneId" if scalar(keys %$clusterHash) == 0; # should at least hit self

	    # Now, the exemplars can be highly redundant because they are unmerged
	    # To save time, I sort the cluster information by the contained genes
	    # We don't care about positions in the members so I forget that info
	    my %cmember = (); # memberId -> list of [$geneBeg,$geneEnd], sorted and merged

	    while (my ($exemplar,$members) = each %$clusterHash) {
		$exemplar =~ m/^(\S+)\.(\d+)\.(\d+)$/ || die "Error parsing exemplar $exemplar";
		die unless $1 eq $geneId;
		my ($gBeg,$gEnd) = ($2,$3);
		foreach my $member (@$members) {
		    die "Cannot parse member $member" unless $member =~ m/^(\S+)\.\d+\.\d+$/;
		    my $mId = $1;
		    push @{ $cmember{$mId} }, [$gBeg,$gEnd];
		}
	    }

	    while (my ($memberId,$regions) = each %cmember) {
		# 5 is to allow merges to save time below but without being aggressive about it
		$cmember{$memberId} = mergeRegions($regions, 5);
	    }

	    # Downstream analyses only use presence of absence of gene, so just take one region
	    # Also, a domain can show up multiple times with redundant begin/end
	    my %domNames = (); # domainId => list of begin, end
	    foreach my $domain (@$domList) {
		my ($domId,$geneBeg,$geneEnd) = @$domain;
		push @{ $domNames{$domId} }, [$geneBeg,$geneEnd];
	    }

	    while (my ($domId, $geneRangeList) = each %domNames) {
		my %hits = ($geneId => 1); # list of locusIds

		foreach my $geneRange (@$geneRangeList) {
		    # geneBeg:geneEnd is the region of the exemplar that was assigned to this domain
		    my ($geneBeg,$geneEnd) = @$geneRange;
		
		    while (my ($memberId,$regions) = each %cmember) {
			next if exists $hits{$memberId}; # no need to look
			foreach my $region (@$regions) {
			    # matchBeg:matchEnd is the region of the exemplar that clusters this gene
			    my ($matchBeg,$matchEnd) = @$region;
			    # intersection of the two regions
			    my $maxBeg = $matchBeg > $geneBeg ? $matchBeg : $geneBeg;
			    my $minEnd = $matchEnd < $geneEnd ? $matchEnd : $geneEnd;
			    if ($minEnd - $maxBeg + 1 >= $minsize
				|| $minEnd - $maxBeg + 1 >= $minfraction * ($geneEnd-$geneBeg+1)) {
				$hits{$memberId} = 1;
				last;
			    }
			}
		    }
		}
		foreach my $hit (keys %hits) {
		    print OUT "$domId\t$hit\n";
		}
	    } # end loop over domIds
	} # end if domain found
	$nGeneDone++;
	if ($nGeneDone % 1000 == 0) {
	    print STDERR "expandDomains.pl finished $nGeneDone genes\n";
	}
    } # end loop over domains

    close(CLUSTERS) || die "Error reading $opts->{clusters}";
    close(DOMAINS) || die "Error reading $opts->{domains}";
    close(OUT) || die "Error writing $tmpout";
    rename($tmpout,$opts->{out}) || die "Cannot rename $tmpout to $opts->{out}";
    print STDERR "Wrote $opts->{out}\n";
}

my %geneSeenCluster = ();
my %clustersForGene = ();
my $lastClustersLine = "";

sub GetClustersForGene($$) {
    my ($fId,$geneId) = @_;

    die "Second time asking for clusters for gene $geneId" if exists $geneSeenCluster{$geneId};

    my $nEntries = 0;
    %clustersForGene = ();
    while($lastClustersLine || ($lastClustersLine = <$fId>)) {
	die "Cannot parse clusters line $lastClustersLine"
	    unless $lastClustersLine =~ m/^(\S+)\.\d+\.\d+\t/;
	if($geneId eq $1) {
	    my @F = split /\t/, $lastClustersLine;
	    chomp $F[-1];
	    die "Invalid cluster $lastClustersLine" unless @F > 1;
	    my $exemplar = shift @F;
	    # Note, if the same region is chosen as an exemplar for two different families,
	    # then the exemplar can show up more than once
	    # Hence use push instead of assignment
	    # Deal with making it unique later on
	    push @{ $clustersForGene{$exemplar} }, @F;
	    $lastClustersLine = "";
	    $nEntries++;
	} elsif ($geneId ne $1 && $nEntries > 0) {
	    last;
	} else {
	    # cluster for gene that has no domains -- skip it
	    #print STDERR "Skipping clusters for $1\n" unless exists $geneSeenCluster{$1};
	    $geneSeenCluster{$1} = 1; # in case we somehow ask for it later
	    $lastClustersLine = "";
	}
    }
    #print STDERR "Finished GetClusters for gene $geneId with " . scalar(keys %clustersForGene) . " entries\n";
    return \%clustersForGene;
}

my $lastDomainsLine = "";

sub GetDomains($) {
    my ($fId) = @_;
    my @domains = ();

    my $lastGeneId = "";

    while($lastDomainsLine || ($lastDomainsLine = <$fId>)) {
	my ($domId,$regionId,$beg,$end) = split /\t/, $lastDomainsLine;

        # Remove lcl| resulting from using fastacmd to using fastacmd
	# to make the masked query file for running against
	$domId =~ s/^lcl\|//;

	die "Cannot parse domains line $lastDomainsLine"
	    unless defined $end && $regionId =~ m/^(\S+)\.(\d+)\.(\d+)$/;

	chomp $end;
	my ($geneId,$regionBeg,$regionEnd) = ($1,$2,$3);
	if ($geneId ne $lastGeneId && $lastGeneId ne "") {
	    last;
	} else {
	    $lastGeneId = $geneId;
	    push @domains, [$domId, $regionBeg+$beg-1, $regionEnd+$end-1];
	    $lastDomainsLine = "";
	}
    }
    #print STDERR "Finished GetDomains for $lastGeneId with " . scalar(@domains) . " entries\n";
    return ($lastGeneId,\@domains);
}

sub mergeRegions($$)
{
	my $list = shift;
	my $minDom = shift;
	my ($BEG, $END) = (0, 1);
	my @mask = sort { $a->[$BEG] <=> $b->[$BEG] } @{$list};
	my @merged = ();
	my $last = shift( @mask );
	foreach my $domain ( @mask )
	{
		if ( $domain->[$BEG] - $last->[$END] < $minDom )
		{
			$last->[$END] = $domain->[$END]
				if ( $domain->[$END] > $last->[$END] );
		} else {
			push( @merged, $last );
			$last = $domain;
		}
	}
	push( @merged, $last );
	return \@merged;
}
