#!/usr/bin/perl -w
#
#  $Id: mergeDomains.pl,v 1.5 2008/05/23 23:48:40 mprice Exp $
#  fastHmm/fastBlast Alignment Tools
#  http://microbesonline.org/fasthmm (fasthmm@microbesonline.org)
#
#  Merge overlapping exemplars from reducing the domain families
#
#  Copyright (C) 2007 The Regents of the University of California
#  All rights reserved.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License along
#  with this program; if not, write to the Free Software Foundation, Inc.,
#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
#  Disclaimer
#
#  NEITHER THE UNITED STATES NOR THE UNITED STATES DEPARTMENT OF ENERGY,
#  NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED,
#  OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
#  COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
#  OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
#  PRIVATELY OWNED RIGHTS.
#

use strict;
use Getopt::Long;

my $usage =
"Usage:
  mergeDomains.pl -db database -o filename input_domains

Parameters:
  -db      <fastafile>	The full sequences (not subsequences)
  -o	   outfile      mergeDomains.pl writes the merged fasta file here
  input_domains         The reduced domains files, e.g. from DomReduce
";

# Given a reference to a list of begin,end and a new begin,end,
# return a reference to the merged result.
# The merged list will be sorted by begin (and assumes that the
# input is sorted as well)
sub Merge($$$);
sub OutDom($$$); # name, sequence, list of begin/end -> string to print

{
    my $db = undef;
    my $out = undef;

    GetOptions('db=s' => \$db,
	       'out=s' => \$out)
	|| die $usage;
    die "Must specify -db:\n$usage\n" unless defined $db;
    die "Must specify -out:\n$usage\n" unless defined $out;
    die "Must specify input domains:$usage\n" unless @ARGV>0;
    my @domfiles = @ARGV;

    die "Input fasta file $db does not exist" unless -e $db;

    # By merging the domains as we read them, we minimize memory utilization
    my %dom = (); # locusId => list of merged domains. The list is stored flattened as begin1, end1, ...

    foreach my $domfile (@domfiles) {
	local *DOM;
	open(DOM,"<",$domfile) || die "Error reading $domfile";
	while(my $line = <DOM>) {
	    chomp $line;
	    my @F = split /\t/, $line;
	    my ($domId,$locusId,$begin,$end) = @F;
	    die "No begin/end in line $line"
		unless defined $end && $begin>0 && $end>0;
	    $dom{$locusId} = Merge($dom{$locusId},$begin,$end);
	}
	close(DOM) || die "Error reading $domfile";
    }
    print STDERR "Parsed domains for " . scalar(keys %dom) . " exemplars\n";

    # Now read the fasta file
    my $name = "";
    my $seq = "";
    open(FAA,"<",$db) || die "Cannot read $db";
    open(OUT,">",$out) || die "Cannot write to $out";
    while(<FAA>) {
	chomp;
	if (m/^>(\S+)/) {
	    my $newname = $1;
	    print OUT OutDom($name,$seq,$dom{$name}) if (exists $dom{$name});
	    $name = $newname;
	    $seq = "";
	} else {
	    $seq .= $_;
	}
    }
    print OUT OutDom($name,$seq,$dom{$name}) if (exists $dom{$name});
    close(FAA) || die "Error reading $db";
    close(OUT) || die "Error writing to $out";
    print STDERR "mergeDomains.pl wrote $out\n";
}

sub Merge($$$) {
    my ($list,$beg,$end) = @_;
    return [$beg,$end] if !defined $list;

    my @out = ();
    my $size = scalar(@$list);
    for (my $i = 0; $i < $size; $i += 2) {
	my $oldbeg = $list->[$i];
	my $oldend = $list->[$i+1];
	# overlap if beg <= end and vice versa
	if ($oldbeg <= $end && $beg <= $oldend) {
	    # If overlap, fold this region into the 
	    # This way we handle two old regions being joined by a new one
	    $beg = $oldbeg if $oldbeg < $beg;
	    $end = $oldend if $oldend > $end;
	} elsif ($oldbeg > $end) {
	    push @out, ($beg,$end);
	    push @out, @{$list}[$i .. ($size-1)];
	    return \@out;
	} else {
	    push @out, ($oldbeg,$oldend);
	}
    }
    push @out, ($beg,$end);
    return \@out;
}

sub OutDom($$$) {
    my ($locusId,$seq,$ranges) = @_;
    my $size = scalar(@$ranges);
    my $out = "";
    for (my $i = 0; $i < $size; $i += 2) {
	my $beg = $ranges->[$i];
	my $end = $ranges->[$i+1];
	if ($end > length($seq)) {
	    die "Illegal ending position $end for sequence $locusId of length " . length($seq);
	}
	$out .= ">$locusId.$beg.$end\n" . substr($seq,$beg-1,$end-$beg+1) . "\n";
    }
    return $out;
}
