#!/usr/bin/perl -w
#
#  $Id: reduceOther.pl,v 1.4 2008/05/29 00:37:07 mprice Exp $
#  fastHmm/fastBlast Alignment Tools
#  http://microbesonline.org/fasthmm (fasthmm@microbesonline.org)
#
#  Script for reducing non-domain regions using cd-hit
#
#  Copyright (C) 2007 The Regents of the University of California
#  All rights reserved.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License along
#  with this program; if not, write to the Free Software Foundation, Inc.,
#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
#  Disclaimer
#
#  NEITHER THE UNITED STATES NOR THE UNITED STATES DEPARTMENT OF ENERGY,
#  NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED,
#  OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
#  COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
#  OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
#  PRIVATELY OWNED RIGHTS.

use strict;
use lib exists( $ENV{FASTHMM_DIR} ) ?
		"$ENV{FASTHMM_DIR}/lib" :
		"./lib";

use Args;

my $defaults = {
		'cdhit'		=> "$ENV{FASTHMM_DIR}/bin/cd-hit",
		'n'		=> 3,
		'c'		=> 0.55,
		'M'		=> 1000,
		'minDomain'	=> 30,
	       };

my $usage =
"Usage:
  reduceDomains.pl <options>

Parameters:
  -domains    <inFile>	Specify combined hmmhits file
  -db         <blastDb>	Specify source FASTA file

Optional Parameters:
  -debug      Enable debugging output; do not delete intermediate files
  -n	      <$defaults->{n}>	Specify cd-hit word length
  -c	      <$defaults->{c}>	Specify cd-hit sequence identity threshold
  -M	      <$defaults->{M}>	Specify cd-hit maximum available memory (MB)
  -minDomain  <$defaults->{minDomain}>	Specify minimum domain overlap amount
  -prefix     <out>	Specify output file prefix
  -cdhit      <binary>	Specify cd-hit binary to use;
			$defaults->{cdhit}
  -nocdhit    Writes to the output fasta file; doesn't call cdhit
";

sub MaskRegions($$); # domains file and minDomain -> hash of masked regions

# Produces output for printing to a fasta file
sub MaskSequence($$$$); # Arguments are hash of masked regions, name, sequence, minDomain


{
    my ($opts, $nonOpts) = Args::getArgs( "+domains:|+db:|n:|c:|M:|minDomain:|prefix:|debug|cdhit:|nocdhit",
					  @ARGV, -1, $usage );

    die "Error: Specified combined hmmhits file '", $opts->{domains}, "' does not exist!\n"
	if ( !(-e $opts->{domains}) );

    die "Error: Specified source FASTA file '", $opts->{db}, "' does not exist!\n"
	if ( !(-e $opts->{db}) );

    # Set defaults
    foreach my $p ( keys( %{$defaults} ) )
    {
	$opts->{$p} = $defaults->{$p}
	if ( !exists( $opts->{$p} ) );
    }

    $opts->{prefix} = $opts->{domains} . ".other"
    if ( !exists( $opts->{prefix} ) );

    # masked is a hash of sequence -> merged list of masked regions
    # (regions that overlap or are sperated by < minDomain  are merged)
    # the list is stored flattned (e.g., as begin1, end1, begin2, end2, ...)
    my $masked = MaskRegions($opts->{domains},$opts->{minDomain});

    my %handled = (); # sequences handled

    # make the fasta file with the unassigned regions ($faaFile)
    local *IN;
    open(IN, "<", $opts->{db}) || die "Cannot read $opts->{db}";
    my $faaFile = exists $opts->{nocdhit} ? "$opts->{prefix}.faa" : "$opts->{prefix}.unreduced.faa";
    local *OUT;
    open(OUT, ">", $faaFile) || die "Cannot write to $faaFile";
    my $lastname = "";
    my $sequence = "";
    while(<IN>) {
	chomp;
	if (m/^>(\S+)/) {
	    my $newname = $1;
	    print OUT MaskSequence($masked,$lastname,$sequence,$opts->{minDomain}) if $lastname ne "";
	    $handled{$lastname} = 1;

	    die "Duplicate fasta sequence $newname" if exists $handled{$newname};
	    $lastname = $newname;
	    $sequence = "";
	} else {
	    $sequence .= $_;
	}
    }
    print OUT MaskSequence($masked,$lastname,$sequence,$opts->{minDomain}) if $lastname ne "";
    $handled{$lastname} = 1;
    close(OUT) || die "Error writing to $faaFile";
    print STDERR "Wrote non-domain regions to temporary file $faaFile\n"
	if $opts->{debug} && !exists $opts->{nocdhit};

    while (my ($gene,$list) = each %$masked) {
	if (!exists $handled{$gene}) {
	    die "Sequence $gene is in domains file $opts->{domains} but not in fasta file $opts->{db}";
	}
    }
    if (exists $opts->{nocdhit}) {
	print STDERR "reduceOther.pl completed, wrote to $faaFile\n";
	exit(0);
    }
    my $cdhitOut = $opts->{prefix} . ".reduce.other.faa";
    my $cFile = $cdhitOut . ".clstr";
    my $cFile2 = $opts->{prefix} . ".reduce.other.clstr";
    my $cmd = $opts->{cdhit} . " -d 0 -M $opts->{M} -n $opts->{n} -c $opts->{c} -i $faaFile -o $cdhitOut";
    print STDERR "Running: $cmd\n" if $opts->{debug};
    system( $cmd )==0
	or die "system $cmd failed: $?";
    unlink($faaFile) unless $opts->{debug};

    unlink( "$cdhitOut.bak.clstr" );
    rename($cFile,$cFile2);
    print STDERR "reduceOther.pl completed, wrote to $cdhitOut and $cFile\n";
}

sub MaskRegions($$) {
    my ($domainsFile, $minDomain) = @_;
    my $masked = {}; # the output hash
    my $nDomRead = 0;
    local *DOM;
    open(DOM, "<", $domainsFile) || die "Cannot read $domainsFile";
    while(<DOM>) {
	chomp;
	my ($domId,$geneId,$beg,$end) = split /\t/, $_;
	die "Cannot parse $_" unless defined $end && $beg =~ m/^\d+$/ && $end =~ m/^\d+$/;
	die "Illegal end or begin" unless $beg >= 1 && $end >= 1 && $beg <= $end;
	$nDomRead++;
	if (exists $masked->{$geneId}) {
	    my $oldlist = $masked->{$geneId};
	    my $newlist = [];
	    while (@$oldlist > 0 && $oldlist->[0] <= $end + $minDomain) {
		my $oldbeg = shift @$oldlist;
		my $oldend = shift @$oldlist;

		# handle overlaps -- strict overlap is
		# begin <= end && vice versa
		if ($oldbeg <= $end + $minDomain && $beg <= $oldend + $minDomain) {
		    $beg = ($oldbeg < $beg) ? $oldbeg : $beg;
		    $end = ($oldend > $end) ? $oldend : $end;
		} else {
		    push @$newlist, ($oldbeg,$oldend);
		}
	    }
	    push @$newlist, ($beg,$end);
	    push @$newlist, @$oldlist;
	    $masked->{$geneId} = $newlist;
	} else {
	    $masked->{$geneId} = [$beg,$end];
	}
    }	    
    close(DOM) || die "Error reading $domainsFile";
    my $nPosMasked = 0;
    my $nGeneMasked = 0;
    while (my ($geneId,$list) = each %$masked) {
	$nGeneMasked++;
	for (my $i = 0; $i < scalar(@$list); $i += 2) {
	    my $beg = $list->[$i];
	    my $end = $list->[$i+1];
	    $nPosMasked += $end-$beg+1;
	}
	#print STDERR join("\t", $geneId, "Mask", @$list)."\n";
    }
    print STDERR "Read $nDomRead domains assignments for $nGeneMasked genes; masked out $nPosMasked positions\n";
    return $masked;
}

sub MaskSequence($$$$) {
    my ($masked,$name,$seq,$minDomain) = @_;
    my $length = length($seq);
    die "Empty sequence for $name" if $length < 1;

    my @unmasked = ();
    if (exists $masked->{$name}) {
	my $list = $masked->{$name};
	my $lastend = 0;
	for (my $i = 0; $i < scalar(@$list); $i+= 2) {
	    my $beg = $list->[$i];
	    my $end = $list->[$i+1];
	    die "Illegal end $end past end of sequence $name" if $end > $length;
	    if ($beg >= $lastend + $minDomain) {
		push @unmasked, [$lastend+1,$beg-1];
	    }
	    $lastend = $end;
	}
	push @unmasked, [$lastend+1,$length]
	    if $lastend + $minDomain < $length;
    } else {
	@unmasked = [1,length($seq)];
    }
    #print STDERR join("\t",$name,"Unmasked",map {$_->[0].":".$_->[1]} @unmasked)."\n";
    my $out = "";
    foreach my $region (@unmasked) {
	my ($beg,$end) = @$region;
	$out .= ">$name.$beg.$end\n";
	my $subseq = substr($seq,$beg-1,$end-$beg+1);
	die "Too short $name $beg $end" if $subseq eq "";
	my @lines = $subseq =~ m/.{1,60}/g;
	$out .= join("\n", @lines);
	$out .= "\n";
    }
    return $out;
}
