#!/usr/bin/perl -w
#
#  $Id: hitsToAlign.pl,v 1.8 2008/06/11 22:06:20 mprice Exp $
#  fastHmm/fastBlast Alignment Tools
#  http://microbesonline.org/fasthmm (fasthmm@microbesonline.org)
#
#  Converts raw HMM hits into alignments
#
#  Copyright (C) 2007 The Regents of the University of California
#  All rights reserved.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License along
#  with this program; if not, write to the Free Software Foundation, Inc.,
#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
#  Disclaimer
#
#  NEITHER THE UNITED STATES NOR THE UNITED STATES DEPARTMENT OF ENERGY,
#  NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED,
#  OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
#  COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
#  OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
#  PRIVATELY OWNED RIGHTS.
#

use strict;
use lib "$ENV{FASTHMM_DIR}/lib";
use Args;

#
# USAGE
#
my $usage =
"Usage:
  hitsToAlign.pl -db <faFile> [<options>] < <hmmHitsFile>
  * Outputs to STDOUT

Required Options:
  -db <faFile>		Input seed sequences in FASTA format
  <hmmHitsFile>		Concatenated raw HMM hits

Options:
  -piece <n/m>		Performs analysis on input subset n of m, n <= m
			This option allows the analysis to be split into m
			chunks, all of which can be run in parallel
			Default: Run all pieces serially (no parallelization)

  -writeRegion <0|1>	Output entire matched region (for debugging)

";

die "Error: Please set your FASTHMM_DIR environment variable!\n"
	if ( !exists( $ENV{FASTHMM_DIR} ) );
die "Error: The specified FASTHMM_DIR '$ENV{FASTHMM_DIR}' does not exist!\n"
	if ( !(-e $ENV{FASTHMM_DIR}) );

#
# Parse Command Line Options
#
my ($opts, $nonOpts) = Args::getArgs(
	"+db:|piece:|writeRegion:",
	@ARGV, -1, $usage );

# Set defaults
$opts->{writeRegion} = 0
	if ( !exists( $opts->{writeRegion} ) ||
		( $opts->{writeRegion} ne '1' ) );

my $binDir = $ENV{FASTHMM_DIR} . "/bin";
my $piece = exists( $opts->{piece} ) ?
		$opts->{piece} :
		undef;
my $iPart = undef;
my $nParts = undef;

# Prototypes
sub HandleDomain($);

# Parameter checking
die "$opts->{db} not a database - must be formatted with formatdb and with option -o T"
	unless -e "$opts->{db}.pin" || -e "$opts->{db}.00.pin";
if ( defined($piece) )
{
	die "Piece specification should be like 23/345" unless $piece =~ m!^([0-9]+)/([0-9]+)$!
		&& $1 >= 1 && $2 >= $1;
	$iPart = $1 - 1;
	$nParts = $2;
}

#print STDERR "Using seed sequences database '$opts->{db}'\n";
print STDERR "Analyzing piece " . ($iPart + 1) . " of $nParts ...\n"
	if ( defined($iPart) );

my %domSeen = ();

my @rows = ();
my $domId = undef;
my $domCount = 0;
while(<STDIN>) {
    chomp;
    my @F = split /\t/, $_;
    die "Cannot parse $_" if @F < 1;
    if (!defined $domId || $domId ne $F[0]) {
	if ( scalar(@rows) > 0 )
	{
		if ( !defined($iPart) ||
			(defined($iPart) && ($domCount % $nParts == $iPart)) )
		{
			HandleDomain(\@rows);
		}
		$domCount++;
	}
	$domId = $F[0];
	die "Have seen domain $domId before" if exists $domSeen{$domId};
	$domSeen{$domId} = 1;
	@rows = ();
    }
    push @rows, \@F;
}
if ( scalar(@rows) > 0 )
{
	if ( !defined($iPart) ||
		(defined($iPart) && ($domCount % $nParts == $iPart)) )
	{
		HandleDomain(\@rows);
	}
}

sub HandleDomain($) {
    my ($rows) = @_;
    my %loci = (); # locusId -> list of hits
    my %domEnd = (); # domId -> right-most aligned position seen (in domain coordinates not sequence coord.)

    foreach my $row (@$rows) {
	my ($domId, $locusId,$beg,$end,$domBeg,$domEnd,$score,$eval,$seqRangeSpec,$domRangeSpec) = @$row;
	die "Cannot parse " . join("\t",@$row) . " -- not enough fields" unless defined $domRangeSpec;
	
	my @seqRanges = map { m/^([0-9]+):([0-9]+)$/ || die $_; [$1,$2] } split /,/, $seqRangeSpec;
	my @domRanges = map { m/^([0-9]+):([0-9]+)$/ || die $_; [$1,$2] } split /,/, $domRangeSpec;
	$domEnd{$domId} = $domEnd unless exists $domEnd{$domId} && $domEnd{$domId} > $domEnd;
	
	push @{ $loci{$locusId} }, [$locusId, $domId, \@seqRanges, \@domRanges, $beg, $end, $score, $eval];
    }

    die "No loci in input\n" if keys(%loci) == 0;
    
    my %AASeqs = (); # filled by using fastacmd

    my $tmpdir = exists $ENV{TMPDIR} && -d $ENV{TMPDIR} ? $ENV{TMPDIR} : "/tmp";
    my $pre = "$tmpdir/HitsToAlign$$";
    my $listfile = "$pre.list";
    my $faafile = "$pre.faa";
    
    open(LIST,">",$listfile) || die "Cannot write to $listfile";
    foreach my $locus (keys %loci) {
	if ($locus =~ m/^\d+$/) {
	    $locus = "lcl|".$locus;
	}
	print LIST $locus."\n";
    }
    close(LIST) || die "Error writing $listfile\n";
    
    my $fastacmd = "$binDir/fastacmd -i $listfile -d $opts->{db} > $faafile";
    system($fastacmd) == 0 || die "fastacmd failed $? -- $fastacmd";
    die "Cannot run $fastacmd" unless -e $faafile;
    open(FASTA,"<", $faafile) || die "Cannot open $faafile";
    my $tmpname = "";
    while(<FASTA>) {
	chomp;
	if (m/^>/) {
	    die "Bad output from $fastacmd\n$_" unless m/>(\S+)/;
	    $tmpname = $1;
	    $tmpname =~ s/^lcl[|]//;
	} else {
	    die "Bad output from $fastacmd\n$_" if !defined $tmpname || m/^>/;
	    $AASeqs{$tmpname} .= $_;
	}
    }
    close(FASTA) || die "Error reading $faafile";
    if (keys(%AASeqs) == 0) {
	print STDERR "Warning: no output from $fastacmd\n";
    }
    unlink($faafile);
    unlink($listfile);
    
    while (my ($locusId,$hits) = each %loci) {
	if (!exists $AASeqs{$locusId}) {
	    print STDERR "Warning: skipping locus $locusId\n";
	    next;
	}
	my $locusseq = $AASeqs{$locusId};
	foreach my $hit (@$hits) {
	    my ($locusId,$domId,$seqRanges,$domRanges,$seqBeg,$seqEnd,$score,$eval) = @$hit;

	    my $domseq = ""; # sequence as aligned to domain
	    my $end = $domEnd{$domId};
	    die unless defined $end;
	    
	    my @seqAt = ();
	    foreach my $pair (@$seqRanges) {
		my ($left,$right) = @$pair;
		push @seqAt, $left .. $right;
	    }
	    my $seqFirst = $seqAt[0];
	    my $seqLast = $seqAt[-1];

	    my @domAt = ();
	    foreach my $pair (@$domRanges) {
		my ($left,$right) = @$pair;
		push @domAt, $left .. $right;
	    }
	    die unless @domAt == @seqAt;
	    
	    my $fail = 0;
	    for (my $domAt = 1; $domAt <= $end; $domAt++) {
		if (@domAt > 0 && $domAt[0] == $domAt) {
		    shift @domAt;
		    my $pos = shift @seqAt;
		    if ($pos > length($locusseq)) {
			print STDERR "Failed on locus $locusId domain $domId: DomAt $domAt Pos $pos length "
			    . length($locusseq) . " -- continuing\n";
			$fail = 1;
			last;
		    }
		    $domseq .= substr($locusseq, $pos-1, 1);
		} else {
		    $domseq .= "-";
		}
	    }
	    if (! $fail) {
		my @out = ($domId, $locusId, $domseq,$seqBeg,$seqEnd,$score,$eval);
		if ($opts->{writeRegion}) {
		    push @out, substr($locusseq, $seqFirst-1, $seqLast-$seqFirst+1);
		}
		print join("\t",@out)."\n";
	    }
	}
    }
}

