#!/usr/bin/perl -w
#
#  $Id: fastBlastAlignment.pl,v 1.14 2009/02/04 20:57:54 mprice Exp $
#  fastHmm/fastBlast Alignment Tools
#  http://microbesonline.org/fasthmm (fasthmm@microbesonline.org)
#
#  Script for realigning expanded blastp domains back to seed sequences
#  Also renames the expanded domains as "fb.sequence.begin.end"
#
#  Copyright (C) 2007 The Regents of the University of California
#  All rights reserved.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License along
#  with this program; if not, write to the Free Software Foundation, Inc.,
#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
#  Disclaimer
#
#  NEITHER THE UNITED STATES NOR THE UNITED STATES DEPARTMENT OF ENERGY,
#  NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED,
#  OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
#  COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
#  OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
#  PRIVATELY OWNED RIGHTS.
#

use strict;
use lib "$ENV{FASTHMM_DIR}/lib";
use Args;
use File;

#
# USAGE
#
my $usage =
"Usage:
  fastBlastAlignment.pl -db <faFile> [<options>] < <domainsFile>
  * Outputs to STDOUT

Required Options:
  -db <faFile>		Input seed sequences in FASTA format
  -o <name>		Send output to name.align and name.hits
  <domainsFile>		BLAST-expanded members of ad-hoc families/domains

Options:
  -F <filterStr>	BLAST filter string
			Default: 'm S' - For more information, please see:
			http://www.ncbi.nlm.nih.gov/staff/tao/URLAPI/new/node80.html

  -z <dbSize>		BLAST effective length/size of input database
			Default: 100000000

  -e <E-value>		BLAST E-value cutoff
			Default: 1e-3

  -b <dbseqs>           Size of output.
                        Default: 100000000

  -v <dbseqs>           Size of output
                        Default: 100000000

  -T tmpdir             Directory for temporary files
                        Default: /tmp

  -debug <0|1>		Enable debugging output with -debug 1

  -anchordb <file>      A second database that contains the seed sequences for the domains
                        (the full-length sequences, not the domain pieces) (optional)
";

die "Error: Please set your FASTHMM_DIR environment variable!\n"
	if ( !exists( $ENV{FASTHMM_DIR} ) );
die "Error: The specified FASTHMM_DIR '$ENV{FASTHMM_DIR}' does not exist!\n"
	if ( !(-e $ENV{FASTHMM_DIR}) );

#
# Parse Command Line Options
#
my ($opts, $nonOpts) = Args::getArgs(
	"+o:|+db:|F:|z:|e:|b:|v:|debug:|T:|anchordb:",
	@ARGV, -1, $usage );

# Set defaults
$opts->{F} = "m S"
	if ( !exists( $opts->{F} ) );
#$opts->{F} =~ s/[^\sa-zA-Z]+//g; # not sure why this was in here
$opts->{z} = 100000000 if ( !exists( $opts->{z} ) );
$opts->{e} = 1e-3 if ( !exists( $opts->{e} ) );
$opts->{b} = 100000000 if ( !exists( $opts->{b} ) );
$opts->{v} = 100000000 if ( !exists( $opts->{v} ) );

$opts->{debug} = 0
	if ( !exists( $opts->{debug} ) ||
		($opts->{debug} ne '1') );

my $binDir = $ENV{FASTHMM_DIR} . "/bin";

#my $piece = exists( $opts->{piece} ) ?
#		$opts->{piece} :
#		undef;
#my $iPart = undef;
#my $nParts = undef;
my $nDomPrint = 0;
my $nDomAnalyze = 0;

# Prototypes
# This uses the handles ALIGN and ALLHITS created by the caller
sub HandleDomain($);

# Main code
{
#    if (defined $piece) {
#	die "Piece specification should be like 23/345" unless $piece =~ m!^([0-9]+)/([0-9]+)$!
#	    && $1 >= 1 && $2 >= $1;
#	$iPart = $1 - 1;
#	$nParts = $2;
#    }
#    print STDERR "Analyzing domains against $opts->{db}\n";
#    print STDERR "Analyzing piece " . ($iPart+1) . " of $nParts\n" if defined $iPart;

    my $hitsfile = $opts->{o} . ".hits";
    my $hitsfile2 = "$hitsfile.tmp";
    open(ALLHITS,">",$hitsfile2) || die "Cannot write to $hitsfile2";

    my $alignfile = $opts->{o} . ".align";
    my $alignfile2 = "$alignfile.tmp";
    open(ALIGN,">",$alignfile2) || die "Cannot write to $alignfile2";

    my $dom = undef;
    my @rows = ();
    my %seenDom = ();
    my $domCount = 0;
    while(my $line = <STDIN>) {
	chomp $line;
	my @F = split /\t/, $line;
	die "Cannot parse $line" unless @F >= 2;
	if (defined $dom && $F[0] ne $dom) {
	    if (@rows > 1 || (@rows > 0 && exists $opts->{anchordb})) {
		# run if not using -piece, or this entry is part of the selected chunk
#		if ( !defined($iPart) ||
#			( defined($iPart) && ($domCount % $nParts == $iPart) ) )

		HandleDomain(\@rows);
		$seenDom{$dom} = 1;
		$domCount++;
	    }
	    @rows = ();
	    die "Duplicate domain: $F[0]" if exists $seenDom{$F[0]};
	}
	$dom = $F[0];
	push @rows, \@F;
    }
    if (@rows > 1 || (@rows > 0 && exists $opts->{anchordb})) {
#	if ( !defined($iPart) ||
#		( defined($iPart) && ($domCount % $nParts == $iPart) ) )
	HandleDomain(\@rows);
    }
    close(ALLHITS) || die "Error writing to $hitsfile2";
    close(ALIGN) || die "Error writing to $alignfile2";

    # Now check that the files are OK and rename them if they are
    die "Wrong number of lines in $hitsfile2 -- I/O error?"
	unless File::checkFileLinesColumns($hitsfile2,$nDomPrint,10);
    die "Wrong number of lines in $alignfile2 -- I/O error?"
	unless File::checkFileLinesColumns($alignfile2,$nDomPrint,7);
    rename($hitsfile2,$hitsfile) || die "Cannot rename $hitsfile2 to $hitsfile";
    rename($alignfile2,$alignfile) || die "Cannot rename $alignfile2 to $alignfile";
    print STDERR "Analyzed $nDomAnalyze ad-hoc domains and printed $nDomPrint entries to $hitsfile and $alignfile\n";
}

sub HandleDomain($) {
    my ($list) = @_;

    my $domId = $list->[0][0];
    my ( $anchorId, $aBeg, $aEnd ) = $domId =~ /^(.+?)\.(\d+)\.(\d+)$/;
    die "Cannot parse $domId" unless defined($anchorId) &&
		$aBeg =~ m/^[0-9]+$/ && $aEnd =~ m/^[0-9]+$/;

    # Only do some of the work
    # this is broken for non-numeric ids - handling it on a line by line basis in the main loop
    #return() if (defined $iPart && ($anchorId % $nParts) != $iPart);

    # Ignore begin and end positions if they are present -- just blast against full loci
    my %locus = map {$_->[1] => 1} @$list;

    # Anchor may not be found because of sorting issues -- it should actually be there
    if (!exists $locus{$anchorId} && !exists $opts->{anchordb}) {
	print STDERR "Anchor $domId does not hit itself -- adding it to ad-hoc domain\n";
	$locus{$anchorId} = 1;
    }

    my $tmpdir = "/tmp";
    $tmpdir = $ENV{TMPDIR} if exists $ENV{TMPDIR} && -d $ENV{TMPDIR};
    $tmpdir = $opts->{T} if exists $opts->{T} && -d $opts->{T};
    my $prefix = "$tmpdir/fba.$domId.$$";

    # Fetch sequences of members of domain -- run fastacmd and parse output
    my $listfile = "$prefix.list";
    my $tmpdb = "$prefix.faa1"; # full-length sequences
    open(LIST,">",$listfile) || die "Cannot write to $listfile";
    foreach my $locusId (keys %locus) {
	if ( $locusId =~ /^\d+$/ )
	{
		print LIST "lcl|"."$locusId\n";
	} else {
		print LIST $locusId, "\n";
	}
    }
    close(LIST) || die "Error writing $listfile";

    my $fastacmd = "$binDir/fastacmd -i $listfile -d $opts->{db} -p T > $tmpdb";
    print STDERR "$domId: Running $fastacmd\n" if $opts->{debug};
    (system($fastacmd) == 0) || die "Cannot run $fastacmd";

    open(FAA1,"<",$tmpdb) || die "Cannot run $fastacmd";
    my $tmpname = "";
    my %seqs = ();
    while(<FAA1>) {
	chomp;
	if ( /^>(?:lcl\|)?(\S+)/i ) {
	    $tmpname = $1;
	} else {
	    die "Bad output from $fastacmd\n$_" if (!defined $tmpname) || m/^>/;
	    $seqs{$tmpname} .= $_;
	}
    }
    close(FAA1) || die "Error reading $tmpdb";
    die "Empty output from $fastacmd" if (scalar(keys %seqs) == 0);

    # Fetch the anchor sequence, if necessary, and write the anchor sequence as a faa file
    my $queryFile = "$prefix.query";
    my $anchorSeq = undef;
    if (exists $opts->{anchordb}) {
	my $name = $anchorId;
	$name = "lcl|".$name unless $name =~ m/^lcl[|]/;
	my $fastacmd2 = "$binDir/fastacmd -s \"$name\" -d $opts->{anchordb} -p T";
	print STDERR "Running $fastacmd2\n" if $opts->{debug};
	open(FAA2,"$fastacmd2 |") || die "Error running $fastacmd2";
	my $header = <FAA2>;
	die "$fastacmd2 failed with\n$header\ninstead of >$name" 
	    unless $header =~ m/>(\S+)/ || $1 eq $name;
	$anchorSeq = "";
	while(my $line = <FAA2>) {
	    die "Unexpected line $line\ninstead of sequence for $name from $fastacmd2"
		if $line =~ m/^>/;
	    chomp $line;
	    $anchorSeq .= $line;
	}
	close(FAA2) || die "Error reading output from $fastacmd2";
	die "Empty sequence for $anchorId fetched from $opts->{anchordb}" unless length($anchorSeq) > 0;
	die "Sequence for $anchorId fetched from $opts->{anchordb} is too short given end $aEnd:\n$anchorSeq"
	    unless length($anchorSeq) >= $aEnd;
	$seqs{$anchorId} = $anchorSeq; # for use while parsing alignments
    } else {
	die "No sequence for $anchorId from $fastacmd -- is it in domain $domId?"
	    unless exists $seqs{$anchorId};
	$anchorSeq = $seqs{$anchorId};
    }
    open(QUERY,">",$queryFile) || die "Cannot write to $queryFile";
    my $queryseq = substr($anchorSeq, $aBeg-1, $aEnd-$aBeg+1)."\n";
    print QUERY ">$domId\n$queryseq\n";
    close(QUERY) || die "Error writing to $queryFile";

    # Add the query to the database
    if (exists $opts->{anchordb}) {
	open(QUERY,">>",$tmpdb) || die "Cannot append to $tmpdb";
	print QUERY ">$anchorId\n$anchorSeq\n";
	close(QUERY) || die "Error appending anchor to $tmpdb";
    }

    # Format the database
    my $formatcmd = "$binDir/formatdb -i $tmpdb -p T";
    # Do not trust exit codes from formatdb
    system($formatcmd) == 0 || print STDERR "Warning: formatdb may have failed $? -- $formatcmd\n";

    # Run BLAST
    my $blastFile = "$prefix.out";
    # by default, turn filtering off so I get a full alignment
    # and turn the parsed data from parseBlast.pl into an alignment
    my $blastcmd = "$binDir/blastall -p blastp -z $opts->{z} -e $opts->{e} -F '$opts->{F}' -b $opts->{b} -v $opts->{v} -i $queryFile -d $tmpdb"
	. " | $binDir/parseBlast.pl > $blastFile";
    print STDERR "Running $blastcmd\n" if $opts->{debug};
    system($blastcmd) == 0 || die "Cannot run $blastcmd -- $?";
    print STDERR "Reading $blastFile\n" if $opts->{debug};
    # Note -- below, I should be using only the top hit for each gene region --
    # is filtering that way necessary? Yes, it is.
    # Assume earlier hits in output are better

    my %regionsSeen = (); # id -> list of [begin,end]
    # any hit that overlaps >50% with an already-seen range is ignored
    
    my @out = ();
    open(BLAST,"<",$blastFile) || die "Cannot read $blastFile";
    while(my $line = <BLAST>) {
	chomp $line;
	my ($query,$subject,
	    $qBeg,$qEnd,$sBeg,$sEnd,
	    $score,$eval,
	    $qRangeStr,$sRangeStr) = split /\t/, $line;
	die "Unexpected query $query in line\n$line\nfor query $domId" unless $query eq $domId;
	die "Error in data from $binDir/parseBlast.pl: $line"
	    unless defined $sRangeStr
	    && $qBeg =~ m/^[0-9]+$/
	    && $qEnd =~ m/^[0-9]+$/
	    && $sBeg =~ m/^[0-9]+$/
	    && $sEnd =~ m/^[0-9]+$/
	    && $qRangeStr =~ m/^[0-9:,]+$/
	    && $sRangeStr =~ m/^[0-9:,]+$/;
	$subject =~ s/^lcl\|//;
	die "Unknown subject $subject" unless exists $seqs{$subject};

	my $keep = 1;
	if (exists $regionsSeen{$subject}) {
	    foreach my $seen (@{ $regionsSeen{$subject} }) {
		my ($oldBeg,$oldEnd) = @$seen;
		my $intersectBeg = $oldBeg < $sBeg ? $sBeg : $oldBeg;
		my $intersectEnd = $oldEnd > $sEnd ? $sEnd : $oldEnd;
		if ($intersectEnd > $intersectBeg
		    && ($intersectEnd-$intersectBeg+1) > 0.5 * ($sEnd-$sBeg+1)) {
		    $keep = 0;
		    last;
		}
	    }
	}
	if ($keep) {
	    $nDomPrint++;
	    # not the same as line because of removing lcl|
	    print ALLHITS join("\t","fb.$domId",$subject,$sBeg,$sEnd,$qBeg,$qEnd,$score,$eval,$sRangeStr,$qRangeStr)."\n";
	    push @{ $regionsSeen{$subject} }, [$sBeg,$sEnd];

	    # Now find the aligned positions and print those out too
	    my @qAlignedPos = ();
	    foreach my $range (split /,/, $qRangeStr) {
		my @pos = split /:/, $range;
		die "Cannot parse range $range" unless @pos == 2;
		push @qAlignedPos, map {$_ + ($aBeg-1)} $pos[0]..$pos[1];
	    }
	    
	    my @sAlignedPos = ();
	    foreach my $range (split /,/, $sRangeStr) {
		my @pos = split /:/, $range;
		die "Cannot parse range $range" unless @pos == 2;
		push @sAlignedPos, $pos[0]..$pos[1];
	    }
	    die "Alignments from $qRangeStr and $sRangeStr different number of pos or empty"
		unless scalar(@sAlignedPos) == scalar(@qAlignedPos) && @sAlignedPos>0;
	    die "Alignments from $qRangeStr out of range"
		unless $qAlignedPos[0] >= $aBeg && $qAlignedPos[-1] <= $aEnd;
	    
	    # Make alignment so that all positions from aBeg:aEnd in query are represented
	    my $subjectAlign = "";
	    my $subjectSeq = $seqs{$subject};
	    
	    for (my $at = $aBeg; $at <= $aEnd; $at++) {
		if (@qAlignedPos > 0 && $at == $qAlignedPos[0]) {
		    shift @qAlignedPos;
		    my $sPos = shift @sAlignedPos;
		    $subjectAlign .= substr($subjectSeq,$sPos-1,1);
		} else {
		    $subjectAlign .= "-";
		}
	    }

	    print ALIGN join("\t", "fb.$domId", $subject, $subjectAlign, $sBeg, $sEnd, $score, $eval)."\n";
	} # end if keep
    }
    close(BLAST) || die "Error reading $blastFile";

    if (! $opts->{debug}) {
	unlink($queryFile);
	unlink($tmpdb);
	foreach my $suffix (qw(phr pin psd psi psq)) {
	    unlink("$tmpdb.$suffix");
	}
	unlink($listfile);
	unlink($blastFile);
    }

    $nDomAnalyze++;
}
