#!/usr/bin/perl -w
#
#  $Id: topHomologs.pl,v 1.14 2009/01/28 01:37:17 mprice Exp $
#  fastHmm/fastBlast Alignment Tools
#  http://microbesonline.org/fasthmm (fasthmm@microbesonline.org)
#
#  Script for reducing non-domain regions using cd-hit
#
#  Copyright (C) 2007 The Regents of the University of California
#  All rights reserved.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License along
#  with this program; if not, write to the Free Software Foundation, Inc.,
#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
#  Disclaimer
#
#  NEITHER THE UNITED STATES NOR THE UNITED STATES DEPARTMENT OF ENERGY,
#  NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED,
#  OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
#  COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
#  OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
#  PRIVATELY OWNED RIGHTS.

use strict;
use lib exists( $ENV{FASTHMM_DIR} ) ?
		"$ENV{FASTHMM_DIR}/lib" :
		"./lib";
use Args;
use FastBLAST;

my $fbOptions = FastBLAST::Options();
my $usage =
"Usage:
   topHomologs.pl <options>

Parameters:
  -i <dir>	fastBlast output directory
  -f <fasta>    The fasta file that was used for the 1st stage of fastBlast
                This should also be a blast database
  -l <list>     Comma-delimited list of gene ids to get homologs for
  -L <file>     Read the gene ids from a file instead (1 per line)

Optional Parameters:
  -m            What blast format to output -- defaults to tab-delimited (-m 8)
  -F            BLAST filtering, defaults to '$fbOptions->{F}'. For more information, please see:
			http://www.ncbi.nlm.nih.gov/staff/tao/URLAPI/new/node80.html
  -z            Effective database size for reporting evalues (default: $fbOptions->{z})
  -e            E-value limit (default: $fbOptions->{e}). Weaker hits are often
			missed by FastBLAST anyway.
  -D            Debugging mode; summarize intermediate results to standard error
  -D2           Debugging mode and do not delete intermediate files
  -n            Number of top homologs (defaults to 1/$fbOptions->{nthTopHits} of fasta file)
  -T            temporary directory to use (else uses \$TMPDIR or /tmp)
  -missok       If an id is not in the blast database, issue a warning instead of
                exiting with an error.

Heuristics:
  -nth		#homologs defaults to 1/nth of sequences in fasta file (default: $fbOptions->{nthTopHits})
  -nDom         #overlapping redundant to consider (default: $fbOptions->{nOverlappingDom})
  -minBits      minimum bit score from domain alignment (default: $fbOptions->{minBits})
  -fPerDom      fudge factor for #homologs per domain (default: $fbOptions->{multPerDomain})
  -fCand        fudge factor for total candidates (default $fbOptions->{multTot})

Environment variables used: FASTHMM_DIR and TMPDIR
";

# Global options
my ($opts,$nonOpts) = Args::getArgs("+i:|+f:|l:|L:|F:|D|D2|t|n:|z:|e:|m:|T:|nofb|nth:|nDom:|minBits:|fPerDom:|fCand:|missok",
				    @ARGV, -1, $usage);
foreach my $opt (qw{F z m e}) {
    $fbOptions->{$opt} = $opts->{$opt} if exists $opts->{$opt};
}

if ($opts->{D2}) {
    $fbOptions->{debug} = 2;
} elsif ($opts->{D}) {
    $fbOptions->{debug} = 1;
}

$fbOptions->{nthTopHits} = $opts->{nth} if exists $opts->{nth};
$fbOptions->{nOverlappingDom} = $opts->{nDom} if exists $opts->{nDom};
$fbOptions->{minBits} = $opts->{minBits} if exists $opts->{minBits};
$fbOptions->{multPerDomain} = $opts->{fPerDom} if exists $opts->{fPerDom};
$fbOptions->{multTot} = $opts->{fCand} if exists $opts->{fCand};
$fbOptions->{missok} = 1 if exists $opts->{missok};

if (exists $opts->{T}) {
    SetTmpDir( $opts->{T} );
} elsif (exists $ENV{TMPDIR} && -d $ENV{TMPDIR}) {
    SetTmpDir( $ENV{TMPDIR} );
}

# Main function
{
    my @list = ();
    if (exists $opts->{l}) {
	@list = split /,/, $opts->{l};
    } elsif (exists $opts->{L}) {
	open(LIST, "<", $opts->{L}) || die "Cannot read $opts->{L}";
	while(<LIST>) {
	    chomp;
	    push @list, $_;
	}
	close(LIST) || die "Error reading $opts->{L}";
    } else {
	die "Please specify -l or -L\n$usage\n";
    }
    if (scalar(@list) == 0) {
	die "Please specify at least one locus\n";
    }
    die "Not a BLAST database: $opts->{f}" unless -e "$opts->{f}.pin" || -e "$opts->{f}.00.pin";

    InitDomains($opts->{i});

    my $nTopHits = $opts->{n};
    if (!defined $opts->{n}) {
	my $dbSize = FetchNSequences($opts->{i});
	$nTopHits = int($dbSize/$fbOptions->{nthTopHits});
	$nTopHits = 10 if $nTopHits < 10;
    }

    if ($fbOptions->{debug}) {
	print STDERR "topHomologs.pl: finding top $nTopHits hits to $opts->{f} using data directory $opts->{i}\n";
	print STDERR "Heuristics: nDom $fbOptions->{nOverlappingDom} minBits $fbOptions->{minBits}"
	    . " fPerDom $fbOptions->{multPerDomain} fCand $fbOptions->{multTot}\n";
    }

    foreach my $query (@list) {
	$query =~ s/^lcl[|]//;
	print STDERR "Query:\t$query " . localtime() . "\n" if ($fbOptions->{debug});
	FastBLAST($query, $opts->{f}, $nTopHits, undef);
	print STDERR "Finished query $query " . localtime() . "\n\n" if $fbOptions->{debug};
    }
    CleanupDomains();
    my $times = GetTimes();
    my $tot = 0;
    foreach my $key (sort keys %$times) {
	print STDERR sprintf("Time\t%s\t%.3f\n",$key,$times->{$key});
	$tot += $times->{$key};
    }
    print STDERR sprintf("Time\tTotal\t%.3f\n",$tot) if scalar(keys %$times) > 0;
}


