#!/usr/bin/perl -w

#  $Id: fastBlastPrepare.pl,v 1.14 2008/08/07 18:53:16 mprice Exp $
#  fastHmm/fastBlast Alignment Tools
#  http://microbesonline.org/fasthmm (fasthmm@microbesonline.org)
#
#  Sets up a fastBlast directory and Makefile
#
#  Copyright (C) 2007 The Regents of the University of California
#  All rights reserved.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License along
#  with this program; if not, write to the Free Software Foundation, Inc.,
#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
#  Disclaimer
#
#  NEITHER THE UNITED STATES NOR THE UNITED STATES DEPARTMENT OF ENERGY,
#  NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED,
#  OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
#  COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
#  OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
#  PRIVATELY OWNED RIGHTS.
#

use strict;
use Getopt::Long;
use Cwd;

my $nPieces = 200;

my $usage =
"Usage: fastBlastPrepare.pl -faa input.faa [ -o fastblast_directory ]
                     [-pieces nPieces] [-c configurationFile]
		     [-T tmpdir]
 		hmmhits files
The domain files should be the result.*.hmmhits from FastHMM and from COG.
input.faa should be a BLAST protein database formatted with -o T
fastblast_directory is where all the FastBlast files will be made.
It defaults to the current directory
nPieces defaults to $nPieces
configurationFile defaults to FASTHMM_DIR/conf/fastBlastMake.conf
FASTHMM_DIR should be set unless you are in that directory
";

{
    my $fasthmmDir = exists $ENV{FASTHMM_DIR} ? $ENV{FASTHMM_DIR} : Cwd::cwd();
    die "FASTHMM_DIR is set to non-existent directory $fasthmmDir" unless -d $fasthmmDir;
    die "FASTHMM_DIR environment variable not set or incorrect:\nCannot find $fasthmmDir/bin/fastacmd"
	unless -e "$fasthmmDir/bin/fastacmd";

    my $configFile = undef;
    my $faaFile = undef;
    my $outDir = ".";
    my $tmpdir = undef;

    GetOptions('-faa=s'=>\$faaFile,
	       '-o=s'=>\$outDir,
	       '-pieces=i' => \$nPieces,
	       '-c=s' => \$configFile,
	       '-T=s' => \$tmpdir)
	|| die $usage;
    my @hmmfiles = @ARGV;

    die "No fasta file specified:\n$usage" unless defined $faaFile;
    die "No such file $faaFile" unless -e $faaFile;
    if (!defined $configFile) {
	$configFile = "$fasthmmDir/conf/fastBlastMake.conf";
	if (! -e $configFile && !exists $ENV{FASTHMM_DIR}) {
	    die "Set the FASTHMM_DIR environment variable or specify the configuration file:\n$usage";
	}
    } elsif (! -e $configFile) {
	die "Configuration file $configFile not found\n";
    }
    die "pieces argument $nPieces should be at least 1" if $nPieces < 1;
    if ($nPieces >= 2000) {
	print STDERR "Warning: pieces is over 2000, command lines could be too long\n";
    }
    die "No hmmhits files specified:$usage\n" if scalar(@hmmfiles) == 0;
    die "No such directory $outDir" unless -d $outDir;
    print STDERR "Warning: No such directory $tmpdir (specified with -T)\n"
	if defined $tmpdir && ! -d $tmpdir;

    # Verify the input
    my %faaLen = (); # id in fasta file to length
    my $lastName = "";
    open(FAA,"<",$faaFile) || die "Cannot read $faaFile";
    while(<FAA>) {
	if (m/^>/) {
	    die "Cannot parse $_ in $faaFile" unless m/^>(\S+)/;
	    die "Duplicate entry $1 in $faaFile" if exists $faaLen{$1};
	    $faaLen{$1} = 0;
	    $lastName = $1;
	    if ($lastName =~ m/[|]/) {
		die "FastBLAST does not allow sequence identifiers that contain '|' characters: $lastName\n";
	    }
	} else {
	    chomp;
	    die "$faaFile is not a fasta file" unless $lastName ne "";
	    die "Invalid line $_ in $faaFile" unless m/^[A-Za-z*-]+$/ || $_ eq "";
	    $faaLen{$lastName} += length($_);
	}
    }
    die "$faaFile is empty" unless $lastName ne "";
    close(FAA) || die "Error reading $faaFile";
    print STDERR "Checked $faaFile -- OK -- last name is $lastName\n";

    # Check for the commands that fast-blast needs
    foreach my $cmd (qw{cat cp cut mv rm sort formatdb blastall cd-hit clstr_rev.pl
			    DomReduce DomSearch}) {
	my $exe = "$fasthmmDir/bin/$cmd";
	print STDERR "Warning: executable $exe required by FastBLAST is missing\n" unless -x $exe;
    }
    
    #Use fastacmd to verify that this is a database
    my $fastaSpec = $lastName;
    $fastaSpec = "lcl|".$lastName if $fastaSpec =~ m/^\d+$/;
    my $cmd = "$ENV{FASTHMM_DIR}/bin/fastacmd -s '$fastaSpec' -d $faaFile -p T";
    open(FASTACMD,"$cmd |")
	|| die "Cannot run $cmd";
    my $len = 0;
    while(<FASTACMD>) {
	next if m/^>/;
	chomp;
	$len += length($_);
    }
    close(FASTACMD) || die "Error running\n$cmd\n -- is $faaFile formatted with -o T ?";
    die "Inconsistent lengths for $lastName -- $len vs. $faaLen{$lastName} -- in $faaFile\n"
	. " You should rerun formatdb?\n"
	unless $len == $faaLen{$lastName};

    my $allhmm = "$outDir/allhmm";

    open(ALLHMM,">",$allhmm) || die "Cannot write to $allhmm";
    my %domSeen = ();
    foreach my $file (@hmmfiles) {
	die "File $file not found\n" unless -e $file;
    }
    foreach my $file (@hmmfiles) {
	open(FILE,"<",$file) || die "Cannot read $file";
	print STDERR "Checking hmmhits input file $file\n";
	my $lastDom = "";
	while(my $line = <FILE>) {
	    print ALLHMM $line;
	    my @F = split /\t/, $line;
	    die "Cannot parse domain line $_ in $file" unless @F >= 10;
	    if ($F[0] ne $lastDom) {
		die "Domain files not sorted? Domain $F[0] seen more than once" if exists $domSeen{$F[0]};
		$lastDom = $F[0];
	    }
	    if (!exists $faaLen{$F[1]}) {
		die "Unknown gene $F[1] appears in domain $lastDom in  file $file";
	    }
	    my $len = $faaLen{$F[1]};
	    my $beg = $F[2];
	    my $end = $F[3];
	    chomp $end;
	    die "Illegal begin $beg for line\n$line\nin file $file" unless $beg =~ m/^\d+$/ && $beg >= 1;
	    die "Illegal end $end for line\n$line\nin file $file" unless $end =~ m/^\d+$/ && $end >= 1;
	    die "Illegal end for line $line\nin file $file for sequence of length $len"
		unless $beg <= $end && $end <= $len;
	}
	close(FILE) || die "Error reading $file";
	if ($lastDom eq "") {
	    print STDERR "Warning: $file is empty\n";
	}
    }
    close(ALLHMM) || die "Error writing to $allhmm";
    print STDERR "Finished checking the hmmhits files and wrote $allhmm\n";

    if(! -d "$outDir/fb") {
	mkdir("$outDir/fb");
    }
    if(! -d "$outDir/fb") {
	die "Cannot make directory $outDir/fb";
    }

    my $makefile = "$outDir/Makefile.fastblast";
    open(MAKE,">",$makefile) || die "Cannot write $makefile";
    my $pieceSpec = join(" ", (1..$nPieces));
    
    print MAKE <<END

# FastBlast control file to work with GNU make and parallel execution
# (e.g., make or qmake with the -j option)
#
# On a machine with 4 CPUs, we recommend
#	cd $outDir; make -f Makefile.fastblast -j 4 all

# To run on a cluster with sun grid engine, with up to 100
# jobs at once, use
#	export FASTHMM_DIR=$fasthmmDir
#	cd $outDir; qmake -f Makefile.fastblast -cwd -v PATH,FASTHMM_DIR -- -j 100 all

# To delete all FastBLAST files and start over, use
#	make -f Makefile.fastblast clean

# To run on a cluster without sun grid engine, we recommend
#	cd $outDir
#	make -f Makefile.fastblast -j 4 reduce1
#	make -f Makefile.fastblast -n blast1 > cmdlist1
#	cmdlist1: run these commands on your cluster and wait for them to finish
#	make -f Makefile.fastblast -j 4 reduce2
#	make -f Makefile.fastblast -n blast2 >  cmdlist2
#	cmdlist2: run these commands on your cluster and wait for them to finish
#	make -f Makefile.fastblast -j 4 expand
#	make -f Makefile.fastblast -n blast3 >  cmdlist3
#	cmdlist3: run these commands on your cluster and wait for them to finish
#	make -f Makefile.fastblast all
#
# When you the commands in cmdlist1 or cmdlist2 on your cluster,
# remember to set the FASTHMM_DIR and the working directory.


# The fasthmm directory
FASTHMM_DIR=$fasthmmDir

# The input fasta file
FAA=$faaFile

# How finely to split up the work
NPIECES = $nPieces
PIECES = $pieceSpec

END
    ;

    open(CONF,"<",$configFile) || die "Cannot read $configFile";
    while(<CONF>) {
	print MAKE $_;
    }
    close(CONF) || die "Error reading $configFile";

    if (defined $tmpdir) {
	print MAKE <<END

# Override default temporary directory
TMPDIR = $tmpdir

END
;
    }

print MAKE <<END

# The actual specifications for what to do
export FASTHMM_DIR
include \$(FASTHMM_DIR)/lib/fastblast.make
END
    ;

    close(MAKE) || die "Error writing $makefile";

    print STDERR <<END
Wrote $makefile
fastBlastPrepare.pl finished -- to actually run FastBlast, use
	cd $outDir; make -f $makefile
Use make -j 4 to run in parallel or qmake to run on a SunGridEngine cluster
For other job schedulers, see $makefile for advice
END
    ;
}
    
