#!/usr/bin/perl -w

#  $Id: COGHits.pl,v 1.4 2008/08/02 01:06:15 mprice Exp $
#  fastHmm/fastBlast Alignment Tools
#  http://microbesonline.org/fasthmm (fasthmm@microbesonline.org)
#
#  Create COG hits, formatted like FastHMM hits, for input to FastBLAST
#
#  Copyright (C) 2007 The Regents of the University of California
#  All rights reserved.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License along
#  with this program; if not, write to the Free Software Foundation, Inc.,
#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
#  Disclaimer
#
#  NEITHER THE UNITED STATES NOR THE UNITED STATES DEPARTMENT OF ENERGY,
#  NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED,
#  OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
#  COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
#  OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
#  PRIVATELY OWNED RIGHTS.
#

use strict;
use Getopt::Long;
use Cwd;

my $eval = 1e-5;
my $piece_dir = "pieces";
my $usage =
"Usage: COGHits.pl -faa input.faa [ -o output ]
                     [-e $eval] [-rpsdb FASTHMM_DIR/db/cog/Cog]
                     [-pieces nPieces] [-CPU 1]
                     [-dir $piece_dir]
input.faa should be a protein FASTA file
Use -rpsdb to specify the rps database you want to use
FASTHMM_DIR should be set unless you are in that directory
Use -pieces N to generate N parallel commands instead of running
	rpsblast directly -- writes to ./cogrps.cmds and cogrps.finish.cmd
Use -dir to store the piece files in the specified subdirectory
(defaults to \"pieces\" -- will make it if need be).
";

{
    my $nPieces = 0;
    my $faa = undef;
    my $outfile = undef;
    my $CPU = 1;
    my $basedir = exists $ENV{FASTHMM_DIR} ? $ENV{FASTHMM_DIR} : Cwd::cwd();
    my $rpsdb = "$basedir/db/cog/Cog";
    my $merge = 0;

    (GetOptions('faa=s' => \$faa,
		'o=s' => \$outfile,
		'pieces=i' => \$nPieces,
		'e=f' => \$eval,
		'rpsdb=s' => \$rpsdb,
		'CPU=i' => \$CPU,
		'merge=i' => \$merge,
		'dir=s' => \$piece_dir) 
     && @ARGV == 0)
	|| die $usage;
    die $usage unless defined $faa;
    die "No such file: $faa" unless -e $faa;
    die "No such database: $rpsdb.rps" unless -e "$rpsdb.rps" || $merge;
    die "No such directory: $basedir -- did you set FASTHMM_DIR" unless -d $basedir;
    my $bin = "$basedir/bin";
    die "No such executable: $bin/rpsblast" unless -x "$bin/rpsblast";
    die "No such executable: $bin/parseBlast.pl" unless -x "$bin/parseBlast.pl";
    die "No such executable: $bin/sort" unless -x "$bin/sort";

    if (!defined $outfile) {
	my $base = $faa;
	$base = $1 if $base =~ m!/([^/]+)$!;
	$base = $1 if $base =~ m!^(.*)[.][^.]+$!;
	$outfile = "result.$base.cogrps";
    }
    print STDERR "COG hits will be written to $outfile\n";
    if ($nPieces == 0 && $merge == 0) {
	my $blastcmd = "$bin/rpsblast -a $CPU -i $faa -d $rpsdb -e $eval";
	my $parsecmd = "$bin/parseBlast.pl | LC_ALL=C sort -k 2";
	my $cmd = "$blastcmd | $parsecmd > $outfile.raw";
	print STDERR "Running: $cmd\n";
	system("$cmd") == 0 || die "Error running $cmd";
    } elsif ($nPieces > 0 && $merge == 0)  {
	if (! -d $piece_dir) {
	    mkdir($piece_dir) || die "Cannot make pieces subdirectory\n";
	}
	die "No such executable: $bin/splitFasta.pl" unless -x "$bin/splitFasta.pl";
	# create pieces/cogrps.*.faa for 1 to nPieces
	my $cmd = "$bin/splitFasta.pl $faa $nPieces $piece_dir/cogrps faa";
	system("$cmd") == 0 || die "Error running $cmd";

	my $dir = Cwd::cwd();

	my $cmdfile = "cogrps.cmds";
	open(CMD,">",$cmdfile) || die "Cannot write to $cmdfile";
	foreach my $n (1..$nPieces) {
	    print CMD "$bin/rpsblast -a $CPU -i $dir/$piece_dir/cogrps.$n.faa -d $rpsdb -e $eval"
		. " | $bin/parseBlast.pl > $dir/$piece_dir/cogrps.$n.rpsblast\n";
	}
	close(CMD) || die "Error writing to $cmdfile";
	print STDERR "Wrote $nPieces parallel commands to $cmdfile\n";

	my $cmdfile2 = "cogrps.finish.cmd";
	open(CMD,">",$cmdfile2) || die "Cannot write to $cmdfile2";
	my $cogrpslist = join(" ", map "cogrps.$_.rpsblast", (1..$nPieces));
	print CMD "$bin/COGHits.pl -faa $faa -merge $nPieces -o $outfile\n";
	close(CMD) || die "Error writing to $cmdfile2";
	print STDERR "Wrote finish-up commands to $cmdfile2\n";
    }
    if ($nPieces == 0) {
	# convert the $outfile.raw file or the cogrps.*.rpsblast files
	my @files = ();
	if ($merge == 0) {
	    push @files, "$outfile.raw";
	} else {
	    foreach my $n (1..$merge) {
		push @files, "$piece_dir/cogrps.$n.rpsblast";
	    }
	}
	# Check for files before we overwrite outfile!
	foreach my $file (@files) {
	    die "No such file: $file" unless -e $file;
	}
	open(OUT,"| LC_ALL=C $bin/sort > $outfile") || die "Cannot run $bin/sort to $outfile";
	foreach my $file (@files) {
	    open(IN,"<",$file) || die "Cannot read $file";
	    my $lines = 0;
	    while (my $line = <IN>) {
		chomp $line;
		my @F = split /\t/, $line;
		die "Wrong number of columns in $file:\n$line\nExpect 10" unless @F >= 10;
		print OUT join("\t",$F[1],$F[0],@F[2..9])."\n";
		$lines++;
	    }
	    close(IN) || die "Error reading $file";
	    print STDERR "Warning: hits file $file is empty\n" if $lines == 0;
	}
	close(OUT) || die "Error running $bin/sort into $outfile";
	print STDERR "Wrote $outfile\n";
    }
}
