#!/usr/bin/perl -w

#  $Id: cd-hit-para-make.pl,v 1.1 2008/05/20 17:18:28 mprice Exp $
#  fastHmm/fastBlast Alignment Tools
#  http://microbesonline.org/fasthmm (fasthmm@microbesonline.org)
#
#  Creates a makefile for running cd-hit in parallel
#
#  Copyright (C) 2007 The Regents of the University of California
#  All rights reserved.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License along
#  with this program; if not, write to the Free Software Foundation, Inc.,
#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
#  Disclaimer
#
#  NEITHER THE UNITED STATES NOR THE UNITED STATES DEPARTMENT OF ENERGY,
#  NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED,
#  OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
#  COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
#  OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
#  PRIVATELY OWNED RIGHTS.

use strict;
use Getopt::Long;
use Cwd;

my $nPieces = 30;
my $n = 4;
my $c = 0.65;
my $M = 2000;

my $usage =
"Usage: cd-hit-para-make.pl -in input.faa -out output.faa
        [ -d work_directory ]
        [-pieces nPieces]
	[-n $n] [-c $c] [-M $M]
	[-bin cd_hit_binaries_directory]

nPieces defaults to $nPieces -- the number of temporary files grows as
the square of nPieces, so much larger values are not recommended

work_directory defaults to the current directory

If -bin is not used, looks in \$FASTHMM_DIR/bin and ~/bin

-n, -c, and -M are passed to cd-hit and cd-hit-2d

The makefile is written to work_directory/Makefile.cdhit

Running the makefile, e.g. with make -f Makefile.cdhit, will
create output.faa and output.faa.clstr
";

{

    my $faaFile = undef;
    my $out = undef;
    my $bin = undef;
    my $dir = ".";

    (GetOptions('-in=s' => \$faaFile,
		'-out=s' => \$out,
		'-dir=s' => \$dir,
		'-n=i' => \$n,
		'-c=f' => \$c,
		'-M=i' => \$M,
		'-bin=s' => \$bin,
		'-pieces=i' => \$nPieces)
     && @ARGV==0)
	|| die $usage;
    die "Must specify input fasta file\n" unless defined $faaFile;
    die "Must specify output file\n" unless defined $out;

    die "-pieces must be at least 2" if ($nPieces <= 1);

    if (!defined $bin) {
	if (exists $ENV{FASTHMM_DIR}) {
	    $bin = "$ENV{FASTHMM_DIR}/bin";
	} else {
	    $bin = "$ENV{HOME}/bin";
	}
	if (! -d $bin) {
	    die "Looking for executables in $bin and failed -- please use the -bin option";
	}
    }
    foreach ("cd-hit-div", "cd-hit", "cd-hit-2d", "clstr_merge.pl") {
	print STDERR "Warning: required executable $bin/$_ does not exist -- use the -bin option?\n"
	    unless -x "$bin/$_";
    }
    if (! -d $dir) {
	die "Specified working directory $dir is not a directory";
    }

    # Verify $out is OK
    open(OUT,">",$out) || die "Illegal -o option -- cannot write to $out";
    close(OUT);
    unlink($out);

    # Verify $subDir is OK
    my $subDir = "cdhitpara";
    mkdir("$dir/$subDir");
    open(TMP,">","$dir/$subDir/testfile") || die "Cannot write to $dir/$subDir/testfile";
    close(TMP);
    unlink("$dir/$subDir/testfile");

    my $piecesList = join(" ", 0 .. ($nPieces-1));
    my $makefile = "$dir/Makefile.cdhit";
    open(MAKE,">",$makefile) || die "Cannot write to $makefile";
    print MAKE <<END
BIN = $bin
IN = $faaFile
OUT = $out
DIR = $subDir

ARGS = -n $n -c $c -M $M

# Split into NSEG segments, and then for rounds 0 to NSEG-1, we
#	compare segment i to itself (seg-\$i.self)
#	for j>=i, compare seg-\$i.self to seg-\$j or seg-\$j.(\$i-1) to make seg-\$j.vs\$i
#	At end of round i, everything that can be clustered with a sequence in seg-\$i has been
#	We use clstr_merge.pl to combine the cluster membership of the pieces

NSEG = $nPieces
SEGS = $piecesList
SEGS_PRE = \$(addprefix \$(DIR)/seg-, \$(SEGS))

.PHONY: all
all: \$(OUT) \$(OUT).clstr

.PHONEY: cdhit-clean
cdhit-clean:
	rm \$(DIR)/*

\$(OUT): \$(addsuffix .self, \$(SEGS_PRE))
	cat \$^ > \$@

\$(OUT).clstr: \$(addsuffix -clustr, \$(SEGS_PRE))
	cat \$^ > \$@

\$(SEGS_PRE): \$(DIR)/seg-div

\$(DIR)/seg-div: \$(IN)
	\$(BIN)/cd-hit-div -i \$(IN) -div \$(NSEG) -o \$(DIR)/seg && touch \$@

END
    ;

    for (my $i = 0; $i < $nPieces; $i++) {
	# suffix from the previous round
	my $suffix = ($i==0) ? "" : ".vs".($i-1);
	print MAKE "\$(DIR)/seg-$i.self: \$(DIR)/seg-$i$suffix\n"
	    . "\t\$(BIN)/cd-hit \$(ARGS) -i \$< -o \$@\n";

	if ($i < $nPieces-1) {
	    my $vsList = join(" ",($i+1)..($nPieces-1));
	    print MAKE "VS$i = \$(addprefix \$(DIR)/seg-, \$(addsuffix .vs$i, $vsList))\n";
	    print MAKE "\$(VS$i) : \%.vs$i : \%$suffix \$(DIR)/seg-$i.self\n"
		. "\t\$(BIN)/cd-hit-2d \$(ARGS) -i \$(DIR)/seg-$i.self -i2 \$< -o \$@\n";
	    print MAKE "\$(DIR)/seg-$i-clustr: \$(DIR)/seg-$i.self \$(VS$i)\n";
	    print MAKE "\t\$(BIN)/clstr_merge.pl \$(DIR)/seg-$i.self.clstr \$(addsuffix .clstr, \$(VS$i)) > \$@\n";
	} else {
	    print MAKE "\$(DIR)/seg-$i-clustr: \$(DIR)/seg-$i.self\n";
	    print MAKE "\tcp \$<.clstr \$@\n";
	}
	print MAKE "\n";
    }
    close(MAKE) || die "Error writing to $makefile";
    print STDERR "Wrote $makefile -- use parallel make such as GNU make or qmake\n";
}
