#!/usr/bin/perl -w
#
#  $Id: cdhitReduce.pl,v 1.2 2007/10/16 23:20:20 whuang Exp $
#  fastHmm/fastBlast Alignment Tools
#  http://microbesonline.org/fasthmm (fasthmm@microbesonline.org)
#
#  Script for reducing domain families using cd-hit
#
#  Copyright (C) 2007 The Regents of the University of California
#  All rights reserved.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License along
#  with this program; if not, write to the Free Software Foundation, Inc.,
#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
#  Disclaimer
#
#  NEITHER THE UNITED STATES NOR THE UNITED STATES DEPARTMENT OF ENERGY,
#  NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED,
#  OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
#  COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
#  OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
#  PRIVATELY OWNED RIGHTS.
#

use lib "$ENV{FASTHMM_DIR}/lib";
use File;

use strict;

my $bPrefix = shift;
my $thresh = shift;

die "usage: cdhitReduce [b file prefix] [seq threshold]"
	if ( !defined($bPrefix) );

$thresh = int($thresh)
	if ( defined($thresh) );
$thresh = 1000
	if ( !defined($thresh) || ($thresh < 10) );

my $bFile = $bPrefix . ".b";
my $seqFile = $bPrefix . ".seq";

my $cdhit = $ENV{FASTHMM_DIR} . "/bin/cd-hit";
my $mcdhit = $ENV{FASTHMM_DIR} . "/bin/mcd-hit";
my $getBSubset = $ENV{FASTHMM_DIR} . "/bin/getBSubset.pl";

if ( -e $bFile )
{
	local *IN;
	local *OUT;
	my $tmpPrefix = "/tmp/cdhit.$$";

	my $faStripped = $tmpPrefix . ".fa";
	open(IN, "<$bFile");
	my %seqs = ();
	my @seqNames = ();
	while(<IN>)
	{
		chomp;
		next if ( /^\s*$/ );
		my ( $id, $seq ) = split(/\s+/, $_, 2);
		if ( !exists( $seqs{$id} ) )
		{
			$seqs{$id} = $seq;
			push( @seqNames, $id );
		} else {
			$seqs{$id} .= $seq;
		}
	}
	close(IN);

	if ( scalar(@seqNames) < $thresh )
	{
		unlink( $faStripped );
		# do not cdhit reduce this db
		exit(0);
	}

	open(OUT, ">$faStripped");
	foreach my $id ( @seqNames )
	{
		my $seq = $seqs{$id};

		$seq =~ tr/-xX//d;
		print OUT File::formatFasta( $id, $seq, 60 );
	}
	close(OUT);

	# run cd-hit
	my $cd1 = $tmpPrefix . ".cd90";
	my $cmd = $cdhit . " -i $faStripped -o $cd1";
	system( $cmd );
	my $cd2 = $tmpPrefix . ".cd60";
	$cmd = $cdhit . " -n 4 -c 0.6 -i $cd1 -o $cd2";
	system( $cmd );
	my $cd3 = $tmpPrefix . ".cd45";
	$cmd = $cdhit . " -n 2 -c 0.45 -i $cd2 -o $cd3";
	system( $cmd );

	# filter b seqs
	$cmd = $getBSubset . " $bFile $cd3 >$bFile.new";
	system( $cmd );

	unlink( $bFile );
	rename( $bFile . ".new", $bFile );
	unlink( $faStripped );
	unlink( $cd1 );
	unlink( $cd1 . ".clstr" );
	unlink( $cd1 . ".bak.clstr" );
	unlink( $cd2 );
	unlink( $cd2 . ".clstr" );
	unlink( $cd2 . ".bak.clstr" );
	unlink( $cd3 );
	unlink( $cd3 . ".clstr" );
	unlink( $cd3 . ".bak.clstr" );

	my @seqEnts;
	my $numSeqs = 0;
	%seqs = ();
	@seqNames = ();
	open(IN, "<$bFile");
	while(<IN>)
	{
		chomp;
		next if ( /^\s*$/ );
		my ( $id, $seq ) = split(/\s+/, $_, 2);
		if ( exists( $seqs{$id} ) )
		{
			$seqs{$id} .= $seq;
		} else {
			$seqs{$id} = $seq;
			push( @seqNames, $id );
		}
	}
	close(IN);
	foreach my $id ( @seqNames )
	{
		my %seq = ( 'id' => $id, 'seq' => $seqs{$id} );
		push( @seqEnts, \%seq );
		$numSeqs++;
	}

	my $best = getMorganBest( \@seqEnts, $numSeqs );
	#create seed
	open(OUT, ">$seqFile");
	my $seq = $best->{seq};
	$seq =~ tr/-.//d;

	print OUT File::formatFasta( $best->{id}, $seq, 60 );
	close(OUT);
}


# This function adapted from Fasta2B.pl selects from a list of
# sequences the one with the least amount of gap columns from
# all columns having >= 60% non-gap characters
sub getMorganBest
{
        my $seqs = shift;
        my $numSeqs = shift;
        my $seqLen = length( $seqs->[0]->{seq} );

        $numSeqs = scalar( @{$seqs} )
                if ( !defined($numSeqs) || ( $numSeqs < 1 ) );
        return undef
                if ( $numSeqs < 1 );

        my %scores = ();
        for ( my $i = 0; $i < $seqLen; $i++ )
        {
                my $col = "";
                for ( my $j = 0; $j < $numSeqs; $j++ )
                {
                        $col .= substr( $seqs->[$j]->{seq}, $i, 1 );
                }

                my $gaps = $col =~ tr/-xX/-xX/;
                if ( $gaps < 0.6*$numSeqs )
                {
                        for ( my $j = 0; $j < $numSeqs; $j++ )
                        {
                                $scores{$j}++
                                        if ( substr( $col, $j, 1 ) !~ /^[-xX]$/ );
                        }
                }
        }

        my $bestIdx = 0;
        my $bestScore = 0;
        foreach my $i ( sort { $a <=> $b } keys( %scores ) )
        {
                if ( $scores{$i} > $bestScore )
                {
                        $bestIdx = $i;
                        $bestScore = $scores{$i};
                }
        }

        return $seqs->[$bestIdx];
}

