#!/usr/bin/perl -w
#
#  $Id: listBadHMMs.pl,v 1.4 2008/05/13 00:13:12 mprice Exp $
#  fastHmm/fastBlast Alignment Tools
#  http://microbesonline.org/fasthmm (fasthmm@microbesonline.org)
#
#  Tool for identifying difficult HMMs
#
#  Copyright (C) 2007 The Regents of the University of California
#  All rights reserved.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License along
#  with this program; if not, write to the Free Software Foundation, Inc.,
#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
#  Disclaimer
#
#  NEITHER THE UNITED STATES NOR THE UNITED STATES DEPARTMENT OF ENERGY,
#  NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED,
#  OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
#  COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
#  OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
#  PRIVATELY OWNED RIGHTS.
#

use strict;
use lib "$ENV{FASTHMM_DIR}/lib";
use File;
use Getopt::Long;

{
    my $datadir = $ENV{FASTHMM_DIR} . "/db";
    my $blastbin = $ENV{FASTHMM_DIR} . "/bin";
    my $hmmsearch = $ENV{FASTHMM_DIR} . "/bin/hmmsearch";

    $ENV{BLASTMAT} = $ENV{FASTHMM_DIR} . "/matrix"
	if ( !exists( $ENV{BLASTMAT} ) );

    my $tmpdir = "/tmp";
    my $nmodels = 0;
    my $limit = 0.01;
    my $numthreads = 1;
    my $db = undef;

    die "Usage: ListBadHMMs.pl -db db [-nmodels 0] [-limit $limit]\n"
	. "   [ -blastbin $blastbin ]\n"
	. "   [ -datadir $datadir ]\n"
	. "   [ -hmmsearch $hmmsearch ]\n"
	. "   [ -tmpdir $tmpdir ]\n"
	. "   [ -numthreads $numthreads ]\n" 
	. "   Note: uses --cut_tc if nomodels is 0, and -Z nmodels -e 0.02 otherwise\n"
	. "   Limit should be between 0.01 and 0.05. 1 miss on the seed sequences is always allowed.\n"
        . "   db should be a subdirectory of the data directory\n"
	unless GetOptions('db=s' => \$db,
			  'nmodels=i' => \$nmodels,
			  'limit=f' => \$limit,
			  'blastbin=s' => \$blastbin,
			  'datadir=s' => \$datadir,
			  'hmmsearch=s' => \$hmmsearch,
			  'tmpdir=s' => \$tmpdir,
			  'numthreads=i' => \$numthreads)
	&& @ARGV == 0
	&& defined $db;

    die "No tmp directory" unless -d $tmpdir;
    
    die "Invalid $limit" unless $limit > 0 && $limit < 0.5;
    die "Cannot find directory $datadir/$db" unless -e "$datadir/$db";
    die "Invalid hmmsearch argument" unless -e $hmmsearch;
    die "Invalid blastbin argument" unless -e "$blastbin/blastpgp";
    my $cutoff = $nmodels > 0 ? "-E 0.02 -Z $nmodels" : "--cut_tc";
    
    my @candidates = ();
    my $accList = $datadir . "/" . $db . "/.accList";
    local *IN;

    die "Could not find accession cache for $db; run fastHmm to build it first!"
	if ( !(-e $accList) );

    open(IN, "<$accList");
    print STDERR "Checking dataset for completeness ...\n";
    my $c = 0;
    while (<IN>) {
	chomp;
	my $acc = $_;
	#die "Cannot find .seq file for $acc" unless -e "$datadir/$db/$acc.seq";
	#die "Cannot find .hmm file for $acc" unless -e "$datadir/$db/hmm/$acc.hmm";
	#die "Cannot find .b file for $acc" unless -e "$datadir/$db/$acc.b";
	push @candidates, $acc
	    if ( ( -e "$datadir/$db/$acc.seq" ) &&
		( -e "$datadir/$db/hmm/$acc.hmm" ) &&
		( -e "$datadir/$db/$acc.b" ) );
        $c++;
        print STDERR " ... $c done\n"
	    if ( $c % 500 == 0 );
    }
    close(IN);
    print STDERR "Read " . scalar(@candidates) . " accessions and checked for files\n";
    
    my $slots = $numthreads;
    my $numAcc = scalar(@candidates);
    my $parentPid = $$;
    for ( my $i = 0; $i < $numAcc; $i++ ) {
	my $acc = $candidates[$i];
	if ( $slots > 0 )
        {
	    my $childPid = fork();
	    if ( defined($childPid) )
	    {
		if ( $childPid == 0 )
		{
		    # do analysis (child proc)

		    # First, make .faa file for hmmsearch
		    my $tmp = "$tmpdir/$parentPid.ListBadHMMs.$$";
		    my $allfile = "$tmp.all";
		    my %seqs = ();
	
		    open(SEQ,"<","$datadir/$db/$acc.b") || die "Error reading .b file";
		    while(<SEQ>) {
			chomp;
			next if $_ eq "";
			die $_ unless m/^(\S+) +([^ ]+)$/;
			my ($name,$subseq) = ($1,$2);
			$seqs{$name} = "" unless exists $seqs{$name};
			$seqs{$name} .= $subseq;
		    }
		    close(SEQ) || die;
		    if (keys(%seqs) == 0) {
			print STDERR "Sequence file for $acc is empty! Assigning it to the problematic list\n";
			open(OUT, ">$tmpdir/$parentPid.badhmm.$acc");
			close(OUT);
			exit(0);
		    }
		    my $alnlen = length((values %seqs)[0]);
		    while (my ($name,$seq) = each %seqs) {
			if (length($seq) != $alnlen) {
			    print STDERR "Warning: seed $name has length ".length($seq)." not $alnlen for acc $acc -- skipping this HMM\n";
			    last;
			    $alnlen = 0;
			}
		    }
		    next if $alnlen == 0;

		    open(OUT,">",$allfile) || die "Error writing $allfile";
		    while (my ($name,$seq) = each %seqs) {
			$seq =~ s/[-]//g;
			print OUT File::formatFasta( $name, uc($seq) );
		    }
		    close(OUT) || die "Error writing $allfile";

		    my @tc = ();
		    open TC, "$hmmsearch --informat FASTA $cutoff $datadir/$db/hmm/$acc.hmm $allfile | $ENV{FASTHMM_DIR}/bin/parseHmmGenes.pl $acc |";
		    while(<TC>) {
			chomp;
			die $_ unless m/^(\S+)\t(\S+)\t/ && $1 eq $acc;
			die "Unknown match $2" unless exists $seqs{$2};
			push @tc, $2;
		    }
		    close(TC);

		    # skip if there are no hits to the hmm
		    if ( scalar(@tc) == 0 )
		    {
			unlink( $allfile );
			exit(0); # from fork
		    }
	
		    # Rewrite all file with just the matching ones
		    open(OUT,">",$allfile) || die "Error writing $allfile";
		    foreach my $name (@tc) {
			my $seq = $seqs{$name};
			$seq =~ s/[-]//g;
			print OUT File::formatFasta( $name, uc($seq) );
		    }
		    close(OUT) || die "Error writing $allfile";

		    system("$blastbin/formatdb -i $allfile -p T"); # ignore errors
		    open PGP, "$blastbin/blastpgp -d $allfile -i $datadir/$db/$acc.seq -B $datadir/$db/$acc.b"
			. " -e 10 -m 8 -v 1000000 -b 1000000 -z 1e8 -Y 1e8 2>/dev/null |";
		    my %hits = (); # note name of hit sequence may be garbled; just count # of them
		    while(<PGP>) {
			chomp;
			my @F = split /\t/, $_;
			$hits{$F[1]} = 1;
		    }
		    print STDERR "acc $acc nSeed " . scalar(keys %seqs) . " nAboveTC " . scalar(@tc)
			. " nPGP " . scalar(keys %hits) . " seed-alignment-length $alnlen\n";
		    # For small seed alignments, >1 missing sequence is trouble
		    # For large seed alignments, up to 5% FP is acceptable

		    if ( (scalar(keys %hits) < scalar(@tc) - 1) &&
			(scalar(keys %hits) < (1-$limit) * scalar(@tc)) )
		    {
			open(OUT, ">$tmpdir/$parentPid.badhmm.$acc");
			close(OUT);
		    }

		    unlink( $allfile );
		    unlink( $allfile . ".phr" );
		    unlink( $allfile . ".pin" );
		    unlink( $allfile . ".psq" );
		    exit(0);
		}
	    } else {
		die "[error] unable to create child process to run analysis!";
	    }

	    # one slot used
	    $slots--;
	} else {
	    # no slots available; wait for a child proc to die
	    $i--;
	    my $childPid = wait();
	    die "Child process died: $?" if ($? & 255) != 0 && $childPid > 0;
	    if ( $childPid > 0 )
	    {
		$slots++;
	    } elsif ( $childPid < 0 )
	    {
		$slots = $numthreads;
	    }
	}
    }

    # wait for all child procs
    my $childPid = 0;
    do
    {
	$childPid = wait();
	die "Child process died: $?" if ($? & 255) != 0 && $childPid > 0;
    } until ( $childPid < 0 );

    # now merge
    my $files = File::getFilteredFileList( $tmpdir, [ "^$parentPid\\.badhmm\\..+\$" ] );
    my $hardFile = $datadir . "/" . $db . "/$db.hard.list";
    open(OUT, ">$hardFile");
    foreach my $file ( @{$files} )
    {
	my $baseFile = (split(/\//, $file))[-1];
	if ( $baseFile =~ /^\d+\.badhmm\.(.+)$/ )
	{
	    print OUT $1, "\n";
	    unlink( $file );
	}
    }
    #foreach (@bad) { print OUT "$_\n"; }
    close(OUT);
    print STDERR "$hardFile written; " . scalar(@${files}) . " hard HMMs added\n";
}
