#!/usr/bin/perl -w
#
#  $Id: FastBLAST.pm,v 1.10 2009/01/28 01:37:18 mprice Exp $
#  fastHmm/fastBlast Alignment Tools
#  http://microbesonline.org/fasthmm (fasthmm@microbesonline.org)
#
#  Perl module for accessing the FastBLAST output and searching
#  for top homologs
#
#  Copyright (C) 2007 The Regents of the University of California
#  All rights reserved.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License along
#  with this program; if not, write to the Free Software Foundation, Inc.,
#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
#  Disclaimer
#
#  NEITHER THE UNITED STATES NOR THE UNITED STATES DEPARTMENT OF ENERGY,
#  NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED,
#  OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
#  COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
#  OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
#  PRIVATELY OWNED RIGHTS.

package FastBLAST;
require Exporter;
use strict;
use DB_File; # BerkeleyDB lightweight database
my $useHiRes;

BEGIN {
    eval { require Time::HiRes; };
    $useHiRes = 1 if $@ eq ""; # $@ is EVAL_ERROR
}

our @ISA = qw(Exporter);

our @EXPORT = qw(FetchSequences FetchSequencesFile InitDomains
		 CleanupDomains FetchDomains FetchAlignment
		 HasAlignment FetchNSequences SetTmpDir TmpFile
		 AlignmentToTopHits StartTimer EndTimer GetTimes
		 Options TopDomains FastBLAST);

sub Options(); # returns a reference to the FastBLAST options hash

# FastBLAST(locusId, database, nTopHits, outputfile)
# you must call InitDomains() first, and you
# should call CleanupDomains() when done to avoid very slow berkeley-DB cleanup
# If the output file is not specified, writes to STDOUT instead
sub FastBLAST($$$$);

sub FetchSequences($$); # fasta file and reference to list of names to hash
sub FetchSequencesFile($$); # fasta file and reference to list of names to hash -> temporary file
sub InitDomains($); # the fasthmm directory
sub CleanupDomains(); # the fasthmm directory

sub FetchDomains($); # sequence name -> reference to list of domains
# Each domain is [$domId,$geneId,$seqBeg,$seqEnd,$domBeg,$domEnd,$score]
# All positions are 1-based

# in: a reference to a list of domains for a gene
# out: a reduced subset that should be good enough to get all homologs
sub TopDomains($);

# in: domain name and optional file handle to write to
# out: reference to list of alignment lines, or, if writing to a filehandle, the number of lines written
sub FetchAlignment;

sub FetchNSequences($); # number of sequences in the database
sub TmpFile($); # suffix to name of the form /tmp/topHomologs$$.$nTmpFile.suffix
sub AlignmentToTopHits($$$$); # domainId, query, minScore, number of homologs
# returns a reference to a hash of subject -> score

# global performance counters
sub StartTimer($);
sub EndTimer($);
sub GetTimes(); # returns a reference to a hash

my $fbOptions = {
    # heuristics
    nthTopHits => 2000, # fraction of top hits
    nOverlappingDom => 2, # number of overlapping domains to use for each region (beyond COG/PFam)
    minBits => 20.0, # minimum bits to accept from alignment matches (DomSearch)
    multPerDomain => 2.0, # multiple desired candidates by this fudge factor for each domain
    multTot => 2.5, # multiple desired candidates by this fudge factor for total list sent to blast

    # blast settings
    z => "1e8",
    e => "1e-3",
    m => 8,
    F => "m S",

    # reporting/debugging settings
    missok => 0, # print error to stderr instead of failing if gene is unknown
    nofb => 0, # set this to turn off use of ad-hoc domains (for testing)
    debug => 0,
    
    };

sub Options() {
    return $fbOptions;
}

sub TopDomains($) {
    my ($domains) = @_;
    my ($DOM,$SEQ,$SEQBEG,$SEQEND,$DOMBEG,$DOMEND,$SCORE) = (0..6);

    # check that sequence values are consistent
    my %seq = map {$_->[$SEQ] => 1} @$domains;
    die join(" ", "Inconsistent sequences input to TopDomains:", keys %seq)
	if scalar(keys %seq) > 1;

    # Sort domains for query by score
    my @domSorted = sort {$b->[$SCORE] <=> $a->[$SCORE]} @$domains;
    if ($fbOptions->{debug}) {
	foreach (@domSorted) { print STDERR join("\t",@$_)."\n"; }
    }

    # Process in order to get a subset of domains (in %domUsed):
    # If overlaps >= $fbOptions->{nOverlappingDom} so far, may be redudant:
    # superfamily or gene3d or panther or ad-hoc fast-blast domains can always be redundant
    # COG can be redundant, but not the best COG
    # PFam_ls/Pfam_fs/TIGRFam/SMART/PIRSF are never redundant

    my @domUse = (); # All domains (the objects) except FastBLAST domains
    my @domUseAdhoc = (); # FastBlast domains (the objects)
    my @domIgnore = (); # domains ignored because of overlap rule (the names)
    my %domUsed = (); # All domains byname, only once
    my $COGseen = 0;

    foreach my $row (@domSorted) {
	my ($domId, $seqId, $seqBeg, $seqEnd, $domBeg, $domEnd, $score) = @$row;
	die unless defined $score;
	my $IsCOG = $domId =~ m/^(COG|gnl.CDD.)[0-9]+$/ ? 1 : 0;
	my $IsAdhoc = $domId =~ m/^fb\./ ? 1 : 0;
	next if $IsAdhoc && $fbOptions->{nofb}; # skip ad-hoc domains if using -nofb
	
	# Gene3D, Superfam, and Panther
	my $IsDisposable = $domId =~ m/^[0-9][a-z0-9][0-9a-zA-Z]+$/ || $domId =~ m/^PTHR/;

	my $ignore = 0;
	my $list = $IsAdhoc ? \@domUseAdhoc : \@domUse;
	if ($IsAdhoc || $IsDisposable || ($IsCOG && $COGseen)) {
	    # Ignore if has too many overlaps
	    my $nOverlap = 0;
	    foreach my $old (@$list) {
		my ($oldId,$oldGene,$oldBeg,$oldEnd,$oldDBeg,$oldDEnd,$oldScore) = @$old;
		my $maxBeg = $oldBeg < $seqBeg ? $seqBeg : $oldBeg;
		my $minEnd = $oldEnd > $seqEnd ? $seqEnd : $oldEnd;
		$nOverlap++ if $minEnd - $maxBeg + 1 > 0.5 * ($seqEnd-$seqBeg+1); # over half covered
	    }
	    $ignore = 1 if $nOverlap > $fbOptions->{nOverlappingDom};
	}
	if ($ignore) {
	    push @domIgnore, $domId;
	} else {
	    push @$list, $row;
	    $domUsed{$domId} = 1;
	    $COGseen = 1 if $IsCOG;
	}
    }

    if ($fbOptions->{debug}) {
	print STDERR "\tDomains:\t" . join("\t",map {$_->[$DOM]} @domUse) . "\n";
	print STDERR "\tAdHoc:\t" . join("\t",map {$_->[$DOM]} @domUseAdhoc) . "\n";
	print STDERR "\tIgnore:\t" . join("\t", @domIgnore)."\n";
    }

    my @out = @domUse;
    push @out, @domUseAdhoc;
    return \@out;
}

sub FastBLAST($$$$) {
    my ($query,$db,$nTopHits,$outfile) = @_;
    my $domains = FetchDomains($query);
    my $filtered = TopDomains($domains);

    # Given subset of domains, we choose nCandidatesPerDomain candidates for each
    my $nCandidatesPerDomain = int(0.5 + $fbOptions->{multPerDomain} * $nTopHits);
    my $nTotalCandidates = int(0.5 + $fbOptions->{multTot} * $nTopHits); # ditto

    my %maybeHits = ($query => 1e20); # candidateId => best score
    my %domUsed = map {$_->[0] => 1} @$filtered;
    foreach my $domId (keys %domUsed) {
	my $tophits = AlignmentToTopHits($domId,$query,$fbOptions->{minBits},$nCandidatesPerDomain);
	while (my ($candidate,$score) = each %$tophits) {
	    $maybeHits{$candidate} = $score
		unless exists $maybeHits{$candidate} && $maybeHits{$candidate} > $score;
	}
    }
    
    # Then we sort by best score and select the best $nTotalCandidates
    my @candidates = sort {$maybeHits{$b} <=> $maybeHits{$a}} (keys %maybeHits);
    if (@candidates > $nTotalCandidates) {
	$#candidates = $nTotalCandidates; # don't subtract one because of self
    }
    
    if ($fbOptions->{debug}) {
	print STDERR "From " . scalar(keys %maybeHits) . " total candidates to " . scalar(@candidates) . " " . localtime() . "\n";
	if ($fbOptions->{debug}>1) {
	    print STDERR join("\t","Total:",keys %maybeHits)."\n";
	    print STDERR join("\t","Filtered:",@candidates)."\n";
	}
    }
    
    # Then we run blast
    my $faafile = FetchSequencesFile($db, \@candidates);
    return if !defined $faafile; # it failed
    my $formatCmd = "$ENV{FASTHMM_DIR}/bin/formatdb -p T -i $faafile";
    StartTimer("formatdb");
    system($formatCmd) == 0 || print STDERR "formatdb may have failed: $? -- $formatCmd";
    EndTimer("formatdb");
    die "$formatCmd failed" unless -e "$faafile.pin";
    print STDERR "Running $formatCmd " . localtime() . "\n" if $fbOptions->{debug};
    
    my $infile = FetchSequencesFile($db, [$query]);
    my $opts = {F=>"m S",m=>0,z=>1e8,e=>1e-3};
    my @blastCmd = ("$ENV{FASTHMM_DIR}/bin/blastall",
		    "-F", $opts->{F},
		    "-p", "blastp",
		    "-d", $faafile,
		    "-m", $fbOptions->{m},
		    "-z", $fbOptions->{z},
		    "-e", $fbOptions->{e},
		    "-b", 100000000,
		    "-v", 100000000,
		    "-i", $infile);
    push @blastCmd, ("-o",$outfile) if defined $outfile;

    print STDERR "Running " . join(" ",@blastCmd) . " " . localtime() . "\n" if $fbOptions->{debug};
    StartTimer("blastp");
    system(@blastCmd) == 0 || print STDERR "blast may have failed for query $query: $? -- " . join(" ",@blastCmd) . "\n";
    EndTimer("blastp");

    if ($fbOptions->{debug} <= 1) {
	unlink($infile);
	unlink($faafile);
	foreach my $suffix (qw(phr pin psd psi psq)) {
	    unlink("$faafile.$suffix");
	}
    }
}


sub FetchSequencesFile($$) {
    my ($db,$list) = @_;

    # fetch loci and load their sequences
    my $listfile = TmpFile("list");
    local *LIST;
    open(LIST,">",$listfile) || die "Cannot write to $listfile";
    foreach my $name (@$list) {
	if ($name =~ m/^\d+$/) {
	    print LIST "lcl|".$name."\n";
	} else {
	    print LIST "$name\n";
	}
    }
    close(LIST) || die "Error writing to $listfile";

    my $binDir = $ENV{FASTHMM_DIR} . "/bin";
    my $seqsfile = TmpFile("faa");
    StartTimer("fastacmd");
    my $fastacmd = "$binDir/fastacmd -i $listfile -d $db -p T > $seqsfile";
    print STDERR "Running $fastacmd " . localtime() . "\n" if $fbOptions->{debug} > 0;
    if (system($fastacmd) != 0) {
	unlink($seqsfile);
	return undef if $fbOptions->{missok};
	die "Cannot run $fastacmd -- $?";
    }
    EndTimer("fastacmd");
    unlink($listfile) unless $fbOptions->{debug} > 1;

    return($seqsfile);
}

sub FetchSequences($$) {
    my ($db,$list) = @_;
    my $seqsfile = FetchSequencesFile($db,$list);

    my %seqs = (); # name -> value
    my $lastseq = "";
    local *FAA;
    open(FAA,"<",$seqsfile) || die "Cannot read $seqsfile: $!";
    while(<FAA>) {
	chomp;
	if (m/^>lcl\|(\S+)/ || m/>(\S+)/) {
	    $lastseq = $1;
	    die if $lastseq eq "";
	    die "Duplicate data for $lastseq from fastacmd" if exists $seqs{$lastseq};
	    $seqs{$lastseq} = "";
	} elsif (m/^>/) {
	    die "Cannot parse fastacmd output $_";
	} else {
	    die if $lastseq eq "";
	    $seqs{$lastseq} .= $_;
	}
    }
    close(FAA) || die "Error reading $seqsfile";

    # verify that every gene is included
    foreach my $name (@$list) {
	if (!exists $seqs{$name}) {
	    die "fastacmd failed for input name $name -- missing from database $db";
	}
    }
    unlink($seqsfile) unless $fbOptions->{debug} > 1;
    return \%seqs;
}

my %geneSeek = (); # gene -> seek position of its domains
my $geneSeekTable = undef;
my %alignSeek = (); # domain -> seek position of its alignment
my $alignSeekTable = undef;
local *BYGENE; # reading the actual domains file
local *ALIGN; # reading the actual align file
my $nSeqs = undef;
my $domainDir = undef;

sub InitDomains($) {
    my ($dir) = @_;
    if (defined($domainDir)) {
	if ($domainDir eq $dir) {
	    return();
	} else {
	    CleanupDomains();
	    $domainDir = undef;
	}
    }
    die "No such directory $dir" unless -d $dir;

    ($alignSeekTable = tie %alignSeek, 'DB_File', "$dir/fb.all.align.seek.db", O_RDONLY, 0666, $DB_HASH)
	|| die "Cannot read $dir/fb.all.align.seek.db: $!\n";

    ($geneSeekTable = tie %geneSeek, 'DB_File', "$dir/fb.all.domains.bygene.seek.db", O_RDONLY, 0666, $DB_HASH)
	|| die "Cannot read $dir/fb.all.domains.bygene.seek.db: $!\n";

    open(ALIGN, "<", "$dir/fb.all.align") || die "Cannot read $dir/fb.all.align";
    open(BYGENE, "<", "$dir/fb.all.domains.bygene") || die "Cannot read $dir/fb.all.domains.bygene";
    $domainDir = $dir;
}

sub CleanupDomains() {
    if (defined $domainDir) {
	close(ALIGN);
	close(BYGENE);
    }
    undef $geneSeekTable;
    untie %geneSeek;
    undef $alignSeekTable;
    untie %alignSeek;
    undef $domainDir;
}

sub FetchDomains($) {
    my ($name) = @_;
    StartTimer("FetchDomains");
    die "InitDomains() not called" unless defined $domainDir;
    my $seek = $geneSeek{$name};
    # it is legal for a gene to not have domains
    if (!defined $seek || $seek eq "") {
	EndTimer("FetchDomains");
	return [];
    }
    seek(BYGENE,$seek,0)
	|| die "Failed to seek to position $seek for gene $name in domains file";
    my @rows = ();
    while(my $line = <BYGENE>) {
	chomp $line;
	my @F = split /\t/, $line;
	# expect at least domId, name, beg, end, domBeg, domEnd, score
	die "Bad line $line in domains file" unless @F >= 7;
	if ($F[1] eq $name) {
	    push @rows, \@F;
	} else {
	    last; # no longer hits for this gene;
	}
    }
    EndTimer("FetchDomains");
    return(\@rows);
}

# returns 1 if this domain has an alignment
sub HasAlignment($) {
    my ($domId) = @_;
    return exists $alignSeek{$domId} ? 1 : 0;
}

# returns list of lines if $fh is not specified
# else writes lines to $fh and returns the number of lines
# fails if alignment does not exist
sub FetchAlignment {
    my ($domId,$fh) = @_;
    die "InitDomains() not called" unless defined $domainDir;
    StartTimer("FetchAlignment");
    my $seek = $alignSeek{$domId};
    die "No alignment for domain $domId" if !defined $seek || $seek eq "";
    # 0 means absolute offset
    seek(ALIGN,$seek,0)
	|| die "Failed to seek to position $seek for domain $domId in alignment file";
    my @lines = ();
    my $nLines = 0;
    my $line;
    while ($line = <ALIGN>) {
	$line =~ m/^(\S+)\t/ || die "Cannot parse alignment line $_ from seek $seek for domain $domId";
	if ($1 ne $domId) {
	    last;
	} else {
	    if (defined $fh) {
		print $fh $line;
	    } else {
		push @lines, $line;
	    }
	    $nLines++;
	}
    }
    EndTimer("FetchAlignment");
    die "Alignment for $domId not found at seek $seek -- last line read:\n$line"
	unless $nLines>0;
    return defined $fh ? $nLines : \@lines;
}

sub FetchNSequences($) {
    my ($dir) = @_;
    my $nfile = "$dir/fb.all.nseq";
    local *NSEQ;
    open(NSEQ,"<",$nfile) || die "Cannot read $nfile";
    my $nseq = <NSEQ>;
    die "Error in $nfile" unless defined $nseq;
    chomp $nseq;
    die "Error in $nfile" unless $nseq =~ m/^\d+$/;
    close(NSEQ) || die "Error reading $nfile";
    return $nseq;
}

sub AlignmentToTopHits($$$$) {
    my ($domId,$query,$minScore,$nHits) = @_;
    my $alnfile = TmpFile("aln");
    local *ALN;
    open(ALN,">",$alnfile) || die "Cannot write to $alnfile";
    my $nlines = FetchAlignment($domId,\*ALN);
    close(ALN) || die "Error writing to $alnfile";

    my %scores = ();

    my $domsearchFile = TmpFile("domsearch");
    my $domsearchCmd = "$ENV{FASTHMM_DIR}/bin/DomSearch $query $alnfile $minScore $nHits > $domsearchFile";
    my $nCand = $nlines;
    print STDERR "Running $domsearchCmd # $domId ($nCand candidates) " . localtime() . "\n" if $fbOptions->{debug} > 0;

    StartTimer("DomSearch");
    system($domsearchCmd) == 0 || die "Cannot run $domsearchCmd -- $?";
    EndTimer("DomSearch");

    open(DOMSEARCH,"<",$domsearchFile) || die "Cannot read $domsearchFile";
    while(<DOMSEARCH>) {
	chomp;
	my ($candidate,$score) = split /\t/, $_;
	die "Cannot parse $_ from $domsearchCmd: $!" unless defined $score;
	$scores{$candidate} = $score
	    unless exists $scores{$candidate} && $scores{$candidate} > $score;
    }
    close(DOMSEARCH) || die "Error reading $domsearchFile";
    unlink($alnfile) unless $fbOptions->{debug} > 1;
    unlink($domsearchFile) unless $fbOptions->{debug} > 1;
    return \%scores;
}

my $nTmpFile = 0; # used to avoid clobbering temporary files
my $tmpdir = "/tmp";

sub SetTmpDir($) {
    my ($tmpdirNew) = @_;
    die "No such temporary directory: $tmpdirNew" unless -d $tmpdirNew;
    $tmpdir = $tmpdirNew;
}

sub TmpFile($) {
    my ($suffix) = @_;
    $nTmpFile++;
    return join("", $tmpdir, "/topHomologs", $$, ".", $nTmpFile, ".", $suffix);
}

my %timerStarts = ();
my %timerTotals = ();

sub StartTimer($) {
    my ($name) = @_;
    return() unless $useHiRes;
    $timerStarts{$name} = [ Time::HiRes::gettimeofday( ) ];
}

sub EndTimer($) {
    my ($name) = @_;
    return() unless $useHiRes;
    die "Unknown timer $name" unless exists $timerStarts{$name};
    $timerTotals{$name} = 0 unless exists $timerTotals{$name};
    $timerTotals{$name} += Time::HiRes::tv_interval( $timerStarts{$name} );
    delete $timerStarts{$name};
}

sub GetTimes() {
    return() unless $useHiRes;
    return \%timerTotals;
}

return 1;
