#!/usr/bin/perl -w
#
#  $Id: makeDBTables.pl,v 1.3 2008/08/05 20:31:01 mprice Exp $
#  fastHmm/fastBlast Alignment Tools
#  http://microbesonline.org/fasthmm (fasthmm@microbesonline.org)
#
#  Use berkeley DB (DB_File module) to index the domains for each gene
#  (including ad-hoc domains) and the alignments of all domains
#  from FastBLAST
#
#  Copyright (C) 2007 The Regents of the University of California
#  All rights reserved.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License along
#  with this program; if not, write to the Free Software Foundation, Inc.,
#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
#  Disclaimer
#
#  NEITHER THE UNITED STATES NOR THE UNITED STATES DEPARTMENT OF ENERGY,
#  NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED,
#  OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
#  COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
#  OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
#  PRIVATELY OWNED RIGHTS.

# This makes the tables expected by topHomologs.pl
#
# Expects fb.all.align to have all the alignments, as a tab-delimited file
# with columns domId, geneId, alignedSequence, begin, end, score, evalue,
# where all alignments for a domain are in a single block.
#
# Expects fb.all.domains.bygene to have all the gene-domain associations, as
# a tab-delimited file with columns domId, geneId, seqBeg, seqEnd, domBeg, domEnd, score, evalue
# (and perhaps additional alignment-specification columns -- those are ignored).
# All domains for a gene must be in a single block.
#
# All positions are 1-based.
#
# The files it creates are (all in the fastBlast output directory):
# fb.all.nseq -- the number of sequences in the input fasta file
# fb.all.domains.bygene.seek.db -- a Berkeley database of domainId -> seek position
# fb.all.align.seek.db -- a Berkeley database of geneId -> seek position
#
# The Berkeley databases are stored as hashes of strings with default settings
# according to the DB_File perl module.

use strict;
use DB_File;
die "Usage: makeDBTables.pl InFaa FastBLASTDirectory\n" unless @ARGV==2;

my ($faa,$dir) = @ARGV;
die "No such file: $faa" unless -e $faa;
die "No such directory: $dir" unless -d $dir;

my %geneNDomains = (); # gene to number of domains
open(FAA,"<", $faa) || die "Cannot read $faa";
while(<FAA>) {
    if (m/^>(\S+)/) {
	my $gene = $1;
	die "Duplicate gene $gene in $faa" if exists $geneNDomains{$gene};
	$geneNDomains{$gene} = 0;
    }
}
close(FAA) || die "Error reading $faa";

my $nSeqFile = "$dir/fb.all.nseq";
open(NSEQ,">","$nSeqFile.tmp") || die "Cannot write to $nSeqFile.tmp";
print NSEQ scalar(keys %geneNDomains) . "\n";
close(NSEQ) || die "Error writing to $nSeqFile.tmp";
rename("$nSeqFile.tmp",$nSeqFile) || die "Cannot rename $nSeqFile.tmp to $nSeqFile";
print STDERR "Wrote $nSeqFile\n";

my $alnFile = "$dir/fb.all.align";
open(ALN,"<",$alnFile) || die "Cannot read $alnFile";
my %domSeek = ();
my $seekTable = tie %domSeek, 'DB_File', "$alnFile.seek.db", O_CREAT|O_TRUNC, 0666, $DB_HASH;
$seekTable || die "Cannot write to $alnFile.seek.db: $!\n";
my $lastDomain = "";

my $nDomain = 0;
for (my $at = tell(ALN); <ALN>; $at = tell(ALN)) {
    die "Cannot parse $_" unless m/^(\S+)\t/;
    my $domain = $1;
    if ($domain ne $lastDomain) {
	die "Duplicate domain $domain at both $at and $domSeek{$domain}"
	    if (exists $domSeek{$domain});
	$domSeek{$domain} = $at;
	$lastDomain = $domain;
	$nDomain++;
    }
}
close(ALN) || die "Error reading $alnFile";
undef $seekTable;
untie %domSeek;

print STDERR "Wrote DB table $alnFile.seek.db with indexes to alignments for $nDomain domains\n";

# And make the domain table

my $bygeneFile = "$dir/fb.all.domains.bygene";
open(GENEDOM,"<",$bygeneFile) || die "Cannot read $bygeneFile";

my $bygeneDBFile = "$bygeneFile.seek.db";
my %geneSeek = (); # gene id -> seek
my $bygeneTable = tie %geneSeek, "DB_File", $bygeneDBFile, O_CREAT|O_TRUNC, 0666, $DB_HASH;
$bygeneTable || die "Cannot write to $bygeneDBFile: $!";
my $nGeneAssign = 0; # genes with any domains at all

my $lastgene = "";
for (my $at = tell(GENEDOM); <GENEDOM>; $at = tell(GENEDOM)) {
    chomp;
    my ($domId,$geneId,$beg,$end,$domBeg,$domEnd,$score,$eval) = split /\t/, $_;
    die $_ unless defined $eval;
    if ($geneId ne $lastgene) {
	$geneSeek{$geneId} = $at;
	die "Domain entry for gene $geneId in $bygeneFile but not in sequence file $faa\n"
	    if !exists $geneNDomains{$geneId};
	die "Domain entries for gene $geneId do not form a single block in $bygeneFile"
	    if $geneNDomains{$geneId} > 0;
	$lastgene = $geneId;
	$nGeneAssign++;
    }
    $geneNDomains{$geneId}++;
}
close(GENEDOM) || die "Error reading $bygeneFile";

undef $bygeneTable;
untie %geneSeek;

print STDERR "Wrote DB table $bygeneDBFile with indexes to domains for $nGeneAssign genes\n";
