#!/usr/bin/perl -w
#  $Id: hmmToPsiBlast.pl,v 1.3 2008/07/31 21:03:28 mprice Exp $
#  fastHmm/fastBlast Alignment Tools
#  http://microbesonline.org/fasthmm (fasthmm@microbesonline.org)
#
# Converts an HMM input file to a psiblast file (with a seed sequence)
# Based on Martin Madera's model_convert.pl and HiddenMarkovModel.pm
# Morgan N. Price, July 2008
#
#  Copyright (C) 2008 The Regents of the University of California
#  All rights reserved.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#
#  You should have received a copy of the GNU General Public License along
#  with this program; if not, write to the Free Software Foundation, Inc.,
#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
#  Disclaimer
#
#  NEITHER THE UNITED STATES NOR THE UNITED STATES DEPARTMENT OF ENERGY,
#  NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED,
#  OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
#  COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
#  OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
#  PRIVATELY OWNED RIGHTS.

# Changes by MNP:
#
# Madera's original code did not handle different null emission probabilities
# for some families (e.g., panther)
#
# hmmToPsiBlast.pl ignores gaps
# when producing a psi-blast profile (more precisely, it divides by
# the non-gap frequency at each position to raise the profile probabilities).
# We hope that this will make the psi-blast profile more sensitive
# to weak matches.

use strict;
use Getopt::Long;

{
    my ($infile, $outfile, $seedfile);
    my $debug = 0;
    my $usage = "hmmToPsiBlast.pl -in hmmer [-out hmmer.psiblast] [-seed hmmer.psiblast.fa]\n"
	. "    Writes a binary psi-blast checkpoint and an artificial seed sequence\n"
	. "    The psi-blast checkpoint may not be portable to other machines\n";
    die $usage unless
	GetOptions('in=s' => \$infile,
		   'out=s' => \$outfile,
		   'seed=s' => \$seedfile,
		   'debug=i' => \$debug)
	&& @ARGV == 0
	&& defined $infile;
    $outfile = $infile . ".psiblast" unless defined $outfile;
    $seedfile = $outfile . ".fa" unless defined $seedfile;

    my $acc = undef;
    my $name = undef;
    my @weights = (); # list of hashes of character -> match score
    my @characters = ();
    my @back = (); # background frequencies
    my $len = undef;
    open(HMM,"<",$infile) || die "Cannot read $infile";
    my $inHMMBlock = 0;
    while(<HMM>) {
	chomp;
	if (m/^ACC +([^ ]+)$/) {
	    die "Redundant accession line: $_" if defined $acc;
	    $acc = $1;
	} elsif (m/^NAME +([^ ]+)$/) {
	    die "Redundant name line: $_" if defined $name;
	    $name = $1;
	} elsif (m/^NULE/) {
	    die "Duplicate NULE line: $_" if @back > 0;
	    my @F = split / +/, $_;
	    pop @F if $F[-1] eq "";
	    shift @F;
	    @back = @F;
	    foreach my $score (@back) {
		die "Illegal background score $score in $infile" unless $score =~ m/^-?\d+$/;
	    }
	} elsif (m/^LENG +(\d+)/) {
	    $len = $1;
	} elsif (m/^HMM /) {
	    die "Redundant characters line: $_" if @characters > 0;
	    my @F = split / +/, $_;
	    pop @F if $F[-1] eq "";
	    shift @F; # the "HMM";
	    die "Empty characters line: $_" if @F == 0;
	    @characters = @F;
	    $inHMMBlock = 1;
	} elsif (m!^//!) {
	    last;
	} elsif ($inHMMBlock && m/^ *\d/) {
	    # note only consider block lines that start with a number, for the position in the HMM
	    # (other lines are for insert and delete states)
	    s/ +$//;
	    s/^ +//;
	    my @F = split / +/, $_;
	    # match scores lines look like
	    #      1  -2212  -6366   -564  -3871  -1440   3569  -5039  -7593  -1811  -7539  -6856      6  -5916  -4750  -6275   -627  -5367  -6794  -7727  -6804     1
	    # or
	    #     1   -625  -1481   2624    655  -1758   -262   -127  -1955   -121  -1871  -1522    573   -484    -35   -420   -244   -460  -1604  -1283  -1048
	    next unless $F[0] =~ m/^[0-9]+$/ && $F[-1] =~ m/^[0-9-]+$/ && $F[1] =~ m/^[0-9-]+$/;
	    shift @F; # line number at beginning
	    die "No characters yet for line $_" unless @characters > 0;
	    pop @F if @F > @characters; # alignment position is at end
	    die "Wrong number of characters on line $_\n vs " . join("",@characters) unless @characters == @F;
	    my %map = ();
	    foreach my $i (0..$#F) {
		$F[$i] = -100000 if $F[$i] eq "*"; # * means 0 emission probability
		die "Invalid score $F[$i] in $infile" unless $F[$i] =~ m/^-?\d+$/;
		$map{$characters[$i]} = $F[$i];
	    }
	    push @weights, \%map;
	}
    }
    close(HMM) || die "Error reading $infile";
    die "No emission scores in $infile" if @weights == 0;
    if (defined $len) {
	die "Wrong number of positions: " . scalar(@weights) . " versus LENG $len in $infile"
	    unless scalar(@weights) == $len;
    } else {
	print STDERR "Warning: no length for $infile\n";
    }
    die "No background frequencies (NULE line) in $infile" if @back == 0;
    die "Wrong number of background frequencies (NULE line) in $infile" if @back != @characters;
    die "No accession or name in $infile" if !defined($name) && !defined($acc);
    $acc = $name if !defined $acc;
    my %back = ();
    foreach (@characters) {
	$back{$_} = shift @back;
    }

    if ($debug) {
	print STDERR "Read " . scalar(@weights) . " columns from $infile\n";
	print "Match weights\n";
	my $line = 1;
	foreach my $map (@weights) {
	    print "$line\t";
	    $line++;
	    foreach my $c (@characters) {
		print " ".$c.":".$map->{$c};
	    }
	    print "\n";
	}
    }

    # Compute seed sequence
    my $seed = "";
    foreach my $map (@weights) {
	my $best = $characters[0];
	foreach my $c(@characters) {
	    $best = $c if $map->{$c} > $map->{$best};
	}
	$seed .= $best;
    }

    print STDERR "Seed: $seed\n" if $debug;
    open(SEED,">",$seedfile) || die "Cannot write to $seedfile";
    print SEED ">$acc\n";
    my @s = $seed =~ m/.{1,60}/g;
    print SEED join("\n",@s)."\n";
    close(SEED) || die "Error writing to $seedfile";
    print STDERR "Wrote to $seedfile\n" if $debug;

    # adjusted emission rates (ignoring gaps)
    my @emitRate = ();
    foreach my $map (@weights) {
	# per-position score plus background score goes to overall score in bits
	# then divide by 20
	my %rate = map { $_ => 2**($map->{$_}/1000.0 + $back{$_}/1000.0)/20 } (keys %$map);
	my $tot = 0;
	foreach (values %rate) { $tot += $_; }
	foreach (keys %rate) { $rate{$_} /= $tot; }
	push @emitRate, \%rate;
    }
	
    my @psiorder = split //, "ARNDCQEGHILKMFPSTWYV";

    open(PSI, ">", $outfile) || die "Cannot write to $outfile";
    binmode PSI;
    syswrite PSI, pack("I", scalar(@emitRate)), 4;
    syswrite PSI, uc($seed), scalar(@emitRate);
    foreach my $emit (@emitRate) {
	foreach my $c (@psiorder) {
	    die "No emit score for character $c" unless exists $emit->{$c};
	    syswrite PSI, pack("d", $emit->{$c}), 8;
	}
    }
    close(PSI);
    print STDERR "Wrote $outfile\n" if $debug;
}

