#!/usr/bin/perl -w
#
#  $Id: parseHmmAlignments.pl,v 1.2 2007/10/16 23:20:20 whuang Exp $
#  fastHmm/fastBlast Alignment Tools
#  http://microbesonline.org/fasthmm (fasthmm@microbesonline.org)
#
#  Parse hmmsearch output into tab-delimited domains including hit coords
#
#  Copyright (C) 2007 The Regents of the University of California
#  All rights reserved.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License along
#  with this program; if not, write to the Free Software Foundation, Inc.,
#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
#  Disclaimer
#
#  NEITHER THE UNITED STATES NOR THE UNITED STATES DEPARTMENT OF ENERGY,
#  NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED,
#  OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
#  COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
#  OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
#  PRIVATELY OWNED RIGHTS.
#


use strict;

die "Usage: ParseHmmAlignments.pl HmmName < hmmsearch >> domainsfile\n"
    . "   Returns domain, sequence id, sequence begin to end, domain to end, score, E-value,\n"
    . "   and the list of matching positions in the sequence, as e.g. 5:48,50:100,110:200,\n"
    . "   and then similarly the list of matching positions in the model\n"
    if @ARGV != 1;

my ($ACC,$SEQID,$SEQBEG,$SEQEND,$DOMBEG,$DOMEND,$SCORE,$E) = (0..7);

my $acc = shift @ARGV;

while(<STDIN>) {
    last if m/^Parsed for domains/;
}

my @hits = ();
while(<STDIN>) {
    chomp;
    last if m/^Alignments/ || m/^[ \t]+.?no hits/;
    next if $_ eq "" || m/^Sequence +Domain/ || m/^[ -]+$/;
    my @F = split /[\t ]+/, $_;
    die "Cannot parse $_" unless @F==10;
    push @hits, [$acc,$F[0],$F[2],$F[3],$F[5],$F[6],$F[8],$F[9]];
}
my $hit = undef; # hit object corresponding to alignment we are processing

# Lineoffset keeps track of where we are:
# lineoffset = 1 means we expect the model alignment line [or an RF line]
# lineoffset = 2 means we expect the match-quality line (pluses and spaces)
# lineoffset = 3 means we expect the sequence alignment line
# lineoffset = 4 means we expect the blank line that separates the alignment sections
my $lineOffset = 0;
my $modelAlign = undef; # the model alignment
my $seqAlign = undef; # the sequence alignment

my @lines = ();
while(<STDIN>) {
    chomp;
    push @lines, $_;
}

while(@lines > 0) {
    my $line = shift @lines;

    # example input line: "616154: domain 1 of 1, from 196 to 341: score 10.1, E = 0.0048"
    # First, save existing alignment if we're at the begining of another one, or at the end
    if (($line =~ m/^Histogram/ || $line =~ m/domain.*from/)
	&& defined $hit) {
	$modelAlign =~ s/^[*]-[>]//;
	$modelAlign =~ s/[<]-[*]$//;
	# The modelALign and seqAlign should line up
	die "Sequence and model for $hit->[$SEQID] beginning at $hit->[$SEQBEG]"
	    . " are of different lengths:\n$modelAlign\n$seqAlign\nin ParseHmmAlignments.pl"
	    if length($modelAlign) != length($seqAlign);

	my $seqOff = $hit->[$SEQBEG];
	my $modelOff = $hit->[$DOMBEG];

	my $range = []; # in sequence (list of begin-to)
	my $rangeHMM = [];
	for (my $i = 0; $i < length($seqAlign); $i++) {
	    my $c = substr($seqAlign,$i,1);
	    if (uc($c) eq $c && $c ne "-") { # upper-case characters are matches to the HMM
		if (@$range > 0 && $range->[-1][1] == $seqOff-1) {
		    $range->[-1][1] = $seqOff;
		} else {
		    push @$range, [$seqOff,$seqOff];
		}
		if (@$rangeHMM > 0 && $rangeHMM->[-1][1] == $modelOff-1) {
		    $rangeHMM->[-1][1] = $modelOff;
		} else {
		    push @$rangeHMM, [$modelOff,$modelOff];
		}
	    }
	    $modelOff++ unless substr($modelAlign,$i,1) eq ".";
	    $seqOff++ unless $c eq "-";
	}
#	print STDERR "Parsing $subStart:$subEnd\nseq $subseq\nmod $modelAlign\nended at seq $seqOff model $modelOff\n";
#	print STDERR "Last model range " . join(" ",@{$rangeHMM->[-1]})."\n";

	print join("\t",@$hit,
		   join(",", map {$_->[0].":".$_->[1]} @$range),
		   join(",", map {$_->[0].":".$_->[1]} @$rangeHMM))
	    ."\n";
    }
    # Then process the line
    if ($line =~ m/^Histogram/) {
	last;
    } elsif ($line =~ m/^(.*): domain.*from ([0-9]+) to ([0-9]+):/) {
	my ($seqId, $begin, $end) = ($1,$2,$3);
	die "Unexpected domain -- no hits left" if @hits==0;
	$hit = shift @hits;
	die "Unexpected seqId $seqId $begin $end vs. $hit->[$SEQID] $hit->[$SEQBEG] $hit->[$SEQEND]" unless
	    $hit->[$SEQID] eq $seqId && $hit->[$SEQBEG] == $begin && $hit->[$SEQEND] == $end;
	$lineOffset = 1;
	$modelAlign = "";
	$seqAlign = "";
    }  elsif (defined $hit && $lineOffset == 1) {
	next if $line eq ""; # There are extra blank lines between alignments and start of histogram section
	# I don't know what these RF lines are, but I need to check that the RF is aligned
	# to the left of the sequence. ("RF " could be the alignment of the model!)
	if ($line =~ m/^( +RF) +/) {
	    my $RFLength = length($1);
	    die "Cannot parse RF -- no successor: $line" if @lines < 3;
	    my $nextLine = $lines[2];
	    # Example RF lines:
	    #                  RF  xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
	    #                  RF                                                    
	    # Example nextline:
	    #        651984   121  vesgkkqfvgseiagkrlgviglgaigalvandalalgmdvvgydpyisv 170  
	    if ($nextLine =~ m/^( +\S+ +\S+) +\S/ && length($1) == $RFLength) {
		next;
	    }
	    # else keep the line, assume it is a bona fide alignment line
	}	    
	die "Cannot parse $line" unless $line =~ m/^ +([*<>a-zA-Z.-]+)$/;
	$modelAlign .= $1;
	# e.g., $modelAlign = "*->LklldkhLipka...ttaeskvFylKmkGDYyRYlaEfatgeerkeaadk<-*"
	# (The ends are optional)
	# Capitalization indicates the strength of the match, not whether or not it is a match,
	# and ... indicates insert states
	$lineOffset = 2;
    } elsif (defined $hit && $lineOffset == 2) {
	$lineOffset = 3; # skip the alignment matches indicator
    } elsif (defined $hit && $lineOffset == 3) {
	# example line: "      616154   196    GRVFRRDF--EDATHAMMFHQVEGLVI--DKGITMASLKGALTEMAR 238 "
	# example line: "        15502     -     -    " #  Hangover line, note absence of right-end count...
	if ($line =~ m/^\s*(\S+)\s+[-]\s+[-]\s+$/) {
	    my $seqId2 = $1;
	    die "Cannot parse $line -- expected sequence $seqId2" unless 
		substr($hit->[$SEQID],0,length($seqId2)) eq $seqId2;
	} elsif ($line =~ m/^\s*(\S+)\s+([0-9-]+)\s+(\S+)\s+([0-9-]+) ?/) {
	    my ($seqId2,$subStart,$subseq,$subEnd) = ($1,$2,$3,$4);
	    # HMMer often truncates the seqid, but it should still match as far as it is present
	    die "Cannot parse $line -- expected sequence $seqId2"
		unless substr($hit->[$SEQID],0,length($seqId2)) eq $seqId2;
	    my $subseqStrip = $subseq;
	    $subseqStrip =~ s/[-]//g;
	    if ($subStart ne "-") {
		die "Cannot parse $line -- subseq $subseq is of wrong length"
		    unless length($subseqStrip) == $subEnd-$subStart+1;
	    }
	    $seqAlign .= $subseq;
	} else {
	    die "Cannot parse $line";
	}
	$lineOffset = 4;
    } elsif ($line eq "" && $lineOffset==4) {
	$lineOffset = 1;
    }
}
