#!/usr/bin/perl -w
#
#  $Id: parseBlast.pl,v 1.2 2008/06/17 01:30:24 mprice Exp $
#  fastHmm/fastBlast Alignment Tools
#  http://microbesonline.org/fasthmm (fasthmm@microbesonline.org)
#
#  General-purpose blastp parser
#
#  Copyright (C) 2007 The Regents of the University of California
#  All rights reserved.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License along
#  with this program; if not, write to the Free Software Foundation, Inc.,
#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
#  Disclaimer
#
#  NEITHER THE UNITED STATES NOR THE UNITED STATES DEPARTMENT OF ENERGY,
#  NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED,
#  OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
#  COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
#  OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
#  PRIVATELY OWNED RIGHTS.
#

use strict;

sub ReadBlock($); # returns a reference to a list of lines ending in up to n consecutive empty lines
sub ParseAlignment($$$); # writes to stdout

# This should be run as a pipe, on raw BLASTp output
{
    my $usage = "Run as a filter, writes tab-delimited output\n"
	. " Columns are query, subject, query begin, query end, subject begin, subject end,\n"
	. " score in bits, expectation-value,\n"
	. " list of query aligned ranges, list of subject aligned ranges,\n"
	. " alignment length, number of matches, number of gaps, and percent identity\n";
    die $usage if @ARGV>0;
    
    my $queryName;
    my $subjectName;
    my $subjectLen;
    
    while(my $line = <STDIN>) {
	chomp $line;
	if ($line =~ m/^Query= (.*)$/) { # e.g. Query= 64576_4
	    $queryName = $1;
	    $queryName =~ s/,? .*//;
	    $subjectName = undef;
	    $subjectLen = undef;
	}
	if ($line =~ m/^[>](.*)$/) { # e.g. >64576_4
	    $subjectName = $1;
	    $subjectName =~ s/,? .*//;
	    my $lines = ReadBlock(1);
	    die "No length line for subject $subjectName: $lines->[-2]" unless $lines->[-2] =~ m/^ +Length = ([0-9]+)$/;
	    $subjectLen = $1;
	    my $alignment = ReadBlock(2);
	    ParseAlignment($queryName, $subjectName, $alignment);
	} elsif ($line =~ m/^ Score = .*Expect =/) {
	    my $lines = ReadBlock(2);
	    unshift @$lines, $line;
	    ParseAlignment($queryName, $subjectName, $lines);
	} else {
	    #print STDERR "Ignoring: $line\n";
	}
    }
}

sub ReadBlock($) {
    my ($n) = @_;
    my $nEmpty = 0;
    my @lines = ();
    while(<STDIN>) {
	chomp;
	push @lines, $_;
	if ($_ eq "") {	$nEmpty++ } else { $nEmpty = 0; }
	last if $nEmpty >= $n;
    }
    die "Failed parsing at @lines\nProbably an incomplete job at " . scalar localtime()."\n"
	    unless @lines >= $n;
    return \@lines;
}

sub ParseAlignment($$$) {
    my ($queryName, $subjectName, $lines) = @_;

    my $line = shift @$lines;
    # e.g. " Score =  211 bits (536), Expect = 9e-54"
    $line =~ m/^ Score = +([0-9.e+-]+) [ bits()0-9]+, Expect = ([0-9e.-]+)/
	|| die "No score/expect in: $line";
    my ($score, $expect) = ($1, $2);

    $line = shift @$lines;
    # e.g. " Identities = 107/178 (60%), Positives = 136/178 (76%)"
    my @pieces = split /,/, $line;
    die "No identities in: $line" unless @pieces == 2 || @pieces == 3;
    $pieces[0] =~ m!^ +Identities = ([0-9]+)/([0-9]+) [(]([0-9.]+)[%][)]$!
	|| die "No identities in piece: $line";
    my ($match, $alignLen, $percentId) = ($1,$2,$3);
    my $gaps = 0;
    if (@pieces == 3) {
	$pieces[2] =~ m!^ Gaps = ([0-9]+)/! || die "No gaps in piece: $line";
	$gaps = $1;
    }

    $line = shift @$lines;
    die "Missing alignment block for $subjectName" if !defined $line || $line ne "";

    my @qRanges = ();
    my @sRanges = ();
    while(@$lines > 0) {
	if (@$lines == 1 && $lines->[0] eq "") {
	    shift @$lines;
	    next;
	}
	my $query = shift @$lines;
	my $ignore = shift @$lines;
	my $subject = shift @$lines;
	my $empty = shift @$lines;
	if (!defined $empty || $empty ne "" || $query !~ m/^Query:/ || $subject !~ m/^Sbjct:/) {
	    die "Missing alignment block for $subjectName: query $query ignore $ignore subject $subject empty $empty";
	}
	# Example block ($query,$ignore,$subject,$empty):
	#Query: 182 VGVVGAGTMGSGIANLAAMSGXXXXXXXXXXXXXXIAWQKINTFMEKSVAKGKMSEAEKE 241
	#           VGVVGAG MGSGIA++ +++G               A   I   ME+ V++GK+S  +K 
	#Sbjct: 6   VGVVGAGQMGSGIAHVFSLAGYEVLLNDISAEGLNKALSTIERNMERQVSRGKVSAEDKA 65
	#

	# Not sure why * characters appear in alignments sometimes
	$query =~ m/^Query: +([0-9]+) +([A-Za-z*-]+) +([0-9]+) *$/ || die "Cannot parse query alignment line: $query";
	my ($qBeg, $qSeq, $qEnd) = ($1,$2,$3);

	$subject =~ m/^Sbjct: +([0-9]+) +([A-Za-z*-]+) +([0-9]+) *$/ || die "Cannot parse subject alignment line: $query";
	my ($sBeg, $sSeq, $sEnd) = ($1,$2,$3);

	$ignore =~ m/^ +([A-Za-z+* ]+)$/ || die "Invalid comparison line in alignment status block: $ignore";

	die "Alignment lengths do not match" unless length($qSeq) == length($sSeq);
	my $len = length($qSeq);
	my $sAt = $sBeg;
	my $qAt = $qBeg;

	for (my $i=0; $i < $len; $i++) {
	    my $qChar = substr($qSeq, $i, 1);
	    my $sChar = substr($sSeq, $i, 1);
	    if ($qChar ne "-" && $sChar ne "-") {
		if (@qRanges > 0 && @sRanges > 0
		    && $qRanges[-1][1] == $qAt-1
		    && $sRanges[-1][1] == $sAt-1) {
		    $qRanges[-1][1]++;
		    $sRanges[-1][1]++;
		} else {
		    push @qRanges, [$qAt,$qAt];
		    push @sRanges, [$sAt,$sAt];
		}
		$qAt++;
		$sAt++;
	    } elsif ($qChar ne "-") {
		$qAt++;
	    } elsif ($sChar ne "-") {
		$sAt++;
	    }
	}
	# We're 1 past the end unless the alignment is all gaps
	die "Error parsing alignment of\n$qSeq\n$sSeq\nof len $len with qAt $qAt qEnd $qEnd sAt $sAt sEnd $sEnd" unless ($qAt == $qEnd+1 || $qAt == $qBeg) && ($sAt == $sEnd+1 || $sAt == $sBeg);
    }
    # Convert qPos and sPos to ranges
    
    print join("\t", $queryName, $subjectName,
	       $qRanges[0][0], $qRanges[-1][1],
	       $sRanges[0][0], $sRanges[-1][1],
	       $score, $expect,
	       join(",", map {$_->[0] . ":" . $_->[1]} @qRanges),
	       join(",", map {$_->[0] . ":" . $_->[1]} @sRanges),
	       $alignLen, $match, $gaps, $percentId)."\n";
}
