#!/usr/bin/perl -w
#
#  $Id: pirsfAssign.pl,v 1.2 2007/10/16 23:20:20 whuang Exp $
#  fastHmm/fastBlast Alignment Tools
#  http://microbesonline.org/fasthmm (fasthmm@microbesonline.org)
#
#  Script to eliminate weak hits to pirsf
#
#  Copyright (C) 2007 The Regents of the University of California
#  All rights reserved.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License along
#  with this program; if not, write to the Free Software Foundation, Inc.,
#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
#  Disclaimer
#
#  NEITHER THE UNITED STATES NOR THE UNITED STATES DEPARTMENT OF ENERGY,
#  NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED,
#  OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
#  COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
#  OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
#  PRIVATELY OWNED RIGHTS.
#

use strict;

my $usage = "PirsfAssign.pl hmmHitsFile dbDir [-sorted] > output\n"
    . "   Given assignments of regions of genes to pirsf familes and subfamilies,\n"
    . "   PirsfAssign.pl uses the thresholds in the pirsf file to eliminate\n"
    . "   weak hits\n"
    . "   Because InterPro shows the hit to the family model and to the\n"
    . "   subfamily model, PirsfAssign.pl does so as well\n"
    . "   The rules for using the thresholds are:\n"
    . "   bit score >= mininum score and abs(length - mean(length)) < 3.5 * Std(length)\n"
    . "   Finally, PirsfAssign.pl keeps only the best-hitting family for any region\n"
    . "   (again choosing by bit score)\n";

die $usage unless @ARGV == 2;
my ($infile, $dbDir, $sorted) = @ARGV;
$sorted = ( defined($sorted) && ($sorted eq '-sorted') ) ?
		1 : 0;

my $baseDir = exists( $ENV{FASTHMM_DIR} ) ?
		$ENV{FASTHMM_DIR} :
		".";

my $pirsf_dat = $dbDir . "/pirsf/pirsf.dat";
die "missing hmmHits file '$infile'"
	if ( !(-e $infile) );
die "invalid database dir '$dbDir'; could not find pirsf/pirsf.dat"
	if ( !(-d $dbDir) || !(-e $pirsf_dat) );

my %pirsf = (); # pirsf id -> hash of minscore, meanlen, sdlen, children
open(PIRSF,"<",$pirsf_dat) || die "Cannot read $pirsf_dat";
while(<PIRSF>) {
    # e.g.,
    #>PIRSF000001 child: PIRSF500151 PIRSF500152
    #Cytochrome c/cytochrome c2
    #112.723214285714 12.7298313681837 88 168.897321428571 25.7630317403111
    #BLAST: Yes
    chomp;
    die "Expecting a line like\n>PIRSF000001 child: PIRSF500151 PIRSF500152\nbut found\n$_\ninstead"
	unless m/^>(PIRSF[0-9]+)( child: )?(PIRSF[PIRSF0-9 ]+)?$/;
    my ($fam,$childspec) = ($1,$3);
    $childspec = "" if !defined $childspec; # allowed to be missing
    my @children = split / /, $childspec;
    die "Family $fam is present twice in $pirsf_dat" if exists $pirsf{$fam};

    my $name = <PIRSF>;
    die "Cannot read name for $fam from $pirsf_dat" unless $name;
    chomp $name;

    my $line = <PIRSF>;
    chomp;
    my ($meanlen, $sdlen, $minscore, $meanscore, $sdscore) = split / /, $line;
    # They always have the first three, but not necessarily meanscore or sdscore
    die "Cannot parse thresholds for $fam: $line" unless defined $minscore
	&& $minscore =~ m/^[0-9.-]+$/ 
	&& $meanlen =~ m/^[0-9.-]+$/
	&& $sdlen =~ m/^[0-9.-]+$/;

    my $blastline = <PIRSF>; # ignored

    my %children = map {$_=>1} @children;
    $pirsf{$fam} = { name => $name, 
		     children => \%children,
		     minscore => $minscore,
		     meanlen => $meanlen,
		     sdlen => $sdlen
		 };
}
close(PIRSF) || die "Error reading $pirsf_dat";

my ($FAMILY, $GENE, $BEG, $END, $DOMBEG, $DOMEND, $SCORE, $EVAL) = (0..7);

my $thisLocus = undef;
my @hits = ();

if ( $sorted )
{
	open(IN,"<",$infile) || die "Cannot read $infile";
} else {
	open(IN, "$baseDir/bin/sort -nk2 $infile |") ||
		die "Cannot sort and read $infile";
}
while(<IN>) {
    chomp;
    my @F = split /\t/, $_;
    die "Cannot parse $infile line $_" unless @F >= 8;
    my $fam = $F[$FAMILY];
    my $locus = $F[$GENE];
    die "Read hit to family $fam that is not described in $pirsf_dat"
	unless exists $pirsf{$fam};
    my $len = $F[$END] - $F[$BEG] + 1;
    if ($F[$SCORE] >= $pirsf{$fam}{minscore}
	&& abs($len - $pirsf{$fam}{meanlen}) < 3.5 * $pirsf{$fam}{sdlen}) {

	if ( !defined($thisLocus) )
	{
		$thisLocus = $locus;
		@hits = ();
	} elsif ( $thisLocus ne $locus )
	{
		# process hits
		processLociHits( $thisLocus, \@hits, \%pirsf )
			if ( scalar( @hits ) > 0 );

		$thisLocus = $locus;
		@hits = ();
	}

	push @hits, \@F;
    }
}

# process hits
processLociHits( $thisLocus, \@hits, \%pirsf )
	if ( scalar( @hits ) > 0 );

exit;

sub processLociHits
{
	my $gene = shift;
	my $hitsRef = shift;
	my $pirsf = shift;
	my @hits = sort {$b->[$SCORE] <=> $a->[$SCORE]} @{ $hitsRef }; # highest score first
	my @sofar = ();
	foreach my $hit (@hits)
	{
		my $keep = 1;
		my ($hitbeg,$hitend) = ($hit->[$BEG], $hit->[$END]);
		foreach my $sofar (@sofar)
		{
			my ($sbeg,$send) = ($sofar->[$BEG], $sofar->[$END]);
			if ($hitbeg <= $send && $hitend >= $sbeg)
			{
				# is overlap significant?
				my $obeg = $hitbeg < $sbeg ? $hitbeg : $sbeg;
				my $oend = $hitend < $send ? $hitend : $send;
				if ($oend - $obeg + 1 >= 0.5 * ($hitend - $hitbeg + 1))
				{
					# keep hit to parent of better hit, but not vice versa
					$keep = 0 unless exists $pirsf->{$hit->[$FAMILY]}{children}{$sofar->[$FAMILY]};
#					print STDERR "Keeping hit of $gene to $hit->[$FAMILY] child of $sofar->[$FAMILY]\n"
#						if $keep;
#					print STDERR "Rejecting hit of $gene to $hit->[$FAMILY] in favor of $sofar->[$FAMILY]\n"
#						. join(" ","children",keys %{ $pirsf->{$sofar->[$FAMILY]}{children} })
#						."\n" if !$keep;
				}
			}
		}

		push @sofar, $hit if $keep;
	}

	foreach my $kept (@sofar)
	{
		print join("\t", @$kept)."\n";
	}
}
