#!/usr/bin/perl -w
#
#  $Id: ssfAssignFast.pl,v 1.2 2007/10/16 23:20:20 whuang Exp $
#  fastHmm/fastBlast Alignment Tools
#  http://microbesonline.org/fasthmm (fasthmm@microbesonline.org)
#
#  Script to assign genes to superfamilies given family hits
#
#  Copyright (C) 2007 The Regents of the University of California
#  All rights reserved.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License along
#  with this program; if not, write to the Free Software Foundation, Inc.,
#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
#  Disclaimer
#
#  NEITHER THE UNITED STATES NOR THE UNITED STATES DEPARTMENT OF ENERGY,
#  NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED,
#  OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
#  COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
#  OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
#  PRIVATELY OWNED RIGHTS.
#


#
# Given hmmer output for each superfamily, and the fasta file assign genes to superfamilies
# Unlike hmmer, the output includes only seqid, modelid, begin, end, and not the sequence of the
#	gene with the matching positions in uppercase
# Deliberately ignores hits to sequences that are not in the fasta file
use Getopt::Long;
use strict;
sub overlapRangesSize($$);

    my ($MODEL,$LOCUSID,$BEGIN,$END,$DOMBEG,$DOMEND,$SCORE,$EVALUE,$RANGES,$MODELRANGES) = (0..9);
    #my ($MODEL,$BEGIN,$END,$SCORE,$EVALUE,$RANGES) = (0..5);

    my $parsedFile = undef;
    my $ssfFile = undef;
    my $faaFile = undef;
    my $cutoff = 0.02;
    my $percentsame = 35;
    my $sorted = 0;
    my $coresize = 15;
    my $debug = 0;
    my $mapFile = undef;
    my $minLen = undef;

    my $baseDir = exists( $ENV{FASTHMM_DIR} ) ?
			$ENV{FASTHMM_DIR} :
			"./";

    my %seqLength = (); # id -> length
    my $map = {};


{

    my $usage = "ssfAssignFast.pl -superfams superfamListFile -db faaFile\n"
	. "    -parsed parsedHits  [-sorted]\n"
	. "    [-map <mapFile>] [-minlength <len>] [-percentsame $percentsame]\n"
	. "    [-cutoff $cutoff] [-debug 1] > assignments\n"
	. "\n"
	. "    -map; load <mapFile> and map accessions to family names\n"
	. "    -minlength; skip hits less than the <len> amino acids\n"
	. "    -sorted; the input hits file is already sorted by sequence id\n"
	. "\n"
	. "    superfamslist should just a a list of superfam HMM ids, 1 per line\n"
	. "\n"
	. "    In rare cases, this script gives different results from the assignment.pl\n"
	. "    script that is distributed with the superfamily database. On a test of\n"
	. "    10,920 bacterial proteins, we found 7 differences. The two scripts have\n"
	. "    slightly different rules for determining if a hit conflicts with a\n"
	. "    preexisting assignment and hence should be ignored. This script\n"
	. "    counts every position that overlaps a previous assignment, and assigns\n"
	. "    a conflict if overlap >= percentsame * size-of-new-hit. assignment.pl\n"
	. "    does the same, but counts positions that overlap multiple previous\n"
	. "    assignments multiple times, which makes it slightly more aggressive in\n"
	. "    rejecting additional domain assignments\n"
	. "\n"
	. "    Note that this script is memory-intensive (~800 MB for 5.2M of sequence)\n"
	. "    but in -parsed mode it is fast (~20s), and you can split the database\n"
	. "    to reduce memory requirements\n";

    (GetOptions('superfams=s' => \$ssfFile,
		'parsedFile=s' => \$parsedFile,
		'db=s' => \$faaFile,
		'percentsame=i' => \$percentsame,
		'cutoff=f' => \$cutoff,
		'sorted' => \$sorted,
		'coresize=i' => \$coresize,
		'map=s' => \$mapFile,
		'minlength=i' => \$minLen,
		'debug=i' => \$debug)
     && @ARGV==0) || die $usage;
    die "Missing required arguments: Usage is\n$usage"
	unless defined $ssfFile
	&& defined $parsedFile
	&& defined $faaFile;

    die "map file specified '$mapFile' does not exist"
	if ( defined($mapFile) && !(-e $mapFile) );

    # load map if specified
    $map = ( defined($mapFile) ) ?
		loadMapFile($mapFile) :
		{};

    $minLen = int( $minLen )
	if ( defined($minLen) );
    die "minimum alignment length '$minLen' specified is invalid"
	if ( defined($minLen) && ($minLen <= 0) );


    # First, read the faa file to make %seqLength
    open(FAA,"<",$faaFile) || die "Cannot read $faaFile";
	warn "parsing $faaFile";
    my $faaId = undef;
    while(<FAA>) {
	chomp;
	if (m/^>/) {
	    if (m/^>(\S+)$/) {
		$faaId = $1;
	    } elsif (m/^>(\S+)\,?\s/) {
		$faaId = $1;
	    } else {
		die "Cannot parse  line of $faaFile: $_";
	    }
	    die "Duplicate sequence $faaId in $faaFile" if exists $seqLength{$faaId};
	    $seqLength{$faaId} = 0;
	} else {
	    die "Cannot parse first line of $faaFile: $_" unless defined $faaId;
	    die "Cannot parse line of $faaFile -- not sequence" unless m/^[A-Za-z*]+$/;
	    $seqLength{$faaId} += length($_);
	}
    }
    close(FAA) || die "Error reading $faaFile";

    my @ssf = (); # list of model ids
    open(SSF,"<",$ssfFile) || die "Cannot read $ssfFile";
	warn "reading $ssfFile";
    while(<SSF>) {
	chomp;
	push @ssf, $_;
    }
    close(SSF) || die "Error reading $ssfFile";

    my $nTotHits = 0;
    my %hits = (); # seqid -> list of modelid, begin, end, score, evalue, list of matched-ranges,
    # where each range is of the form [begin,end]

#   if (defined $parsedFile) {

	if ( $sorted )
	{
		open(PARSED,"<",$parsedFile) || die "Cannot read $parsedFile";
	} else {
		# sort by locus id (second column)
		open(PARSED, "$baseDir/bin/sort -nk2 $parsedFile |") || die "Cannot sort and read $parsedFile";
	}
	warn "reading $parsedFile";
	my $thisSeqId = undef;
	my @hits = ();
	while(<PARSED>) {
	    chomp;
	    my @F = split /\t/, $_;
	    my ($model,$seqId,$begin,$end,$domBeg,$domEnd,$score,$e,$ranges,$modelRanges) = @F;
	    if ( !defined($thisSeqId) )
	    {
		$thisSeqId = $seqId;
		@hits = ();
	    } elsif ( $thisSeqId ne $seqId )
	    {
		# now we have $thisSeqId and @hits for iteractive processing
		processLociHits( $thisSeqId, \@hits )
		    if ( scalar( @hits ) > 0 );

		$thisSeqId = $seqId;
		@hits = ();
	    }

		die "no range found for $model $seqId" unless defined $ranges;
		next unless exists $seqLength{$seqId} && 
		    (($e <= $cutoff) && ( (defined($minLen) && (($end-$begin+1) >= $minLen)) || !defined($minLen) ));
		$hits{$seqId} = [] unless exists $hits{$seqId};
		my $parsedRanges = [];
		foreach my $range (split /,/,$ranges) {
		    die "Cannot parse range $range" unless $range =~ m/^([0-9]+):([0-9]+)$/;
		    push @$parsedRanges,[$1,$2];
		}
		$F[$RANGES] = $parsedRanges;

		my $parsedModelRanges = [];
		foreach my $range ( split(/,/,$modelRanges) )
		{
		    die "Cannot parse range $range" unless $range =~ m/^([0-9]+):([0-9]+)$/;
		    push( @{$parsedModelRanges}, [$1, $2] );
		}
		$F[$MODELRANGES] = $parsedModelRanges;
		push( @hits, \@F );
	}

	processLociHits( $thisSeqId, \@hits )
	    if ( scalar( @hits ) > 0 );
#   }

#   print STDERR join(" ", "Parsed $nTotHits hits containing", scalar(keys %hits), "sequences",
#		      "and", scalar(keys %modelToSeqs), "models,\nfrom a list of",
#		      scalar(@ssf), "models and a database of",
#		      scalar(keys %seqLength), "sequences")."\n";


#   if ($debug) { # print out all hits and ranges
#	while (my ($seqId,$hits) = each %hits) {
#	    foreach my $hit (@$hits) {
#		my ($model,undef,$begin,$end,$domBeg,$domEnd,$score,$evalue,$ranges,$modelRanges) = @$hit;
#		#my ($model,$begin,$end,$score,$evalue,$ranges) = @$hit;
#		print STDERR join("\t",$seqId,$model,$begin,$end,$score,$evalue,
#				  join(",",map {$_->[0].":".$_->[1]} @$ranges),
#				  $seqLength{$seqId})."\n";
#	    }
#	}
#   }

    # Select which hits count as assignments, and print
#    my $nAssign = 0;
#    while (my ($seqId,$hits) = each %hits) {
#    }
#    print STDERR "Wrote $nAssign assignments\n";
}


sub processLociHits
{
	my $seqId = shift;
	my $hitsRef = shift;
	my @hits = sort {$a->[$EVALUE] <=> $b->[$EVALUE]} @$hitsRef;
	my @assigned = ();
	my @assignedRange = ();
	foreach my $hit (@hits) {
	    my ($model,undef,$begin,$end,$domBeg,$domEnd,$score,$evalue,$ranges,$modelRanges) = @$hit;
	    #my ($model,$begin,$end,$score,$evalue,$ranges) = @$hit;
	    
	    # is this hit overlapping with previous assignments?
	    # Note -- the original superfam assignment.pl script compares to each of the
	    # original ranges in turn, incrementing alloverlap as it goes
	    # If those overlap, then this calculation will give somewhat
	    # misleading results (I don't see why those should be counted >once)
	    #
	    # Here, I compare the unified range to the overlap -- which could in
	    # some situations give a slightly different answer
	    my $hitSize = 0;
	    foreach my $range (@{$hit->[$RANGES]}) {
		$hitSize += $range->[1] - $range->[0] + 1;
	    }
	    my $allOverlap = &overlapRangesSize($hit->[$RANGES], \@assignedRange);
	    if ($allOverlap <= $hitSize * $percentsame/100) {
                # add to assigned list and update @assignedRange
		push @assigned, $hit;
		#$nAssign++;
		my @newRange = @assignedRange;
		push @newRange, @{ $hit->[$RANGES] };
		@newRange = sort {$a->[0] <=> $b->[0]} @newRange;
		@assignedRange = ();
		foreach my $range (@newRange) {
		    if (@assignedRange==0) {
			push @assignedRange, $range;
		    } elsif ($assignedRange[-1][1] >= $range->[0] - 1) {
			# note we already know range begin > last-range's end
			# merge overlapping or abutting ranges
			if ($assignedRange[-1][1] < $range->[1]) {
			    my $old = pop @assignedRange;
			    push @assignedRange, [$old->[0], $range->[1] ];
			}
		    } else {
			push @assignedRange, $range;
		    }
		}
		if ($debug) {
		    print STDERR "Merged range for $seqId: added "
			. join(",",map {$_->[0].":".$_->[1]} @{$hit->[$RANGES]})
			. " giving " . join(",",map {$_->[0].":".$_->[1]} @assignedRange)
			. " (overlap was $allOverlap out of $hitSize)\n";
		}
	    }

	    my $nCover = 0;
	    foreach my $range (@assignedRange) { $nCover += $range->[1] - $range->[0] + 1; }
	    # give up if we've covered most of the gene already
	    last if $seqLength{$seqId} - $nCover < $coresize;
	}

	# Print the assignments for this locus
	foreach my $hit (@assigned) {
	    $hit->[$MODEL] = $map->{$hit->[$MODEL]}
		if ( exists( $map->{$hit->[$MODEL]} ) );
	    $hit->[$RANGES] = flattenCoordList( $hit->[$RANGES] );
	    $hit->[$MODELRANGES] = flattenCoordList( $hit->[$MODELRANGES] );
	    print join("\t", @{$hit}), "\n";
	    #print join("\t", $name, $seqId, $hit->[$BEGIN], $hit->[$END],
		#       $hit->[$EVALUE], $hit->[$SCORE])."\n";
	}
}

sub overlapRangesSize($$) { # assumes that they are sorted
    my ($r1,$r2) = @_;
    # make copies so we can shift them
    my @r1 = @$r1;
    my @r2 = @$r2; # make a copy so we can shift it

    my $nOverlap  = 0;

    while (@r1 > 0 && @r2 > 0) {
	my ($begin1,$end1) = @{ $r1[0] };
	my ($begin2, $end2) = @{ $r2[0] };
	if ($end1 >= $begin2 && $end2 >= $begin1) { # they intersect
	    my $begin = $begin1 > $begin2 ? $begin1 : $begin2;
	    my $end = $end1 < $end2 ? $end1 : $end2;
	    my $o = $end - $begin + 1;
	    $nOverlap += $o;
	}
	# Next sequence in either sequence is past end, so the range
	# with the lesser end won't overlap anything more
	if ($end1 < $end2) {
	    shift @r1;
	} else {
	    shift @r2;
	}
    }
    return ($nOverlap);
}



sub loadMapFile
{
	my $file = shift;
	my %map = ();
	local *IN;

	open(IN, "<$file");
	while(<IN>)
	{
		chomp;
		my ( $from, $to ) = split(/\s+/, $_);
		$map{$from} = $to;
	}
	close(IN);

	return \%map;
}

sub flattenCoordList
{
	my $list = shift;

	my @coords = ();
	foreach my $cp ( @{$list} )
	{
		push( @coords, join(":", @{$cp}) );
	}

	return join(",", @coords);
}
