#!/usr/bin/perl -w
#
#  $Id: maskBlast.pl,v 1.5 2008/09/23 01:41:38 mprice Exp $
#  fastHmm/fastBlast Alignment Tools
#  http://microbesonline.org/fasthmm (fasthmm@microbesonline.org)
#
#  Script for identifying redundant sequences from tab-delimited BLAST output
#
#  Copyright (C) 2007 The Regents of the University of California
#  All rights reserved.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License along
#  with this program; if not, write to the Free Software Foundation, Inc.,
#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
#  Disclaimer
#
#  NEITHER THE UNITED STATES NOR THE UNITED STATES DEPARTMENT OF ENERGY,
#  NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED,
#  OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
#  COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
#  OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
#  PRIVATELY OWNED RIGHTS.

# Takes a tab-delimited set of BLAST results and removes any redundant queries.
# Outputs a reduced set of BLAST results (including only the non-redundant queries)
# Assumes that sequence names are of the form name.begin.end, so that
# it can determine sequence length as end-begin+1 and hence infer coverage

use strict;
use Getopt::Long;

# Given blast hits for a query as a reference to a list of lists,
# a reference to a hash of queries that are masked out -> 1,
# a reference to a hash of query -> subjects masked out by that query -> 1
# the id threshold,
# and the coverage threshold,
# updates the masked hash
sub UpdateMask($$$$$);

{
    my $cover = 0.8;
    my $idThresh = 0.35;
    my $outfile = undef;
    my $usage = "maskBlast.pl [-cover $cover] [-id $idThresh] -out masked_hits blast_file1 ... blast_fileN";

    (GetOptions('cover=f' => \$cover, 
		'id=f' => \$idThresh,
		'out=s' => \$outfile)
     && defined $outfile)
	|| die "$usage\n";
    die "No blast files for maskBlast.pl to analyze:\n$usage" if @ARGV==0;

    # read the input files twice, first to identify sequences that
    # we can "mask out" because they are redundant, and
    # a second time to select the subset of rows from the BLAST hits

    my %masked = (); # masked-out name => 1
    my %masksOut = (); # name -> name it masks out -> 1
    foreach my $infile (@ARGV) {
	open(BLAST,"<",$infile) || die "Cannot read $infile";
	my $lastQuery = "";
	my @rows = ();
	while(my $line = <BLAST>) {
	    chomp $line;
	    my @F  = split /\t/, $line;
	    die "Cannot parse $line" unless scalar(@F) >= 11;
	    my $query = $F[0];
	    if ($query ne $lastQuery) {
		UpdateMask(\@rows,\%masked,\%masksOut,$idThresh,$cover) if scalar(@rows) > 0;
		@rows = ();
	    }
	    $lastQuery = $query;
	    push @rows, \@F;
	}
	close(BLAST) || die "Error reading $infile";
	UpdateMask(\@rows,\%masked,\%masksOut,$idThresh,$cover) if scalar(@rows) > 0;
    }
    print STDERR "Masking out " . scalar(keys %masked) . " sequences in maskBlast.pl\n";

    my $outtmp = "$outfile.tmp";
    open(OUT,">",$outtmp) || die "Cannot write to $outtmp";
    foreach my $infile (@ARGV) {
	open(BLAST,"<",$infile) || die "Cannot read $infile";
	while(<BLAST>) {
	    m/^(\S+)\t/ || die "Cannot parse $_";
	    print OUT $_ if !exists $masked{$1};
	}
	close(BLAST) || die "Error reading $infile";
    }
    close(OUT) || die "Error writing to $outtmp";
    rename($outtmp,$outfile) || die "Cannot rename $outtmp to $outfile";
    print STDERR "Wrote $outfile\n";
}

sub UpdateMask($$$$$) {
    my ($rows,$masked,$masksOut,$idThresh,$cover) = @_;
    return if exists $masked->{$rows->[0][0]}; # query is constant

    my %subjects = map {$_->[1] => 1} @$rows;

    foreach my $row (@$rows) {
	my ($query,$subject,$idPercent,$alnLen,$mm,$gap,$qBeg,$qEnd,$sBeg,$sEnd,$eval,$score) = @$row;
	next if $query eq $subject || exists $masked->{$subject};
	next if $idPercent < $idThresh*100;
	    
	my $subjectMatchLen = $sEnd-$sBeg+1;
	die "Cannot parse\n" . join("\t",@$row)
	    ."\n -- end $sEnd does not exceed begin $sBeg"
	    if $subjectMatchLen <= 1;
	$subject =~ m/\.(\d+)\.(\d+)$/ || die "Cannot parse begin and end from subject $subject";
	my ($sFrom,$sTo) = ($1,$2);
	my $sLen = $sTo-$sFrom+1;
	die "Subjects range for $subject does not exceed 1" unless $sLen>1;
	    
	if ($subjectMatchLen >= $cover * $sLen) {
	    # We've met the coverage and %identity thresholds
	    # But would masking this sequence cause queries with hits
	    # to disappear?
	    my $OK = 1;
	    foreach my $sub2 (keys %{ $masksOut->{$subject} }) {
		if (!exists $subjects{$sub2}) {
		    $OK = 0;
		    last;
		}
	    }
	    if ($OK) {
		$masked->{$subject} = 1;
		$masksOut->{$query} = {} if !exists $masksOut->{$query};
		$masksOut->{$query}{$subject} = 1;
		foreach my $sub2 (keys %{ $masksOut->{$subject} }) {
		    $masksOut->{$query}{$sub2} = 1;
		}
		delete $masksOut->{$subject}; # not needed anymore
	    }
	}
    }
}
