#!/usr/bin/perl -w
#
#  $Id: filterFaaNoHits.pl,v 1.2 2008/09/26 22:03:45 mprice Exp $
#  fastHmm/fastBlast Alignment Tools
#  http://microbesonline.org/fasthmm (fasthmm@microbesonline.org)
#
#  Script for removing sequences from a fasta file if they have a "good" hit in
#  tab-delimited BLAST output
#
#  Copyright (C) 2007 The Regents of the University of California
#  All rights reserved.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License along
#  with this program; if not, write to the Free Software Foundation, Inc.,
#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
#  Disclaimer
#
#  NEITHER THE UNITED STATES NOR THE UNITED STATES DEPARTMENT OF ENERGY,
#  NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED,
#  OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
#  COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
#  OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
#  PRIVATELY OWNED RIGHTS.

# Takes a fasta file and a tab-delimited set of BLAST results
# Outputs a reduced fasta file (including only items without good hits as subjects)
# Unlike maskBlast.pl, this does not keep some of the good hits --
# all sequences with hits are removed
#
# Assumes that sequence names are of the form name.begin.end, so that
# it can determine sequence length as end-begin+1 and hence infer coverage
#
# Removes "lcl|" from the start of the names when interpreting either the faa file
# or the BLAST hits, and from the names in the masked list, but
# does not remove the "lcl|" prefix from the names in the output faa file.
#
# Optionally writes a list of excluded sequences in "fb.id" format
# and their masking sequence(s) also in "fb.id" format

use strict;
use Getopt::Long;

{
    my $cover = 0.8;
    my $idThresh = 0.35;
    my $outfile = undef;
    my $excludedfile = undef;
    my $faafile = undef;

    my $usage = "maskBlast.pl [-cover $cover] [-id $idThresh] [-excluded removednames]\n"
	. "   -faa faafile -out faa2 blast_file1 ... blast_fileN";

    (GetOptions('cover=f' => \$cover, 
		'id=f' => \$idThresh,
		'out=s' => \$outfile,
		'excluded=s' => \$excludedfile,
		'faa=s' => \$faafile)
     && defined $outfile
     && defined $faafile)
	|| die "$usage\n";
    die "No blast files for filterFaaNoHits.pl to analyze:\n$usage" if @ARGV==0;

    # first, the list of names in the input file
    my %excludedby = (); # name => masking member => 1
    open(FAA,"<",$faafile) || die "Cannot read $faafile";
    while(<FAA>) {
	if (m/^>(\S+)/) {
	    my $name = $1;
	    $name =~ s/^lcl[|]//;
	    die "duplicate name $name in $faafile" if exists $excludedby{$name};
	    $excludedby{$name} = {};
	}
    }
    close(FAA) || die "Error reading $faafile";

    while(my $line = <>) {
	chomp $line;
	my ($query,$subject,$idPercent,$alnLen,$mm,$gap,$qBeg,$qEnd,$sBeg,$sEnd,$eval,$score) = split /\t/, $line;
	$query =~ s/^lcl[|]//;
	$subject =~ s/^lcl[|]//;
	next if $idPercent < $idThresh*100;
	next if $query eq $subject;

	my $subjectMatchLen = $sEnd-$sBeg+1;
	die "Cannot parse\n$line\n -- end $sEnd does not exceed begin $sBeg"
	    if $subjectMatchLen <= 1;
	$subject =~ m/\.(\d+)\.(\d+)$/ || die "Cannot parse begin and end from subject $subject";
	my ($sFrom,$sTo) = ($1,$2);
	my $sLen = $sTo-$sFrom+1;
	die "Subjects range for $subject does not exceed 1" unless $sLen>1;
	    
	if ($subjectMatchLen >= $cover * $sLen) {
	    # We've met the coverage and %identity thresholds
	    $excludedby{$subject}{$query} = 1;
	}
    }

    # write out the (optional) exclude file
    if (defined $excludedfile) {
	open(EXCLUDED,">",$excludedfile) || die "Cannot write to $excludedfile";
	while (my ($name,$hash) = each %excludedby) {
	    foreach my $excludedby (keys %$hash) {
		print EXCLUDED "fb.$name\tfb.$excludedby\n";
	    }
	}
	close(EXCLUDED) || die "Error writing to $excludedfile";
	print STDERR "Wrote $excludedfile\n";
    }

    # write out the new faa file
    open(FAA,"<",$faafile) || die "Cannot read $faafile";
    open(OUT,">","$outfile.tmp") || die "Cannot write to $outfile.tmp";
    my $writing = 0; # save these lines to outfile?
    while(<FAA>) {
	if (m/^>(\S+)/) {
	    my $name = $1;
	    $name =~ s/^lcl[|]//;
	    $writing = (scalar(keys %{$excludedby{$name}}) == 0);
	}
	print OUT $_ if $writing;
    }
    close(FAA) || die "Error reading $faafile";
    close(OUT) || die "Error writing to $outfile.tmp";
    rename("$outfile.tmp",$outfile) || die "Cannot rename $outfile.tmp to $outfile";
    print STDERR "Wrote $outfile\n";
}
