#!/usr/bin/perl -w

#  $Id: cleanFasta.pl,v 1.1 2008/05/30 00:53:46 mprice Exp $
#  fastHmm/fastBlast Alignment Tools
#  http://microbesonline.org/fasthmm (fasthmm@microbesonline.org)
#
#  "Clean" all the identifiers in a fasta file to be compatible with
#  fast-blast and to be unique
#
#  Copyright (C) 2007 The Regents of the University of California
#  All rights reserved.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License along
#  with this program; if not, write to the Free Software Foundation, Inc.,
#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
#  Disclaimer
#
#  NEITHER THE UNITED STATES NOR THE UNITED STATES DEPARTMENT OF ENERGY,
#  NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED,
#  OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
#  COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
#  OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
#  PRIVATELY OWNED RIGHTS.
#

use strict;
use Getopt::Long;

my $usage =
"Usage: cleanFasta.pl [-define 1] -faa input.faa -out out_faa_file

Changes identifiers to be compatible with FastBLAST

By default, writes out definitions as well
(use -define 0 to not write them out)

Also writes the relationship of original identifiers to
new identifiers to out_faa_file.map
";

my @goodrefs = qw/gi gb pdb dbj pir prf sp pdb pat bbs gnl ref lcl/;
my %goodrefs = map {$_ => 1} @goodrefs;
{
    my $in = undef;
    my $out = undef;
    my $define = 1;

    (GetOptions('-faa=s'=>\$in,
	       '-out=s'=>\$out,
		'define=i' => \$define)
	&& defined $in
	&& defined $out)
	|| die $usage;

    open(IN,"<",$in) || die "Cannot read $in";
    open(OUT,">",$out) || die "Cannot write to $out";
    open(MAP,">","$out.map") || die "Cannot write to $out.map";

    my %lowercase = (); # lower case of id seen so far => cleaned id
    my $nline = 0;
    my $nRename = 0;
    while(my $line = <IN>) {
	if (substr($line,0,1) eq ">") {
	    chomp $line;
	    my ($name,$description);
	    if ($line =~ m/^>(\S+)\s+(.*)$/) {
		$name = $1;
		$description = $2;
	    } else {
		$name = $line;
		$description  = undef;
	    }

	    # NR joins redundant ids by control-A characters
	    # We'll just use the first id
	    my @newnames = split /\cA/, $name;
	    my $newname = $newnames[0];
	    die "Empty id in fasta defline\n$line\n" if $newname eq "";
	    die "Spaces in fasta id\n$newname\nin line\n$line\n" if $newname =~ m/\s/;

	    # NR includes ids like gi|158262557 which are problematic because
	    # fastblast will append ranges to them
	    # If we recognize the first ref then we just use it as the id
	    if ($newname =~ m/[|]/) {
		my @refs = split /[|]/, $newname;
		while (@refs > 0) {
		    my $ref = shift @refs;
		    if (exists $goodrefs{$ref} && @refs > 0 && length($refs[0]) > 0) {
			$newname = $refs[0];
			last;
		    }
		}
		# Otherwise we replace | with _
		if ($newname =~ m/[|]/) {
		    $newname =~ s/[|]/_/g;
		}
	    }

	    # Now, check if the name is unique
	    my $lower = lc($newname);
	    my $prefix = $newname;
	    my $n = 2;
	    while (exists $lowercase{$lower}) {
		$newname = "$prefix.$n";
		$lower = lc($newname);
		$n++;
	    }
	    $lowercase{$lower} = $newname;

	    # Finally, we have a unique pipe-free cntrl-A-free name
	    if (defined $description && $define) {
		print OUT ">$newname $description\n";
	    } else {
		print OUT ">$newname\n";
	    }
	    print MAP "$name\t$newname\n";
	    $nRename++ if $name ne $newname;
	} else {
            # Allow * in sequence to represent stop codons
	    die "$in is not a valid fasta file at line $nline:\n$line"
		if  $nline == 0 || $line !~ m/^[A-Z*]+[\r\n]*$/;
	    print OUT $line;
	}
	$nline++;
    }
    close(IN) || die "Error reading $in";
    close(OUT) || die "Error writing $out";
    close(MAP) || die "Error writing $out.map";
    print STDERR "Wrote " . scalar(keys %lowercase) . " sequences to $out and renamed $nRename, see $out.map\n";
}
