#!/usr/bin/perl -w
#
#  $Id: splitFasta.pl,v 1.2 2008/06/11 22:06:21 mprice Exp $
#  fastHmm/fastBlast Alignment Tools
#  http://microbesonline.org/fasthmm (fasthmm@microbesonline.org)
#
#  split a fasta file into a specified number of outputs
#
#  Copyright (C) 2007 The Regents of the University of California
#  All rights reserved.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License along
#  with this program; if not, write to the Free Software Foundation, Inc.,
#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
#  Disclaimer
#
#  NEITHER THE UNITED STATES NOR THE UNITED STATES DEPARTMENT OF ENERGY,
#  NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED,
#  OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
#  COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
#  OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
#  PRIVATELY OWNED RIGHTS.

use strict;

# writes to pre.1.post pre.2.post ... pre.N.post
die "Usage: splitFasta.pl input nOutput outputPrefix outputPost\n"
    unless @ARGV==4;
my ($in,$nOutput,$outPre,$outPost) = @ARGV;
die "No input file $in" unless -e $in;
die "nOutput files $nOutput illegal"
    unless $nOutput =~ m/^\d+$/ && $nOutput >= 1;

# First count the number of sequences
my $nTotalSequence = 0;
open(IN,"<",$in) || die "Error reading $in";
while(my $line = <IN>) {
    $nTotalSequence++ if substr($line,0,1) eq ">";
}
close(IN) || die "Error reading $in";

# round up so last file doesn't have too many entries
my $nSeqPerFile = int(0.9+$nTotalSequence/$nOutput);
$nSeqPerFile = 1 if $nSeqPerFile < 1;
print STDERR "$in has $nTotalSequence seqs, writing $nSeqPerFile per file to $outPre.*.$outPost\n";

my $nFile = 1;
my $nSeqThisFile = 0;
open(IN,"<",$in) || die "Error reading $in";
open(OUT,">","$outPre.$nFile.$outPost") || die "Cannot write to $outPre.$nFile.$outPost";
while(my $line = <IN>) {
    if(substr($line,0,1) eq ">") {
	$nSeqThisFile++;
	if ($nSeqThisFile > $nSeqPerFile && $nFile < $nOutput) {
	    # go to next file
	    close(OUT) || die "Error writing to $outPre.$nFile.$outPost";
	    $nSeqThisFile = 1;
	    $nFile++;
	    open(OUT,">","$outPre.$nFile.$outPost")  || die "Cannot write to $outPre.$nFile.$outPost";
	}
    }
    print OUT $line;
}
close(OUT) || die "Error writing to $outPre.$nFile.$outPost";

if ($nFile < $nOutput) {
#    print STDERR "Warning: splitFasta.pl wrote up to $outPre.$nFile.$outPost when $nOutput pieces were requested\n";
#    print STDERR "You might want to rerun with splitting into fewer pieces\n";
#    print STDERR "Making empty files for the remainder\n";
    $nFile++;
    while($nFile <= $nOutput) {
	open(OUT,">","$outPre.$nFile.$outPost") || die "Cannot write to $outPre.$nFile.$outPost";
	close(OUT) || die "Error writing to $outPre.$nFile.$outPost";
	$nFile++;
    }
}
