#!/usr/bin/perl -w
#
#  $Id: splitDomains.pl,v 1.3 2008/10/27 23:38:07 kkeller Exp $
#  fastHmm/fastBlast Alignment Tools
#  http://microbesonline.org/fasthmm (fasthmm@microbesonline.org)
#
#  split a fasta file into a specified number of outputs
#
#  Copyright (C) 2007 The Regents of the University of California
#  All rights reserved.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License along
#  with this program; if not, write to the Free Software Foundation, Inc.,
#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
#  Disclaimer
#
#  NEITHER THE UNITED STATES NOR THE UNITED STATES DEPARTMENT OF ENERGY,
#  NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED,
#  OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
#  COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
#  OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
#  PRIVATELY OWNED RIGHTS.

use strict;

# writes to pre.1.post pre.2.post ... pre.N.post
die "Usage: splitDomains.pl input nOutput outputPrefix outputPost\n"
    unless @ARGV==4;
my ($in,$nOutput,$outPre,$outPost) = @ARGV;
die "No input file $in" unless -e $in;
# nOutput==0 special case: split every domain into own file no matter what
die "nOutput files $nOutput illegal"
    unless $nOutput =~ m/^\d+$/ && $nOutput >= 0;

# First count the domains and verify that the file is sorted by domain
my %domainsSeen = (); 

my $lastDomain = "";
open(IN,"<",$in) || die "Error reading $in";
while(my $line = <IN>) {
    die "Cannot parse domain line $_ in $in"
	unless $line =~ m/^(\S+)\t/;
    my $dom = $1;
    if ($dom ne $lastDomain) {
	die "Domains file $in is not sorted by domain -- $dom occurs in separate blocks"
	    if (exists $domainsSeen{$dom});
	$domainsSeen{$dom} = 1;
	$lastDomain = $dom;
    }
}
close(IN) || die "Error reading $in";
my $nDomainsTotal = scalar(keys %domainsSeen);

# round up so last file doesn't have too many entries
my $nDomPerFile=-1;
$nDomPerFile = int(0.9+$nDomainsTotal/$nOutput) if ($nOutput > 0);
$nDomPerFile = 1 if ($nOutput == 0 or $nDomPerFile < 1);

print STDERR "$in has $nDomainsTotal different domains, writing $nDomPerFile per file to $outPre.*.$outPost\n";

my $nFile = 1;
my $nDomThisFile = 0;
open(IN,"<",$in) || die "Error reading $in";
open(OUT,">","$outPre.$nFile.$outPost") || die "Cannot write to $outPre.$nFile.$outPost";
$lastDomain = "";
while(my $line = <IN>) {
    die "Cannot parse domain line $_ in $in"
	unless $line =~ m/^(\S+)\t/;
    my $dom = $1;
    if ($dom ne $lastDomain) {
	$nDomThisFile++;
	$lastDomain = $dom;
	if ($nDomThisFile > $nDomPerFile && ($nOutput == 0 or $nFile < $nOutput)) {
	    # go to next file
	    close(OUT) || die "Error writing to $outPre.$nFile.$outPost";
	    $nDomThisFile = 1;
	    $nFile++;
	    open(OUT,">","$outPre.$nFile.$outPost")  || die "Cannot write to $outPre.$nFile.$outPost";
	}
    }
    print OUT $line;
}
close(IN) || die "Error reading $in";

if ($nFile < $nOutput) {
#    print STDERR "Warning: splitDomains.pl wrote up to $outPre.$nFile.$outPost when $nOutput pieces were requested\n";
#    print STDERR "You might want to rerun with splitting into fewer pieces\n";
#    print STDERR "Making empty files for the remainder\n";
    $nFile++;
    while($nFile <= $nOutput) {
	open(OUT,">","$outPre.$nFile.$outPost") || die "Cannot write to $outPre.$nFile.$outPost";
	close(OUT) || die "Error writing to $outPre.$nFile.$outPost";
	$nFile++;
    }
}
