#!/usr/bin/perl -w
#
#  $Id: combine.pl,v 1.2 2008/09/26 22:03:45 mprice Exp $
#  fastHmm/fastBlast Alignment Tools
#  http://microbesonline.org/fasthmm (fasthmm@microbesonline.org)
#
#  Script for combining files that are grouped by a key into another file that is grouped by that key
#  and optionally ignoring values according to another key

use strict 'refs';
use Getopt::Long;

my $usage = "combine.pl -key 1 [ optional arguments ] input_files > outfile\n"
    . "   Combine files that are grouped by a key (specified by a column number)\n"
    . "   into another file that is grouped by that key\n"
    . "   Uses an in-memory index of each group's seek position in each file\n"
    . "   all columns are counted starting from 1\n"
    . "\n"
    . "Options:\n"
    . "  -exclude excludelist [-xcol 1]\n"
    . "   Only the first column of excludelist is used\n"
    . "   any input files with column xcol being in that file are ignored\n"
    . "\n"
    . "  -uniq 1,2,3,5\n"
    . "   Remove entries that are duplicates based on matching the list of columns\n"
    . "   (by default, no uniqueness check)\n";

my $column = 1;
my $excludeColumn = 1;
my %exclude = ();
my @uniq = ();

# Given a file handle and a column index (0-based), build an in-memory
# index of where each group with that index starts (as a seek position)
sub IndexFile($) {
    my ($handle) = @_;
    die unless defined $column;

    my %seekTable = ();
    my $lastid = undef;
    seek($handle,0,0);
    my $at;
    for ($at = 0; my $line = <$handle>; $at = tell($handle)) {
	chomp $line;
	my @F = split /\t/, $line;
	die "Not enough columns in $line" if $column >= scalar(@F)
	    || $excludeColumn >= scalar(@F);
	if (!defined $lastid || $F[$column] ne $lastid) {
	    if (exists $seekTable{$F[$column]}) {
		print STDERR "Duplicate key $F[$column]";
		return undef;
	    }
	    $seekTable{$F[$column]} = $at;
	    $lastid = $F[$column];
	}
    }
    return \%seekTable;
}

# $id is the key of the group
# $out is the output handle
# $seen is to keep track of which lines might be redundant
sub PrintGroup($$$$$) {
    my ($handle,$seekTable,$id,$out,$seen) = @_;
    if (exists $seekTable->{$id}) {
	seek($handle,$seekTable->{$id},0);
	my $nLines = 0;
	while(my $line = <$handle>) {
	    chomp $line;
	    my @F = split /\t/, $line;
	    die "Not enough columns in $line" if $column >= scalar(@F);
	    if ($F[$column] ne $id) {
		last;
	    } else {
		$nLines++;
		my $uniqkey = join("\t",@F[@uniq]);
		if (!defined $seen || !exists $seen->{$uniqkey}) {
		    print $out $line,"\n" unless exists $exclude{$F[$excludeColumn]};
		    $seen->{$uniqkey} = 1 if (defined $seen);
		}
	    }
	}
	die "Group for $id not found at $seekTable->{$id}" if $nLines == 0;
    }
}

{
    my $excludeFile = undef;
    my $uniqSpec = undef;
    GetOptions('key=i' => \$column,
	       'exclude=s' => \$excludeFile,
	       'xcol=i' => \$excludeColumn,
	       'uniq=s' => \$uniqSpec)
	|| die $usage;
    my @files = @ARGV;
    die "No input files:\n$usage" if @files == 0;

    # convert to 1-based
    $column--;
    $excludeColumn--;
    die "-key is too small:\n$usage" if $column < 0;
    die "-xcol is too small:\n$usage" if $excludeColumn < 0;

    # parse uniqSpec
    if (defined $uniqSpec) {
	my @list = split /,/, $uniqSpec;
	foreach my $number (@list) {
	    die "Cannot parse uniq-column-specifier $uniqSpec"
		unless $number =~ m/^[0-9]+$/ && $number > 0;
	}
	# not useful to use key inside the unique-list,
	# as we process each group separately
	@uniq = map {$_ - 1} @list;
	@uniq = grep {$_ != $column} @uniq unless scalar(@uniq) == 1;
    }
    
    # make the exclude list
    if (defined $excludeFile) {
	open(EXCLUDE,"<",$excludeFile) || die "Cannot read $excludeFile";
	while(my $line = <EXCLUDE>) {
	    chomp $line;
	    my @F = split /\t/, $line;
	    $exclude{$F[0]} = 1;
	}
	close(EXCLUDE) || die "Error reading $excludeFile";
    }

    my @handles = ();
    my @indexes = ();
    my %keys = (); # record all keys
    foreach my $file (@files) {
	local *IN;
	open(IN,"<",$file) || die "Cannot read $file";
	push @handles, *IN;
	my $index = IndexFile(*IN);
	die "\nProblem reading $file" if (!defined $index);
	foreach my $key (keys %$index) {
	    $keys{$key} = 1;
	}
	push @indexes, $index;
    }
    foreach my $key (sort keys %keys) {
	my $seen = scalar(@uniq) > 0 ? {} : undef; # in/out for tracking what values have been seen
	foreach my $i (0..$#handles) {
	    PrintGroup($handles[$i],$indexes[$i],$key,\*STDOUT,$seen);
	}
    }
    foreach my $i (0..$#handles) {
	close($handles[$i]) || die "Error reading file $files[$i]";
    }
}    
