#!/usr/bin/perl -w
#
#  $Id: reduceDomains.pl,v 1.4 2008/05/13 00:13:12 mprice Exp $
#  fastHmm/fastBlast Alignment Tools
#  http://microbesonline.org/fasthmm (fasthmm@microbesonline.org)
#
#  Script for reducing domain families using cd-hit
#
#  Copyright (C) 2007 The Regents of the University of California
#  All rights reserved.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License along
#  with this program; if not, write to the Free Software Foundation, Inc.,
#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
#  Disclaimer
#
#  NEITHER THE UNITED STATES NOR THE UNITED STATES DEPARTMENT OF ENERGY,
#  NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED,
#  OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
#  COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
#  OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
#  PRIVATELY OWNED RIGHTS.
#

use strict;

use lib exists( $ENV{FASTHMM_DIR} ) ?
		"$ENV{FASTHMM_DIR}/lib" :
		"./lib";
use Args;
use Cache;
use File;
use Util;
use Vector;

my $defaults = {
		'cdhit'		=> "$ENV{FASTHMM_DIR}/bin/cd-hit",
		'n'		=> 3,
		'c'		=> 0.55,
		'M'		=> 1000,
		'reduce'	=> "both",
		'numThreads'	=> 1,
		'minDomain'	=> 30,
	       };

#
# USAGE
#
my $usage =
"Usage:
  reduceDomains.pl <options>

Parameters:
  -domains    <inFile>	Specify combined hmmhits file
  -db         <blastDb>	Specify source FASTA file

Optional Parameters:
  -lowMem		Slower but reduces memory footprint
  -debug		Enable debugging output
  -n	      <$defaults->{n}>	Specify cd-hit word length
  -c	      <$defaults->{c}>	Specify cd-hit sequence identity threshold
  -M	      <$defaults->{M}>	Specify cd-hit maximum available memory (MB)
  -minDomain  <$defaults->{minDomain}>	Specify minimum domain overlap amount
  -prefix     <out>	Specify output file prefix
  -reduce     <$defaults->{reduce}>	Specify job type: domains, other, both
  -numThreads <$defaults->{numThreads}>	Specify number of parallel cd-hit jobs to run
  -cdhit      <binary>	Specify cd-hit binary to use;
			$defaults->{cdhit}

Cluster Parameters:
  -cluster		Output command lists for execution on a cluster
  -outdir     <dir>	Specify output directory; must be accessible by all nodes
  -cfpre      <prefix>	Specify command list prefix; default: <domains>.cmds.#
  -listnum    <#>	Specify command list starting # (for embedded use)
  -merge		Merge results after all jobs have run

";

#
# Parse Command Line Options
#
my $fullCmd = "$0 " . join( " ", @ARGV );
my ($opts, $nonOpts) = Args::getArgs( 
	"+domains:|+db:|n:|c:|M:|minDomain:|prefix:|reduce:|lowMem|debug|checkLoci|numThreads:|cdhit:|cluster|+;cluster;outdir:|cfpre:;cluster|listnum:;cluster|merge;cluster*outdir",
	@ARGV, -1, $usage );

$opts->{fullCmd} = $fullCmd;

die "Error: Specified combined hmmhits file '", $opts->{domains}, "' does not exist!\n"
	if ( !(-e $opts->{domains}) );

die "Error: Specified source FASTA file '", $opts->{db}, "' does not exist!\n"
	if ( !(-e $opts->{db}) );

# Set defaults
foreach my $p ( keys( %{$defaults} ) )
{
	$opts->{$p} = $defaults->{$p}
		if ( !exists( $opts->{$p} ) );
}

$opts->{reduce} = lc( $opts->{reduce} );
$opts->{reduce} = $defaults->{reduce}
	if ( $opts->{reduce} !~ /^(?:domains|other|both)$/i );

# backwards compatability
$opts->{lowMem} = 1
	if ( exists( $opts->{checkLoci} ) );

$opts->{prefix} = $opts->{domains} . "." . $opts->{reduce}
	if ( !exists( $opts->{prefix} ) );

# sequence cache
$opts->{cacheId} = Cache::initCache( 500 )
	if ( exists( $opts->{lowMem} ) );

$opts->{cfpre} = $opts->{domains} . ".cmds"
	if ( !exists( $opts->{cfpre} ) );

$opts->{listnum} = 1
	if ( !exists( $opts->{listnum} ) );

reduceDomains( $opts );

Cache::statsCache( $opts->{cacheId} )
	if ( exists( $opts->{lowMem} ) );

exit(0);


sub reduceDomains
{
	my $opts = shift;

	# Load list of loci present in the hmm hits thus excluding
	# unnecessary sequences from being cached.  "lowMem" ops
	# will use pseudo-LRU and fastacmd instead, which is slower
	# but with a much smaller memory footprint.

	my $presentLoci = ( $opts->{reduce} eq 'domains' ) ?
				checkPresentLoci( $opts->{domains} ) :
				undef;
	print STDERR "present loci: ", scalar( keys( %{$presentLoci} ) ), "\n"
		if ( defined( $presentLoci ) && exists( $opts->{debug} ) );
	my ( $seqCache, $lenCache ) = loadSequences( $opts->{db}, $presentLoci, $opts );
	print STDERR "seq cache ents: ", scalar( keys( %{$seqCache} ) ), "\n"
		if ( exists( $opts->{debug} ) );
	print STDERR "len cache ents: ", scalar( keys( %{$lenCache} ) ), "\n"
		if ( exists( $opts->{debug} ) );

	# build masking data structures
	my ( $masked, $toSearch, $domain, $totalLen, $nDomains ) =
		buildMaskedUnmaskedRegions( $lenCache, $opts );

	if ( $opts->{reduce} ne 'other' )
	{
		if ( !exists( $opts->{cluster} ) )
		{
			doReduceDomains( $opts, $seqCache, $lenCache, $domain, $totalLen, $nDomains );
		} else {
			die "output directory '$opts->{outdir}' must be an absolute path"
				if ( $opts->{outdir} !~ /^\// );

			if ( exists( $opts->{merge} ) )
			{
				doClusteredMerge( $opts, $seqCache, $lenCache, $domain, $totalLen, $nDomains );
			} else {
				doClusteredReduceDomains( $opts, $seqCache, $lenCache, $domain, $totalLen, $nDomains );
			}
		}
	}

	doReduceOther( $opts, $seqCache, $lenCache, $toSearch )
		if ( ($opts->{reduce} ne 'domains') && !exists($opts->{merge}) );
}

sub doClusteredReduceDomains
{
	my $opts = shift;
	my $seqCache = shift;
	my $lenCache = shift;
	my $domain = shift;
	my $totalLen = shift;
	my $nDomains = shift;

	my @domKeys = keys( %{$domain} );
	my $numKeys = scalar( @domKeys );

	local *CMD;
	my $clNum = $opts->{listnum};
	open( CMD, ">$opts->{cfpre}.$clNum" );

	# iterate over all domains
	for ( my $i = 0; $i < $numKeys; $i++ )
	{
		my $acc = $domKeys[$i];
		my $hits = $domain->{$acc};
		my $prefix = $opts->{outdir} . "/" . File::baseFile( $opts->{domains} ) . ".dom${acc}." . $opts->{n} . "." . $opts->{c};

		if ( scalar( @{$hits} ) == 1 )
		{
			# Simulate singleton result
			my ( $locusId, $beg, $end ) = @{$hits->[0]};
			my $seq = fetchSequence( $opts, $seqCache, $locusId, $beg - 1, $end - $beg + 1 );

			local *OUT;
			open( OUT, ">$prefix.clstr" );
			print OUT ">Cluster 0\n0\t" . length($seq) . "aa, >$locusId.$beg.$end... *\n";
			close( OUT );
		} elsif ( scalar( @{$hits} ) > 1 )
		{
			my $faaFile = $prefix . ".faa";
			local *FAA;
			open( FAA, ">$faaFile" ) ||
				die "couldn't create cd-hit fasta file '$faaFile': $!";

			my $err = 0;
			foreach ( @{$hits} )
			{
				if ( !defined($_) )
				{
					print STDERR "acc:[$acc] hit is undefined\n";
					$err++;
					next;
				}
				my ( $locusId, $beg, $end ) = @{$_};
				my $seq = fetchSequence( $opts, $seqCache, $locusId, $beg - 1, $end - $beg + 1 );

				print FAA ">$locusId.$beg.$end\n$seq\n";
			}
			close( FAA );

			die "$err errors detected - halting\n"
				if ( $err > 0 );

			my $cdhitOutFile = $prefix;
			my $cmd = $opts->{cdhit} . " -d 0 -M $opts->{M} -n $opts->{n} -c $opts->{c} -i $faaFile -o $cdhitOutFile";
			print CMD $cmd, "\n";
		}
	}

	close( CMD );

	# do command list for merging
	$clNum++;
	open( CMD, ">$opts->{cfpre}.$clNum" );

	print CMD $opts->{fullCmd}, " -merge\n";

	close( CMD );
}

sub doReduceDomains
{
	my $opts = shift;
	my $seqCache = shift;
	my $lenCache = shift;
	my $domain = shift;
	my $totalLen = shift;
	my $nDomains = shift;

	local *COMBCLSTR;
	local *OUTDOM;
	local *OUTFAA;

	my $combClstrFile = $opts->{prefix} . ".reduce.domains.clstr";
	my $reduceDomFile = $opts->{prefix} . ".reduce.domains";
	my $outFaaFile = $opts->{prefix} . ".reduce.domains.faa";

	open( COMBCLSTR, ">$combClstrFile" ) ||
		die "couldn't write to combined clusters file '$combClstrFile': $!";
	open( OUTDOM, ">$reduceDomFile" ) ||
		die "couldn't write to reduced domains file '$reduceDomFile': $!";
	open( OUTFAA, ">$outFaaFile" ) ||
		die "couldn't write to reduced FASTA file '$outFaaFile': $!";

	# for stats (-debug)
	my $nClusters = 0;
	my $nClusterAA = 0;

	my @domKeys = keys( %{$domain} );
	my $numKeys = scalar( @domKeys );

	# maps child pid to domain acc for post-processing
	my %childAccMap = ();

	# map of domain to cluster # to list of [locusId,beg,end]
	my %domCluster = ();

	# number of parallel job slots available
	my $slots = $opts->{numThreads};

	# iterate over all domains
	for ( my $i = 0; $i < $numKeys; $i++ )
	{
		my $acc = $domKeys[$i];
		my $hits = $domain->{$acc};

		if ( scalar( @{$hits} ) == 1 )
		{
			# no need to run cd-hit
			my ( $locusId, $beg, $end ) = @{$hits->[0]};
			$domCluster{$acc} = { 1 => [ $hits->[0] ] };
			$nClusterAA += $end - $beg + 1;
			my $seq = fetchSequence( $opts, $seqCache, $locusId, $beg - 1, $end - $beg + 1 );
			die "couldn't load sequence for sequence id $locusId [$beg, $end]"
				if ( !defined($seq) || (length($seq) < 1) );

			# add singleton to cluster file
			print COMBCLSTR ">Cluster 0\n0\t" . length($seq) . "aa, >$locusId.$beg.$end... *\n";
			print OUTFAA ">$locusId.$beg.$end\n$seq\n";
			print OUTDOM join( "\t", $acc, $locusId, $beg, $end ), "\n";

			$nClusters++;
		} elsif ( scalar( @{$hits} ) > 1 )
		{
			my $faaFile = $opts->{prefix} . ".dom${acc}.faa";
			local *FAA;
			open( FAA, ">$faaFile" ) ||
				die "couldn't create cd-hit fasta file '$faaFile': $!";

			my $err = 0;
			foreach ( @{$hits} )
			{
				if ( !defined($_) )
				{
					print STDERR "acc:[$acc] hit is undefined\n";
					$err++;
					next;
				}
				my ( $locusId, $beg, $end ) = @{$_};
				my $seq = fetchSequence( $opts, $seqCache, $locusId, $beg - 1, $end - $beg + 1 );

				print FAA ">$locusId.$beg.$end\n$seq\n";
			}

			close( FAA );

			die "$err errors detected - halting\n"
				if ( $err > 0 );

			my $cdhitOutFile = $opts->{prefix} . ".dom${acc}." . $opts->{n} . "." . $opts->{c};
			my $cmd = $opts->{cdhit} . " -d 0 -M $opts->{M} -n $opts->{n} -c $opts->{c} -i $faaFile -o $cdhitOutFile";

			# run job
			if ( $slots > 0 )
			{
				# spawn worker process
				my $childPid = fork();
				if ( defined($childPid) )
				{
					if ( $childPid == 0 )
					{
						# BEGIN cdhit EXECUTION CODE

						print STDERR "Running: [$cmd]\n";
						system( $cmd ) == 0 || die "cd-hit failed $? -- $cmd";

						# END cdhit EXECUTION CODE

						# terminate child process
						exit(0);
					} else {
						$childAccMap{$childPid} = $acc;
					}
				} else {
					die "couldn't spawn child worker process: $!";
				}

				$slots--;
			} else {
				$i--;

				print STDERR "no slots - waiting for a child process to complete...\n";
				my $childPid = wait();
				if ( $childPid > 0 )
				{
					# post-proc
					die "couldn't map child pid $childPid to accession"
						if ( !exists( $childAccMap{$childPid} ) );

					my $acc = $childAccMap{$childPid};
					print STDERR "child pid $childPid exited (acc = [$acc])!\n";
					my $fp = $opts->{prefix} . ".dom${acc}." . $opts->{n} . "." . $opts->{c};
					my $cFile = $fp . ".clstr";
					my $cluster = undef;

					open( IN, "<$cFile" ) ||
						die "couldn't open cd-hit output '$cFile': $!";
					while( <IN> )
					{
						print COMBCLSTR;

						if ( /^>Cluster\s+(\d+)/ )
						{
							$cluster = $1;
							$domCluster{$acc}->{$cluster} = [];
							$nClusters++;
						} elsif ( defined($cluster) &&
								( $_ =~ /^\d+\s+\d+aa,\s+>(.+?)\.(\d+)\.(\d+)\.+\s+(.*)$/ ) )
						{
							my ($locusId, $beg, $end, $star) = ($1, $2, $3, $4);
							my $hit = [ $locusId, $beg, $end ];
							if ( $star eq '*' )
							{
								unshift( @{$domCluster{$acc}->{$cluster}}, $hit );
								$nClusterAA += $end - $beg + 1;
								my $seq = fetchSequence( $opts, $seqCache, $locusId, $beg - 1, $end - $beg + 1 );

								print OUTFAA ">$locusId.$beg.$end\n$seq\n";
								print OUTDOM join( "\t", $acc, $locusId, $beg, $end ), "\n";
							} else {
								push( @{$domCluster{$acc}->{$cluster}}, $hit );
							}
						}
					}
					close( IN );

					# clean up
					unlink( $fp, "$fp.bak.clstr", "$fp.clstr", $opts->{prefix} . ".dom$acc.faa" );

					# one job done, increase available slots
					$slots++;
				} elsif ( $childPid < 0 )
				{
					$slots = $opts->{numThreads};
				}
			}
		}
	}

	# wait for worker processes to finish
	my $childPid = -1;
	do
	{
		$childPid = wait();
		die "Child process died: $?" if ($? & 255) != 0;
		
		if ( $childPid > 0 )
		{
			# post-proc
			die "couldn't map child pid $childPid to accession"
				if ( !exists( $childAccMap{$childPid} ) );

			my $acc = $childAccMap{$childPid};
			my $fp = $opts->{prefix} . ".dom$acc." . $opts->{n} . "." . $opts->{c};
			my $cFile = $fp . ".clstr";
			my $cluster = undef;

			open( IN, "<$cFile" ) ||
				die "couldn't open cd-hit output '$cFile': $!";
			while( <IN> )
			{
				print COMBCLSTR;

				if ( /^>Cluster\s+(\d+)/ )
				{
					$cluster = $1;
					$domCluster{$acc}->{$cluster} = [];
					$nClusters++;
				} elsif ( defined($cluster) &&
						( $_ =~ /^\d+\s+\d+aa,\s+>(.+?)\.(\d+)\.(\d+)\.+\s+(.*)$/ ) )
				{
					my ($locusId, $beg, $end, $star) = ($1, $2, $3, $4);
					my $hit = [ $locusId, $beg, $end ];
					if ( $star eq '*' )
					{
						unshift( @{$domCluster{$acc}->{$cluster}}, $hit );
						$nClusterAA += $end - $beg + 1;
						my $seq = fetchSequence( $opts, $seqCache, $locusId, $beg - 1, $end - $beg + 1 );

						print OUTFAA ">$locusId.$beg.$end\n$seq\n";
						print OUTDOM join( "\t", $acc, $locusId, $beg, $end ), "\n";
					} else {
						push( @{$domCluster{$acc}->{$cluster}}, $hit );
					}
				}
			}
			close( IN );

			# clean up
			unlink( $fp, "$fp.bak.clstr", "$fp.clstr", $opts->{prefix} . ".dom$acc.faa" );
		}
	} until ( $childPid < 0 );

	# stats
	if ( exists( $opts->{debug} ) )
	{
		my $s = sprintf( "%.2f%%", (100.0 * $nClusterAA / $totalLen) );
		print STDERR "Reduced $nDomains domains to $nClusters clusters containing $nClusterAA aa ($s of db)\n";
	}

	close( OUTFAA );
	close( OUTDOM );
	close( COMBCLSTR );
}

sub doReduceOther
{
	my $opts = shift;
	my $seqCache = shift;
	my $lenCache = shift;
	my $toSearch = shift;

	#my $clusterFile = $opts->{prefix} . ".reduce.other.clstr";
	my $faaFile = $opts->{prefix} . ".tmp.faa";
	my $outFaaFile = $opts->{prefix} . ".reduce.other.faa";
	#local *OTHER;
	local *FAA;

	open( FAA, ">$faaFile" ) ||
		die "couldn't open $faaFile for write: $!";

	while ( my ( $locusId, $regions ) = each( %{$toSearch} ) )
	{
		foreach my $region ( @{$regions} )
		{
			my ( $beg, $end ) = @{$region};
			my $seq = fetchSequence( $opts, $seqCache, $locusId, $beg - 1, $end - $beg + 1 );

			print FAA ">$locusId.$beg.$end\n$seq\n";
		}
	}
	close( FAA );

	my $cdhitOut = $opts->{prefix} . ".reduce.other." . $opts->{n} . "." . $opts->{c};
	my $cmd = $opts->{cdhit} . " -d 0 -M $opts->{M} -n $opts->{n} -c $opts->{c} -i $faaFile -o $cdhitOut";
	system( $cmd ) == 0 || die "cd-hit failed $? -- $cmd";

	unlink( $faaFile );
	my $cFile = $cdhitOut . ".clstr";
	
	local *CLSTR;
	local *OUTFAA;
	open( OUTFAA, ">$outFaaFile" ) ||
		die "couldn't write cluster FASTA file '$outFaaFile'";
	#open( OTHER, ">$clusterFile" ) ||
	#	die "couldn't write cluster output '$clusterFile'";
	open( CLSTR, "<$cFile" ) ||
		die "couldn't read cd-hit output '$cFile'";
	my $cluster = undef;

	# no clue why we have to do this
	my %domCluster = ( "" => {} );

	while(<CLSTR>)
	{
		chomp;
		if ( /^>Cluster\s+(\d+)/ )
		{
			$cluster = $1;
			$domCluster{""}->{$cluster} = [];
		} elsif ( /^\d+\s+\d+aa,\s+>(.+?)\.(\d+)\.(\d+)\.+\s+(.*)$/ )
		{
			my $locusId = $1;
			my $beg = $2;
			my $end = $3;
			my $star = $4;

			my $hit = [$locusId, $beg, $end];
			if ( $star eq '*' )
			{
				unshift( @{$domCluster{""}{$cluster}}, $hit );
				my $seq = fetchSequence( $opts, $seqCache, $locusId, $beg - 1, $end - $beg + 1 );
				print OUTFAA ">$locusId.$beg.$end\n$seq\n";
			} else {
				push( @{$domCluster{""}{$cluster}}, $hit );
			}

			#print OTHER join( "\t", "Other$cluster", $locusId, $beg, $end ), "\n";
		} else
		{
			# parse error
			die "couldn't parse '$cFile':\n$_";
		}
	}
	close( CLSTR );
	#close( OTHER );
	close( OUTFAA );


	# clean up
	unlink( $cdhitOut, "$cdhitOut.bak.clstr" );
	rename( "$cdhitOut.clstr", $opts->{prefix} . ".reduce.other.clstr" );
}

sub getFastaCmdSeq
{
	my $faFile = shift;
	my $locusId = shift;

	my $cmd = "$ENV{FASTHMM_DIR}/bin/fastacmd -d $faFile -s ";
	if ( $locusId =~ /^\d+$/ )
	{
		$cmd .= "lcl\\|$locusId";
	} else {
		$cmd .= $locusId;
	}

	local *SEQ;
	open(SEQ, "$cmd 2>/dev/null |");
	my $crap = "";
	while( my $line = <SEQ> )
	{
		next if $line =~ /^>/;
		chomp( $line );
		$crap .= $line;
	}
	close(SEQ);

	return (length($crap) > 0) ?
		 $crap : undef;
}

#print STDERR "==> [slots: $slots] [line: ", __LINE__, "] TIGR01580: ", ( defined($domain->{'TIGR01580'}->[0]) ? join(",", @{$domain->{'TIGR01580'}->[0]}) : "undefined" ), "\n"  if defined($domain);

sub fetchSequence
{
	my $opts = shift;
	my $seqCache = shift;
	my $locusId = shift;
	my $start = shift;
	my $len = shift;

	# is it in local seqCache?
	return substr( $seqCache->{$locusId}, $start, $len )
		if ( exists( $seqCache->{$locusId} ) );

	# is it in LRU cache?
	if ( Cache::existsCache( $opts->{cacheId}, $locusId ) )
	{
		my $seq = Cache::getCache( $opts->{cacheId}, $locusId );
		return substr( $seq, $start, $len )
			if ( defined($seq) );
	}

	# get it from fastacmd
	my $seq = getFastaCmdSeq( $opts->{db}, $locusId );
	if ( defined($seq) && (length($seq) > 0) )
	{
		Cache::addCache( $opts->{cacheId}, $locusId, $seq );
		return substr( $seq, $start, $len );
	}

	# error
	die "couldn't find sequence for '$locusId'; tried local and LRU cache, and fastacmd";
}

sub unmasked
{
	my $seqLen = shift;
	my $list = shift;
	my $minDom = shift;
	my ($BEG, $END) = (0, 1);

	return [ [1, $seqLen] ]
		if ( !defined($list) );

	my @regions = ();
	my @copy = @{$list};
	my $last = shift( @copy );

	push( @regions, [1, $last->[$BEG] - 1] )
		if ( $last->[$BEG] > $minDom );

	foreach my $domain ( @copy )
	{
		die "unmerged input to unmasked: last $last->[$BEG]:$last->[$END] this $domain->[$BEG]:$domain->[$END]"
			if ( $domain->[$BEG] - $last->[$END] + 1 < $minDom );
		push( @regions, [$last->[$END] + 1, $domain->[$BEG] - 1] );
		$last = $domain;
	}

	push( @regions, [$last->[$END] + 1, $seqLen] )
		 if ( $seqLen - $last->[$END] >= $minDom );

	return \@regions;
}

sub mergeRegions
{
	my $list = shift;
	my $minDom = shift;
	my ($BEG, $END) = (0, 1);
	my @mask = sort { $a->[$BEG] <=> $b->[$BEG] } @{$list};
	my @merged = ();
	my $last = shift( @mask );
	foreach my $domain ( @mask )
	{
		if ( $domain->[$BEG] - $last->[$END] < $minDom )
		{
			$last->[$END] = $domain->[$END]
				if ( $domain->[$END] > $last->[$END] );
		} else {
			push( @merged, $last );
			$last = $domain;
		}
	}
	push( @merged, $last );
	return \@merged;
}

sub buildMaskedUnmaskedRegions
{
	my $len = shift;
	my $opts = shift;
	my $domFile = $opts->{domains};
	local *IN;

	# for other or both
	my %masked = ();
	my %unmasked = ();

	# for domains or both
	my %domain = ();

	my $totalLen = exists( $opts->{debug} ) ?
			Vector::sum( values( %{$len} ) ) :
			1;

	my $nDomains = 0;
	open(IN, "<$domFile");
	while(<IN>)
	{
		chomp;
		my ( $acc, $locusId, $beg, $end ) = split( /\t/, $_ );
		next if ( $locusId eq 'locusId' );
		die "Cannot parse '$_'"
			unless defined( $end );
		if ( !exists( $len->{$locusId} ) )
		{
			print STDERR "Unable to load sequence information for sequence id $locusId; skipping domain $acc\n";
		} else {
			$beg = 1
				if ( $beg < 1 );
			$end = $len->{$locusId}
				if ( $end > $len->{$locusId} );
			$nDomains++;
			if ( $opts->{reduce} ne 'domains' )
			{
				push( @{$masked{$locusId}}, [ $beg, $end ] );
				$masked{$locusId} = mergeRegions( $masked{$locusId}, $opts->{minDomain} )
					if ( exists( $opts->{lowMem} ) );
			}
			push( @{$domain{$acc}}, [$locusId, $beg, $end] )
				if ( ( $opts->{reduce} ne 'other') ||
					exists( $opts->{debug} ) );
		}
	}
	close(IN);

	print STDERR "Read $nDomains domains for ", scalar( keys( %masked ) ), " loci (", scalar( keys( %domain ) ), " different families)\n"
		if ( exists( $opts->{debug} ) );

	# Merge overlapping regions unless we've already
	# done it incrementally with the lowMem option
	if ( ($opts->{reduce} ne 'domains') && !exists( $opts->{lowMem} ) )
	{
		foreach my $locusId ( keys( %masked ) )
		{
			$masked{$locusId} = mergeRegions( $masked{$locusId}, $opts->{minDomain} );
		}
	}

	if ( ($opts->{reduce} ne 'domains') && exists( $opts->{debug} ) )
	{
		my $nMasked = 0;
		foreach ( values( %masked ) )
		{
			$nMasked += scalar( @{$_} );
		}

		print STDERR "Merged domains into $nMasked masked regions\n";
	}

	# build to-search list for other/both
	if ( $opts->{reduce} ne 'domains' )
	{
		while ( my ( $locusId, $seqLen ) = each( %{$len} ) )
		{
			if ( !exists( $masked{$locusId} ) )
			{
				$unmasked{$locusId} = [ [1, $seqLen] ];
			} else {
				$unmasked{$locusId} = unmasked( $seqLen, $masked{$locusId}, $opts->{minDomain} );
			}
		}

		if ( exists( $opts->{debug} ) )
		{
			my $nToSearch = 0;
			my $searchAA = 0;
			foreach my $list ( values( %unmasked ) )
			{
				$nToSearch += scalar( @{$list} );
				$searchAA += Vector::sum( map {$_->[1] - $_->[0] + 1} @{$list} );
			}
			my $s = sprintf( "%.2f%%", (100.0 * $searchAA / $totalLen) );
			print STDERR "$nToSearch unmasked areas to search containing $searchAA aa total ($s)\n";
		}
	}

	return ( \%masked, \%unmasked, \%domain, $totalLen, $nDomains );
}

sub loadSequences
{
	my $faFile = shift;
	my $include = shift;
	my $opts = shift;
	my %seq = ();
	my %len = ();
	local *IN;

	my $fId = File::openFastaCache( \*IN, $faFile );

	while ( 1 )
	{
		my $seqEnt = File::readNextFastaSeq( $fId );
		last
			if ( !defined( $seqEnt->{defline} ) );

		# should VIMSS-specific parsing be included in
		# the production release?
		my ( $parsedName ) = ( $seqEnt->{defline} =~ /^(?:VIMSS)?(\S+)/i );
		$parsedName = $seqEnt->{defline}
			if ( !defined( $parsedName ) );

		$len{$parsedName} = length( $seqEnt->{seq} );
		$seq{$parsedName} = $seqEnt->{seq}
				if ( !exists( $opts->{lowMem} ) &&
					(
						($opts->{reduce} ne 'domains') ||
						!defined( $include ) ||
						exists( $include->{$parsedName} )
					) );
	}

	File::closeFastaCache( $fId );

	return ( \%seq, \%len );
}

sub checkPresentLoci
{
	my $file = shift;
	local *IN;
	my %present = ();

	open(IN, "<$file");
	while(<IN>)
	{
		my ( undef, $locusId ) = split(/\t/, $_);
		$present{$locusId} = 1;
	}
	close(IN);

	return \%present;
}
