#
#  $Id: File.pm,v 1.6 2008/07/16 23:59:19 mprice Exp $
#  fastHmm/fastBlast Alignment Tools
#  http://microbesonline.org/fasthmm (fasthmm@microbesonline.org)
#
#  Collection of functions useful for manipulating files/directories
#
#  Copyright (C) 2007 The Regents of the University of California
#  All rights reserved.
#
#  This library is free software; you can redistribute it and/or
#  modify it under the terms of the GNU Lesser General Public
#  License as published by the Free Software Foundation; either
#  version 2.1 of the License, or (at your option) any later version.
#
#  This library is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
#  Lesser General Public License for more details.
#
#  You should have received a copy of the GNU Lesser General Public
#  License along with this library; if not, write to the Free Software
#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301
#  USA.
#
#  Disclaimer
#
#  NEITHER THE UNITED STATES NOR THE UNITED STATES DEPARTMENT OF ENERGY,
#  NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED,
#  OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
#  COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
#  OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
#  PRIVATELY OWNED RIGHTS.
#

package File;
require Exporter;

use strict;

use vars '$VERSION';
$VERSION = 0.01;

our @ISA = qw(Exporter);
our @EXPORT = qw();

our $MAX_RECURSION_DEPTH = 10;
our @handles = ();
our $nextHandleId = 0;

sub loadConfMacros
{
	my $file = shift;
	my %conf = ();
	local *IN;

	open(IN, "<$file");
	while(<IN>)
	{
		next if /^\s*#/;
		chomp;
		$conf{$1} = $2
			if ( /(\w+)\s*=\s*([^#\s]+)/ );
	}
	close(IN);

	return \%conf;
}

sub baseFile
{
	my $file = shift;

	my ( $baseFile ) = $file =~ /([^\/]+)$/;
	return $baseFile;
}

sub fileGzOpen
{
	my $fp = shift;
	my $file = shift;

	if ( $file =~ /\.gz$/i )
	{
		open( $fp, "$ENV{FASTHMM_DIR}/bin/gzip -dc $file |" );
	} else {
		open( $fp, "<$file" );
	}
}

sub fileExtensionPri
{
	my $fileBase = shift;

	return $fileBase
		if ( -e $fileBase );

	foreach my $ext ( shift )
	{
		my $file = $fileBase . "." . $ext;
		return $file
			if ( -e $file );
	}

	return undef;
}

sub openFileIterCache
{
	my $fh = shift;
	my $file = shift;
	my $lineToId = shift;
	my $idHash = shift;
	my $exclusion = shift;
	my $maintainCache = shift;
	my $debug = shift;
	$debug = 0
		if ( !defined( $debug ) );
	$idHash = {}
		if ( !defined( $idHash ) );
	$exclusion = 1
		if ( !defined( $exclusion ) );
	$maintainCache = 0
		if ( !defined( $maintainCache ) );

	if ( -f $file )
	{
		if ( open( $fh, "<$file" ) )
		{
			my %ent = (
					'file' => $file,
					'debug' => $debug,
					'fh' => $fh,
					'lineToId' => $lineToId,
					'cache' => {},
					'maintainCache' => $maintainCache,
					'buffer' => undef,
					'hash' => $idHash,
					'exclusion' => $exclusion,
				  );
			push( @handles, \%ent );

			return $nextHandleId++;
		}
	}

	return -1;
}

sub openPipeIterCache
{
	my $fh = shift;
	my $cmd = shift;
	my $lineToId = shift;
	my $idHash = shift;
	my $exclusion = shift;
	my $maintainCache = shift;
	my $debug = shift;
	$debug = 0
		if ( !defined( $debug ) );
	$idHash = {}
		if ( !defined( $idHash ) );
	$exclusion = 1
		if ( !defined( $exclusion ) );
	$maintainCache = 0
		if ( !defined( $maintainCache ) );

	if ( open( $fh, "$cmd |" ) )
	{
		my %ent = (
				'file' => $cmd,
				'debug' => $debug,
				'fh' => $fh,
				'lineToId' => $lineToId,
				'cache' => {},
				'maintainCache' => $maintainCache,
				'buffer' => undef,
				'hash' => $idHash,
				'exclusion' => $exclusion,
			  );
		push( @handles, \%ent );

		return $nextHandleId++;
	}

	return -1;
}

sub openFastaCache
{
	my $fh = shift;
	my $file = shift;
	my $hash = shift;
	my $exclusion = shift;
	my $maintainCache = shift;
	my $debug = shift;
	$debug = 0
		if ( !defined( $debug ) );
	$hash = {}
		if ( !defined( $hash ) );
	$exclusion = 1
		if ( !defined( $exclusion ) );
	$maintainCache = 0
		if ( !defined( $maintainCache ) );

	if ( -f $file )
	{
		if ( open($fh, "<$file") )
		{
			my %ent = (
					'file' => $file,
					'debug' => $debug,
					'fh' => $fh,
					'cache' => {},
					'maintainCache' => $maintainCache,
					'buffer' => undef,
					'hash' => $hash,
					'exclusion' => $exclusion,
				  );
			push( @handles, \%ent );

			return $nextHandleId++;
		}
	}

	return -1;
}

sub getFileIterIdGroupCache
{
	my $handleId = shift;
	my $groupId = shift;

	return undef
		if ( $handleId >= scalar( @handles ) );

	if ( exists( $handles[$handleId]->{cache}->{$groupId} ) )
	{
		my $data = $handles[$handleId]->{cache}->{$groupId};
		if ( $handles[$handleId]->{maintainCache} == 0 )
		{
			delete( $handles[$handleId]->{cache}->{$groupId} );
			print STDERR "cache hit id group '$groupId'; removing\n"
				if ( $handles[$handleId]->{debug} > 0 );
		} else {
			print STDERR "cache hit id group '$groupId'\n"
				if ( $handles[$handleId]->{debug} > 0 );
		}
		return $data;
	} else {
		my $numCached = 0;
		my $lineToId = $handles[$handleId]->{lineToId};
		while ( my $group = readNextFileIter( $handleId ) )
		{
			my ( $thisId ) = $group->[0] =~ /$lineToId/i;
			next if ( !defined( $thisId ) );

			if ( $thisId eq $groupId )
			{
				# found the group we were looking for and cached all previous
				print STDERR "found id group '$groupId' after $numCached other groups now cached\n"
					if ( $handles[$handleId]->{debug} > 0 );
				$handles[$handleId]->{cache}->{$thisId} = $group
					if ( $handles[$handleId]->{maintainCache} > 0 );
				return $group;
			} else {
				my $go = $handles[$handleId]->{exclusion} ?
						!exists( $handles[$handleId]->{hash}->{$thisId} ) :
						exists( $handles[$handleId]->{hash}->{$thisId} );

				# we need to cache this
				if ( $go )
				{
					$numCached++;
					$handles[$handleId]->{cache}->{$thisId} = $group;
				}
			}
		}
	}

	print STDERR "id group '$groupId' wasn't found in cache or remainder of input file\n";
	return undef;
}

sub getFastaCacheSeq
{
	my $handleId = shift;
	my $seqName = shift;

	return undef
		if ( $handleId >= scalar( @handles ) );

	if ( exists( $handles[$handleId]->{cache}->{$seqName} ) )
	{
		my $seq = $handles[$handleId]->{cache}->{$seqName}->{seq};
		if ( $handles[$handleId]->{maintainCache} == 0 )
		{
			delete( $handles[$handleId]->{cache}->{$seqName} );
			print STDERR "cache hit for '$seqName'; removing\n"
				if ( $handles[$handleId]->{debug} > 0 );
		} else {
			print STDERR "cache hit for '$seqName'\n"
				if ( $handles[$handleId]->{debug} > 0 );
		}
		return $seq;
	} else {
		my $numCached = 0;
		while ( 1 )
		{
			my $seqEnt = readNextFastaSeq( $handleId );
			last
				if ( !defined($seqEnt->{defline}) );
			if ( $seqEnt->{defline} eq $seqName )
			{
				# found the one we were looking for and cached all previous
				print STDERR "found '$seqName' after $numCached other sequences now cached\n"
					if ( $handles[$handleId]->{debug} );
				$handles[$handleId]->{cache}->{$seqEnt->{defline}} = $seqEnt
					if ( $handles[$handleId]->{maintainCache} > 0 );
				return $seqEnt->{seq};
			} else {
				my $go = $handles[$handleId]->{exclusion} ?
						!exists( $handles[$handleId]->{hash}->{$seqEnt->{defline}} ) :
						exists( $handles[$handleId]->{hash}->{$seqEnt->{defline}} );

				# we need to cache this
				if ( $go )
				{
					$numCached++;
					$handles[$handleId]->{cache}->{$seqEnt->{defline}} = $seqEnt;
				}
			}
		}
	}

	print STDERR "sequence '$seqName' wasn't found in cache or remainder of fasta file\n"
		if ( $handles[$handleId]->{debug} > 0 );
	return undef;
}

sub readNextPipeIter
{
	return readNextFileIter( @_ );
}

sub readNextFileIter
{
	my $handleId = shift;
	return undef
		if ( ( $handleId < 0 )
			|| ( $handleId >= scalar( @handles ) ) );

	my @lines = ();
	my $line = undef;
	my $handle = $handles[$handleId];
	my $fh = $handle->{fh};
	my $lineToId = $handle->{lineToId};
	my $curId = undef;
	my $read = 0;

	while ( ( $line = $handles[$handleId]->{buffer} ) ||
		( $line = <$fh> ) )
	{
		$handles[$handleId]->{buffer} = undef
			if ( defined( $handles[$handleId]->{buffer} ) );
		if ( $line =~ /$lineToId/i )
		{
			my $thisId = $1;
			chomp( $line );
			if ( !$read && !defined($curId) )
			{
				$curId = $thisId;
				print STDERR "line $. begins id:[$curId] group of lines\n"
					if ( $handle->{debug} > 2 );
				push( @lines, $line );
				$read = 1;
			} elsif ( $read && ( $curId eq $thisId ) )
			{
				push( @lines, $line );
			} else
			{
				$handles[$handleId]->{buffer} = $line;
				print STDERR "group id:[$curId] has " . scalar(@lines) . " lines\n"
					if ( $handle->{debug} > 1 );
				return \@lines;
			}
		} else {
			print STDERR "line $. of input does not match id-parsing pattern; ignoring...\n"
				if ( $handle->{debug} > 0 );
			$read = 0;
		}
	}

	if ( defined($curId) )
	{
		# eof and last result set
		print STDERR "group id:[$curId] has " . scalar(@lines) . " lines\n"
			if ( $handle->{debug} > 0 );
		return \@lines;
	} else {
		# eof and buffer is empty, therefore nothing to return
		return undef;
	}
}

sub readNextFastaSeq
{
	my $handleId = shift;
	return { 'defline' => undef, 'seq' => undef }
		if ( $handleId >= scalar( @handles ) );

	my %seq = ( 'defline' => undef, 'seq' => "" );
	my $readSeq = 0;

	my $line = undef;
	my $fh = $handles[$handleId]->{fh};
	while ( ( $line = $handles[$handleId]->{buffer} ) ||
		( $line = <$fh> ) )
	{
		$handles[$handleId]->{buffer} = undef
			if ( defined( $handles[$handleId]->{buffer} ) );
		$line =~ s/^\s+|\s+$//g;
		if ( !$readSeq && ( $line =~ /^>(.+)/ ) )
		{
			$seq{defline} = $1;
			$readSeq = 1;
		} elsif ( $readSeq )
		{
			if ( $line !~ /^>/ )
			{
				$seq{seq} .= uc($line);
			} else {
				$handles[$handleId]->{buffer} = $line;

				return \%seq;
			}
		}
	}

	return \%seq;
}

sub closeFastaCache
{
	my $handleId = shift;

	return closeFileIterCache( $handleId );
}

sub closeFileIterCache
{
	my $handleId = shift;
	return
		if ( ( $handleId < 0 ) ||
			( $handleId >= scalar( @handles ) ) );

	# clear cache and close fh
	undef( $handles[$handleId]->{cache} );
	close( $handles[$handleId]->{fh} );
}

sub formatFasta
{
	my $name = shift;
	my $seq = shift;
	my $width = shift;
	$width = 60
		if ( !defined($width) );

	return ""
		if ( !defined($name) ||
			!defined($seq) );

	$seq =~ s/^\s+|\s+$//g;
	$name =~ s/^\s+|\s+$//g;

	my $seqLen = length($seq);

	$seq =~ s/(.{$width})/$1\n/g;
	$seq .= "\n"
		if ( ($seqLen % $width) > 0 );

	return ">$name\n$seq";
}

sub getFilteredFileList
{
	my $dir = shift;
	my $filters = shift;
	my $depth = shift;
	$depth = 0
		if ( !defined( $depth ) );
	my $numFilters = scalar( @{$filters} );

	if ( $depth > $MAX_RECURSION_DEPTH )
	{
		print STDERR "Warning: Maximum recursion depth $MAX_RECURSION_DEPTH exceeded while building file list; pruning results ...\n";
		return [];
	}

	# remove trailing slashes from directory
	$dir =~ s/[\/]+$//;

	my @files = ();
	local *DIR;

	if ( -d $dir )
	{
		opendir( DIR, $dir );
		while ( my $file = readdir( DIR ) )
		{
			# skip "." and ".."
			next if ( $file =~ /^\.\.?$/ );

			my $fullFile = $dir . "/" . $file;
			if ( -d $fullFile )
			{
				next
					if ( $depth < 0 );
				my $dirFiles = getFilteredFileList( $fullFile, $filters, $depth + 1 );
				push( @files, @{$dirFiles} );
			} elsif ( -f $fullFile )
			{
				for ( my $i = 0; $i < $numFilters; $i++ )
				{
					my $filter = $filters->[$i];
					if ( $file =~ /$filter/ )
					{
						push( @files, $fullFile );
						$i = $numFilters;
					}
				}
			}
		}
		closedir( DIR );
	}

	return \@files;
}

sub checkFileLinesColumns($$$) {
    my ($filename,$lines,$columns) = @_;
    local *CHECK;
    open(CHECK, "<", $filename) || die "Cannot read $filename";
    my $nRead = 0;
    while(my $line = <CHECK>) {
	chomp $line;
	my @F = split /\t/, $line;
	my $nActual = scalar(@F);
	die "Expected $columns not $nActual columns in \n$line\nin $filename\n"
	    unless $nActual == $columns;
	$nRead++;
    }
    close(CHECK) || die "Error reading $filename";
    return( $lines == $nRead ? 1 : 0);
}

1;
