#
#  $Id: fbmerge.make,v 1.3 2008/09/29 23:07:24 mprice Exp $
#
#  Merge two FastBLAST databases. Run with
#  make FASTHMM_DIR=~/fasthmm DIR1=dir1 FAA1=db1 DIR2=dir2 FAA2=db2 PIECES=100 -j nCPUs
#	(or with qmake)
#  where dir1/ and dir2/ contain fast-blast databases, including
#  the files from the reduction
#
#  Do NOT run in the same directory as the fast-blast databases.
#  Copyright (C) 2007 The Regents of the University of California
#  All rights reserved.
#
#  This library is free software; you can redistribute it and/or
#  modify it under the terms of the GNU Lesser General Public
#  License as published by the Free Software Foundation; either
#  version 2.1 of the License, or (at your option) any later version.
#
#  This library is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
#  Lesser General Public License for more details.
#
#  You should have received a copy of the GNU Lesser General Public
#  License along with this library; if not, write to the Free Software
#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301
#  USA.
#
#  Disclaimer
#
#  NEITHER THE UNITED STATES NOR THE UNITED STATES DEPARTMENT OF ENERGY,
#  NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED,
#  OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
#  COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
#  OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
#  PRIVATELY OWNED RIGHTS.

# Steps:
# Make the fbmerge directory
# Split masked(1) into fbmerge/masked1.*.faa
# BLAST masked(1) vs. reduce(2) into fbmerge/masked1.*.blastp
# Expand these blast hits into fbmerge/masked1.hits.expanded
# Use masked1.*.blastp to mask masked(2) into fbmerge/unmasked2.faa
#       this also writes masked2.list
# Add additional members of these families into fbmerge/masker1.hits
# Sort expanded list and maskermember list into fbmerge/masked1.hits.expanded.bydom
# Replace members of masked domains with potential members of maskers' domains (fbmerge/masked2.maskers.members)


# Split into fbmerge/masked1.*.dom
# Use fastBlastAlignment.pl to make masked1.*.hits and masked1.*.align
# Split unmasked2.faa into fbmerge/unmasked2.*.faa
# BLAST unmasked2 vs. reduce(1) into fbmerge/unmasked2.*.blastp
# Expand into fbmerge/unmasked2.hits.expanded
# Sort expanded hits into fbmerge/unmasked2.hits.expanded.bydom
# Split into fbmerge/unmasked2.*.dom
# Use fastBlastAlignment.pl to make unmasked2.*.hits and unmasked2.*.align
#	Note that there is a tricky step here: the alignment files do
#	not include hits from a seed to itself if the seed had no hits.
#	So, we need to add an alignment to the seed. These may then be redundant.
#	So there will be (rare) redundancies in the domain files for seeds
#	This is filtered out later in combine.pl
# Merge the alignments and the hits files
#	Note need to ignore masked-out seeds that were excluded from masked2.masked.faa
#
# Also make the reduction sets so we can merge a 2nd time if we want to (albeit in another directory):
# fb.other.masked.faa is all masked from 1 and the unmasked set from 2
#	fb.other.reduced.comb.clstr.byexemplar is just the combination, and sorted by name
#	(In principle, masked-out clusters should be removed but that isn't implemented;
#	this shouldn't affect incremental FastBlast)
# fb.domains.merged.faa is just the combination of the two inputs
#	ditto for fb.domains.clstr.byexemplar but it needs to be sorted

NPIECES ?= 100 # only set if undefined
# e.g. if NPIECES=4 then PLIST = 1 2 3 4
# $$ quotes $ before sending it to perl
PLIST=$(shell perl -e 'map {print "$$_\n";} (1..$$ARGV[0])' $(NPIECES))

# Verify that all required parameters are set
ifndef DIR1
$(error DIR1 must be set to a fast-blast output directory)
endif
ifeq ($(DIR1),.)
$(error DIR1 cannot be set to the current directory)
endif

ifndef DIR2
$(error DIR2 must be set to a fast-blast output directory)
endif
ifeq ($(DIR2),.)
$(error DIR2 cannot be set to the current directory)
endif

ifndef FASTHMM_DIR
$(error FASTHMM_DIR must be set)
endif

ifndef FAA1
$(error FAA1 must be set)
endif

ifndef FAA2
$(error FAA2 must be set)
endif

# get defaults
CONF=$(FASTHMM_DIR)/conf/fastBlastMake.conf
include $(CONF)
BIN=$(FASTHMM_DIR)/bin

# Delete target if command exits with non-zero status
.DELETE_ON_ERROR:

# all targets

.PHONY:
all: fbmerge/dupnames fb.all.align fb.all.nseq fb.other.masked.faa fb.domains.merged.faa fb.other.reduced.comb.clstr.byexemplar fb.domains.clstr.byexemplar 
	echo FastBLAST-merge of fasta $(FAA1) dir $(DIR1)/ with fasta $(FAA2) dir $(DIR2)/ is complete

.PHONY:
clean:
	$(BIN)/rm -Rf fbmerge fb.all.align fb.all.nseq fb.other.masked.faa fb.domains.merged.faa fb.other.reduced.comb.clstr.byexemplar fb.domains.clstr.byexemplar combined.faa* fb.all*

fbmerge/made:
	mkdir fbmerge; touch $@

# Check that identifiers do not overlap; $@ is target; $^ is all dependencies
fbmerge/dupnames: fbmerge/names1 fbmerge/names2
	$(BIN)/join.pl -header 0 -match 1.1=2.1 $^ > $@ && perl -e 'die "Duplicate names: see $@" if -s "$@"'

# $$ quotes $; $@ is target; $< is first dependency
fbmerge/names1: $(FAA1) fbmerge/made
	perl -ane 'print "$$1\n" if m/>(\S+)/;' < $< >& $@.tmp && $(BIN)/mv $@.tmp $@

fbmerge/names2:  $(FAA2) fbmerge/made
	perl -ane 'print "$$1\n" if m/>(\S+)/;' < $< >& $@.tmp && $(BIN)/mv $@.tmp $@


# BLAST seed1 versus reduction of 2
M1_PIECES_PRE = $(addprefix fbmerge/masked1., $(PLIST))
M1_PIECES_FAA   = $(addsuffix .faa,    $(M1_PIECES_PRE)) # seeds
M1_PIECES_BLAST = $(addsuffix .blastp, $(M1_PIECES_PRE)) # initial blast hits before expanding
M1_PIECES_DOM   = $(addsuffix .dom,    $(M1_PIECES_PRE)) # potential members after expanding
M1_PIECES_HITS  = $(addsuffix .hits,   $(M1_PIECES_PRE)) # confirmed hits after expanding
M1_PIECES_ALIGN = $(addsuffix .align,  $(M1_PIECES_PRE)) # alignments of confirmed hits

.PHONY:
blast1v2: $(M1_PIECES_BLAST)

$(M1_PIECES_BLAST): %.blastp : %.faa fbmerge/reduce2.faa.pin
	$(BIN)/blastall -p blastp $(BLASTP_OPTIONS_INITIAL) -m 8 -i $< -d fbmerge/reduce2.faa -o $@.tmp && $(BIN)/mv $@.tmp $@

# $^ means all dependencies
fbmerge/reduce2.faa: $(DIR2)/fb.other.masked.faa $(DIR2)/fb.domains.merged.faa
	$(BIN)/cat $^ > $@.tmp && $(BIN)/mv $@.tmp $@

$(M1_PIECES_FAA): fbmerge/masked1 # empty file to record that pieces were made
fbmerge/masked1: $(DIR1)/fb.other.masked.faa
	$(BIN)/splitFasta.pl $< $(NPIECES) $@ faa && touch $@

# Use masked1 blast hits make unmasked2.faa and then masker1.hits
# Expand fbmerge/masked1.*.blastp into fbmerge/masked1.hits.expanded
# Sort into .bydom
.PHONY: expand1v2
expand1v2: fbmerge/masked1.hits.expanded.bydom fbmerge/unmasked2.faa

fbmerge/masked1.hits.sorted: $(M1_PIECES_BLAST)
	$(BIN)/cut -f 1,2,9,10 $(M1_PIECES_BLAST) | LC_ALL=C $(BIN)/sort -T $(TMPDIR) -k 2 >& $@.tmp && $(BIN)/mv $@.tmp $@

fbmerge/clusters2: $(DIR2)/fb.domains.clstr.byexemplar $(DIR2)/fb.other.reduced.comb.clstr.byexemplar
	LC_ALL=C $(BIN)/sort -T $(TMPDIR) -k 1 $^ > $@.tmp && $(BIN)/mv $@.tmp $@

fbmerge/masked1.hits.expanded: fbmerge/masked1.hits.sorted fbmerge/clusters2
	$(BIN)/expandDomains.pl -clusters fbmerge/clusters2 -domains fbmerge/masked1.hits.sorted -out $@ $(EXPAND_OPTIONS)

# masked2.list is tab-delimited of the form masked_domain_id masked_by_domain_id
fbmerge/masked2.list: fbmerge/unmasked2.faa

fbmerge/unmasked2.faa: $(M1_PIECES_BLAST)
	$(BIN)/filterFaaNoHits.pl $(MERGE_MASK_OPTIONS) -out $@ -faa $(DIR2)/fb.other.masked.faa -excluded fbmerge/masked2.list $(M1_PIECES_BLAST)

fbmerge/masker1.hits: fbmerge/masked2.list $(DIR2)/fb.all.align
	$(BIN)/expandHitsByDomains.pl -db $(DIR2) -in fbmerge/masked2.list > $@.tmp && $(BIN)/mv $@.tmp $@


fbmerge/masked1.hits.expanded.bydom: fbmerge/masked1.hits.expanded fbmerge/masker1.hits
	LC_ALL=C $(BIN)/sort -T $(TMPDIR) $^ > $@.tmp && $(BIN)/mv $@.tmp $@

# Split unmasked2.faa and blast vs. reduce(1)
U2_PIECES_PRE = $(addprefix fbmerge/unmasked2., $(PLIST))
U2_PIECES_FAA   = $(addsuffix .faa,    $(U2_PIECES_PRE)) # seeds
U2_PIECES_BLAST = $(addsuffix .blastp, $(U2_PIECES_PRE)) # initial blast hits before expanding
U2_PIECES_DOM   = $(addsuffix .dom,    $(U2_PIECES_PRE)) # potential members after expanding
U2_PIECES_HITS  = $(addsuffix .hits,   $(U2_PIECES_PRE)) # confirmed hits after expanding
U2_PIECES_ALIGN = $(addsuffix .align,  $(U2_PIECES_PRE)) # alignments of confirmed hits

.PHONY: blast2v1
blast2v1: $(U2_PIECES_BLAST)

$(U2_PIECES_BLAST): %.blastp : %.faa fbmerge/reduce1.faa.pin
	$(BIN)/blastall -p blastp $(BLASTP_OPTIONS_INITIAL) -m 8 -i $< -d fbmerge/reduce1.faa -o $@.tmp && $(BIN)/mv $@.tmp $@

# $^ is all dependencies
fbmerge/reduce1.faa: $(DIR1)/fb.other.masked.faa $(DIR1)/fb.domains.merged.faa
	$(BIN)/cat $^ > $@.tmp && $(BIN)/mv $@.tmp $@

$(U2_PIECES_FAA): fbmerge/unmasked2 # empty file to record that pieces were made
fbmerge/unmasked2: fbmerge/unmasked2.faa
	$(BIN)/splitFasta.pl $< $(NPIECES) $@ faa && touch $@

# Expand fbmerge/unmasked2.*.blastp into fbmerge/unmasked2.hits.expanded and sort into .bydom

fbmerge/unmasked2.hits.sorted: $(U2_PIECES_BLAST)
	$(BIN)/cut -f 1,2,9,10 $(U2_PIECES_BLAST) | LC_ALL=C $(BIN)/sort -T $(TMPDIR) -k 2 >& $@.tmp && $(BIN)/mv $@.tmp $@

fbmerge/clusters1: $(DIR1)/fb.domains.clstr.byexemplar $(DIR1)/fb.other.reduced.comb.clstr.byexemplar
	LC_ALL=C $(BIN)/sort -T $(TMPDIR) -k 1 $^ > $@.tmp && $(BIN)/mv $@.tmp $@

fbmerge/unmasked2.hits.expanded: fbmerge/unmasked2.hits.sorted fbmerge/clusters1
	$(BIN)/expandDomains.pl -clusters fbmerge/clusters1 -domains fbmerge/unmasked2.hits.sorted -out $@ $(EXPAND_OPTIONS)

fbmerge/unmasked2.hits.expanded.bydom: fbmerge/unmasked2.hits.expanded
	LC_ALL=C $(BIN)/sort -T $(TMPDIR) $< > $@.tmp && $(BIN)/mv $@.tmp $@

# Additional members of domains from blast1v2
.PHONY: newalign
newdom: $(M1_PIECES_ALIGN) $(U2_PIECES_ALIGN)

$(M1_PIECES_DOM): fbmerge/masked1.splitdom
fbmerge/masked1.splitdom: fbmerge/masked1.hits.expanded.bydom
	$(BIN)/splitDomains.pl $< $(NPIECES) fbmerge/masked1 dom && touch $@

# $* is the stem ("%" above)
$(M1_PIECES_ALIGN): %.align : %.dom fbmerge/masked1.splitdom $(FAA2).pin combined.faa.pin
	$(BIN)/fastBlastAlignment.pl -T $(TMPDIR) -db $(FAA2) -anchordb combined.faa $(BLASTP_OPTIONS_FINAL) -o $* < $<

$(M1_PIECES_HITS): %.hits : %.align

# Additional members of domains from blast2v1
$(U2_PIECES_DOM): fbmerge/unmasked2.splitdom
fbmerge/unmasked2.splitdom: fbmerge/unmasked2.hits.expanded.bydom
	$(BIN)/splitDomains.pl $< $(NPIECES) fbmerge/unmasked2 dom && touch $@

# $* is the stem ("%" in the dependencies line)
$(U2_PIECES_ALIGN): %.align : %.dom fbmerge/unmasked2.splitdom $(FAA1).pin combined.faa.pin
	$(BIN)/fastBlastAlignment.pl -T $(TMPDIR) -db $(FAA1) -anchordb combined.faa $(BLASTP_OPTIONS_FINAL) -o $* < $<

$(U2_PIECES_HITS): %.hits : %.align

# Merge the alignments and the hits files, and sort as needed

fbmerge/newalign: $(U2_PIECES_ALIGN) $(M1_PIECES_ALIGN)
	$(BIN)/cat $^ > $@.tmp && $(BIN)/mv $@.tmp $@

fbmerge/newdomains.bygene:  $(U2_PIECES_HITS) $(M1_PIECES_HITS)
	LC_ALL=C $(BIN)/sort -T $(TMPDIR) -k 2 $^ > $@.tmp && $(BIN)/mv $@.tmp $@

# uniq on domain, seqId, alignseq, begin, end (do not use score/evalue as these may be inconsistent)
# (and use alignseq in case we have multiple valid alignments of the same seq begin/end to different parts of the domain)_
fb.all.align: $(DIR1)/fb.all.align $(DIR2)/fb.all.align fbmerge/newalign
	$(BIN)/combine.pl -uniq 1,2,3,4,5 -key 1 -exclude fbmerge/masked2.list -xcol 1 $^ > $@.tmp && $(BIN)/mv $@.tmp $@

# uniq on domain, seqId, seqbeg, seqend, dombeg, domend
fb.all.domains.bygene: $(DIR1)/fb.all.domains.bygene $(DIR2)/fb.all.domains.bygene fbmerge/newdomains.bygene
	$(BIN)/combine.pl -uniq 1,2,3,4,5,6 -key 2 -exclude fbmerge/masked2.list -xcol 1 $^ > $@.tmp && $(BIN)/mv $@.tmp $@

combined.faa: $(FAA1) $(FAA2)
	$(BIN)/cat $(FAA1) $(FAA2) > $@.tmp && $(BIN)/mv $@.tmp $@

fb.all.nseq: fb.all.align fb.all.domains.bygene combined.faa.pin
	$(BIN)/makeDBTables.pl $(FAA) combined.faa .

fb.other.masked.faa: $(DIR1)/fb.other.masked.faa fbmerge/unmasked2.faa
	$(BIN)/cat $^ > $@.tmp && $(BIN)/mv $@.tmp $@

fb.domains.clstr.byexemplar: $(DIR1)/fb.domains.clstr.byexemplar $(DIR2)/fb.domains.clstr.byexemplar
	$(BIN)/cat $^ > $@.tmp && $(BIN)/mv $@.tmp $@

fb.domains.merged.faa: $(DIR1)/fb.domains.merged.faa $(DIR2)/fb.domains.merged.faa
	$(BIN)/cat $^ > $@.tmp && $(BIN)/mv $@.tmp $@

# Strictly speaking, we should remove the excluded seeds (in fbmerge/masked2.list) from
# being listed here, but it isn't necessary
fb.other.reduced.comb.clstr.byexemplar: $(DIR1)/fb.other.reduced.comb.clstr.byexemplar $(DIR2)/fb.other.reduced.comb.clstr.byexemplar
	$(BIN)/cat $^ > $@.tmp && $(BIN)/mv $@.tmp $@

# Generic rule to format a database
# The touch is there in case formatdb writes *.00.pin instead of *.pin
%.faa.pin: %.faa
	$(BIN)/formatdb -p T -i $< -o T && touch $@
