#
#  $Id: fastblast.make,v 1.13 2008/08/07 18:53:16 mprice Exp $
#  fastHmm/fastBlast Alignment Tools
#  http://microbesonline.org/fasthmm (fasthmm@microbesonline.org)
#
#  Controls the dependencies for actually running fastblast
#  Designed to work with parallel make, such as GNU make or qmake
#
#  Copyright (C) 2007 The Regents of the University of California
#  All rights reserved.
#
#  This library is free software; you can redistribute it and/or
#  modify it under the terms of the GNU Lesser General Public
#  License as published by the Free Software Foundation; either
#  version 2.1 of the License, or (at your option) any later version.
#
#  This library is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
#  Lesser General Public License for more details.
#
#  You should have received a copy of the GNU Lesser General Public
#  License along with this library; if not, write to the Free Software
#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301
#  USA.
#
#  Disclaimer
#
#  NEITHER THE UNITED STATES NOR THE UNITED STATES DEPARTMENT OF ENERGY,
#  NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED,
#  OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
#  COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
#  OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
#  PRIVATELY OWNED RIGHTS.

# Writes pieces files to fb/ to keep the working directory
# (which must be the current directory) uncluttered.
#
# Inputs: the allhmm file created by fastBlastPrepare.pl and $(FAA)
# Final outputs: fb.all.align and fb.all.domains.bygene
# and the index files *.seek.db and statistics file fb.all.nseq
# which are read by topHomologs.pl
#
# allhmm should be the *.domains files from fastHmm.pl plus hits from COG
# the $(FAA) file should be a fasta file and also a BLASTp database built with -o T
#	so that fastacmd will work
#
# Both output files are tab-delimited:
# fb.all.align has fields domainId, locusId, alignment, seqBegin, seqEnd, score (bits), e-value
#	and is grouped by domainId
#	alignment is an actual sequence, with insertions relative to the family removed,
#	not an alignment specifiction
# fb.all.domains.sorted has the same format as FastHMM output but is sorted by gene, not by domain
#
# all coordinates are 1-based
#
# The overall flow of work is:
# Reduce non-domain ("other") regions
#	fb.other.reduced.faa, fb.other.reduced.comb.clstr.byexemplar
#	fb.other.reduced.faa is the database for the other-vs-other BLAST
# BLAST them against each other
#	fb/other.{1,2,...}.blastp
# Reduce the non-domains more by masking
#	fb.other.masked.{blastp,id,faa} and fb/masked.{1,2,...}.faa
#	fb.other.masked.faa is the database for the other-vs-domains BLAST
# Reduce the domains & merge the results
#	fb.domains.merged.{domains,faa} and fb.domains.clstr.byexemplar
# BLAST the masked other set against the merged domains
#	fb/masked.{1,2,...}.blastp
# Expand the two sets of hits, and sort by ad-hoc domain
#	fb.hits.expanded.bydom
# Use fastBlastAlignment.pl to confirm & align ad-hoc domain members
#	fb/newdom.{1,2,...}.{align,hits}
# Align all the hmm hits
#	fb/allhmm.{1,2,...}.align
# Combine all the output
#	fb.all.align, fb.all.domains.sorted

# Bits of independent work as defined by NPIECES (e.g., 4) and
# PIECES (e.g., 1 2 3 4).
#
# OTHER_PIECES_PRE might be fb/other.1, fb/other.2, etc.
# OTHER_PIECES_FAA might be fb/other.1.faa, fb/other.2.faa, etc.

# pieces of the reduced "other" (non-domains regions) for the first pass of blastp
OTHER_PIECES_PRE = $(addprefix fb/other., $(PIECES))
OTHER_PIECES_FAA = $(addsuffix .faa, $(OTHER_PIECES_PRE))
OTHER_PIECES_BLAST = $(addsuffix .blastp, $(OTHER_PIECES_PRE))

# pieces of the masked "other" (from removing further sequences based on the first round of blastp)
# and blasted against reduced/merged domains
MASKED_PIECES_PRE = $(addprefix fb/masked., $(PIECES))
MASKED_PIECES_FAA = $(addsuffix .faa, $(MASKED_PIECES_PRE))
MASKED_PIECES_BLAST = $(addsuffix .blastp, $(MASKED_PIECES_PRE))


# pieces of the expanded domains file for the new domains
# *.dom is the potential members, *.hits is the actual members,
# *.align is alignments (with sequences, not lists of ranges) for those hits
NEWDOM_PIECES_PRE = $(addprefix fb/newdom., $(PIECES))
NEWDOM_PIECES_DOM = $(addsuffix .dom, $(NEWDOM_PIECES_PRE))
NEWDOM_PIECES_ALIGN = $(addsuffix .align, $(NEWDOM_PIECES_PRE))
NEWDOM_PIECES_HITS = $(addsuffix .hits, $(NEWDOM_PIECES_PRE))

# pieces of the input domains
ALLHMM_PIECES_PRE = $(addprefix fb/allhmm., $(PIECES))
ALLHMM_PIECES_DOM = $(addsuffix .dom, $(ALLHMM_PIECES_PRE))
ALLHMM_PIECES_REDUCE = $(addsuffix .reduce, $(ALLHMM_PIECES_PRE))
ALLHMM_PIECES_REDUCE_CLSTR = $(addsuffix .reduce.clstr, $(ALLHMM_PIECES_PRE))
ALLHMM_PIECES_ALIGN = $(addsuffix .align, $(ALLHMM_PIECES_PRE))

BIN=$(FASTHMM_DIR)/bin

# Delete target if command exits with non-zero status
.DELETE_ON_ERROR:

# Default list of targets
.PHONY: all
all: fb.all.align fb.all.domains.bygene fb.all.nseq

# Start over
.PHONY: clean
clean:
	rm -Rf fb && mkdir fb && rm fb.*

# REDUCE OTHER

# Reducing the non-domain regions uses reduceOther.pl to select the
# other regions, use cd-hit once or twice to reduce it, use clstr_rev
# to combine the clusters (if using two passes), and parse the
# clusters. Define CDHIT_ONEPASS if you want to use just one pass.o

.PHONY: reduce1
reduce1: reduceother $(ALLHMM_PIECES_DOM)

.PHONY: reduceother
reduceother: fb.other.reduced.faa.pin fb.other.reduced.comb.clstr.byexemplar $(OTHER_PIECES_FAA)

fb.other.faa: $(FAA) allhmm
	$(BIN)/reduceOther.pl -nocdhit -domains allhmm -db $(FAA) -prefix fb.other

ifdef CDHIT_ONEPASS

# This rule also makes fb.other.reduced.faa.clstr -- $< is dependency and $@ is target
fb.other.reduced.faa: fb.other.faa
	$(BIN)/cd-hit $(CDHIT_OPTIONS_PASS2) -i $< -o $@.tmp && $(BIN)/mv $@.tmp $@ && $(BIN)/mv $@.tmp.clstr $@.clstr

fb.other.reduced.comb.clstr: fb.other.reduced.faa
	$(BIN)/cp fb.other.reduced.faa.clstr $@.tmp && $(BIN)/mv $@.tmp $@

else

# This rule also makes fb.other.reduce1.faa.clstr
fb.other.reduce1.faa: fb.other.faa
	$(BIN)/cd-hit $(CDHIT_OPTIONS_PASS1) -i $< -o $@.tmp && $(BIN)/mv $@.tmp $@ && $(BIN)/mv $@.tmp.clstr $@.clstr

# This rule also makes fb.other.reduced.faa.clstr
fb.other.reduced.faa: fb.other.reduce1.faa
	$(BIN)/cd-hit $(CDHIT_OPTIONS_PASS2) -i $< -o $@.tmp && $(BIN)/mv $@.tmp $@ && $(BIN)/mv $@.tmp.clstr $@.clstr

fb.other.reduced.comb.clstr: fb.other.reduce1.faa fb.other.reduced.faa
	$(BIN)/clstr_rev.pl fb.other.reduce1.faa.clstr fb.other.reduced.faa.clstr > $@.tmp && $(BIN)/mv $@.tmp $@

endif

# Parse & sort clusters by exemplar id
%.clstr.byexemplar: %.clstr
	$(BIN)/parseClstr.pl < $< | LC_ALL=C $(BIN)/sort -T $(TMPDIR) -u > $@.tmp && $(BIN)/mv $@.tmp $@

# split the fb.other.reduced.faa file into $(OTHER_PIECES_FAA), e.g. fb/other.3.faa
# (The 3rd and 3th arguments to splitFasta.pl are the prefix and the suffix for the output file names)
$(OTHER_PIECES_FAA): fb/other # empty file to record that pieces were made
fb/other: fb.other.reduced.faa
	$(BIN)/splitFasta.pl $< $(NPIECES) $@ faa && touch $@

# REDUCE AND MERGE DOMAINS

# Reducing the domains relies on splitting allhmm into pieces,
# reducing each piece with DomReduce, and merging the results with
# mergeDomains.pl. All exemplar domains and the sequences of all these
# must be stored in memory -- e.g., mergeDomains.pl required ~4 GB of
# memory on 1.12 billion amino acids from ~1,000 prokaryotic genomes.

.PHONY: reducedomains
reducedomains: fb.domains.merged.faa.pin fb.domains.merged.faa fb.domains.clstr.byexemplar

# Split domains into $(ALLHMM_PIECES_DOM), e.g. fb/allhmm.2.dom
$(ALLHMM_PIECES_DOM): fb/allhmm
fb/allhmm: allhmm
	$(BIN)/splitDomains.pl $< $(NPIECES) $@ dom && touch $@

# Reduce each domain in each piece independently
# Note: DomReduce could be modified to ignore sequences not in domains file,
# which would save memory, but it only uses ~2x more memory than the input fasta file
# This command also creates fb/allhmm.*.reduce.clstr (which are in tab-delimited
# format not CD-HIT format)
$(ALLHMM_PIECES_REDUCE): %.reduce : %.dom fb/allhmm
	$(BIN)/DomReduce $(FAA) $< $@ $(DOMREDUCE_THRESHOLDS)

# Merge reduced domains (a sequence can be an exemplar for multiple domains)
# mergeDomains.pl reads all the domain ranges for exemplars, merges overlapping
# bits, stores all this in memory, and then reads the fasta file and
# outputs the pieces. Because most sequences do not appear as exemplars for any domain,
# and because it merges the (highly redundant) domains as it reads them,
# this requires less memory than the input file
fb.domains.merged.faa: $(ALLHMM_PIECES_REDUCE)
	$(BIN)/mergeDomains.pl -db $(FAA) -o $@ $(ALLHMM_PIECES_REDUCE)

# The touch is there in case formatdb writes *.00.pin instead of *.pin
%.faa.pin: %.faa
	$(BIN)/formatdb -p T -i $< -o T && touch $@

fb.domains.clstr.byexemplar: $(ALLHMM_PIECES_REDUCE)
	$(BIN)/cat $(ALLHMM_PIECES_REDUCE_CLSTR) | LC_ALL=C sort > $@.tmp && $(BIN)/mv $@.tmp $@

# BLAST OTHER AGAINST EACH OTHER
.PHONY: blast1
blast1: $(OTHER_PIECES_BLAST)  $(ALLHMM_PIECES_REDUCE)

$(OTHER_PIECES_BLAST): %.blastp : %.faa fb.other.reduced.faa.pin
	$(BIN)/blastall -p blastp $(BLASTP_OPTIONS_INITIAL) -m 8 -d fb.other.reduced.faa -i $< -o $@.tmp && $(BIN)/mv $@.tmp $@

.PHONY: reduce2
reduce2: fb.other.masked.blastp $(MASKED_PIECES_FAA) fb.domains.merged.faa.pin

# MASK OTHER
fb.other.masked.blastp: $(OTHER_PIECES_BLAST)
	$(BIN)/maskBlast.pl $(MASK_OPTIONS) -out $@ $(OTHER_PIECES_BLAST)

fb.other.masked.id: fb.other.masked.blastp
	$(BIN)/cut -f 1 $< | LC_ALL=C $(BIN)/sort -T $(TMPDIR) -u > $@.tmp && $(BIN)/mv $@.tmp $@

fb.other.masked.faa: fb.other.masked.id
	$(BIN)/fastacmd -d fb.other.reduced.faa -i $< > $@.tmp && $(BIN)/mv $@.tmp $@

$(MASKED_PIECES_FAA): fb/masked
fb/masked: fb.other.masked.faa
	$(BIN)/splitFasta.pl $< $(NPIECES) $@ faa && touch $@

# BLAST MASKED OTHER VS DOMAINS
.PHONY: blast2
blast2: $(MASKED_PIECES_BLAST)

$(MASKED_PIECES_BLAST): %.blastp : %.faa fb/masked fb.domains.merged.faa.pin
	$(BIN)/blastall -p blastp $(BLASTP_OPTIONS_INITIAL) -m 8 -d fb.domains.merged.faa -i $< -o $@.tmp && $(BIN)/mv $@.tmp $@

# EXPAND THE TWO SETS SEPARATELY, AND DO ONE BIG SORT
.PHONY: expand
expand: fb/newdom

fb.masked.hits.sorted: fb.other.masked.blastp
	$(BIN)/cut -f 1,2,9,10 $< | LC_ALL=C $(BIN)/sort -T $(TMPDIR) -k 2 > $@.tmp && $(BIN)/mv $@.tmp $@

fb.masked.hits.expanded: fb.masked.hits.sorted fb.other.reduced.comb.clstr.byexemplar
	$(BIN)/expandDomains.pl -clusters fb.other.reduced.comb.clstr.byexemplar -domains fb.masked.hits.sorted -out $@ $(EXPAND_OPTIONS)

fb.domains.hits.sorted: $(MASKED_PIECES_BLAST)
	$(BIN)/cut -f 1,2,9,10 $(MASKED_PIECES_BLAST) | LC_ALL=C $(BIN)/sort -T $(TMPDIR) -k 2 > $@.tmp && $(BIN)/mv $@.tmp $@

fb.domains.hits.expanded: fb.domains.hits.sorted fb.domains.clstr.byexemplar
	$(BIN)/expandDomains.pl -clusters fb.domains.clstr.byexemplar -domains fb.domains.hits.sorted -out $@ $(EXPAND_OPTIONS)

fb.hits.expanded.bydom: fb.masked.hits.expanded fb.domains.hits.expanded
	LC_ALL=C $(BIN)/sort -T $(TMPDIR) fb.masked.hits.expanded fb.domains.hits.expanded > $@.tmp && $(BIN)/mv $@.tmp $@

# BLAST EXEMPLARS AGAINST THEIR MEMBERS AND ALIGN
# blast each expanded-domain against its homologs to confirm membership
# in the ad-hoc domain and to get alignments

.PHONY: blast3
blast3: $(NEWDOM_PIECES_ALIGN)

# Use a fake target represented by an empty file to represent the split split
$(NEWDOM_PIECES_DOM): fb/newdom
fb/newdom: fb.hits.expanded.bydom
	$(BIN)/splitDomains.pl $< $(NPIECES) $@ dom && touch $@

# makes the .hits file as well as the .align file
# $* is the prefix (the % part in %.align)
$(NEWDOM_PIECES_ALIGN): %.align : %.dom fb/newdom
	$(BIN)/fastBlastAlignment.pl -T $(TMPDIR) -db $(FAA) $(BLASTP_OPTIONS_FINAL) -o $* < $<

# MAKE THE FASTHMM ALIGNMENTS
.PHONY: alignhmm
alignhmm: $(ALLHMM_PIECES_ALIGN)
$(ALLHMM_PIECES_ALIGN): %.align : %.dom fb/allhmm
	$(BIN)/hitsToAlign.pl -db $(FAA) < $< > $@.tmp && $(BIN)/mv $@.tmp $@

# MAKE THE FINAL OUTPUT FILES
fb.all.align: $(ALLHMM_PIECES_ALIGN) $(NEWDOM_PIECES_ALIGN)
	$(BIN)/cat $(ALLHMM_PIECES_ALIGN) $(NEWDOM_PIECES_ALIGN) > $@.tmp && $(BIN)/mv $@.tmp $@

# *.align is used to record what is done in the rules
fb.all.domains.bygene: allhmm $(NEWDOM_PIECES_ALIGN)
	$(BIN)/cat $(NEWDOM_PIECES_HITS) allhmm | LC_ALL=C $(BIN)/sort -T $(TMPDIR) -k 2 > $@.tmp && $(BIN)/mv $@.tmp $@

# Make the index files for topHomologs.pl
.PHONY: indexes
db: fb.all.nseq

fb.all.nseq: fb.all.align fb.all.domains.bygene
	$(BIN)/makeDBTables.pl $(FAA) .
