From 881b640b74cde8f6b3b426a8a289ddc6c8578110 Mon Sep 17 00:00:00 2001
From: icwells <smrupp16@gmail.com>
Date: Tue, 13 Jun 2017 09:23:44 -0700
Subject: [PATCH] Automated filetype and analysis selection.

---
 AlignmentProcessor.py    |  32 ++++----
 bin/04_CallCodeML.py     |   7 +-
 bin/ConcatenateCodeML.py | 153 +++++++++++++++++++++++++++------------
 3 files changed, 122 insertions(+), 70 deletions(-)

diff --git a/AlignmentProcessor.py b/AlignmentProcessor.py
index 333f653..2839616 100644
--- a/AlignmentProcessor.py
+++ b/AlignmentProcessor.py
@@ -19,10 +19,13 @@
 from glob import glob
 import os
 
-def checkInput(axt, kaks, phylip, codeml, outdir):
+def checkInput(ref, kaks, codeml, outdir):
 	'''Makes sure necessary programs are installed and proper file conversion
 is run for optional analysis program.'''
 	proceed = True
+	if not ref:
+		print("\n\tError: Please specify a reference species.\n")
+		proceed = False
 	if kaks == True:
 		if codeml == True:
 			print("\n\tError: Please specify only one analysis program.\n")
@@ -32,10 +35,6 @@ def checkInput(axt, kaks, phylip, codeml, outdir):
 			print("\n\tError: Please install KaKs_Calculator in the\
  AlignmentProcessor bin.\n")
 			proceed = False
-		if axt == False:
-			print("\n\tError: Files must be converted into axt format for use\
- with KaKs_Clculator.\n")
-			proceed = False
 	if codeml == True:
 		control = glob(outdir + "*.ctl")
 		if len(control) == 0:
@@ -52,10 +51,6 @@ def checkInput(axt, kaks, phylip, codeml, outdir):
 			proceed = False
 		if phyml == False:
 			print("\n\tError: Please install and rename PhyML for use\
- with CodeML.\n")
-			proceed = False
-		if phylip == False:
-			print("\n\tError: Files must be converted into phylip format for use\
  with CodeML.\n")
 			proceed = False
 	return proceed
@@ -172,9 +167,9 @@ def main():
 help="Specifies that sequences containing internal stop codons should be \
 ratained.")
 	parser.add_argument("--axt", action="store_true", 
-help="Convert files to axt format.")
+help="Convert files to axt format without calling KaKs_Calculator.")
 	parser.add_argument("--phylip", action="store_true",
-help="Convert files to sequential phylip format.")
+help="Convert files to sequential phylip format without calling CodeML.")
 	parser.add_argument("--kaks", action="store_true",
 help="Calls KaKs_Calculator on axt files.")
 	parser.add_argument("-m", default="NG", 
@@ -206,15 +201,14 @@ def main():
 	forward = args.f
 	ntree = args.n
 	cleanup = args.cleanUp
-	# Check inout commands prior to running:
-	if not ref:
-		print("\n\tError: Please specify a reference species.\n")
-		quit()
-	if axt == True and phylip == True:
-		print("\n\tError: Please specify only one file type.\n")
-		quit()
-	proceed = checkInput(axt, kaks, phylip, codeml, outdir)
+	# Check input commands prior to running:
+	proceed = checkInput(ref, kaks, codeml, outdir)
 	if proceed == True:
+		# Set axt or phylip
+		if codeml == True:
+			phylip = True
+		elif kaks == True:
+			axt = True
 		# Set checkpoint variables to False:
 		sf = False
 		ff = False
diff --git a/bin/04_CallCodeML.py b/bin/04_CallCodeML.py
index 3824818..1d9af24 100644
--- a/bin/04_CallCodeML.py
+++ b/bin/04_CallCodeML.py
@@ -72,10 +72,9 @@ def main():
 	# Save path to the AlignmentProcessor directory
 	ap = os.getcwd() + "/"
 	if " " in ap:
-		# Change to warning ########################################################
-		print("\tWARNING: AlignmentProcessor will not run properly if there \
-is a space in its PATH name.")
-		ap = ap.replace(" (ASU)", "")
+		print("\tError: AlignmentProcessor will not run properly if there \
+is a space in its PATH name. Exiting.\n")
+		quit()
 	run = False
 	# Parse command
 	parser = argparse.ArgumentParser(description="Runs CodeML on all files \
diff --git a/bin/ConcatenateCodeML.py b/bin/ConcatenateCodeML.py
index 1627be1..c9a983c 100644
--- a/bin/ConcatenateCodeML.py
+++ b/bin/ConcatenateCodeML.py
@@ -3,24 +3,86 @@
 
 import argparse
 from glob import glob
+import re
 
-def concatenate(multiple, pairwise, indir, outfile):
-	# Parses CodeML output and prints to file
-	if multiple == True and pairwise == True:
-		print("\n\tPlease specify only one alignment type.\n")
-		quit()
-	elif multiple == True:
-		# Read in codeml output for multiple alignments
-		readMultiple(indir, outfile)
-	elif pairwise == True:
-		# Read in codeml output for pairwise alignments
-		readPairwise(indir, outfile)
+def inputList(indir):
+	# Creates list of mlc files and determines which analysis was run
+	infiles = glob(indir + "*")
+	# Remove tmp dir
+	for i in infiles:
+		if i[i.rfind("/")+1:] == "tmp":
+			infiles.remove(i)
+	with open(infiles[0], "r") as ali:
+		# Identify analysis
+		for line in ali:
+			if "dN/dS (w) for site classes" in line:
+				typ = "brsite"
+				break
+			elif "dN & dS for each branch" in line:
+				typ = "brspec"
+				break
+			elif "pairwise comparison, codon frequencies:" in line:
+				typ = "pw"
+				break
+	return infiles, typ
+		
+def branchSite(infiles, outfile):
+	# Saves data from branch site analysis to csv
+	save = False
+	with open(outfile, "w") as output:
+		# Open output file and write header
+		output.write("TranscriptID,Foreground_dN/dS,Background_dN/dS,\
+TreeLength,lnL,Species,SitesUnderSelection (Amino Acid; PosteriorProbability)\n")
+		for infile in infiles:
+			try:
+				# Get gene id and number of species remaining for each file
+				filename = infile.split("/")[-1]
+				transcript = filename.split(".")[0]
+				species = filename.split(".")[1]
+				if int(species) > 2:
+					with open(infile, "r") as mlc:
+						sites = ""
+						for line in mlc:
+							# Get substitution rates, tree lengths, etc
+							if save == True:
+								if "The grid" in line:
+									save = False
+								elif line.strip() and "Positive" not in line:
+									# Record sites and probability
+									splt = line.split()
+									sites += ("{} ({}; {}),").format(splt[0],
+															splt[1], splt[2])
+							elif "tree length =" in line:
+								length = line.split("=")[1].strip()
+							elif "lnL" in line:
+								lnl = line.split("):")[1]
+								lnl = lnl.split()[0].strip()
+							elif "background w" in line:
+								# Convert multiple spaces to single space
+								line = re.sub(" +", " ", line)
+								bw = line.split()[4]
+							elif "foreground w" in line:
+								# Convert multiple spaces to single space
+								line = re.sub(" +", " ", line)
+								fw = line.split()[4]
+							elif "Bayes Empirical Bayes" in line:
+								save = True
+					# Append data to list and convert into string
+					data = [fw, bw, length, lnl, species, sites[:-1]]
+					for i in data:
+						transcript += "," + i
+					output.write(transcript + "\n")
+				else:
+					# Skip files with only two sequences
+					pass
+			except IsADirectoryError:
+				pass
 
-def readMultiple(indir, outfile):
+def branchSpecific(infiles, outfile):
+	# Saves data from branch specific analysis to csv
 	with open(outfile, "w") as output:
 		# Open output file and write header
 		output.write("TranscriptID,dN,dS,dN/dS,TreeLength,lnL,Species\n")
-		infiles = glob(indir + "*")
 		for infile in infiles:
 			try:
 				# Get gene id and number of species remaining for each file
@@ -28,42 +90,39 @@ def readMultiple(indir, outfile):
 				transcript = filename.split(".")[0]
 				species = filename.split(".")[1]
 				if int(species) > 2:
+					with open(infile, "r") as mlc:
+						for line in mlc:
+							# Get substitution rates, tree lengths, etc
+							if "tree length =" in line:
+								length = line.split("=")[1].strip()
+							elif "tree length for dN" in line:
+								dn = line.split(":")[1].strip()
+							elif "tree length for dS" in line:
+								ds = line.split(":")[1].strip()
+							elif "lnL" in line:
+								lnl = line.split("):")[1]
+								lnl = lnl.split()[0].strip()
+					# Calculate dN/dS and save as a string
 					try:
-						with open(infile, "r") as mlc:
-							for line in mlc:
-								# Get substitution rates, tree lengths, etc
-								if "tree length =" in line:
-									length = line.split("=")[1].strip()
-								elif "tree length for dN" in line:
-									dn = line.split(":")[1].strip()
-								elif "tree length for dS" in line:
-									ds = line.split(":")[1].strip()
-								elif "lnL" in line:
-									lnl = line.split("):")[1]
-									lnl = lnl.split()[0].strip()
-						# Calculate dN/dS and save as a string
-						try:
-							dnds = str(float(dn)/float(ds))
-						except ZeroDivisionError:
-							dnds = "NA"
-						# Append data to list and convert into string
-						data = [dn, ds, dnds, length, lnl, species]
-						transcript += ","
-						for i in data:
-							transcript += str(i) + ","
-						transcript = transcript[:-1]
-						output.write(transcript + "\n")
+						dnds = str(float(dn)/float(ds))
+					except ZeroDivisionError:
+						dnds = "NA"
+					# Append data to list and convert into string
+					data = [dn, ds, dnds, length, lnl, species]
+					for i in data:
+						transcript += "," + str(i)
+					output.write(transcript + "\n")
 				else:
 					# Skip files with only two sequences
 					pass
-			except IsADirectoryError, UnboundLocalError:
+			except IsADirectoryError:
 				pass
 
-def readPairwise(indir, outfile):
+def pairwise(infiles, outfile):
+	# Saves data from pairwise analysis to csv
 	with open(outfile, "w") as output:
 		# Open output file and write header
 		output.write("TranscriptID,dN,dS,dN/dS,lnL\n")
-		infiles = glob(indir + "*")
 		for infile in infiles:
 			try:
 				# Get gene id and number of species remaining for each file
@@ -94,10 +153,6 @@ def readPairwise(indir, outfile):
 def main():
 	parser = argparse.ArgumentParser(description="This script will \
 concatenate CodeML output and print them in a single file.")
-	parser.add_argument("--multiple", action="store_true",
- help="Indicates that a multiple alignment was used")
-	parser.add_argument("--pairwise", action="store_true",
- help="Indicates that a pairwise alignment was used.")
 	parser.add_argument("-i", help="Path to CodeML output directory")
 	parser.add_argument("-o", help="Name of output file.")
 	# Parse command
@@ -106,9 +161,13 @@ def main():
 	if indir[-1] != "/":
 		indir += "/"
 	outfile = args.o
-	multiple = args.multiple
-	pairwise = args.pairwise
-	concatenate(multiple, pairwise, indir, outfile)
+	infiles, typ = inputList(indir)
+	if typ == "pw":
+		pairwise(infiles, outfile)
+	elif typ == "brsite":
+		branchSite(infiles, outfile)
+	elif typ == "brspec":
+		branchSpecific(infiles, outfile)
 
 if __name__ == "__main__":
 	main()