From 881b640b74cde8f6b3b426a8a289ddc6c8578110 Mon Sep 17 00:00:00 2001 From: icwells Date: Tue, 13 Jun 2017 09:23:44 -0700 Subject: [PATCH] Automated filetype and analysis selection. --- AlignmentProcessor.py | 32 ++++---- bin/04_CallCodeML.py | 7 +- bin/ConcatenateCodeML.py | 153 +++++++++++++++++++++++++++------------ 3 files changed, 122 insertions(+), 70 deletions(-) diff --git a/AlignmentProcessor.py b/AlignmentProcessor.py index 333f653..2839616 100644 --- a/AlignmentProcessor.py +++ b/AlignmentProcessor.py @@ -19,10 +19,13 @@ from glob import glob import os -def checkInput(axt, kaks, phylip, codeml, outdir): +def checkInput(ref, kaks, codeml, outdir): '''Makes sure necessary programs are installed and proper file conversion is run for optional analysis program.''' proceed = True + if not ref: + print("\n\tError: Please specify a reference species.\n") + proceed = False if kaks == True: if codeml == True: print("\n\tError: Please specify only one analysis program.\n") @@ -32,10 +35,6 @@ def checkInput(axt, kaks, phylip, codeml, outdir): print("\n\tError: Please install KaKs_Calculator in the\ AlignmentProcessor bin.\n") proceed = False - if axt == False: - print("\n\tError: Files must be converted into axt format for use\ - with KaKs_Clculator.\n") - proceed = False if codeml == True: control = glob(outdir + "*.ctl") if len(control) == 0: @@ -52,10 +51,6 @@ def checkInput(axt, kaks, phylip, codeml, outdir): proceed = False if phyml == False: print("\n\tError: Please install and rename PhyML for use\ - with CodeML.\n") - proceed = False - if phylip == False: - print("\n\tError: Files must be converted into phylip format for use\ with CodeML.\n") proceed = False return proceed @@ -172,9 +167,9 @@ def main(): help="Specifies that sequences containing internal stop codons should be \ ratained.") parser.add_argument("--axt", action="store_true", -help="Convert files to axt format.") +help="Convert files to axt format without calling KaKs_Calculator.") parser.add_argument("--phylip", action="store_true", -help="Convert files to sequential phylip format.") +help="Convert files to sequential phylip format without calling CodeML.") parser.add_argument("--kaks", action="store_true", help="Calls KaKs_Calculator on axt files.") parser.add_argument("-m", default="NG", @@ -206,15 +201,14 @@ def main(): forward = args.f ntree = args.n cleanup = args.cleanUp - # Check inout commands prior to running: - if not ref: - print("\n\tError: Please specify a reference species.\n") - quit() - if axt == True and phylip == True: - print("\n\tError: Please specify only one file type.\n") - quit() - proceed = checkInput(axt, kaks, phylip, codeml, outdir) + # Check input commands prior to running: + proceed = checkInput(ref, kaks, codeml, outdir) if proceed == True: + # Set axt or phylip + if codeml == True: + phylip = True + elif kaks == True: + axt = True # Set checkpoint variables to False: sf = False ff = False diff --git a/bin/04_CallCodeML.py b/bin/04_CallCodeML.py index 3824818..1d9af24 100644 --- a/bin/04_CallCodeML.py +++ b/bin/04_CallCodeML.py @@ -72,10 +72,9 @@ def main(): # Save path to the AlignmentProcessor directory ap = os.getcwd() + "/" if " " in ap: - # Change to warning ######################################################## - print("\tWARNING: AlignmentProcessor will not run properly if there \ -is a space in its PATH name.") - ap = ap.replace(" (ASU)", "") + print("\tError: AlignmentProcessor will not run properly if there \ +is a space in its PATH name. Exiting.\n") + quit() run = False # Parse command parser = argparse.ArgumentParser(description="Runs CodeML on all files \ diff --git a/bin/ConcatenateCodeML.py b/bin/ConcatenateCodeML.py index 1627be1..c9a983c 100644 --- a/bin/ConcatenateCodeML.py +++ b/bin/ConcatenateCodeML.py @@ -3,24 +3,86 @@ import argparse from glob import glob +import re -def concatenate(multiple, pairwise, indir, outfile): - # Parses CodeML output and prints to file - if multiple == True and pairwise == True: - print("\n\tPlease specify only one alignment type.\n") - quit() - elif multiple == True: - # Read in codeml output for multiple alignments - readMultiple(indir, outfile) - elif pairwise == True: - # Read in codeml output for pairwise alignments - readPairwise(indir, outfile) +def inputList(indir): + # Creates list of mlc files and determines which analysis was run + infiles = glob(indir + "*") + # Remove tmp dir + for i in infiles: + if i[i.rfind("/")+1:] == "tmp": + infiles.remove(i) + with open(infiles[0], "r") as ali: + # Identify analysis + for line in ali: + if "dN/dS (w) for site classes" in line: + typ = "brsite" + break + elif "dN & dS for each branch" in line: + typ = "brspec" + break + elif "pairwise comparison, codon frequencies:" in line: + typ = "pw" + break + return infiles, typ + +def branchSite(infiles, outfile): + # Saves data from branch site analysis to csv + save = False + with open(outfile, "w") as output: + # Open output file and write header + output.write("TranscriptID,Foreground_dN/dS,Background_dN/dS,\ +TreeLength,lnL,Species,SitesUnderSelection (Amino Acid; PosteriorProbability)\n") + for infile in infiles: + try: + # Get gene id and number of species remaining for each file + filename = infile.split("/")[-1] + transcript = filename.split(".")[0] + species = filename.split(".")[1] + if int(species) > 2: + with open(infile, "r") as mlc: + sites = "" + for line in mlc: + # Get substitution rates, tree lengths, etc + if save == True: + if "The grid" in line: + save = False + elif line.strip() and "Positive" not in line: + # Record sites and probability + splt = line.split() + sites += ("{} ({}; {}),").format(splt[0], + splt[1], splt[2]) + elif "tree length =" in line: + length = line.split("=")[1].strip() + elif "lnL" in line: + lnl = line.split("):")[1] + lnl = lnl.split()[0].strip() + elif "background w" in line: + # Convert multiple spaces to single space + line = re.sub(" +", " ", line) + bw = line.split()[4] + elif "foreground w" in line: + # Convert multiple spaces to single space + line = re.sub(" +", " ", line) + fw = line.split()[4] + elif "Bayes Empirical Bayes" in line: + save = True + # Append data to list and convert into string + data = [fw, bw, length, lnl, species, sites[:-1]] + for i in data: + transcript += "," + i + output.write(transcript + "\n") + else: + # Skip files with only two sequences + pass + except IsADirectoryError: + pass -def readMultiple(indir, outfile): +def branchSpecific(infiles, outfile): + # Saves data from branch specific analysis to csv with open(outfile, "w") as output: # Open output file and write header output.write("TranscriptID,dN,dS,dN/dS,TreeLength,lnL,Species\n") - infiles = glob(indir + "*") for infile in infiles: try: # Get gene id and number of species remaining for each file @@ -28,42 +90,39 @@ def readMultiple(indir, outfile): transcript = filename.split(".")[0] species = filename.split(".")[1] if int(species) > 2: + with open(infile, "r") as mlc: + for line in mlc: + # Get substitution rates, tree lengths, etc + if "tree length =" in line: + length = line.split("=")[1].strip() + elif "tree length for dN" in line: + dn = line.split(":")[1].strip() + elif "tree length for dS" in line: + ds = line.split(":")[1].strip() + elif "lnL" in line: + lnl = line.split("):")[1] + lnl = lnl.split()[0].strip() + # Calculate dN/dS and save as a string try: - with open(infile, "r") as mlc: - for line in mlc: - # Get substitution rates, tree lengths, etc - if "tree length =" in line: - length = line.split("=")[1].strip() - elif "tree length for dN" in line: - dn = line.split(":")[1].strip() - elif "tree length for dS" in line: - ds = line.split(":")[1].strip() - elif "lnL" in line: - lnl = line.split("):")[1] - lnl = lnl.split()[0].strip() - # Calculate dN/dS and save as a string - try: - dnds = str(float(dn)/float(ds)) - except ZeroDivisionError: - dnds = "NA" - # Append data to list and convert into string - data = [dn, ds, dnds, length, lnl, species] - transcript += "," - for i in data: - transcript += str(i) + "," - transcript = transcript[:-1] - output.write(transcript + "\n") + dnds = str(float(dn)/float(ds)) + except ZeroDivisionError: + dnds = "NA" + # Append data to list and convert into string + data = [dn, ds, dnds, length, lnl, species] + for i in data: + transcript += "," + str(i) + output.write(transcript + "\n") else: # Skip files with only two sequences pass - except IsADirectoryError, UnboundLocalError: + except IsADirectoryError: pass -def readPairwise(indir, outfile): +def pairwise(infiles, outfile): + # Saves data from pairwise analysis to csv with open(outfile, "w") as output: # Open output file and write header output.write("TranscriptID,dN,dS,dN/dS,lnL\n") - infiles = glob(indir + "*") for infile in infiles: try: # Get gene id and number of species remaining for each file @@ -94,10 +153,6 @@ def readPairwise(indir, outfile): def main(): parser = argparse.ArgumentParser(description="This script will \ concatenate CodeML output and print them in a single file.") - parser.add_argument("--multiple", action="store_true", - help="Indicates that a multiple alignment was used") - parser.add_argument("--pairwise", action="store_true", - help="Indicates that a pairwise alignment was used.") parser.add_argument("-i", help="Path to CodeML output directory") parser.add_argument("-o", help="Name of output file.") # Parse command @@ -106,9 +161,13 @@ def main(): if indir[-1] != "/": indir += "/" outfile = args.o - multiple = args.multiple - pairwise = args.pairwise - concatenate(multiple, pairwise, indir, outfile) + infiles, typ = inputList(indir) + if typ == "pw": + pairwise(infiles, outfile) + elif typ == "brsite": + branchSite(infiles, outfile) + elif typ == "brspec": + branchSpecific(infiles, outfile) if __name__ == "__main__": main()