Skip to content

Commit

Permalink
Automated filetype and analysis selection.
Browse files Browse the repository at this point in the history
  • Loading branch information
icwells committed Jun 13, 2017
1 parent 2153240 commit 881b640
Show file tree
Hide file tree
Showing 3 changed files with 122 additions and 70 deletions.
32 changes: 13 additions & 19 deletions AlignmentProcessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,13 @@
from glob import glob
import os

def checkInput(axt, kaks, phylip, codeml, outdir):
def checkInput(ref, kaks, codeml, outdir):
'''Makes sure necessary programs are installed and proper file conversion
is run for optional analysis program.'''
proceed = True
if not ref:
print("\n\tError: Please specify a reference species.\n")
proceed = False
if kaks == True:
if codeml == True:
print("\n\tError: Please specify only one analysis program.\n")
Expand All @@ -32,10 +35,6 @@ def checkInput(axt, kaks, phylip, codeml, outdir):
print("\n\tError: Please install KaKs_Calculator in the\
AlignmentProcessor bin.\n")
proceed = False
if axt == False:
print("\n\tError: Files must be converted into axt format for use\
with KaKs_Clculator.\n")
proceed = False
if codeml == True:
control = glob(outdir + "*.ctl")
if len(control) == 0:
Expand All @@ -52,10 +51,6 @@ def checkInput(axt, kaks, phylip, codeml, outdir):
proceed = False
if phyml == False:
print("\n\tError: Please install and rename PhyML for use\
with CodeML.\n")
proceed = False
if phylip == False:
print("\n\tError: Files must be converted into phylip format for use\
with CodeML.\n")
proceed = False
return proceed
Expand Down Expand Up @@ -172,9 +167,9 @@ def main():
help="Specifies that sequences containing internal stop codons should be \
ratained.")
parser.add_argument("--axt", action="store_true",
help="Convert files to axt format.")
help="Convert files to axt format without calling KaKs_Calculator.")
parser.add_argument("--phylip", action="store_true",
help="Convert files to sequential phylip format.")
help="Convert files to sequential phylip format without calling CodeML.")
parser.add_argument("--kaks", action="store_true",
help="Calls KaKs_Calculator on axt files.")
parser.add_argument("-m", default="NG",
Expand Down Expand Up @@ -206,15 +201,14 @@ def main():
forward = args.f
ntree = args.n
cleanup = args.cleanUp
# Check inout commands prior to running:
if not ref:
print("\n\tError: Please specify a reference species.\n")
quit()
if axt == True and phylip == True:
print("\n\tError: Please specify only one file type.\n")
quit()
proceed = checkInput(axt, kaks, phylip, codeml, outdir)
# Check input commands prior to running:
proceed = checkInput(ref, kaks, codeml, outdir)
if proceed == True:
# Set axt or phylip
if codeml == True:
phylip = True
elif kaks == True:
axt = True
# Set checkpoint variables to False:
sf = False
ff = False
Expand Down
7 changes: 3 additions & 4 deletions bin/04_CallCodeML.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,9 @@ def main():
# Save path to the AlignmentProcessor directory
ap = os.getcwd() + "/"
if " " in ap:
# Change to warning ########################################################
print("\tWARNING: AlignmentProcessor will not run properly if there \
is a space in its PATH name.")
ap = ap.replace(" (ASU)", "")
print("\tError: AlignmentProcessor will not run properly if there \
is a space in its PATH name. Exiting.\n")
quit()
run = False
# Parse command
parser = argparse.ArgumentParser(description="Runs CodeML on all files \
Expand Down
153 changes: 106 additions & 47 deletions bin/ConcatenateCodeML.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,67 +3,126 @@

import argparse
from glob import glob
import re

def concatenate(multiple, pairwise, indir, outfile):
# Parses CodeML output and prints to file
if multiple == True and pairwise == True:
print("\n\tPlease specify only one alignment type.\n")
quit()
elif multiple == True:
# Read in codeml output for multiple alignments
readMultiple(indir, outfile)
elif pairwise == True:
# Read in codeml output for pairwise alignments
readPairwise(indir, outfile)
def inputList(indir):
# Creates list of mlc files and determines which analysis was run
infiles = glob(indir + "*")
# Remove tmp dir
for i in infiles:
if i[i.rfind("/")+1:] == "tmp":
infiles.remove(i)
with open(infiles[0], "r") as ali:
# Identify analysis
for line in ali:
if "dN/dS (w) for site classes" in line:
typ = "brsite"
break
elif "dN & dS for each branch" in line:
typ = "brspec"
break
elif "pairwise comparison, codon frequencies:" in line:
typ = "pw"
break
return infiles, typ

def branchSite(infiles, outfile):
# Saves data from branch site analysis to csv
save = False
with open(outfile, "w") as output:
# Open output file and write header
output.write("TranscriptID,Foreground_dN/dS,Background_dN/dS,\
TreeLength,lnL,Species,SitesUnderSelection (Amino Acid; PosteriorProbability)\n")
for infile in infiles:
try:
# Get gene id and number of species remaining for each file
filename = infile.split("/")[-1]
transcript = filename.split(".")[0]
species = filename.split(".")[1]
if int(species) > 2:
with open(infile, "r") as mlc:
sites = ""
for line in mlc:
# Get substitution rates, tree lengths, etc
if save == True:
if "The grid" in line:
save = False
elif line.strip() and "Positive" not in line:
# Record sites and probability
splt = line.split()
sites += ("{} ({}; {}),").format(splt[0],
splt[1], splt[2])
elif "tree length =" in line:
length = line.split("=")[1].strip()
elif "lnL" in line:
lnl = line.split("):")[1]
lnl = lnl.split()[0].strip()
elif "background w" in line:
# Convert multiple spaces to single space
line = re.sub(" +", " ", line)
bw = line.split()[4]
elif "foreground w" in line:
# Convert multiple spaces to single space
line = re.sub(" +", " ", line)
fw = line.split()[4]
elif "Bayes Empirical Bayes" in line:
save = True
# Append data to list and convert into string
data = [fw, bw, length, lnl, species, sites[:-1]]
for i in data:
transcript += "," + i
output.write(transcript + "\n")
else:
# Skip files with only two sequences
pass
except IsADirectoryError:
pass

def readMultiple(indir, outfile):
def branchSpecific(infiles, outfile):
# Saves data from branch specific analysis to csv
with open(outfile, "w") as output:
# Open output file and write header
output.write("TranscriptID,dN,dS,dN/dS,TreeLength,lnL,Species\n")
infiles = glob(indir + "*")
for infile in infiles:
try:
# Get gene id and number of species remaining for each file
filename = infile.split("/")[-1]
transcript = filename.split(".")[0]
species = filename.split(".")[1]
if int(species) > 2:
with open(infile, "r") as mlc:
for line in mlc:
# Get substitution rates, tree lengths, etc
if "tree length =" in line:
length = line.split("=")[1].strip()
elif "tree length for dN" in line:
dn = line.split(":")[1].strip()
elif "tree length for dS" in line:
ds = line.split(":")[1].strip()
elif "lnL" in line:
lnl = line.split("):")[1]
lnl = lnl.split()[0].strip()
# Calculate dN/dS and save as a string
try:
with open(infile, "r") as mlc:
for line in mlc:
# Get substitution rates, tree lengths, etc
if "tree length =" in line:
length = line.split("=")[1].strip()
elif "tree length for dN" in line:
dn = line.split(":")[1].strip()
elif "tree length for dS" in line:
ds = line.split(":")[1].strip()
elif "lnL" in line:
lnl = line.split("):")[1]
lnl = lnl.split()[0].strip()
# Calculate dN/dS and save as a string
try:
dnds = str(float(dn)/float(ds))
except ZeroDivisionError:
dnds = "NA"
# Append data to list and convert into string
data = [dn, ds, dnds, length, lnl, species]
transcript += ","
for i in data:
transcript += str(i) + ","
transcript = transcript[:-1]
output.write(transcript + "\n")
dnds = str(float(dn)/float(ds))
except ZeroDivisionError:
dnds = "NA"
# Append data to list and convert into string
data = [dn, ds, dnds, length, lnl, species]
for i in data:
transcript += "," + str(i)
output.write(transcript + "\n")
else:
# Skip files with only two sequences
pass
except IsADirectoryError, UnboundLocalError:
except IsADirectoryError:
pass

def readPairwise(indir, outfile):
def pairwise(infiles, outfile):
# Saves data from pairwise analysis to csv
with open(outfile, "w") as output:
# Open output file and write header
output.write("TranscriptID,dN,dS,dN/dS,lnL\n")
infiles = glob(indir + "*")
for infile in infiles:
try:
# Get gene id and number of species remaining for each file
Expand Down Expand Up @@ -94,10 +153,6 @@ def readPairwise(indir, outfile):
def main():
parser = argparse.ArgumentParser(description="This script will \
concatenate CodeML output and print them in a single file.")
parser.add_argument("--multiple", action="store_true",
help="Indicates that a multiple alignment was used")
parser.add_argument("--pairwise", action="store_true",
help="Indicates that a pairwise alignment was used.")
parser.add_argument("-i", help="Path to CodeML output directory")
parser.add_argument("-o", help="Name of output file.")
# Parse command
Expand All @@ -106,9 +161,13 @@ def main():
if indir[-1] != "/":
indir += "/"
outfile = args.o
multiple = args.multiple
pairwise = args.pairwise
concatenate(multiple, pairwise, indir, outfile)
infiles, typ = inputList(indir)
if typ == "pw":
pairwise(infiles, outfile)
elif typ == "brsite":
branchSite(infiles, outfile)
elif typ == "brspec":
branchSpecific(infiles, outfile)

if __name__ == "__main__":
main()

0 comments on commit 881b640

Please sign in to comment.