Skip to content

Commit 0fbdafd

Browse files
committed
Allows user supplied phylogenic tree.
1 parent ee6b8c9 commit 0fbdafd

6 files changed

Lines changed: 112 additions & 82 deletions

File tree

AlignmentProcessor.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -135,15 +135,17 @@ def calculateKaKs(outdir, method, cpu):
135135
if ck.returncode == 0:
136136
return True
137137

138-
def runcodeml(cpu, outdir, forward, cleanup):
138+
def runcodeml(cpu, outdir, forward, ntree, cleanup):
139139
'''Runs codeml on a directory.'''
140140
# Build commands and add options if necessary
141141
cmd = ("python bin/04_CallCodeML.py -t " + str(cpu) + " -i " + outdir +
142142
"03_phylipFiles" + " -o " + outdir + "04_CodemlOutput")
143-
if cleanup == False:
143+
if cleanup == True:
144144
cmd += " --cleanUp"
145145
if forward:
146146
cmd += " -f " + forward
147+
if ntree:
148+
cmd += " -n " + ntree
147149
cm = Popen(split(cmd))
148150
cm.wait()
149151
if cm.returncode == 0:
@@ -156,7 +158,7 @@ def main():
156158
# Set arguments
157159
parser = argparse.ArgumentParser(description="AlignmentProcessor will run \
158160
the subsituion rate pipeline to produce trimmed axt or phylip files for use \
159-
with KaKs_calculator or PhyMl.\nAlignmentProcessor1.2 Copyright 2016 by \
161+
with KaKs_calculator or PhyMl.\nAlignmentProcessor1.4 Copyright 2016 by \
160162
Shawn Rupp\nThis program comes with ABSOLUTELY NO WARRANTY\nThis is free \
161163
software, and you are welcome to redistribute it under certain conditions\n")
162164
parser.add_argument("-i", help="Path to input file.")
@@ -182,7 +184,8 @@ def main():
182184
parser.add_argument("-t", type=int, default=1, help="Number of threads.")
183185
parser.add_argument("-f", default="",
184186
help="Forward species (name must be the same as it appears in input files.")
185-
parser.add_argument("--cleanUp", action="store_false",
187+
parser.add_argument("-n", help = "Path to optional Newick tree.")
188+
parser.add_argument("--cleanUp", action="store_true",
186189
help="Remove temporary files (it may be useful to retain phylogenic trees \
187190
for future use).")
188191
# Parse arguments and assign to variables
@@ -201,6 +204,7 @@ def main():
201204
codeml = args.codeml
202205
cpu = args.t
203206
forward = args.f
207+
ntree = args.n
204208
cleanup = args.cleanUp
205209
# Check inout commands prior to running:
206210
if not ref:
@@ -231,7 +235,7 @@ def main():
231235
done = calculateKaKs(outdir, method, cpu)
232236
# Run codeml
233237
elif codeml == True:
234-
done = runcodeml(cpu, outdir, forward, cleanup)
238+
done = runcodeml(cpu, outdir, forward, ntree, cleanup)
235239
else:
236240
# Exit if neither program was called
237241
done = True

AlignmentProcessorReadMe.pdf

1.74 KB
Binary file not shown.

bin/01_SplitFasta.py

Lines changed: 34 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -24,40 +24,51 @@ def splitFasta(infile, outdir):
2424
# Parse input fasta
2525
with open(infile, "r") as fasta:
2626
newid = True
27+
prev = True
2728
seq = ""
2829
n = 0
2930
for line in fasta:
30-
if line != "\n":
31+
if line.strip():
3132
if line[0] == ">":
32-
line, geneid = convertHeader(line)
33+
prev = True
34+
build, geneid = convertHeader(line)
3335
# Concatenate lines for all species for each gene
34-
seq += str(line)
35-
# Determine number of sequences and species names
36-
n += 1
3736
if newid == True:
3837
# Set reference species ID as file name
3938
filename = geneid
4039
newid = False
4140
else:
4241
# Concatenate remaining lines
43-
seq += str(line)
44-
elif line == "\n" and newid == False:
42+
line = line.upper()
43+
if ("A" not in line or "C" not in line or "G" not in line
44+
or "T" not in line):
45+
pass
46+
else:
47+
# Save gene if it contains nucleotides
48+
if prev == True:
49+
n += 1
50+
seq += build
51+
prev == False
52+
seq += str(line)
53+
elif not line.strip() and newid == False:
4554
# Use empty lines to determine where genes end
46-
if n >= 2:
55+
if n >= 2 and seq.count("\n") > 3:
4756
# Print gene sequences to file if there are at least two
4857
# species and reset for next gene
4958
outfile = (outdir + filename + "." + str(n) + ".fa")
5059
with open(outfile, "w") as output:
5160
output.write(seq)
5261
newid = True
62+
prev = False
5363
seq = ""
5464
n = 0
5565
passed += 1
56-
elif n < 2:
66+
else:
5767
# Skip genes with only one sequence and save ID in log
5868
with open(log, "a") as logfile:
5969
logfile.write(geneid + "\n")
6070
excluded += 1
71+
newid = True
6172
with open(log, "a") as logfile:
6273
logfile.write(("\nTotal transcripts written to file: {}\n").format(passed))
6374
logfile.write(("Total transcripts with only one sequence: {}").format(excluded))
@@ -68,13 +79,24 @@ def convertHeader(line):
6879
# Extract relevant data from UCSC header
6980
genebuild = line[1:].split()[0]
7081
genebuild = genebuild.split("_")
71-
build = ">" + str(genebuild[1]) + "\n"
72-
geneid = str(genebuild[0].split(".")[0])
82+
if line[1] == "E":
83+
# Ensembl IDs
84+
build = ">" + str(genebuild[1]) + "\n"
85+
geneid = str(genebuild[0].split(".")[0])
86+
elif line[1] == "N":
87+
# NCBI IDs
88+
build = ">" + str(genebuild[2]) + "\n"
89+
geneid = str(genebuild[0]) + "_" + str(genebuild[1])
7390
else:
7491
# Extract build and geneid
7592
build = ">" + line.split(".")[0][1:].rstrip() + "\n"
7693
geneid = str(line.split(".")[1]).rstrip()
77-
return build, geneid
94+
if geneid and build:
95+
return build, geneid
96+
else:
97+
print("Please use a fasta file with Ensembl, NCBI, or Galaxy Stitch \
98+
Gene Blocks IDs.")
99+
quit()
78100

79101
def main():
80102
parser = argparse.ArgumentParser(description="This will take the \

bin/02_FilterFasta.py

Lines changed: 16 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ def seqDict(fasta, n):
8282
species = line[1:].strip()
8383
if line[0] != ">":
8484
codons = []
85-
seq = line.strip().upper()
85+
seq = line.strip()
8686
for i in range(0, len(seq), 3):
8787
codons.append(seq[i:i +3])
8888
i += 3
@@ -130,7 +130,6 @@ def fixCodons(newseq):
130130
def removeStops(n, newseq, retainStops, geneid, log):
131131
'''This will replace internal stop codons with gaps, create a list
132132
of genes with internal stop codons, and remove those sequences.'''
133-
count = n
134133
remove = []
135134
for species in newseq:
136135
# Replace terminal stop codons so the program can identify
@@ -142,7 +141,7 @@ def removeStops(n, newseq, retainStops, geneid, log):
142141
newseq[species][-1] = "---"
143142
except (IndexError, ValueError) as e:
144143
# Remove empty sequences
145-
count -= 1
144+
n -= 1
146145
del newseq[species]
147146
with open(log, "a") as logfile:
148147
logfile.write(geneid + "\t" + species +
@@ -163,18 +162,18 @@ def removeStops(n, newseq, retainStops, geneid, log):
163162
if retainStops == False:
164163
# Remove key after iterating to avoid using multiple breaks
165164
for species in remove:
166-
count -= 1
165+
n -= 1
167166
del newseq[species]
168-
if count >= 2:
169-
return True, newseq, count
170-
elif count < 2:
167+
if n >= 2:
168+
return True, newseq, n
169+
elif n < 2:
171170
# Skip files with only one sequence left
172171
pass
173172

174173
def countBases(n, newseq, percent, geneid, log):
175174
'''Counts the number of nucleotides and only writes the sequence to an
176175
output file if they compose greater than the cutoff threshold of the sequence'''
177-
count = n
176+
failed = []
178177
for species in newseq:
179178
seq = ""
180179
for i in newseq[species]:
@@ -187,21 +186,20 @@ def countBases(n, newseq, percent, geneid, log):
187186
aligned = acount + tcount + ccount + gcount
188187
try:
189188
# Determine whether or not the sequence passes the treshold
190-
if aligned/len(seq) >= percent:
191-
pass
192-
else:
189+
if aligned/len(seq) < percent:
193190
# Remove sequences that do not meet the threshold
194-
del newseq[species]
191+
failed.append(species)
195192
with open(log, "a") as logfile:
196193
logfile.write(geneid + "\t" + species +
197194
"\tLow Nucleotide Count\n")
198-
count -= 1
199-
break
200195
except ZeroDivisionError:
201-
del newseq[species]
202-
if count >= 2:
203-
return True, newseq, count
204-
elif count < 2:
196+
failed.append(species)
197+
for i in failed:
198+
del newseq[i]
199+
n -= 1
200+
if n >= 2:
201+
return True, newseq, n
202+
elif n < 2:
205203
# Do not proceed if dict does not have at least two sequences
206204
pass
207205

bin/04_CallCodeML.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,9 +73,9 @@ def main():
7373
ap = os.getcwd() + "/"
7474
if " " in ap:
7575
# Change to warning ########################################################
76-
print("\tError: AlignmentProcessor will not run properly if there \
76+
print("\tWARNING: AlignmentProcessor will not run properly if there \
7777
is a space in its PATH name.")
78-
quit()
78+
ap = ap.replace(" (ASU)", "")
7979
run = False
8080
# Parse command
8181
parser = argparse.ArgumentParser(description="Runs CodeML on all files \
@@ -85,6 +85,7 @@ def main():
8585
parser.add_argument("-t", type=int, default=1, help="Number of threads.")
8686
parser.add_argument("-f", default="",
8787
help="Forward species (name must be the same as it appears in input files.")
88+
parser.add_argument("-n", help = "Path to optional Newick tree.")
8889
parser.add_argument("--cleanUp", action="store_true",
8990
help="Remove temporary files (it may be useful to retain phylogenic trees \
9091
for future use).")
@@ -100,6 +101,7 @@ def main():
100101
if cpu > MAXCPU:
101102
cpu = MAXCPU
102103
forward = args.f
104+
ntree = args.n
103105
# Reads in required data
104106
finished, completed = outputFiles(outdir)
105107
ctl, multiple = controlFiles(indir, outdir, forward, cpu)
@@ -109,7 +111,7 @@ def main():
109111
genes = glob(indir + "*.phylip")
110112
l = int(len(genes))
111113
func = partial(parallelize, ap, outdir, finished, completed, multiple,
112-
cpu, ctl, forward)
114+
cpu, ctl, forward, ntree)
113115
print(("\tRunning CodeML on {0!s} genes with {1!s} threads...."
114116
).format(l, cpu))
115117
pool = Pool(processes = cpu)

bin/parallelCodeML.py

Lines changed: 48 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from glob import glob
1010
from subprocess import Popen
1111
from shlex import split
12+
from shutil import rmtree
1213
import math
1314
import shutil
1415
import os
@@ -21,40 +22,39 @@ def makeTree(ap, gene, wd, treefile, forward):
2122
# Call PhyML to make gene tree
2223
phy = Popen(split(ap + "PhyML/PhyML -q -i " + gene), stdout = DEVNULL)
2324
phy.wait()
24-
if phy.returncode == 0:
25-
# Move PhyML output to temp directory
26-
output = glob(gene + "_phyml_*")
27-
for i in output:
28-
shutil.copy(i, wd)
29-
os.remove(i)
30-
# Read in PhyML tree
31-
with open(treefile, "r") as genetree:
32-
try:
33-
tree = genetree.readlines()[0]
34-
except IndexError:
35-
pass
36-
# Remove branch lables introduced by PhyML
37-
tree = re.sub(r"\d+\.\d+:", ":", tree)
38-
# Add forward node to tree if specified
39-
if forward:
40-
if forward in tree:
41-
# Determine location and length of species name
42-
i = tree.index(forward) + len(forward)
43-
if ":" in tree:
44-
# Find end of branch length
45-
comma = tree.find(",", i)
46-
paren = tree.find(")", i)
47-
i = min([comma, paren])
48-
# Insert space and node symbol after species name
49-
tree = (tree[:i] + " #1" + tree[i:])
50-
elif forward not in tree:
25+
# Move PhyML output to temp directory
26+
output = glob(gene + "_phyml_*")
27+
for i in output:
28+
shutil.copy(i, wd)
29+
os.remove(i)
30+
# Read in PhyML tree
31+
with open(treefile, "r") as genetree:
32+
try:
33+
tree = genetree.readlines()[0]
34+
except IndexError:
5135
pass
52-
with open(treefile, "w") as outtree:
53-
# Overwrite treefile
54-
string = ""
55-
for i in tree:
56-
string += i
57-
outtree.write(string)
36+
# Remove branch lables introduced by PhyML
37+
tree = re.sub(r"\d+\.\d+:", ":", tree)
38+
# Add forward node to tree if specified
39+
if forward:
40+
if forward in tree:
41+
# Determine location and length of species name
42+
i = tree.index(forward) + len(forward)
43+
if ":" in tree:
44+
# Find end of branch length
45+
comma = tree.find(",", i)
46+
paren = tree.find(")", i)
47+
i = min([comma, paren])
48+
# Insert space and node symbol after species name
49+
tree = (tree[:i] + " #1" + tree[i:])
50+
elif forward not in tree:
51+
pass
52+
with open(treefile, "w") as outtree:
53+
# Overwrite treefile
54+
string = ""
55+
for i in tree:
56+
string += i
57+
outtree.write(string)
5858

5959
def makeCtl(gene, outfile, tempctl, treefile, ctl):
6060
'''Creates unique control file'''
@@ -72,7 +72,7 @@ def makeCtl(gene, outfile, tempctl, treefile, ctl):
7272
#-----------------------------------------------------------------------------
7373

7474
def parallelize(ap, outdir, finished, completed, multiple, cpu, ctl,
75-
forward, gene):
75+
forward, treefile, gene):
7676
filename = gene.split("/")[-1]
7777
geneid = filename.split(".")[0]
7878
if (geneid + "\n") in completed:
@@ -90,27 +90,31 @@ def parallelize(ap, outdir, finished, completed, multiple, cpu, ctl,
9090
if multiple == True:
9191
if filename.split(".")[1] == "2":
9292
# Skip pairwise genes and remove directory
93-
os.rmdir(wd)
93+
rmtree(wd)
9494
pass
9595
else:
96-
treefile = wd + filename + "_phyml_tree.txt"
97-
# Make control file and run PhyML
96+
if not treefile:
97+
# Run Phyml
98+
treefile = wd + filename + "_phyml_tree.txt"
99+
makeTree(ap, gene, wd, treefile, forward)
100+
# Make control file
98101
makeCtl(gene, outfile, tempctl, treefile, ctl)
99-
makeTree(ap, gene, wd, treefile, forward)
100102
os.chdir(wd)
101103
# Call CodeML
102-
cm = Popen(split(ap + "paml/bin/codeml " + tempctl),
103-
stdout = DEVNULL)
104-
cm.wait()
104+
with open(wd + "codemlLog.txt", "w") as tmpout:
105+
cm = Popen(split(ap + "paml/bin/codeml " + tempctl),
106+
shell = True, stdout = tmpout)
107+
cm.wait()
105108
elif multiple == False:
106109
# Make control file
107110
treefile = wd + filename + ""
108111
makeCtl(gene, outfile, tempctl, treefile, ctl)
109112
# Call CodeML for all files
110113
os.chdir(wd)
111-
cm = Popen(split(ap + "paml/bin/codeml " + tempctl),
112-
stdout = DEVNULL)
113-
cm.wait()
114+
with open(wd + "codemlLog.txt", "w") as tmpout:
115+
cm = Popen(split(ap + "paml/bin/codeml " + tempctl),
116+
shell = True, stdout = tmpout)
117+
cm.wait()
114118
with open(finished, "a") as fin:
115119
# Append gene id to list when done
116120
fin.write(geneid + "\n")

0 commit comments

Comments
 (0)