-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathInTrans.py
More file actions
424 lines (366 loc) · 15.1 KB
/
InTrans.py
File metadata and controls
424 lines (366 loc) · 15.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
# -*- coding: utf-8 -*-
import configparser
import os,sys,shutil
sys.path.append('..')
import logging
import time
import subprocess
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
datefmt='%a, %d %b %Y %H:%M:%S',
filename='./log',
filemode='w')
cfg=sys.argv[1]
__config__ = configparser.ConfigParser()
__config__.read(cfg,encoding="utf-8")
def loggingtime(funcname,starttime):
logging.info(funcname + ' Time Running %f s' %(time.time()-starttime))
global start
start = time.time()
# Check if the folder exists
def CheckFileDirs():
try:
if not os.path.exists('./output'):
os.mkdir('./output')
if not os.path.exists('./temp_output'):
os.mkdir('./temp_output')
except Exception as e:
logging.error("CheckFileDirs Function Wrong",e)
print("CheckFileDirs Wrong",e)
# find the first different word's index between str1 and str2
def FindStrIndex(str1,str2):
length = len(str1)
if len(str1) > len(str2): length = len(str2)
for i in range(length):
if str1[i] != str2[i]:
break
return i
# IDBA find the best file to use
def IDBAFindBestFile(filedir):
if not os.path.exists(filedir):
print("Didn't find the file, Check '" + filedir + "/' has the log file")
return ''
try:
BestKmer = {}; TempKmer = ''
logfile = filedir + '/log'
fp = open(logfile,'r')
line = fp.readline()
i = 0
while line:
if 'kmer' in line and "kmers" not in line:
TempKmer = line.replace('kmer','').strip()
BestKmer[TempKmer] = ''
if 'contigs' in line and 'length' in line:
i += 1
if i == 3 and 'contigs' in line:
if line.find('n50') != -1:
tempN50 = line[line.find('n50')+4:line.find('max')-1]
BestKmer[TempKmer] = int(tempN50.strip())
i = 0
line = fp.readline()
BestKmerNum = sorted(BestKmer.items(), key=lambda item:item[1], reverse=True)[0][0]
fp.close()
# print(BestKmerNum)
logging.info("for " + filedir +" we seletced the de novo assemble result when kmer=" + str(BestKmerNum) + " for the biggest N50 we got with this kmer parameter")
return 'transcript-' + BestKmerNum + '.fa'
except Exception as e:
logging.error("IDBAFindBestFile Function Wrong", e)
print("IDBAFindBestFile Wrong",e)
exit(0)
# IDBA Function #
def IDBA():
logging.info("Start Running IDBA:")
try:
path = __config__.get("seq data","fastq_dir")
if path is None or not os.path.exists(path):
logging.error("Run.cfg didn't set right fastq_dir, can't find your path")
print("Run.cfg didn't set right fastq_dir")
exit(0)
if path[-1] != '/':
path = path + '/'
try:
Pairsamplelist = eval(__config__.get('seq data', 'fastq_files'))
except Exception as e:
logging.error("fastq_files format error!",e)
print("fastq_files format error!")
mink = __config__.get("IDBA","mink")
maxk = __config__.get("IDBA","maxk")
step = __config__.get("IDBA","step")
num_threads = __config__.get("IDBA","num_threads")
IDBABestFiles = []
for sample in Pairsamplelist:
IDBACombineStr = "fq2fa --merge "
samplelist = sample.strip().split(" ")
# print(samplelist)
for sam in samplelist:
if not os.path.exists(path + sam):
logging.error("Didn't find your file. Check your fastq_files's format is right.")
print("Didn't find your file. Check your fastq_files's format is right.")
exit(0)
IDBACombineStr = IDBACombineStr + str(path + sam) + ' '
if len(samplelist) > 1:
sampledir = './temp_output/' + samplelist[0][0:FindStrIndex(samplelist[0],samplelist[1])-1] + '.fa'
else:
sampledir = './temp_output/' + samplelist[0] + '.fa'
IDBACombineStr = IDBACombineStr + str(sampledir) + ' '
# print(IDBACombineStr)
try:
subprocess.call(IDBACombineStr,shell=True)
# pp.communicate()
# os.system(IDBACombineStr)
except subprocess.CalledProcessError as e:
logging.error("IDBA first Command Line is wrong.",e.output,e.returncode)
exit(0)
tempdir = sampledir.replace('.fa','')
IDBAExecute = "idba_tran -r &sample -o &output --mink &mink --maxk &maxk --step &step --num_threads &num_threads".replace("&sample",sampledir).replace("&output",tempdir).replace("&mink",mink).replace("&maxk",maxk).replace("&step",step).replace("&num_threads",num_threads)
# print(IDBAExecute)
try:
os.system(IDBAExecute)
except Exception as e:
logging.error("IDBA second Command line is wrong.")
exit(0)
# print(tempdir)
IDBABestFiles.append(tempdir + '/' + IDBAFindBestFile(tempdir))
# print(IDBABestFiles)
logging.info("IDBA Completed")
return IDBABestFiles
except Exception as e:
logging.error("IDBA Function Wrong:" + str(e))
print("IDBA Wrong",e)
exit(0)
# change name of HeteroFiles
def HeteroNewFiles(heterodir):
if not os.path.exists(heterodir):
logging.error("Didn't find the heterodir file")
exit(0)
try:
newHeteroFiles = './temp_output/HeteroFile/' + heterodir.split('/')[-1]
if not os.path.exists("./temp_output/HeteroFile/"):
os.makedirs('./temp_output/HeteroFile/')
rb = open(heterodir,'r')
wb = open(newHeteroFiles, 'w')
rline = rb.readline()
while rline:
if '>' in rline:
rline = rline[:rline.find('>') + 1] + " Hetero " + rline [rline.find('>') + 1:]
wb.write(rline)
rline = rb.readline()
wb.close()
rb.close()
return newHeteroFiles
except Exception as e:
logging.error("HeteroNewFiles Function Wrong", e)
print("HeteroNewFiles Wrong:" + str(e))
exit(0)
# CAT Function
def CATCombine(IDBABestList):
logging.info("Start Running CAT:")
if IDBABestFiles is None or (type(IDBABestList[0]) is str and '.fa' not in IDBABestList[0]):
print("Can't find the IDBA files")
logging.error("Can't find the IDBA files")
exit(0)
try:
heter_prefix_path = __config__.get("seq data","heter_prefix")
# define heter_file
is_heterfile = True
if heter_prefix_path is None or len(heter_prefix_path.strip(' ')) == 0:
logging.warning("Run.cfg didn't set heter_prefix")
is_heterfile = False
if len(heter_prefix_path) !=0 and heter_prefix_path[-1] != '/':
heter_prefix_path = heter_prefix_path + '/'
try:
Heterogeneous_data_files = eval(__config__.get('seq data', 'heter_files'))
#Heterogeneous_data_files = __config__.get('seq data', 'heter_files')
if Heterogeneous_data_files is None:
logging.warning("Run.cfg didn't set heter_files list")
is_heterfile = False
elif len(Heterogeneous_data_files) == 0:
logging.warning("Run.cfg didn't set heter_files list")
is_heterfile = False
except Exception as e:
logging.error("Check your heter_files format.")
print("heter_files format Error!")
exit(0)
if is_heterfile:
HeteroNewFileList = []
for heterofile in Heterogeneous_data_files:
heterofiledir = heter_prefix_path + heterofile
temp = HeteroNewFiles(heterofiledir)
if temp is None:
return
HeteroNewFileList.append(temp)
# filename = IDBABestList[0].split('/')[-2]
cmdline = 'cat '
for IDBAFile in IDBABestList:
cmdline = cmdline + IDBAFile + ' '
if is_heterfile:
for newfile in HeteroNewFileList:
cmdline = cmdline + newfile + ' '
cmdline = cmdline + '> ./temp_output/total.fa'
try:
subprocess.call(cmdline,shell=True)
except Exception as e:
logging.error("CAT Command line is wrong")
exit(0)
if not os.path.exists('./temp_output/total.fa'):
logging.error("Must be something wrong here!")
exit(0)
logging.info("CAT Completed")
return './temp_output/total.fa'
except Exception as e:
logging.error("CAT Function Wrong:" + str(e))
print("CAT Combine Wrong",e)
exit(0)
# CD-HIT-EST Function
def CDHITEST(CATFile):
logging.info("Start Runing CD-HIT-EST")
if CATFile is None or ('.fa' not in CATFile):
logging.error("Can't find the CAT files")
exit(0)
try:
c = __config__.get("CD-hit-est","c")
aS = __config__.get("CD-hit-est","aS")
T = __config__.get("CD-hit-est","T")
if not os.path.exists(CATFile):
logging.error(str(CATFile) + " not exit.")
exit(0)
chefunction = "cd-hit-est -i &inputfile -o &outputfile -c &c -aS &aS -T &T -M 0".replace("&inputfile",CATFile).replace("&outputfile","./temp_output/CD-HIT").replace("&c",c).replace("&aS",aS).replace("&T",T)
# print(chefunction)
try:
subprocess.call(chefunction,shell=True)
except Exception as e:
logging.error("CD-HIT-EST Command line is wrong.")
exit(0)
if not os.path.exists('./temp_output/CD-HIT'):
logging.error("Didn't generate the CD-HIT file")
exit(0)
logging.info("CD-HIT-EST Completed")
return './temp_output/CD-HIT'
except Exception as e:
logging.error("CD-HIT-EST Function Wrong:" + str(e))
print("CD-HIT-EST Wrong",e)
exit(0)
# CAP3 Funtion
def CAP3(CHEFile):
logging.info("Start Running CAP3:")
if CHEFile is None:
logging.error("Can't find the CD-HIT-EST files")
exit(0)
try:
if not os.path.exists("./temp_output/CAP"):
os.makedirs("./temp_output/CAP")
filename = CHEFile.split('/')[-1]
newFiledir = './temp_output/CAP/'+ filename
shutil.copy(CHEFile,newFiledir)
cmdline = 'cap3 ' + newFiledir
subprocess.call(cmdline,shell=True)
if not os.path.exists(newFiledir+'.cap.contigs') or not os.path.exists(newFiledir+'.cap.singlets'):
logging.error("cap files are not exits.")
exit(0)
cmdline = 'cat ' + newFiledir+'.cap.contigs' + ' ' + newFiledir+'.cap.singlets' + ' > ./output/Final.fa'
try:
subprocess.call(cmdline,shell=True)
except Exception as e:
logging.error("CAP3 Command line is wrong.")
exit(0)
# print(cmdline)
logging.info("CAP3 Completed")
return './output/Final.fa'
except Exception as e:
logging.error("CAP3 Function Wrong:" + str(e))
print("CAP3 Wrong",e)
exit(0)
# Refine Function
def Refine(FinalFile):
try:
removelen = __config__.get("transcript","removelen")
try:
if 'bp' not in removelen:
logging.error("removelen must be interger + bg. like 300bp.")
print("removelen must be interger + bg. like 300bp")
exit(0)
removelen = int(removelen[:-2])
print(removelen)
except Exception as e:
logging.error("removelen must be interger + bg. like 300bp.")
print("removelen must be interger + bg. like 300bp")
exit(0)
newFileName = './output/transcript.fa'
fp = open(FinalFile,'r')
wr = open(newFileName,'w')
line = fp.readline()
tempname = 'ThisisByAnt'
tempGene = []
tempGeneNum = 0
if os.path.exists('./temp_output/HeteroFile/'):
HeteroFiles = [ './temp_output/HeteroFile/' + x for x in os.listdir('./temp_output/HeteroFile/')]
is_HeteroFiles = True
else:
is_HeteroFiles = False
# print(HeteroFiles)
while line:
if '>' in line:
# print(tempGeneNum)
if tempGeneNum < removelen:
# print("a:")
tempname = line
line = fp.readline()
continue
if is_HeteroFiles:
for file in HeteroFiles:
# print("b:")
fr = open(file,'r')
if line in fr.read():
tempname = line
fr.close()
break
fr.close()
# print(tempname,line)
if tempname != line:
# print("c:")
wr.write(tempname)
wr.writelines(tempGene)
tempname = line
tempGene = []
tempGeneNum = 0
else:
tempGeneNum = tempGeneNum + len(line) - 1
tempGene.append(line)
line = fp.readline()
wr.close()
fp.close()
return True
except Exception as e:
logging.error("Refine Function Wrong:" + str(e))
print("Refine Wrong",e)
exit(0)
if __name__=='__main__':
global start
start = time.time()
CheckFileDirs()
loggingtime('ChenkFileDirs', start)
IDBABestFiles = IDBA()
loggingtime('IDBA', start)
Totalfile = CATCombine(IDBABestFiles)
loggingtime('CAT', start)
CHEFile = CDHITEST(Totalfile)
loggingtime('CD-HIT-EST', start)
FinalFile = CAP3(CHEFile)
loggingtime('CAP3', start)
defaultFunction = __config__.get("transcript","refinement")
if defaultFunction.lower() != 'yes' and FinalFile is not None:
if os.path.exists(FinalFile):
os.rename(FinalFile,"./output/transcript.fa")
print("The file is integrated in the current output directory.")
#shutil.rmtree('./temp_output')
elif defaultFunction.lower() == 'yes' and FinalFile is not None:
if Refine(FinalFile):
if os.path.exists(FinalFile):
os.remove(FinalFile)
print("The file is integrated in the current output directory.")
#shutil.rmtree('./temp_output')
else:
logging.error("Output Wrong")
print("Output Wrong")
loggingtime('Final', start)
logging.info("Success!")