forked from Snitkin-Lab-Umich/cauris_dotplot
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdotplot.py
More file actions
310 lines (287 loc) · 14.7 KB
/
dotplot.py
File metadata and controls
310 lines (287 loc) · 14.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
import os
import subprocess
import argparse
import shutil
import pandas as pd
def generate_highlight_data(blast_query,dotplot_query,dotplot_subject,output_file,batch_name, minimum_evalue = 1e-5, minimum_identity = 0.8, minimum_coverage = 0.8):
highlight_data_all = []
# if the caurisblast directory doesn't exist, clone it from github
if not os.path.isdir('caurisblast'):
subprocess.run(['git','clone','https://github.com/Snitkin-Lab-Umich/caurisblast','caurisblast'])
# ensure the blastplus module is loaded
if shutil.which('blastn') is None:
print('blastn not found in PATH. Please ensure blast+ is installed or loaded and blastn is in your PATH.')
quit(1)
# run caurisblast using both arrangements of query and subject
formatstr = '7 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen sstrand'
for specific in ['dotplot_subject','dotplot_query']:
batch_name_specific = batch_name + '_' + specific
print(f'Starting caurisblast for {specific}...')
if specific == 'dotplot_query':
blast_subject = dotplot_query
blast_subject_name = suffix_trim(dotplot_query)
if specific == 'dotplot_subject':
blast_subject = dotplot_subject
blast_subject_name = suffix_trim(dotplot_subject)
command = ['python3','caurisblast/blast.py','-q',blast_query,'-s',blast_subject,'-t','nucl','-n',batch_name_specific,'-k','blastn','-e','1e-5','-f',formatstr,'-v']
res = subprocess.run(command)
if res.returncode != 0:
print(f'Error running caurisblast for {specific}, please check the log files.')
quit(1)
# read the raw blast output as a csv, and use it to generate a highlight_data file
raw_blast_output = f'caurisblast/results/{batch_name_specific}/{batch_name_specific}_nucl_blastn_1e-5_blast_results.csv'
blastdf = read_blast(raw_blast_output)
highlight_data = blastdf_to_highlight_data(blastdf,specific.replace('dotplot_',''),blast_subject_name,minimum_evalue,minimum_identity,minimum_coverage)
highlight_data_all+=highlight_data
# write the highlight_data_dict to a tsv file
with open(output_file,'w') as fh:
_ = fh.write('type\tname\tcontig\tstart\tend\n')
for hdata in highlight_data_all:
towrite = [str(x) for x in hdata]
line = '\t'.join(towrite) + '\n'
_ = fh.write(line)
def blastdf_to_highlight_data(blastdf,sequence_type,sequence_name, minimum_evalue = 1e-5, minimum_identity = 0.8, minimum_coverage = 0.8):
# take a blastdf directly from blastplus and use it to generate a highlight_data file
# each query should have its best hit converted to a region to highlight on the plot
highlight_data = []
blastdf['assembly_name'] = blastdf['subject_seqid']
blastdf['subject_scaffold'] = blastdf['subject_seqid']
#blastdf['assembly_name'] = blastdf['subject_seqid'].apply(lambda x: x.split('.scaffolds')[0])
#blastdf['subject_scaffold'] = blastdf['subject_seqid'].apply(lambda x: x.split('.scaffolds_')[1])
for assembly_name, assembly_blastdf in blastdf.groupby('assembly_name'):
assembly_blastdf['coverage'] = assembly_blastdf['alignment_length'] / assembly_blastdf['query_length']
blastdf_filtered = assembly_blastdf[
(assembly_blastdf['percent_identity'] >= minimum_identity * 100) &
(assembly_blastdf['evalue'] <= minimum_evalue) &
(assembly_blastdf['coverage'] >= minimum_coverage)
]
if blastdf_filtered.empty:
continue
for _, row in blastdf_filtered.iterrows():
# order is type,name,contig,start,end
highlight_data.append([sequence_type,sequence_name,row['subject_seqid'],min(row['subject_start'], row['subject_end']),max(row['subject_start'], row['subject_end'])])
return(highlight_data)
def read_blast(file_path):
# parse the BLAST output by skipping the first five lines, treating the rest as tab-separated values
blastdf = pd.read_csv(file_path, sep='\t', comment='#', header=None)
# rename columns
blastdf.columns = [
'query_seqid', 'subject_seqid', 'percent_identity', 'alignment_length', 'mismatches_count', 'gapopen_count',
'query_start', 'query_end', 'subject_start', 'subject_end', 'evalue', 'bitscore', 'query_length', 'subject_length', 'subject_strand'
]
return blastdf
def check_query(input_path):
query_fasta_list = []
if os.path.isdir(input_path):
print('Please provide a single input file for the query!')
quit(1)
# if not input_path.endswith('/'):
# input_path+='/'
# for f in os.listdir(input_path):
# if check_suffix(f):
# query_fasta_list.append(input_path + f)
elif os.path.isfile(input_path) and check_suffix(input_path):
return(input_path)
# query_fasta_list.append(input_path)
else:
print(f'Unrecognized path for {input_path}')
quit(1)
def check_suffix(path,suffix_list = ['.fasta','.fa','.fna']):
if any([path.endswith(x) for x in suffix_list]):
return(True)
else:
return(False)
def suffix_trim(fname):
if fname.endswith('.fasta') or fname.endswith('.fa'):
return(fname.split('/')[-1].split('.fa')[0])
elif fname.endswith('.fna'):
return(fname.split('/')[-1].split('.fna')[0])
else:
print(f'Incorrect file name when generating contig data for {fname}')
quit(1)
def run_nucmer(query_list,subject,output_dir,debug,rname):
output_file_list = []
subject_name = subject.split('/')[-1].split('.fa')[0]
with open(debug,'a') as debug_log:
for query in query_list:
query_name = query.split('/')[-1].split('.fa')[0]
#file_prefix = f'{query_name}_to_{subject_name}'
file_prefix = rname
# create the alignment file (A_to_B.delta)
command1 = ['nucmer','-p',file_prefix,subject,query]
subprocess.run(command1)
_ = debug_log.write(' '.join(command1)+'\n')
# create the coordinate file (A_to_B.coord)
with open(file_prefix + '.coord','w') as fh_coord:
command2 = ['show-coords','-r','-c','-l','-T',file_prefix + '.delta']
subprocess.run(command2,stdout = fh_coord)
_ = debug_log.write(' '.join(command1)+'\n')
# move both files to the results directory
subprocess.run(['mv',file_prefix + '.delta',output_dir])
subprocess.run(['mv',file_prefix + '.coord',output_dir])
# append the final output to the list
output_file_list.append(file_prefix + '.coord')
return(output_file_list)
def count_contig_len(input_file,output_file):
with open(input_file,'r') as fh:
t = None
data_list = []
for line in fh:
line = line.strip()
if line[0] == '>':
if t is not None:
data_list.append(t)
contig_name = line.split('>')[1]
# remove anything after a space to keep contig names consistent with mummer and BLAST
contig_name = contig_name.split(' ')[0]
t = [contig_name,0]
else:
t[1]+=len(line)
data_list.append(t)
with open(output_file,'w') as fh:
_ = fh.write('contig_name\tcontig_length\n')
for el in data_list:
_ = fh.write('\t'.join([str(x) for x in el])+'\n')
def get_nucmer_coord(nucmer_dir):
# take a nucmer results directory and generate a list of .coord files (the same output as run_nucmer above)
# returns and empty list if the path is not a directory
output_file_list = []
if os.path.isdir(nucmer_dir):
for fname in os.listdir(nucmer_dir):
if fname.endswith('.coord'):
output_file_list.append(fname)
return(output_file_list)
def make_plots(nucmer_dir,nucmer_files,contig_data_dir,highlight_data,output_dir,query_contig_data,subject_contig_data,rname):
for filename in nucmer_files:
#query_name,subject_name = filename.split('.coord')[0].split('_to_')
#query_contig_data = contig_data_dir + query_name + '_contig_data.csv'
#subject_contig_data = contig_data_dir + subject_name + '_contig_data.csv'
#output_file = output_dir + f'{query_name}_to_{subject_name}.pdf'
output_file = output_dir + f'{rname}.pdf'
command = ['Rscript','make_plots.R',nucmer_dir + filename,subject_contig_data,query_contig_data,highlight_data,output_file]
subprocess.run(command)
def main():
# define all args
parser = argparse.ArgumentParser()
parser.add_argument(
'--query','-q',type=str,
help='''Provide a query to perform the alignments to. This can be only be a single file!
All files should be in the fasta format (.fasta or .fa).''',
default=None
)
parser.add_argument(
'--subject','-s',type=str,
help='''Provide a subject sequence in the nucleotide fasta format (.fasta, .fa, or .fna). This should be a single file.''',
default=None
)
parser.add_argument(
'--name','-n',type=str,help='''(Optional) Provide a name for the run. This name will be used for all outputs.''',
default='new_search'
)
parser.add_argument(
'--highlight','-hl',type=str,help='''(Optional) Provide a file containing regions to highlight on the plot. This needs to
match the format of the provided highlight_data.tsv file.''',
default='NA'
)
parser.add_argument(
'--blast_query','-bq',type=str,help='''(Optional) Provide a fasta file containing sequences to highlight on the plot. A highlight_data.tsv file will be generated
using caurisblast.''',
default='NA'
)
parser.add_argument(
'--alignments','-a',type=str,help='''(Optional) Provide a path to a previously-generated results directory. This will remake the plots using the
Nucmer alignments and contig data present in the directory.''',
default=None
)
args = parser.parse_args()
# these statements don't cover all possbile inputs
if (args.query is None or args.subject is None) and (args.alignments is None):
print('No query+subject or results directory provided')
quit(1)
if (args.query is not None and args.subject is not None) and (args.alignments is not None):
print('Please provide either a query+subject or a results directory with Nucmer alignments, not both.')
quit(1)
# change working directory to location of script
os.chdir(os.path.dirname(os.path.abspath(__file__)))
# define all directories
#nucmer_output_dir = f'results/{args.name}/nucmer/'
#contig_data_dir = f'results/{args.name}/contig_data/'
#plot_output_dir = f'plots/{args.name}/'
#debug_log_dir = 'logs/'
#debug_log_file = f'logs/{args.name}_debug_log.txt'
nucmer_output_dir = f'{args.name}/nucmer/'
contig_data_dir = f'{args.name}/contig_data/'
plot_output_dir = f'{args.name}/plots/'
debug_log_dir = f'{args.name}/'
debug_log_file = f'{args.name}/{args.name}_debug_log.txt'
# generate the directories
dirlist = [debug_log_dir,contig_data_dir,nucmer_output_dir,plot_output_dir]
if args.alignments is not None:
dirlist = [debug_log_dir,plot_output_dir]
for d in dirlist:
if not os.path.isdir(d):
subprocess.call(['mkdir','-p',d])
# create log file
with open(debug_log_file,'w') as fh:
_ = fh.write(f'debug log for {args.name}\n')
# if the alignments aren't already provided, generate them
if args.alignments is None:
# check that subject and query exist
for f in [args.query,args.subject]:
if not os.path.exists(f):
print(f'Could not locate file or directory at {f}')
quit(1)
# create list of query files
# this is now a single path rather than a list
query_fasta = check_query(args.query)
# determine name for query, then get contig lengths
query_name = suffix_trim(query_fasta)
query_contig_data = contig_data_dir + query_name + '_contig_data.csv'
count_contig_len(query_fasta,query_contig_data)
# determine name for subject, then get contig lengths
subject_name = suffix_trim(args.subject)
subject_contig_data = contig_data_dir + subject_name + '_contig_data.csv'
count_contig_len(args.subject,subject_contig_data)
# # determine contig lengths for the query and subject
# for fpath in query_fasta_list + [args.subject]:
# #fname = fpath.split('/')[-1].split('.fa')[0]
# fname = suffix_trim(fpath)
# count_contig_len(fpath,contig_data_dir + fname + '_contig_data.csv')
#
# generate the highlight_data file if a blast query is provided
if args.blast_query != 'NA':
if not os.path.isfile(args.blast_query):
print(f'Could not locate blast query file at {args.blast_query}')
quit(1)
else:
args.highlight = args.name + '_highlight_data.tsv'
generate_highlight_data(args.blast_query,query_fasta,args.subject,args.highlight,args.name, minimum_evalue = 1e-5, minimum_identity = 0.8, minimum_coverage = 0.8)
# run nucmer, aligning each query to the subject
coord_file_list = run_nucmer(query_list=[query_fasta], subject=args.subject, output_dir=nucmer_output_dir, debug=debug_log_file, rname=args.name)
# create the plots using the R script
# if the alignments are provided, make sure everything looks as expected
else:
if os.path.isdir(args.alignments):
if not args.alignments.endswith('/'):
args.alignments+='/'
# replace paths with new versions
nucmer_output_dir = args.alignments + 'nucmer/'
contig_data_dir = args.alignments + 'contig_data/'
# get the list of .coord files
coord_file_list = get_nucmer_coord(nucmer_output_dir)
if coord_file_list == []:
print(f'No .coord files located in {args.alignments}')
quit(1)
else:
print(f'Could not locate results directory at {args.alignments}')
quit(1)
# make plots using all current paths
make_plots(
nucmer_dir=nucmer_output_dir,nucmer_files=coord_file_list,contig_data_dir=contig_data_dir,
highlight_data=args.highlight,output_dir=plot_output_dir, query_contig_data = query_contig_data,
subject_contig_data=subject_contig_data,rname=args.name)
print(f'Finished making plots!')
print(f'Location of plots: {plot_output_dir}')
print(f'Location of Nucmer alignments: {nucmer_output_dir}')
if __name__ == "__main__":
main()