cauris_dotplot_pipeline/dotplot.py at main · Snitkin-Lab-Umich/cauris_dotplot_pipeline · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
import os
import subprocess
import argparse
import shutil
import pandas as pd

def generate_highlight_data(blast_query,dotplot_query,dotplot_subject,output_file,batch_name, minimum_evalue = 1e-5, minimum_identity = 0.8, minimum_coverage = 0.8):
    highlight_data_all = []
    # if the caurisblast directory doesn't exist, clone it from github
    if not os.path.isdir('caurisblast'):
        subprocess.run(['git','clone','https://github.com/Snitkin-Lab-Umich/caurisblast','caurisblast'])
    # ensure the blastplus module is loaded
    if shutil.which('blastn') is None:
        print('blastn not found in PATH. Please ensure blast+ is installed or loaded and blastn is in your PATH.')
        quit(1)
    # run caurisblast using both arrangements of query and subject
    formatstr = '7 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen sstrand'
    for specific in ['dotplot_subject','dotplot_query']:
        batch_name_specific = batch_name + '_' + specific
        print(f'Starting caurisblast for {specific}...')
        if specific == 'dotplot_query':
            blast_subject = dotplot_query
            blast_subject_name = suffix_trim(dotplot_query)
        if specific == 'dotplot_subject':
            blast_subject = dotplot_subject
            blast_subject_name = suffix_trim(dotplot_subject)
        command = ['python3','caurisblast/blast.py','-q',blast_query,'-s',blast_subject,'-t','nucl','-n',batch_name_specific,'-k','blastn','-e','1e-5','-f',formatstr,'-v']
        res = subprocess.run(command)
        if res.returncode != 0:
            print(f'Error running caurisblast for {specific}, please check the log files.')
            quit(1)
        # read the raw blast output as a csv, and use it to generate a highlight_data file
        raw_blast_output = f'caurisblast/results/{batch_name_specific}/{batch_name_specific}_nucl_blastn_1e-5_blast_results.csv'
        blastdf = read_blast(raw_blast_output)
        highlight_data = blastdf_to_highlight_data(blastdf,specific.replace('dotplot_',''),blast_subject_name,minimum_evalue,minimum_identity,minimum_coverage)
        highlight_data_all+=highlight_data
    # write the highlight_data_dict to a tsv file
    with open(output_file,'w') as fh:
        _ = fh.write('type\tname\tcontig\tstart\tend\n')
        for hdata in highlight_data_all:
            towrite = [str(x) for x in hdata]
            line = '\t'.join(towrite) + '\n'
            _ = fh.write(line)

def blastdf_to_highlight_data(blastdf,sequence_type,sequence_name, minimum_evalue = 1e-5, minimum_identity = 0.8, minimum_coverage = 0.8):
    # take a blastdf directly from blastplus and use it to generate a highlight_data file
    # each query should have its best hit converted to a region to highlight on the plot
    highlight_data = []
    blastdf['assembly_name'] = blastdf['subject_seqid']
    blastdf['subject_scaffold'] = blastdf['subject_seqid']
    #blastdf['assembly_name'] = blastdf['subject_seqid'].apply(lambda x: x.split('.scaffolds')[0])
    #blastdf['subject_scaffold'] = blastdf['subject_seqid'].apply(lambda x: x.split('.scaffolds_')[1])
    for assembly_name, assembly_blastdf in blastdf.groupby('assembly_name'):
        assembly_blastdf['coverage'] = assembly_blastdf['alignment_length'] / assembly_blastdf['query_length']
        blastdf_filtered = assembly_blastdf[
            (assembly_blastdf['percent_identity'] >= minimum_identity * 100) &
            (assembly_blastdf['evalue'] <= minimum_evalue) &
            (assembly_blastdf['coverage'] >= minimum_coverage)
        ]
        if blastdf_filtered.empty:
            continue
        for _, row in blastdf_filtered.iterrows():
            # order is type,name,contig,start,end
            highlight_data.append([sequence_type,sequence_name,row['subject_seqid'],min(row['subject_start'], row['subject_end']),max(row['subject_start'], row['subject_end'])])
    return(highlight_data)

def read_blast(file_path):
    # parse the BLAST output by skipping the first five lines, treating the rest as tab-separated values
    blastdf = pd.read_csv(file_path, sep='\t', comment='#', header=None)
    # rename columns
    blastdf.columns = [
        'query_seqid', 'subject_seqid', 'percent_identity', 'alignment_length', 'mismatches_count', 'gapopen_count',
        'query_start', 'query_end', 'subject_start', 'subject_end', 'evalue', 'bitscore', 'query_length', 'subject_length', 'subject_strand'
    ]
    return blastdf


def check_query(input_path):
    query_fasta_list = []
    if os.path.isdir(input_path):
        print('Please provide a single input file for the query!')
        quit(1)
        # if not input_path.endswith('/'):
        #     input_path+='/'
        # for f in os.listdir(input_path):
        #     if check_suffix(f):
        #         query_fasta_list.append(input_path + f)
    elif os.path.isfile(input_path) and check_suffix(input_path):
        return(input_path)
        # query_fasta_list.append(input_path)
    else:
        print(f'Unrecognized path for {input_path}')
        quit(1)

def check_suffix(path,suffix_list = ['.fasta','.fa','.fna']):
    if any([path.endswith(x) for x in suffix_list]):
        return(True)
    else:
        return(False)

def suffix_trim(fname):
    if fname.endswith('.fasta') or fname.endswith('.fa'):
        return(fname.split('/')[-1].split('.fa')[0])
    elif fname.endswith('.fna'):
        return(fname.split('/')[-1].split('.fna')[0])
    else:
        print(f'Incorrect file name when generating contig data for {fname}')
        quit(1)


def run_nucmer(query_list,subject,output_dir,debug,rname):
    output_file_list = []
    subject_name = subject.split('/')[-1].split('.fa')[0]
    with open(debug,'a') as debug_log:
        for query in query_list:
            query_name = query.split('/')[-1].split('.fa')[0]
            #file_prefix = f'{query_name}_to_{subject_name}'
            file_prefix = rname
            # create the alignment file (A_to_B.delta)
            command1 = ['nucmer','-p',file_prefix,subject,query]
            subprocess.run(command1)
            _ = debug_log.write(' '.join(command1)+'\n')
            # create the coordinate file (A_to_B.coord)
            with open(file_prefix + '.coord','w') as fh_coord:
                command2 = ['show-coords','-r','-c','-l','-T',file_prefix + '.delta']
                subprocess.run(command2,stdout = fh_coord)
                _ = debug_log.write(' '.join(command1)+'\n')
            # move both files to the results directory
            subprocess.run(['mv',file_prefix + '.delta',output_dir])
            subprocess.run(['mv',file_prefix + '.coord',output_dir])
            # append the final output to the list
            output_file_list.append(file_prefix + '.coord')
    return(output_file_list)

def count_contig_len(input_file,output_file):
    with open(input_file,'r') as fh:
        t = None
        data_list = []
        for line in fh:
            line = line.strip()
            if line[0] == '>':
                if t is not None:
                    data_list.append(t)
                contig_name = line.split('>')[1]
                # remove anything after a space to keep contig names consistent with mummer and BLAST
                contig_name = contig_name.split(' ')[0]
                t = [contig_name,0]
            else:
                t[1]+=len(line)
    data_list.append(t)
    with open(output_file,'w') as fh:
        _ = fh.write('contig_name\tcontig_length\n')
        for el in data_list:
            _ = fh.write('\t'.join([str(x) for x in el])+'\n')

def get_nucmer_coord(nucmer_dir):
    # take a nucmer results directory and generate a list of .coord files (the same output as run_nucmer above)
    # returns and empty list if the path is not a directory
    output_file_list = []
    if os.path.isdir(nucmer_dir):
        for fname in os.listdir(nucmer_dir):
            if fname.endswith('.coord'):
                output_file_list.append(fname)
    return(output_file_list)


def make_plots(nucmer_dir,nucmer_files,contig_data_dir,highlight_data,output_dir,query_contig_data,subject_contig_data,rname):
    for filename in nucmer_files:
        #query_name,subject_name = filename.split('.coord')[0].split('_to_')
        #query_contig_data = contig_data_dir + query_name + '_contig_data.csv'
        #subject_contig_data = contig_data_dir + subject_name + '_contig_data.csv'
        #output_file = output_dir + f'{query_name}_to_{subject_name}.pdf'
        output_file = output_dir + f'{rname}.pdf'
        command = ['Rscript','make_plots.R',nucmer_dir + filename,subject_contig_data,query_contig_data,highlight_data,output_file]
        subprocess.run(command)

def main():
    # define all args
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--query','-q',type=str,
        help='''Provide a query to perform the alignments to. This can be only be a single file!
        All files should be in the fasta format (.fasta or .fa).''',
        default=None
        )
    parser.add_argument(
        '--subject','-s',type=str,
        help='''Provide a subject sequence in the nucleotide fasta format (.fasta, .fa, or .fna). This should be a single file.''',
        default=None
        )
    parser.add_argument(
        '--name','-n',type=str,help='''(Optional) Provide a name for the run. This name will be used for all outputs.''',
        default='new_search'
        )
    parser.add_argument(
        '--highlight','-hl',type=str,help='''(Optional) Provide a file containing regions to highlight on the plot. This needs to
        match the format of the provided highlight_data.tsv file.''',
        default='NA'
        )
    parser.add_argument(
        '--blast_query','-bq',type=str,help='''(Optional) Provide a fasta file containing sequences to highlight on the plot. A highlight_data.tsv file will be generated
        using caurisblast.''',
        default='NA'
        )
    parser.add_argument(
        '--alignments','-a',type=str,help='''(Optional) Provide a path to a previously-generated results directory. This will remake the plots using the
        Nucmer alignments and contig data present in the directory.''',
        default=None
        )
    args = parser.parse_args()
    # these statements don't cover all possbile inputs
    if (args.query is None or args.subject is None) and (args.alignments is None):
        print('No query+subject or results directory provided')
        quit(1)
    if (args.query is not None and args.subject is not None) and (args.alignments is not None):
        print('Please provide either a query+subject or a results directory with Nucmer alignments, not both.')
        quit(1)
    # change working directory to location of script
    os.chdir(os.path.dirname(os.path.abspath(__file__)))
    # define all directories
    #nucmer_output_dir = f'results/{args.name}/nucmer/'
    #contig_data_dir = f'results/{args.name}/contig_data/'
    #plot_output_dir = f'plots/{args.name}/'
    #debug_log_dir = 'logs/'
    #debug_log_file = f'logs/{args.name}_debug_log.txt'
    nucmer_output_dir = f'{args.name}/nucmer/'
    contig_data_dir = f'{args.name}/contig_data/'
    plot_output_dir = f'{args.name}/plots/'
    debug_log_dir = f'{args.name}/'
    debug_log_file = f'{args.name}/{args.name}_debug_log.txt'
    # generate the directories
    dirlist = [debug_log_dir,contig_data_dir,nucmer_output_dir,plot_output_dir]
    if args.alignments is not None:
        dirlist = [debug_log_dir,plot_output_dir]
    for d in dirlist:
        if not os.path.isdir(d):
            subprocess.call(['mkdir','-p',d])
    # create log file
    with open(debug_log_file,'w') as fh:
        _ = fh.write(f'debug log for {args.name}\n')
    # if the alignments aren't already provided, generate them
    if args.alignments is None:
        # check that subject and query exist
        for f in [args.query,args.subject]:
            if not os.path.exists(f):
                print(f'Could not locate file or directory at {f}')
                quit(1)
        # create list of query files
        # this is now a single path rather than a list
        query_fasta = check_query(args.query)
        # determine name for query, then get contig lengths
        query_name = suffix_trim(query_fasta)
        query_contig_data = contig_data_dir + query_name + '_contig_data.csv'
        count_contig_len(query_fasta,query_contig_data)
        # determine name for subject, then get contig lengths
        subject_name = suffix_trim(args.subject)
        subject_contig_data = contig_data_dir + subject_name + '_contig_data.csv'
        count_contig_len(args.subject,subject_contig_data)
        # # determine contig lengths for the query and subject
        # for fpath in query_fasta_list + [args.subject]:
        #     #fname = fpath.split('/')[-1].split('.fa')[0]
        #     fname = suffix_trim(fpath)
        #     count_contig_len(fpath,contig_data_dir + fname + '_contig_data.csv')
        #
        # generate the highlight_data file if a blast query is provided
        if args.blast_query != 'NA':
            if not os.path.isfile(args.blast_query):
                print(f'Could not locate blast query file at {args.blast_query}')
                quit(1)
            else:
                args.highlight = args.name + '_highlight_data.tsv'
                generate_highlight_data(args.blast_query,query_fasta,args.subject,args.highlight,args.name, minimum_evalue = 1e-5, minimum_identity = 0.8, minimum_coverage = 0.8)
        # run nucmer, aligning each query to the subject
        coord_file_list = run_nucmer(query_list=[query_fasta], subject=args.subject, output_dir=nucmer_output_dir, debug=debug_log_file, rname=args.name)
        # create the plots using the R script
    # if the alignments are provided, make sure everything looks as expected
    else:
        if os.path.isdir(args.alignments):
            if not args.alignments.endswith('/'):
                args.alignments+='/'
            # replace paths with new versions
            nucmer_output_dir = args.alignments + 'nucmer/'
            contig_data_dir = args.alignments + 'contig_data/'
            # get the list of .coord files
            coord_file_list = get_nucmer_coord(nucmer_output_dir)
            if coord_file_list == []:
                print(f'No .coord files located in {args.alignments}')
                quit(1)
        else:
            print(f'Could not locate results directory at {args.alignments}')
            quit(1)
    # make plots using all current paths
    make_plots(
        nucmer_dir=nucmer_output_dir,nucmer_files=coord_file_list,contig_data_dir=contig_data_dir,
        highlight_data=args.highlight,output_dir=plot_output_dir, query_contig_data = query_contig_data,
        subject_contig_data=subject_contig_data,rname=args.name)
    print(f'Finished making plots!')
    print(f'Location of plots: {plot_output_dir}')
    print(f'Location of Nucmer alignments: {nucmer_output_dir}')


if __name__ == "__main__":
    main()