MultiKernelBench/evaluation.py at main · A-006/MultiKernelBench · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os
import json
from dataset import dataset, category2exampleop
import subprocess
import tempfile
from config import temperature, top_p
import argparse
def eval_all(out_dir, language, categories, op_tested=dataset.keys()):
    result = {}
    if categories == ['all']:
        output_file = os.path.join(out_dir,'result.json')
    else:
        output_file = os.path.join(out_dir, f'result_{"_".join(categories)}.json')
    if os.path.exists(output_file):
        print(f"[INFO] Already evaluated, please see {output_file}")
        return
    for op in op_tested:
        print(f"[INFO] Evaluating op {op}")
        with open(os.path.join(out_dir, f'{op}.txt'), 'r') as saved_log:
            response_txt = saved_log.read()
        with tempfile.NamedTemporaryFile(mode='w+', delete=True) as tf_input, \
            tempfile.NamedTemporaryFile(mode='r', delete=True) as tf_output:

            tf_input.write(response_txt)
            tf_input.flush()
            try:
                captured_text = subprocess.run(
                    ['python3', 'eval_single_runner.py', tf_input.name, op, language, tf_output.name],
                    check=True,
                    capture_output=True,     # capture stdout and stderr
                    text=True,               # decode bytes to str
                    timeout=180
                )
                result_item = json.load(tf_output)
                detailed_compiler_error = '\n'
                if not result_item['compiled']:
                    for line in captured_text.stdout.split('\n'):
                        if '[ERROR]' in line or 'error:' in line:
                            detailed_compiler_error += line
                            detailed_compiler_error += '\n'
                    for line in captured_text.stderr.split('\n'):
                        if '[ERROR]' in line or 'error:' in line:
                            detailed_compiler_error += line
                            detailed_compiler_error += '\n'
                    result_item['compile_info'] += detailed_compiler_error

            except subprocess.CalledProcessError as e:
                if 'FileNotFoundError' in e.stderr:
                    print("[FAIL] FileNotFoundError - Possibly due to incorrect 'project_root_path' setting in config.py")
                    break
                elif e.returncode == -11:
                    print("[FAIL] Segmentation fault" )
                    seg_result = {'compiled': True, 'correctness': None, 'performance': None, 'correctness_info': 'Segmentation fault'}
                    result[op] = seg_result
                    continue
                else:
                    print("[FAIL] unknown error, please report or fix bug")
                    print(e.stderr)
                    unknown_result = {'compiled': True, 'correctness': None, 'performance': None, 'correctness_info': 'Unknown fault'}
                    result[op] = unknown_result
                    continue
            except subprocess.TimeoutExpired as e:
                print("[FAIL] run timeout")
                time_result = {'compiled': True, 'correctness': None, 'performance': None, 'correctness_info': 'Timeout fault'}
                result[op] = time_result
                continue
            result[op] = result_item
            print(f'[INFO] {result_item}')

    with open(output_file, 'w') as f:
        print(f"[INFO] Evaluated succesfully, write into {output_file}")
        json.dump(result, f, indent=2)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Process command line arguments.')

    parser.add_argument('--runs', type=int, default=1, help='Number of runs')
    parser.add_argument('--model', type=str, default='deepseek-chat', help='Model name')
    parser.add_argument('--language', type=str, default='cuda', help='Programming language')
    parser.add_argument('--strategy', type=str, default='add_shot', help='Strategy type.')
    parser.add_argument('--categories', nargs='+', default=['activation'], help='List of categories.')

    args = parser.parse_args()

    runs = args.runs
    model = args.model
    language = args.language
    strategy = args.strategy
    categories = args.categories

    print(f"Runs: {runs}")
    print(f"Model: {model}")
    print(f"Language: {language}")
    print(f"Strategy: {strategy}")
    print(f"Categories: {categories}")

    op_tested = list(dataset.keys())
    if categories != ['all']:
        op_tested = [op for op in op_tested if dataset[op]['category'] in categories]

    if '/' in model:
        # processing openrouter model
        model_name = model.split('/')[1]
    else:
        model_name = model

    for run in range(runs):
        out_dir = f'output/{language}/{strategy}/{temperature}-{top_p}/{model_name}/run{run}'
        eval_all(out_dir, language, categories, op_tested)