forked from wzzll123/MultiKernelBench
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathevaluation.py
More file actions
110 lines (98 loc) · 4.78 KB
/
evaluation.py
File metadata and controls
110 lines (98 loc) · 4.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os
import json
from dataset import dataset, category2exampleop
import subprocess
import tempfile
from config import temperature, top_p
import argparse
def eval_all(out_dir, language, categories, op_tested=dataset.keys()):
result = {}
if categories == ['all']:
output_file = os.path.join(out_dir,'result.json')
else:
output_file = os.path.join(out_dir, f'result_{"_".join(categories)}.json')
if os.path.exists(output_file):
print(f"[INFO] Already evaluated, please see {output_file}")
return
for op in op_tested:
print(f"[INFO] Evaluating op {op}")
with open(os.path.join(out_dir, f'{op}.txt'), 'r') as saved_log:
response_txt = saved_log.read()
with tempfile.NamedTemporaryFile(mode='w+', delete=True) as tf_input, \
tempfile.NamedTemporaryFile(mode='r', delete=True) as tf_output:
tf_input.write(response_txt)
tf_input.flush()
try:
captured_text = subprocess.run(
['python3', 'eval_single_runner.py', tf_input.name, op, language, tf_output.name],
check=True,
capture_output=True, # capture stdout and stderr
text=True, # decode bytes to str
timeout=180
)
result_item = json.load(tf_output)
detailed_compiler_error = '\n'
if not result_item['compiled']:
for line in captured_text.stdout.split('\n'):
if '[ERROR]' in line or 'error:' in line:
detailed_compiler_error += line
detailed_compiler_error += '\n'
for line in captured_text.stderr.split('\n'):
if '[ERROR]' in line or 'error:' in line:
detailed_compiler_error += line
detailed_compiler_error += '\n'
result_item['compile_info'] += detailed_compiler_error
except subprocess.CalledProcessError as e:
if 'FileNotFoundError' in e.stderr:
print("[FAIL] FileNotFoundError - Possibly due to incorrect 'project_root_path' setting in config.py")
break
elif e.returncode == -11:
print("[FAIL] Segmentation fault" )
seg_result = {'compiled': True, 'correctness': None, 'performance': None, 'correctness_info': 'Segmentation fault'}
result[op] = seg_result
continue
else:
print("[FAIL] unknown error, please report or fix bug")
print(e.stderr)
unknown_result = {'compiled': True, 'correctness': None, 'performance': None, 'correctness_info': 'Unknown fault'}
result[op] = unknown_result
continue
except subprocess.TimeoutExpired as e:
print("[FAIL] run timeout")
time_result = {'compiled': True, 'correctness': None, 'performance': None, 'correctness_info': 'Timeout fault'}
result[op] = time_result
continue
result[op] = result_item
print(f'[INFO] {result_item}')
with open(output_file, 'w') as f:
print(f"[INFO] Evaluated succesfully, write into {output_file}")
json.dump(result, f, indent=2)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Process command line arguments.')
parser.add_argument('--runs', type=int, default=1, help='Number of runs')
parser.add_argument('--model', type=str, default='deepseek-chat', help='Model name')
parser.add_argument('--language', type=str, default='cuda', help='Programming language')
parser.add_argument('--strategy', type=str, default='add_shot', help='Strategy type.')
parser.add_argument('--categories', nargs='+', default=['activation'], help='List of categories.')
args = parser.parse_args()
runs = args.runs
model = args.model
language = args.language
strategy = args.strategy
categories = args.categories
print(f"Runs: {runs}")
print(f"Model: {model}")
print(f"Language: {language}")
print(f"Strategy: {strategy}")
print(f"Categories: {categories}")
op_tested = list(dataset.keys())
if categories != ['all']:
op_tested = [op for op in op_tested if dataset[op]['category'] in categories]
if '/' in model:
# processing openrouter model
model_name = model.split('/')[1]
else:
model_name = model
for run in range(runs):
out_dir = f'output/{language}/{strategy}/{temperature}-{top_p}/{model_name}/run{run}'
eval_all(out_dir, language, categories, op_tested)