parallel-kmeans-cuda/graph_efficiency.py at main · lebinary/parallel-kmeans-cuda · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import matplotlib.pyplot as plt
import numpy as np

with open('output/performance_results.csv', 'r') as f:
    lines = f.readlines()

header = lines[0].strip().split(',')
datasets = header[1:]

data = {}
for line in lines[1:]:
    if line.strip():
        parts = line.strip().split(',')
        version = parts[0].strip()
        times = []
        for i in range(1, len(parts), 2):
            times.append(float(parts[i+1].strip()))
        data[version] = times

n_values = [2048, 16384, 65536]
k = 16
d_values = [16, 24, 32]

peak_flops = 52e12
peak_bandwidth = 736e9

compute_efficiency = {}
memory_efficiency = {}

for idx, (n, d) in enumerate(zip(n_values, d_values)):
    flops = n * k * d * 3 + n * d + k * d

    memory_bytes = (n*d*8 + n*k*d*8 + n*4 +
                    n*d*8 + n*4 + n*d*8 +
                    k*4 + k*d*8 + k*d*8)

    theoretical_compute_time = flops / peak_flops
    theoretical_memory_time = memory_bytes / peak_bandwidth

    theoretical_compute_time_ms = theoretical_compute_time * 1000
    theoretical_memory_time_ms = theoretical_memory_time * 1000

    for version in ['cuda_basic', 'cuda_shared', 'thrust']:
        if version not in data:
            continue

        actual_time_ms = data[version][idx]

        comp_eff = theoretical_compute_time_ms / actual_time_ms
        mem_eff = theoretical_memory_time_ms / actual_time_ms

        if version not in compute_efficiency:
            compute_efficiency[version] = []
            memory_efficiency[version] = []

        compute_efficiency[version].append(comp_eff * 100)
        memory_efficiency[version].append(mem_eff * 100)

x = np.arange(len(datasets))
width = 0.25

fig, ax = plt.subplots(figsize=(10, 6))
bars1 = ax.bar(x - width, compute_efficiency['cuda_basic'], width, label='CUDA Basic')
bars2 = ax.bar(x, compute_efficiency['cuda_shared'], width, label='CUDA Shared')
bars3 = ax.bar(x + width, compute_efficiency['thrust'], width, label='Thrust')

ax.set_ylabel('Compute Efficiency (%)', fontsize=12)
ax.set_xlabel('Dataset Size', fontsize=12)
ax.set_title('K-means Compute Efficiency\n(Theoretical Peak Compute Performance)', fontsize=12)
ax.set_xticks(x)
ax.set_xticklabels(datasets)
ax.set_ylim(0, 0.5)
ax.legend()
ax.grid(axis='y', alpha=0.3)

for bars in [bars1, bars2, bars3]:
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.2f}%',
                ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.savefig('output/compute_efficiency_plot.png', dpi=300, bbox_inches='tight')
print("Compute efficiency plot saved to output/compute_efficiency_plot.png")

fig, ax = plt.subplots(figsize=(10, 6))
bars1 = ax.bar(x - width, memory_efficiency['cuda_basic'], width, label='CUDA Basic')
bars2 = ax.bar(x, memory_efficiency['cuda_shared'], width, label='CUDA Shared')
bars3 = ax.bar(x + width, memory_efficiency['thrust'], width, label='Thrust')

ax.set_ylabel('Memory Efficiency (%)', fontsize=12)
ax.set_xlabel('Dataset Size', fontsize=12)
ax.set_title('K-means Memory Efficiency\n(Theoretical Peak Memory Bandwidth)', fontsize=12)
ax.set_xticks(x)
ax.set_xticklabels(datasets)
ax.legend()
ax.grid(axis='y', alpha=0.3)
ax.axhline(y=100, color='red', linestyle='--', linewidth=1, alpha=0.5, label='100% (Peak)')

for bars in [bars1, bars2, bars3]:
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.2f}%',
                ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.savefig('output/memory_efficiency_plot.png', dpi=300, bbox_inches='tight')
print("Memory efficiency plot saved to output/memory_efficiency_plot.png")

print("\nCompute Efficiency (%):")
print(f"{'Version':<15} {datasets[0]:<10} {datasets[1]:<10} {datasets[2]:<10}")
print("-" * 45)
for version in ['cuda_basic', 'cuda_shared', 'thrust']:
    print(f"{version:<15} {compute_efficiency[version][0]:<10.2f} {compute_efficiency[version][1]:<10.2f} {compute_efficiency[version][2]:<10.2f}")

print("\nMemory Efficiency (%):")
print(f"{'Version':<15} {datasets[0]:<10} {datasets[1]:<10} {datasets[2]:<10}")
print("-" * 45)
for version in ['cuda_basic', 'cuda_shared', 'thrust']:
    print(f"{version:<15} {memory_efficiency[version][0]:<10.2f} {memory_efficiency[version][1]:<10.2f} {memory_efficiency[version][2]:<10.2f}")