Skip to content

Commit 398b0f5

Browse files
author
Theresa
committed
gpt2: managed -> async and pinned memory (2* faster loading)
1 parent 2098774 commit 398b0f5

1 file changed

Lines changed: 87 additions & 57 deletions

File tree

benchmarks/GPT2/gpt2.cpp

Lines changed: 87 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -66,28 +66,52 @@ using mat_ref = multi::array_ref<float, 2, fptr>;
6666
using const_mat_ref = multi::array_ref<float const, 2, const_fptr>;
6767

6868
// ── Memory resources ────────────────────────────────────────────────────
69+
// GPU: no managed memory. Weights loaded directly to device.
70+
// Scratch buffers use stream-ordered async allocation (cudaMallocAsync).
71+
// CPU: standard new/delete for everything.
6972
#ifdef DALOTIA_E_WITH_CUBLAS
7073
static cudaStream_t inference_stream = 0;
7174

72-
std::pmr::memory_resource* weight_resource() {
73-
return dalotia::cuda_managed_resource();
75+
// Weights: dalotia loads directly into device pointers (GDS or host-staging).
76+
std::pmr::memory_resource* device_resource() {
77+
return dalotia::cuda_device_resource();
7478
}
7579

76-
// Scratch buffers use managed memory (host-accessible for pmr::vector::resize
77-
// zero-initialization). cuda_async_memory_resource would be more efficient but
78-
// its pointers are not host-accessible, which pmr::vector::resize() requires.
80+
// Scratch: stream-ordered async allocation (cudaMallocAsync on stream 0).
81+
static dalotia::cuda_async_memory_resource* scratch_res = nullptr;
82+
7983
std::pmr::memory_resource* scratch_resource() {
80-
return dalotia::cuda_managed_resource();
84+
if (!scratch_res) scratch_res = new dalotia::cuda_async_memory_resource(/*stream=*/0);
85+
return scratch_res;
8186
}
8287
#else
83-
std::pmr::memory_resource* weight_resource() { return std::pmr::new_delete_resource(); }
8488
std::pmr::memory_resource* scratch_resource() { return std::pmr::new_delete_resource(); }
8589
#endif
8690

87-
dalotia::vector<float> make_buffer(size_t n, std::pmr::memory_resource* mr) {
88-
dalotia::vector<float> v(mr);
89-
v.resize(n);
90-
return v;
91+
// Scratch buffer: raw PMR allocation, no zero-initialization.
92+
// On GPU this uses cudaMallocAsync; on CPU it uses new[].
93+
struct ScratchBuf {
94+
float* ptr = nullptr;
95+
size_t count = 0;
96+
std::pmr::memory_resource* mr = nullptr;
97+
98+
ScratchBuf() = default;
99+
ScratchBuf(size_t n, std::pmr::memory_resource* r) : count(n), mr(r) {
100+
ptr = static_cast<float*>(mr->allocate(n * sizeof(float), alignof(float)));
101+
}
102+
~ScratchBuf() { if (ptr) mr->deallocate(ptr, count * sizeof(float), alignof(float)); }
103+
ScratchBuf(ScratchBuf&& o) noexcept : ptr(o.ptr), count(o.count), mr(o.mr) { o.ptr = nullptr; }
104+
ScratchBuf& operator=(ScratchBuf&& o) noexcept {
105+
if (this != &o) { this->~ScratchBuf(); ptr = o.ptr; count = o.count; mr = o.mr; o.ptr = nullptr; }
106+
return *this;
107+
}
108+
ScratchBuf(const ScratchBuf&) = delete;
109+
ScratchBuf& operator=(const ScratchBuf&) = delete;
110+
float* data() const { return ptr; }
111+
};
112+
113+
ScratchBuf make_buffer(size_t n, std::pmr::memory_resource* mr) {
114+
return ScratchBuf(n, mr);
91115
}
92116

93117
// ── GPT-2 124M hyperparameters ──────────────────────────────────────────
@@ -316,76 +340,82 @@ void add_bias(float* x, const float* bias, int rows, int cols) {
316340
}
317341

318342
// ── Model weights ───────────────────────────────────────────────────────
343+
// On GPU: device memory. On CPU: standard heap memory.
344+
345+
// Reuse ScratchBuf for weight storage — same RAII, different resource.
346+
using WeightBuf = ScratchBuf;
319347

320348
struct TransformerBlock {
321-
dalotia::vector<float> ln_1_weight, ln_1_bias;
322-
dalotia::vector<float> c_attn_weight, c_attn_bias;
323-
dalotia::vector<float> c_proj_weight, c_proj_bias;
324-
dalotia::vector<float> ln_2_weight, ln_2_bias;
325-
dalotia::vector<float> c_fc_weight, c_fc_bias;
326-
dalotia::vector<float> c_proj_mlp_weight, c_proj_mlp_bias;
327-
328-
explicit TransformerBlock(std::pmr::memory_resource* mr = std::pmr::new_delete_resource())
329-
: ln_1_weight(mr), ln_1_bias(mr),
330-
c_attn_weight(mr), c_attn_bias(mr),
331-
c_proj_weight(mr), c_proj_bias(mr),
332-
ln_2_weight(mr), ln_2_bias(mr),
333-
c_fc_weight(mr), c_fc_bias(mr),
334-
c_proj_mlp_weight(mr), c_proj_mlp_bias(mr) {}
349+
WeightBuf ln_1_weight, ln_1_bias;
350+
WeightBuf c_attn_weight, c_attn_bias;
351+
WeightBuf c_proj_weight, c_proj_bias;
352+
WeightBuf ln_2_weight, ln_2_bias;
353+
WeightBuf c_fc_weight, c_fc_bias;
354+
WeightBuf c_proj_mlp_weight, c_proj_mlp_bias;
335355
};
336356

337357
struct GPT2Model {
338-
dalotia::vector<float> wte, wpe;
358+
WeightBuf wte, wpe;
339359
std::vector<TransformerBlock> blocks;
340-
dalotia::vector<float> ln_f_weight, ln_f_bias;
341-
342-
explicit GPT2Model(std::pmr::memory_resource* mr = std::pmr::new_delete_resource())
343-
: wte(mr), wpe(mr), ln_f_weight(mr), ln_f_bias(mr) {}
360+
WeightBuf ln_f_weight, ln_f_bias;
344361
};
345362

346363
GPT2Model load_model(const std::string& filename) {
347364
auto file = std::unique_ptr<dalotia::TensorFile>(
348365
dalotia::make_tensor_file(filename));
366+
GPT2Model model;
349367

350-
auto* mr = weight_resource();
351-
GPT2Model model(mr);
352-
353-
std::pmr::polymorphic_allocator<dalotia_byte> alloc(mr);
368+
#ifdef DALOTIA_E_WITH_CUBLAS
369+
// Allocate device buffer, load directly into it via dalotia.
370+
// dalotia detects the device pointer and uses GDS or host-staging internally.
371+
auto* dev_mr = device_resource();
372+
373+
auto load = [&](const std::string& name) -> WeightBuf {
374+
auto extents = file->get_tensor_extents(name);
375+
size_t n = std::accumulate(extents.begin(), extents.end(), size_t{1}, std::multiplies<>());
376+
WeightBuf dev(n, dev_mr);
377+
file->load_tensor_dense(name, dalotia_float_32, dalotia_C_ordering,
378+
reinterpret_cast<dalotia_byte*>(dev.data()));
379+
return dev;
380+
};
381+
#else
382+
auto* cpu_mr = std::pmr::new_delete_resource();
383+
std::pmr::polymorphic_allocator<dalotia_byte> cpu_alloc(cpu_mr);
354384

355-
auto load_into = [&](dalotia::vector<float>& dst, const std::string& name) {
385+
auto load = [&](const std::string& name) -> WeightBuf {
356386
auto [ext, data] = file->load_tensor_dense<float>(
357-
name, dalotia_float_32, dalotia_C_ordering, {}, alloc);
358-
dst = std::move(data);
387+
name, dalotia_float_32, dalotia_C_ordering, {}, cpu_alloc);
388+
// Wrap the pmr::vector data in a WeightBuf — need to copy since
389+
// pmr::vector will free on scope exit.
390+
WeightBuf buf(data.size(), cpu_mr);
391+
std::memcpy(buf.data(), data.data(), data.size() * sizeof(float));
392+
return buf;
359393
};
394+
#endif
360395

361-
load_into(model.wte, "wte.weight");
362-
load_into(model.wpe, "wpe.weight");
396+
model.wte = load("wte.weight");
397+
model.wpe = load("wpe.weight");
363398

364-
model.blocks.reserve(N_LAYER);
399+
model.blocks.resize(N_LAYER);
365400
for (int i = 0; i < N_LAYER; ++i) {
366-
model.blocks.emplace_back(mr);
367401
std::string p = "h." + std::to_string(i) + ".";
368-
auto& b = model.blocks.back();
369-
load_into(b.ln_1_weight, p+"ln_1.weight"); load_into(b.ln_1_bias, p+"ln_1.bias");
370-
load_into(b.c_attn_weight, p+"attn.c_attn.weight"); load_into(b.c_attn_bias, p+"attn.c_attn.bias");
371-
load_into(b.c_proj_weight, p+"attn.c_proj.weight"); load_into(b.c_proj_bias, p+"attn.c_proj.bias");
372-
load_into(b.ln_2_weight, p+"ln_2.weight"); load_into(b.ln_2_bias, p+"ln_2.bias");
373-
load_into(b.c_fc_weight, p+"mlp.c_fc.weight"); load_into(b.c_fc_bias, p+"mlp.c_fc.bias");
374-
load_into(b.c_proj_mlp_weight, p+"mlp.c_proj.weight"); load_into(b.c_proj_mlp_bias, p+"mlp.c_proj.bias");
402+
auto& b = model.blocks[i];
403+
b.ln_1_weight = load(p+"ln_1.weight"); b.ln_1_bias = load(p+"ln_1.bias");
404+
b.c_attn_weight = load(p+"attn.c_attn.weight"); b.c_attn_bias = load(p+"attn.c_attn.bias");
405+
b.c_proj_weight = load(p+"attn.c_proj.weight"); b.c_proj_bias = load(p+"attn.c_proj.bias");
406+
b.ln_2_weight = load(p+"ln_2.weight"); b.ln_2_bias = load(p+"ln_2.bias");
407+
b.c_fc_weight = load(p+"mlp.c_fc.weight"); b.c_fc_bias = load(p+"mlp.c_fc.bias");
408+
b.c_proj_mlp_weight = load(p+"mlp.c_proj.weight"); b.c_proj_mlp_bias = load(p+"mlp.c_proj.bias");
375409
}
376-
load_into(model.ln_f_weight, "ln_f.weight");
377-
load_into(model.ln_f_bias, "ln_f.bias");
378-
379-
#ifdef DALOTIA_E_WITH_CUBLAS
380-
CHECK_CUDA(cudaDeviceSynchronize());
381-
#endif
410+
model.ln_f_weight = load("ln_f.weight");
411+
model.ln_f_bias = load("ln_f.bias");
382412
return model;
383413
}
384414

385415
// ── Forward pass ────────────────────────────────────────────────────────
386416
// GPU: all ops on default stream 0 — implicitly ordered, no inter-op sync.
387-
// Single cudaDeviceSynchronize at entry (managed memory coherence)
388-
// and cudaStreamSynchronize at exit (before host reads logits).
417+
// All buffers are device memory (no managed). Only sync is the final
418+
// cudaStreamSynchronize before reading logits back to host.
389419
// CPU: sequential host ops.
390420

391421
std::vector<float> forward(const GPT2Model& model,
@@ -602,7 +632,7 @@ int main(int argc, char* argv[]) {
602632
}
603633

604634
#ifdef DALOTIA_E_WITH_CUBLAS
605-
// inference_stream == 0 (default stream), no destroy needed
635+
delete scratch_res; scratch_res = nullptr;
606636
#endif
607637

608638
std::cout << "success!" << std::endl;

0 commit comments

Comments
 (0)