@@ -66,28 +66,52 @@ using mat_ref = multi::array_ref<float, 2, fptr>;
6666using const_mat_ref = multi::array_ref<float const , 2 , const_fptr>;
6767
6868// ── Memory resources ────────────────────────────────────────────────────
69+ // GPU: no managed memory. Weights loaded directly to device.
70+ // Scratch buffers use stream-ordered async allocation (cudaMallocAsync).
71+ // CPU: standard new/delete for everything.
6972#ifdef DALOTIA_E_WITH_CUBLAS
7073static cudaStream_t inference_stream = 0 ;
7174
72- std::pmr::memory_resource* weight_resource () {
73- return dalotia::cuda_managed_resource ();
75+ // Weights: dalotia loads directly into device pointers (GDS or host-staging).
76+ std::pmr::memory_resource* device_resource () {
77+ return dalotia::cuda_device_resource ();
7478}
7579
76- // Scratch buffers use managed memory (host-accessible for pmr::vector::resize
77- // zero-initialization). cuda_async_memory_resource would be more efficient but
78- // its pointers are not host-accessible, which pmr::vector::resize() requires.
80+ // Scratch: stream-ordered async allocation (cudaMallocAsync on stream 0).
81+ static dalotia:: cuda_async_memory_resource* scratch_res = nullptr ;
82+
7983std::pmr::memory_resource* scratch_resource () {
80- return dalotia::cuda_managed_resource ();
84+ if (!scratch_res) scratch_res = new dalotia::cuda_async_memory_resource (/* stream=*/ 0 );
85+ return scratch_res;
8186}
8287#else
83- std::pmr::memory_resource* weight_resource () { return std::pmr::new_delete_resource (); }
8488std::pmr::memory_resource* scratch_resource () { return std::pmr::new_delete_resource (); }
8589#endif
8690
87- dalotia::vector<float > make_buffer (size_t n, std::pmr::memory_resource* mr) {
88- dalotia::vector<float > v (mr);
89- v.resize (n);
90- return v;
91+ // Scratch buffer: raw PMR allocation, no zero-initialization.
92+ // On GPU this uses cudaMallocAsync; on CPU it uses new[].
93+ struct ScratchBuf {
94+ float * ptr = nullptr ;
95+ size_t count = 0 ;
96+ std::pmr::memory_resource* mr = nullptr ;
97+
98+ ScratchBuf () = default ;
99+ ScratchBuf (size_t n, std::pmr::memory_resource* r) : count(n), mr(r) {
100+ ptr = static_cast <float *>(mr->allocate (n * sizeof (float ), alignof (float )));
101+ }
102+ ~ScratchBuf () { if (ptr) mr->deallocate (ptr, count * sizeof (float ), alignof (float )); }
103+ ScratchBuf (ScratchBuf&& o) noexcept : ptr(o.ptr), count(o.count), mr(o.mr) { o.ptr = nullptr ; }
104+ ScratchBuf& operator =(ScratchBuf&& o) noexcept {
105+ if (this != &o) { this ->~ScratchBuf (); ptr = o.ptr ; count = o.count ; mr = o.mr ; o.ptr = nullptr ; }
106+ return *this ;
107+ }
108+ ScratchBuf (const ScratchBuf&) = delete ;
109+ ScratchBuf& operator =(const ScratchBuf&) = delete ;
110+ float * data () const { return ptr; }
111+ };
112+
113+ ScratchBuf make_buffer (size_t n, std::pmr::memory_resource* mr) {
114+ return ScratchBuf (n, mr);
91115}
92116
93117// ── GPT-2 124M hyperparameters ──────────────────────────────────────────
@@ -316,76 +340,82 @@ void add_bias(float* x, const float* bias, int rows, int cols) {
316340}
317341
318342// ── Model weights ───────────────────────────────────────────────────────
343+ // On GPU: device memory. On CPU: standard heap memory.
344+
345+ // Reuse ScratchBuf for weight storage — same RAII, different resource.
346+ using WeightBuf = ScratchBuf;
319347
320348struct TransformerBlock {
321- dalotia::vector<float > ln_1_weight, ln_1_bias;
322- dalotia::vector<float > c_attn_weight, c_attn_bias;
323- dalotia::vector<float > c_proj_weight, c_proj_bias;
324- dalotia::vector<float > ln_2_weight, ln_2_bias;
325- dalotia::vector<float > c_fc_weight, c_fc_bias;
326- dalotia::vector<float > c_proj_mlp_weight, c_proj_mlp_bias;
327-
328- explicit TransformerBlock (std::pmr::memory_resource* mr = std::pmr::new_delete_resource())
329- : ln_1_weight(mr), ln_1_bias(mr),
330- c_attn_weight(mr), c_attn_bias(mr),
331- c_proj_weight(mr), c_proj_bias(mr),
332- ln_2_weight(mr), ln_2_bias(mr),
333- c_fc_weight(mr), c_fc_bias(mr),
334- c_proj_mlp_weight(mr), c_proj_mlp_bias(mr) {}
349+ WeightBuf ln_1_weight, ln_1_bias;
350+ WeightBuf c_attn_weight, c_attn_bias;
351+ WeightBuf c_proj_weight, c_proj_bias;
352+ WeightBuf ln_2_weight, ln_2_bias;
353+ WeightBuf c_fc_weight, c_fc_bias;
354+ WeightBuf c_proj_mlp_weight, c_proj_mlp_bias;
335355};
336356
337357struct GPT2Model {
338- dalotia::vector< float > wte, wpe;
358+ WeightBuf wte, wpe;
339359 std::vector<TransformerBlock> blocks;
340- dalotia::vector<float > ln_f_weight, ln_f_bias;
341-
342- explicit GPT2Model (std::pmr::memory_resource* mr = std::pmr::new_delete_resource())
343- : wte(mr), wpe(mr), ln_f_weight(mr), ln_f_bias(mr) {}
360+ WeightBuf ln_f_weight, ln_f_bias;
344361};
345362
346363GPT2Model load_model (const std::string& filename) {
347364 auto file = std::unique_ptr<dalotia::TensorFile>(
348365 dalotia::make_tensor_file (filename));
366+ GPT2Model model;
349367
350- auto * mr = weight_resource ();
351- GPT2Model model (mr);
352-
353- std::pmr::polymorphic_allocator<dalotia_byte> alloc (mr);
368+ #ifdef DALOTIA_E_WITH_CUBLAS
369+ // Allocate device buffer, load directly into it via dalotia.
370+ // dalotia detects the device pointer and uses GDS or host-staging internally.
371+ auto * dev_mr = device_resource ();
372+
373+ auto load = [&](const std::string& name) -> WeightBuf {
374+ auto extents = file->get_tensor_extents (name);
375+ size_t n = std::accumulate (extents.begin (), extents.end (), size_t {1 }, std::multiplies<>());
376+ WeightBuf dev (n, dev_mr);
377+ file->load_tensor_dense (name, dalotia_float_32, dalotia_C_ordering,
378+ reinterpret_cast <dalotia_byte*>(dev.data ()));
379+ return dev;
380+ };
381+ #else
382+ auto * cpu_mr = std::pmr::new_delete_resource ();
383+ std::pmr::polymorphic_allocator<dalotia_byte> cpu_alloc (cpu_mr);
354384
355- auto load_into = [&](dalotia::vector< float >& dst, const std::string& name) {
385+ auto load = [&](const std::string& name) -> WeightBuf {
356386 auto [ext, data] = file->load_tensor_dense <float >(
357- name, dalotia_float_32, dalotia_C_ordering, {}, alloc);
358- dst = std::move (data);
387+ name, dalotia_float_32, dalotia_C_ordering, {}, cpu_alloc);
388+ // Wrap the pmr::vector data in a WeightBuf — need to copy since
389+ // pmr::vector will free on scope exit.
390+ WeightBuf buf (data.size (), cpu_mr);
391+ std::memcpy (buf.data (), data.data (), data.size () * sizeof (float ));
392+ return buf;
359393 };
394+ #endif
360395
361- load_into ( model.wte , " wte.weight" );
362- load_into ( model.wpe , " wpe.weight" );
396+ model.wte = load ( " wte.weight" );
397+ model.wpe = load ( " wpe.weight" );
363398
364- model.blocks .reserve (N_LAYER);
399+ model.blocks .resize (N_LAYER);
365400 for (int i = 0 ; i < N_LAYER; ++i) {
366- model.blocks .emplace_back (mr);
367401 std::string p = " h." + std::to_string (i) + " ." ;
368- auto & b = model.blocks . back () ;
369- load_into ( b.ln_1_weight , p+" ln_1.weight" ); load_into ( b.ln_1_bias , p+" ln_1.bias" );
370- load_into ( b.c_attn_weight , p+" attn.c_attn.weight" ); load_into ( b.c_attn_bias , p+" attn.c_attn.bias" );
371- load_into ( b.c_proj_weight , p+" attn.c_proj.weight" ); load_into ( b.c_proj_bias , p+" attn.c_proj.bias" );
372- load_into ( b.ln_2_weight , p+" ln_2.weight" ); load_into ( b.ln_2_bias , p+" ln_2.bias" );
373- load_into ( b.c_fc_weight , p+" mlp.c_fc.weight" ); load_into ( b.c_fc_bias , p+" mlp.c_fc.bias" );
374- load_into ( b.c_proj_mlp_weight , p+" mlp.c_proj.weight" ); load_into ( b.c_proj_mlp_bias , p+" mlp.c_proj.bias" );
402+ auto & b = model.blocks [i] ;
403+ b.ln_1_weight = load ( p+" ln_1.weight" ); b.ln_1_bias = load ( p+" ln_1.bias" );
404+ b.c_attn_weight = load ( p+" attn.c_attn.weight" ); b.c_attn_bias = load ( p+" attn.c_attn.bias" );
405+ b.c_proj_weight = load ( p+" attn.c_proj.weight" ); b.c_proj_bias = load ( p+" attn.c_proj.bias" );
406+ b.ln_2_weight = load ( p+" ln_2.weight" ); b.ln_2_bias = load ( p+" ln_2.bias" );
407+ b.c_fc_weight = load ( p+" mlp.c_fc.weight" ); b.c_fc_bias = load ( p+" mlp.c_fc.bias" );
408+ b.c_proj_mlp_weight = load ( p+" mlp.c_proj.weight" ); b.c_proj_mlp_bias = load ( p+" mlp.c_proj.bias" );
375409 }
376- load_into (model.ln_f_weight , " ln_f.weight" );
377- load_into (model.ln_f_bias , " ln_f.bias" );
378-
379- #ifdef DALOTIA_E_WITH_CUBLAS
380- CHECK_CUDA (cudaDeviceSynchronize ());
381- #endif
410+ model.ln_f_weight = load (" ln_f.weight" );
411+ model.ln_f_bias = load (" ln_f.bias" );
382412 return model;
383413}
384414
385415// ── Forward pass ────────────────────────────────────────────────────────
386416// GPU: all ops on default stream 0 — implicitly ordered, no inter-op sync.
387- // Single cudaDeviceSynchronize at entry ( managed memory coherence)
388- // and cudaStreamSynchronize at exit (before host reads logits) .
417+ // All buffers are device memory (no managed). Only sync is the final
418+ // cudaStreamSynchronize before reading logits back to host .
389419// CPU: sequential host ops.
390420
391421std::vector<float > forward (const GPT2Model& model,
@@ -602,7 +632,7 @@ int main(int argc, char* argv[]) {
602632 }
603633
604634#ifdef DALOTIA_E_WITH_CUBLAS
605- // inference_stream == 0 (default stream), no destroy needed
635+ delete scratch_res; scratch_res = nullptr ;
606636#endif
607637
608638 std::cout << " success!" << std::endl;
0 commit comments