From f0b5b5f8a8af55d856a17898c7fc8b9694ec92d5 Mon Sep 17 00:00:00 2001 From: Siddartha Pothapragada Date: Thu, 19 Mar 2026 15:05:41 -0700 Subject: [PATCH] DYNAMIC_UNBOUND support for portable runtime: lazy KV cache allocation Enable DYNAMIC_UNBOUND tensors in the portable runtime, allowing KV cache buffers to be dynamically managed rather than statically memory-planned. This is the architectural foundation for pay-as-you-go memory allocation in ExecuTorch LLM inference. Core changes: - DynamicAllocator interface with allocate/reallocate/free - PalDynamicAllocator default impl (PAL-backed, 2x growth policy) - TrackingDynamicAllocator for memory stats observability - MemoryManager gains 4th slot for DynamicAllocator (backward compatible) - TensorImpl gains dynamic_allocator_ and capacity_bytes_ fields - TensorImpl::internal_resize_contiguous handles DYNAMIC_UNBOUND resize - tensor_parser_portable.cpp: remove DYNAMIC_UNBOUND rejection, wire up allocator at load time for tensors with no memory-planned data - method.cpp: FreeCall frees dynamic memory; destructor cleans up all - Module API auto-creates PalDynamicAllocator (DYNAMIC_UNBOUND just works) Export changes: - MarkDynamicUnboundPass marks KV cache buffers as DYNAMIC_UNBOUND - --lazy_kv_cache flag for Llama export Co-authored-by: Claude --- examples/models/llama/export_llama_lib.py | 16 +++ exir/passes/mark_dynamic_unbound_pass.py | 47 ++++++++ extension/llm/export/config/llm_config.py | 3 + extension/module/module.cpp | 8 +- extension/module/module.h | 2 + extension/module/targets.bzl | 1 + runtime/core/portable_type/targets.bzl | 1 + runtime/core/portable_type/tensor_impl.cpp | 47 +++++++- runtime/core/portable_type/tensor_impl.h | 28 +++++ runtime/executor/dynamic_allocator.h | 73 +++++++++++++ runtime/executor/memory_manager.h | 16 ++- runtime/executor/method.cpp | 22 ++++ runtime/executor/pal_dynamic_allocator.h | 102 ++++++++++++++++++ runtime/executor/targets.bzl | 32 ++++++ runtime/executor/tensor_parser_portable.cpp | 39 +++++-- runtime/executor/tracking_dynamic_allocator.h | 92 ++++++++++++++++ 16 files changed, 517 insertions(+), 12 deletions(-) create mode 100644 exir/passes/mark_dynamic_unbound_pass.py create mode 100644 runtime/executor/dynamic_allocator.h create mode 100644 runtime/executor/pal_dynamic_allocator.h create mode 100644 runtime/executor/tracking_dynamic_allocator.h diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index 0394bf7f320..b8f5e3cf170 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -418,6 +418,15 @@ def build_args_parser() -> argparse.ArgumentParser: help="maximum length of context for model to remember", ) + parser.add_argument( + "--lazy_kv_cache", + action="store_true", + default=False, + help="Mark KV cache buffers as DYNAMIC_UNBOUND so they are allocated " + "lazily at runtime instead of at load time. Reduces initial memory " + "usage when max_context_length is large.", + ) + parser.add_argument( "--local_global_attention", type=parse_list_of_ints, @@ -1362,6 +1371,13 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager: # noqa: C901 if llm_config.base.model_class.value in TORCHTUNE_DEFINED_MODELS: additional_passes = [InitializedMutableBufferPass(["kv_cache_pos"])] + if llm_config.export.lazy_kv_cache: + from executorch.exir.passes.mark_dynamic_unbound_pass import ( + MarkDynamicUnboundPass, + ) + + additional_passes.append(MarkDynamicUnboundPass()) + # export_to_edge builder_manager = _prepare_for_llama_export(llm_config) if ( diff --git a/exir/passes/mark_dynamic_unbound_pass.py b/exir/passes/mark_dynamic_unbound_pass.py new file mode 100644 index 00000000000..b0bb6912a29 --- /dev/null +++ b/exir/passes/mark_dynamic_unbound_pass.py @@ -0,0 +1,47 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import List, Optional + +import torch +from executorch.exir.pass_base import ExportPass, PassResult +from executorch.exir.schema import TensorShapeDynamism +from executorch.exir.tensor import TensorSpec + + +class MarkDynamicUnboundPass(ExportPass): + """ + Marks mutable buffer TensorSpecs matching given name patterns as + DYNAMIC_UNBOUND. This causes the memory planner to skip them (no + allocation_info in the flatbuffer), and the runtime will allocate their + memory lazily via DynamicAllocator. + + Typical usage: mark KV cache buffers so they start unallocated and grow + on demand, avoiding the full upfront memory cost of max_context_length. + """ + + def __init__( + self, + name_patterns: Optional[List[str]] = None, + ) -> None: + super().__init__() + self.name_patterns = name_patterns or ["k_cache", "v_cache"] + + def call(self, graph_module: torch.fx.GraphModule) -> PassResult: + modified = False + for node in graph_module.graph.nodes: + if node.op != "placeholder": + continue + spec = node.meta.get("spec") + if not isinstance(spec, TensorSpec): + continue + if not spec.const: + # Only process mutable buffers (const=False means mutable). + name = node.name + if any(pattern in name for pattern in self.name_patterns): + spec.shape_dynamism = TensorShapeDynamism.DYNAMIC_UNBOUND + modified = True + return PassResult(graph_module, modified) diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py index 47ad2f4374a..2fbcfd2adb5 100644 --- a/extension/llm/export/config/llm_config.py +++ b/extension/llm/export/config/llm_config.py @@ -256,6 +256,7 @@ class ExportConfig: export_only: bool = False foundation_weights_file: Optional[str] = None lora_weights_file: Optional[str] = None + lazy_kv_cache: bool = False def __post_init__(self): if self.max_context_length < self.max_seq_length: @@ -695,6 +696,8 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig": # noqa: C901 llm_config.export.foundation_weights_file = args.foundation_weights_file if hasattr(args, "lora_weights_file"): llm_config.export.lora_weights_file = args.lora_weights_file + if hasattr(args, "lazy_kv_cache"): + llm_config.export.lazy_kv_cache = args.lazy_kv_cache # QuantizationConfig if hasattr(args, "quantization_mode"): diff --git a/extension/module/module.cpp b/extension/module/module.cpp index ec7236276f5..c4f9a101398 100644 --- a/extension/module/module.cpp +++ b/extension/module/module.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include namespace executorch { @@ -389,8 +390,13 @@ runtime::Error Module::load_method( planned_memory = method_holder.planned_memory->planned_memory.get(); } + method_holder.dynamic_allocator = + std::make_unique(); method_holder.memory_manager = std::make_unique( - memory_allocator_.get(), planned_memory, temp_allocator_.get()); + memory_allocator_.get(), + planned_memory, + temp_allocator_.get(), + method_holder.dynamic_allocator.get()); auto res_method = program_->load_method( method_name.c_str(), method_holder.memory_manager.get(), diff --git a/extension/module/module.h b/extension/module/module.h index 08a68b2676b..0e26ba297cc 100644 --- a/extension/module/module.h +++ b/extension/module/module.h @@ -14,6 +14,7 @@ #include #include +#include #include #ifdef USE_ATEN_LIB @@ -694,6 +695,7 @@ class Module { struct MethodHolder { std::unique_ptr planned_memory; + std::unique_ptr dynamic_allocator; std::unique_ptr memory_manager; std::unique_ptr method; }; diff --git a/extension/module/targets.bzl b/extension/module/targets.bzl index 6d60429bc51..d613da8b35f 100644 --- a/extension/module/targets.bzl +++ b/extension/module/targets.bzl @@ -25,6 +25,7 @@ def define_common_targets(): "//executorch/extension/data_loader:mmap_data_loader", "//executorch/extension/flat_tensor:flat_tensor_data_map" + aten_suffix, "//executorch/extension/named_data_map:merged_data_map" + aten_suffix, + "//executorch/runtime/executor:pal_dynamic_allocator", ], exported_deps = [ "//executorch/runtime/executor:program_no_prim_ops" + aten_suffix, diff --git a/runtime/core/portable_type/targets.bzl b/runtime/core/portable_type/targets.bzl index 5b6e67fa213..9d5b261a9b2 100644 --- a/runtime/core/portable_type/targets.bzl +++ b/runtime/core/portable_type/targets.bzl @@ -41,6 +41,7 @@ def define_common_targets(): "//executorch/runtime/core/exec_aten/util:scalar_type_util", "//executorch/runtime/core/exec_aten/util:tensor_shape_to_c_string", "//executorch/runtime/core:tag", + "//executorch/runtime/executor:dynamic_allocator", ], ) diff --git a/runtime/core/portable_type/tensor_impl.cpp b/runtime/core/portable_type/tensor_impl.cpp index ede5a3d4101..5b1d2ad850b 100644 --- a/runtime/core/portable_type/tensor_impl.cpp +++ b/runtime/core/portable_type/tensor_impl.cpp @@ -113,12 +113,53 @@ Error TensorImpl::internal_resize_contiguous(ArrayRef new_sizes) { } break; - case TensorShapeDynamism::DYNAMIC_BOUND: - // TODO(T175194371): Unbounded dynamic tensor resizing is not yet - // supported: treat them as upper-bounded. case TensorShapeDynamism::DYNAMIC_UNBOUND: { const auto new_numel = compute_numel(new_sizes.data(), dim_); + ET_CHECK_OR_RETURN_ERROR( + static_cast(new_numel) <= numel_bound_, + NotSupported, + "Attempted to resize a dynamic unbound tensor beyond its ceiling of %zu elements to %zu elements.", + numel_bound_, + new_numel); + + const size_t needed_bytes = + static_cast(new_numel) * elementSize(type_); + if (needed_bytes > capacity_bytes_) { + ET_CHECK_OR_RETURN_ERROR( + dynamic_allocator_ != nullptr, + NotSupported, + "DYNAMIC_UNBOUND tensor needs reallocation but has no DynamicAllocator"); + size_t actual_size = 0; + void* new_data = dynamic_allocator_->reallocate( + data_, + capacity_bytes_, + needed_bytes, + alignof(std::max_align_t), + &actual_size); + ET_CHECK_OR_RETURN_ERROR( + new_data != nullptr, + MemoryAllocationFailed, + "Failed to reallocate DYNAMIC_UNBOUND tensor to %zu bytes", + needed_bytes); + data_ = new_data; + capacity_bytes_ = actual_size; + } + + if (strides_ && dim_order_) { + auto error = + dim_order_to_stride(new_sizes.data(), dim_order_, dim_, strides_); + if (error != Error::Ok) { + return error; + } + } + numel_ = new_numel; + std::copy(new_sizes.begin(), new_sizes.end(), sizes_); + } break; + + case TensorShapeDynamism::DYNAMIC_BOUND: { + const auto new_numel = compute_numel(new_sizes.data(), dim_); + ET_CHECK_OR_RETURN_ERROR( static_cast(new_numel) <= numel_bound_, NotSupported, diff --git a/runtime/core/portable_type/tensor_impl.h b/runtime/core/portable_type/tensor_impl.h index 1e2b3620ca2..e1267f6b8ca 100644 --- a/runtime/core/portable_type/tensor_impl.h +++ b/runtime/core/portable_type/tensor_impl.h @@ -12,6 +12,7 @@ #include #include #include +#include // Forward declaration of a helper that provides access to internal resizing // methods of TensorImpl. Real definition is in @@ -203,6 +204,26 @@ class TensorImpl { data_ = ptr; } + /// Returns the dynamic allocator for DYNAMIC_UNBOUND tensors, or nullptr. + DynamicAllocator* dynamic_allocator() const { + return dynamic_allocator_; + } + + /// Sets the dynamic allocator for lazy allocation. + void set_dynamic_allocator(DynamicAllocator* allocator) { + dynamic_allocator_ = allocator; + } + + /// Returns the capacity in bytes of the current dynamic allocation. + size_t capacity_bytes() const { + return capacity_bytes_; + } + + /// Sets the capacity in bytes of the current dynamic allocation. + void set_capacity_bytes(size_t capacity) { + capacity_bytes_ = capacity; + } + /* * DEPRECATED: Use torch::executor::resize_tensor() or * torch::executor::resize_tensor_impl(). @@ -261,6 +282,13 @@ class TensorImpl { /// Specifies the mutability of the shape of the tensor. const TensorShapeDynamism shape_dynamism_; + + /// Allocator for DYNAMIC_UNBOUND tensors. nullptr for other dynamism types. + DynamicAllocator* dynamic_allocator_ = nullptr; + + /// Capacity in bytes of the buffer pointed to by data_, when managed by + /// dynamic_allocator_. 0 means no allocation yet. + size_t capacity_bytes_ = 0; }; /** diff --git a/runtime/executor/dynamic_allocator.h b/runtime/executor/dynamic_allocator.h new file mode 100644 index 00000000000..8e3c3cb479f --- /dev/null +++ b/runtime/executor/dynamic_allocator.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace executorch { +namespace runtime { + +/** + * Interface for dynamic memory allocation used by DYNAMIC_UNBOUND tensors. + * + * Tensors marked as DYNAMIC_UNBOUND have their memory allocated lazily at + * runtime rather than at load time. This interface allows plugging in custom + * allocation strategies (e.g., page-aligned, tracking, virtual memory). + */ +class DynamicAllocator { + public: + virtual ~DynamicAllocator() = default; + + /** + * Allocate memory. + * + * @param[in] size Minimum number of bytes to allocate. + * @param[in] alignment Required alignment of the returned pointer. + * @param[out] actual_size If non-null, receives the actual allocation size + * (may be larger than requested, e.g., due to growth policy). + * @returns Pointer to allocated memory, or nullptr on failure. + */ + virtual void* allocate( + size_t size, + size_t alignment, + size_t* actual_size) = 0; + + /** + * Reallocate memory, potentially growing the buffer. + * + * The allocator may implement a growth policy (e.g., 2x) so that the + * actual allocation exceeds new_size. Old data up to min(old_size, new_size) + * is preserved. + * + * @param[in] ptr Pointer previously returned by allocate() or reallocate(), + * or nullptr (in which case this behaves like allocate()). + * @param[in] old_size Size of the existing allocation at ptr. + * @param[in] new_size Minimum number of bytes needed. + * @param[in] alignment Required alignment of the returned pointer. + * @param[out] actual_size If non-null, receives the actual allocation size. + * @returns Pointer to reallocated memory, or nullptr on failure. On failure, + * the old allocation at ptr remains valid. + */ + virtual void* reallocate( + void* ptr, + size_t old_size, + size_t new_size, + size_t alignment, + size_t* actual_size) = 0; + + /** + * Free memory previously returned by allocate() or reallocate(). + * + * @param[in] ptr Pointer to free. May be nullptr (no-op). + */ + virtual void free(void* ptr) = 0; +}; + +} // namespace runtime +} // namespace executorch diff --git a/runtime/executor/memory_manager.h b/runtime/executor/memory_manager.h index 42edd9f0bea..0d22232f8b8 100644 --- a/runtime/executor/memory_manager.h +++ b/runtime/executor/memory_manager.h @@ -10,6 +10,7 @@ #include #include +#include namespace executorch { namespace runtime { @@ -52,10 +53,12 @@ class MemoryManager final { explicit MemoryManager( MemoryAllocator* method_allocator, HierarchicalAllocator* planned_memory = nullptr, - MemoryAllocator* temp_allocator = nullptr) + MemoryAllocator* temp_allocator = nullptr, + DynamicAllocator* dynamic_allocator = nullptr) : method_allocator_(method_allocator), planned_memory_(planned_memory), - temp_allocator_(temp_allocator) { + temp_allocator_(temp_allocator), + dynamic_allocator_(dynamic_allocator) { ET_CHECK_MSG( method_allocator != temp_allocator, "method allocator cannot be the same as temp allocator"); @@ -105,10 +108,19 @@ class MemoryManager final { return temp_allocator_; } + /** + * Returns the allocator to use for DYNAMIC_UNBOUND tensor data. + * May be nullptr if the program does not use DYNAMIC_UNBOUND tensors. + */ + DynamicAllocator* dynamic_allocator() const { + return dynamic_allocator_; + } + private: MemoryAllocator* method_allocator_; HierarchicalAllocator* planned_memory_; MemoryAllocator* temp_allocator_; + DynamicAllocator* dynamic_allocator_; }; } // namespace runtime diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp index 474afc446d2..c1400dd9270 100644 --- a/runtime/executor/method.cpp +++ b/runtime/executor/method.cpp @@ -1502,6 +1502,14 @@ Error Method::execute_instruction() { // at init time. auto free_call = instruction->instr_args_as_FreeCall(); auto t = values_[free_call->value_index()].toTensor(); + // For DYNAMIC_UNBOUND tensors, actually free the dynamically allocated + // memory rather than just nulling the pointer. + auto* impl = t.unsafeGetTensorImpl(); + if (impl->dynamic_allocator() != nullptr && + impl->mutable_data() != nullptr) { + impl->dynamic_allocator()->free(impl->mutable_data()); + impl->set_capacity_bytes(0); + } internal::reset_data_ptr(t); } break; default: @@ -1776,6 +1784,20 @@ EventTracer* Method::get_event_tracer() { } Method::~Method() { + // Free any dynamically allocated tensor memory (DYNAMIC_UNBOUND). + if (values_ != nullptr) { + for (size_t i = 0; i < n_value_; ++i) { + if (values_[i].isTensor()) { + auto* impl = values_[i].toTensor().unsafeGetTensorImpl(); + if (impl->dynamic_allocator() != nullptr && + impl->mutable_data() != nullptr) { + impl->dynamic_allocator()->free(impl->mutable_data()); + impl->set_data(nullptr); + impl->set_capacity_bytes(0); + } + } + } + } // Destroy the values. It's necessary in ATen mode, where the refcount of // Tensors needs to be decremented properly. if (values_ != nullptr) { diff --git a/runtime/executor/pal_dynamic_allocator.h b/runtime/executor/pal_dynamic_allocator.h new file mode 100644 index 00000000000..ac0ca4e2d72 --- /dev/null +++ b/runtime/executor/pal_dynamic_allocator.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +#include +#include + +namespace executorch { +namespace runtime { + +/** + * Default DynamicAllocator implementation backed by the PAL (Platform + * Abstraction Layer). Uses et_pal_allocate/et_pal_free for memory management + * and a 2x growth policy on reallocation to amortize allocation cost. + */ +class PalDynamicAllocator : public DynamicAllocator { + public: + void* allocate(size_t size, size_t alignment, size_t* actual_size) override { + // Over-allocate to accommodate alignment and the raw-pointer bookkeeping. + size_t alloc_size = size + sizeof(void*) + alignment; + void* raw = pal_allocate(alloc_size); + if (raw == nullptr) { + return nullptr; + } + void* aligned = align_pointer(raw, alignment); + // Store the raw pointer just before the aligned pointer so we can free it. + store_raw_pointer(aligned, raw); + if (actual_size) { + *actual_size = size; + } + return aligned; + } + + void* reallocate( + void* ptr, + size_t old_size, + size_t new_size, + size_t alignment, + size_t* actual_size) override { + if (ptr == nullptr) { + return allocate(new_size, alignment, actual_size); + } + // Growth policy: at least 2x old_size to amortize repeated resizes. + size_t target = std::max(new_size, old_size * 2); + size_t alloc_size = target + sizeof(void*) + alignment; + void* raw = pal_allocate(alloc_size); + if (raw == nullptr) { + return nullptr; + } + void* aligned = align_pointer(raw, alignment); + store_raw_pointer(aligned, raw); + // Copy old data. + size_t copy_size = std::min(old_size, new_size); + if (copy_size > 0) { + std::memcpy(aligned, ptr, copy_size); + } + // Free old allocation. + free(ptr); + if (actual_size) { + *actual_size = target; + } + return aligned; + } + + void free(void* ptr) override { + if (ptr == nullptr) { + return; + } + void* raw = load_raw_pointer(ptr); + pal_free(raw); + } + + private: + static void* align_pointer(void* ptr, size_t alignment) { + uintptr_t addr = reinterpret_cast(ptr); + // Reserve space for the raw pointer bookkeeping. + addr += sizeof(void*); + uintptr_t aligned = (addr + alignment - 1) & ~(alignment - 1); + return reinterpret_cast(aligned); + } + + static void store_raw_pointer(void* aligned, void* raw) { + // Store the raw (unaligned) pointer immediately before the aligned pointer. + reinterpret_cast(aligned)[-1] = raw; + } + + static void* load_raw_pointer(void* aligned) { + return reinterpret_cast(aligned)[-1]; + } +}; + +} // namespace runtime +} // namespace executorch diff --git a/runtime/executor/targets.bzl b/runtime/executor/targets.bzl index 90f8d0221e9..62fbb74373c 100644 --- a/runtime/executor/targets.bzl +++ b/runtime/executor/targets.bzl @@ -29,12 +29,44 @@ def define_common_targets(): TARGETS and BUCK files that call this function. """ + runtime.cxx_library( + name = "dynamic_allocator", + exported_headers = [ + "dynamic_allocator.h", + ], + visibility = ["PUBLIC"], + ) + + runtime.cxx_library( + name = "pal_dynamic_allocator", + exported_headers = [ + "pal_dynamic_allocator.h", + ], + exported_deps = [ + ":dynamic_allocator", + "//executorch/runtime/platform:platform", + ], + visibility = ["PUBLIC"], + ) + + runtime.cxx_library( + name = "tracking_dynamic_allocator", + exported_headers = [ + "tracking_dynamic_allocator.h", + ], + exported_deps = [ + ":dynamic_allocator", + ], + visibility = ["PUBLIC"], + ) + runtime.cxx_library( name = "memory_manager", exported_headers = [ "memory_manager.h", ], exported_deps = [ + ":dynamic_allocator", "//executorch/runtime/core:memory_allocator", ], visibility = ["PUBLIC"], diff --git a/runtime/executor/tensor_parser_portable.cpp b/runtime/executor/tensor_parser_portable.cpp index 02cb019a1da..0fb61c30ff0 100644 --- a/runtime/executor/tensor_parser_portable.cpp +++ b/runtime/executor/tensor_parser_portable.cpp @@ -51,12 +51,13 @@ Result parseTensor( TensorShapeDynamism dynamism = static_cast(s_tensor->shape_dynamism()); - // TODO(T175194371): Remove this check once fully dynamic shapes are - // supported. - ET_CHECK_OR_RETURN_ERROR( - dynamism != TensorShapeDynamism::DYNAMIC_UNBOUND, - NotSupported, - "Fully dynamic tensor shapes not yet supported: T175194371"); + if (dynamism == TensorShapeDynamism::DYNAMIC_UNBOUND) { + ET_CHECK_OR_RETURN_ERROR( + memory_manager->dynamic_allocator() != nullptr, + NotSupported, + "Model contains DYNAMIC_UNBOUND tensors but no DynamicAllocator was " + "provided. Pass a DynamicAllocator to MemoryManager."); + } ET_CHECK_OR_RETURN_ERROR( s_tensor->sizes() != nullptr, InvalidProgram, "Missing sizes field"); @@ -180,6 +181,32 @@ Result parseTensor( } tensor_impl->set_data(data_ptr.get()); + // For DYNAMIC_UNBOUND tensors, wire up the dynamic allocator. Memory is + // managed by the DynamicAllocator rather than the memory planner, making it + // freeable via FreeCall and growable via resize. + if (dynamism == TensorShapeDynamism::DYNAMIC_UNBOUND) { + auto* dyn_alloc = memory_manager->dynamic_allocator(); + tensor_impl->set_dynamic_allocator(dyn_alloc); + if (data_ptr.get() == nullptr && tensor_impl->nbytes() > 0) { + // No memory-planned data (KV cache case: data_buffer_idx=0, + // allocation_info=null). Allocate from DynamicAllocator now. + size_t actual_size = 0; + void* dyn_data = dyn_alloc->allocate( + tensor_impl->nbytes(), alignof(std::max_align_t), &actual_size); + ET_CHECK_OR_RETURN_ERROR( + dyn_data != nullptr, + MemoryAllocationFailed, + "Failed to allocate %" PRIu64 + " bytes for DYNAMIC_UNBOUND tensor", + static_cast(tensor_impl->nbytes())); + tensor_impl->set_data(dyn_data); + tensor_impl->set_capacity_bytes(actual_size); + } else { + tensor_impl->set_capacity_bytes( + data_ptr.get() != nullptr ? tensor_impl->nbytes() : 0); + } + } + return Tensor(tensor_impl); } diff --git a/runtime/executor/tracking_dynamic_allocator.h b/runtime/executor/tracking_dynamic_allocator.h new file mode 100644 index 00000000000..394017e42b2 --- /dev/null +++ b/runtime/executor/tracking_dynamic_allocator.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +#include + +namespace executorch { +namespace runtime { + +/** + * A DynamicAllocator wrapper that tracks allocation statistics. + * Delegates actual allocation to an inner DynamicAllocator and records + * current bytes, peak bytes, and allocation count. + */ +class TrackingDynamicAllocator : public DynamicAllocator { + public: + explicit TrackingDynamicAllocator(DynamicAllocator* inner) : inner_(inner) {} + + void* allocate(size_t size, size_t alignment, size_t* actual_size) override { + size_t actual = 0; + void* ptr = inner_->allocate(size, alignment, &actual); + if (ptr != nullptr) { + current_bytes_ += actual; + peak_bytes_ = std::max(peak_bytes_, current_bytes_); + num_allocations_++; + if (actual_size) { + *actual_size = actual; + } + } + return ptr; + } + + void* reallocate( + void* ptr, + size_t old_size, + size_t new_size, + size_t alignment, + size_t* actual_size) override { + size_t actual = 0; + void* new_ptr = + inner_->reallocate(ptr, old_size, new_size, alignment, &actual); + if (new_ptr != nullptr) { + current_bytes_ -= old_size; + current_bytes_ += actual; + peak_bytes_ = std::max(peak_bytes_, current_bytes_); + num_allocations_++; + if (actual_size) { + *actual_size = actual; + } + } + return new_ptr; + } + + void free(void* ptr) override { + // Note: we don't track which allocation was which size, so current_bytes_ + // can only be accurately decremented if the caller tracks capacity_bytes. + inner_->free(ptr); + } + + /// Notify the tracker that `bytes` have been freed. + void record_free(size_t bytes) { + current_bytes_ -= std::min(bytes, current_bytes_); + } + + size_t current_bytes() const { + return current_bytes_; + } + size_t peak_bytes() const { + return peak_bytes_; + } + size_t num_allocations() const { + return num_allocations_; + } + + private: + DynamicAllocator* inner_; + size_t current_bytes_ = 0; + size_t peak_bytes_ = 0; + size_t num_allocations_ = 0; +}; + +} // namespace runtime +} // namespace executorch