pytorch · psiddh · Mar 19, 2026 · digantdesai · Mar 20, 2026
@@ -418,6 +418,15 @@ def build_args_parser() -> argparse.ArgumentParser:
         help="maximum length of context for model to remember",
     )
 
+    parser.add_argument(
+        "--lazy_kv_cache",
+        action="store_true",
+        default=False,
+        help="Mark KV cache buffers as DYNAMIC_UNBOUND so they are allocated "
+        "lazily at runtime instead of at load time. Reduces initial memory "
+        "usage when max_context_length is large.",
+    )
+
     parser.add_argument(
         "--local_global_attention",
         type=parse_list_of_ints,
@@ -1362,6 +1371,13 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager:  # noqa: C901
     if llm_config.base.model_class.value in TORCHTUNE_DEFINED_MODELS:
         additional_passes = [InitializedMutableBufferPass(["kv_cache_pos"])]
 
+    if llm_config.export.lazy_kv_cache:
+        from executorch.exir.passes.mark_dynamic_unbound_pass import (
+            MarkDynamicUnboundPass,
+        )
+
+        additional_passes.append(MarkDynamicUnboundPass())
+
     # export_to_edge
     builder_manager = _prepare_for_llama_export(llm_config)
     if (

@@ -0,0 +1,47 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import List, Optional
+
+import torch
+from executorch.exir.pass_base import ExportPass, PassResult
+from executorch.exir.schema import TensorShapeDynamism
+from executorch.exir.tensor import TensorSpec
+
+
+class MarkDynamicUnboundPass(ExportPass):
+    """
+    Marks mutable buffer TensorSpecs matching given name patterns as
+    DYNAMIC_UNBOUND. This causes the memory planner to skip them (no
+    allocation_info in the flatbuffer), and the runtime will allocate their
+    memory lazily via DynamicAllocator.
+
+    Typical usage: mark KV cache buffers so they start unallocated and grow
+    on demand, avoiding the full upfront memory cost of max_context_length.
+    """
+
+    def __init__(
+        self,
+        name_patterns: Optional[List[str]] = None,
+    ) -> None:
+        super().__init__()
+        self.name_patterns = name_patterns or ["k_cache", "v_cache"]
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        modified = False
+        for node in graph_module.graph.nodes:
+            if node.op != "placeholder":
+                continue
+            spec = node.meta.get("spec")
+            if not isinstance(spec, TensorSpec):
+                continue
+            if not spec.const:
+                # Only process mutable buffers (const=False means mutable).
+                name = node.name
+                if any(pattern in name for pattern in self.name_patterns):
+                    spec.shape_dynamism = TensorShapeDynamism.DYNAMIC_UNBOUND
+                    modified = True
+        return PassResult(graph_module, modified)
@@ -256,6 +256,7 @@ class ExportConfig:
     export_only: bool = False
     foundation_weights_file: Optional[str] = None
     lora_weights_file: Optional[str] = None
+    lazy_kv_cache: bool = False
 
     def __post_init__(self):
         if self.max_context_length < self.max_seq_length:
@@ -695,6 +696,8 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig":  # noqa: C901
             llm_config.export.foundation_weights_file = args.foundation_weights_file
         if hasattr(args, "lora_weights_file"):
             llm_config.export.lora_weights_file = args.lora_weights_file
+        if hasattr(args, "lazy_kv_cache"):
+            llm_config.export.lazy_kv_cache = args.lazy_kv_cache
 
         # QuantizationConfig
         if hasattr(args, "quantization_mode"):

@@ -13,6 +13,7 @@
 #include <executorch/extension/flat_tensor/flat_tensor_data_map.h>
 #include <executorch/extension/memory_allocator/malloc_memory_allocator.h>
 #include <executorch/extension/named_data_map/merged_data_map.h>
+#include <executorch/runtime/executor/pal_dynamic_allocator.h>
 #include <executorch/runtime/platform/runtime.h>
 
 namespace executorch {
@@ -389,8 +390,13 @@ runtime::Error Module::load_method(
       planned_memory = method_holder.planned_memory->planned_memory.get();
     }
 
+    method_holder.dynamic_allocator =
+        std::make_unique<runtime::PalDynamicAllocator>();
     method_holder.memory_manager = std::make_unique<runtime::MemoryManager>(
-        memory_allocator_.get(), planned_memory, temp_allocator_.get());
+        memory_allocator_.get(),
+        planned_memory,
+        temp_allocator_.get(),
+        method_holder.dynamic_allocator.get());
     auto res_method = program_->load_method(
         method_name.c_str(),
         method_holder.memory_manager.get(),

@@ -14,6 +14,7 @@
 #include <unordered_set>
 #include <vector>
 
+#include <executorch/runtime/executor/dynamic_allocator.h>
 #include <executorch/runtime/executor/program.h>
 
 #ifdef USE_ATEN_LIB
@@ -694,6 +695,7 @@ class Module {
 
   struct MethodHolder {
     std::unique_ptr<PlannedMemory> planned_memory;
+    std::unique_ptr<runtime::DynamicAllocator> dynamic_allocator;
     std::unique_ptr<runtime::MemoryManager> memory_manager;
     std::unique_ptr<Method> method;
   };

@@ -25,6 +25,7 @@ def define_common_targets():
                 "//executorch/extension/data_loader:mmap_data_loader",
                 "//executorch/extension/flat_tensor:flat_tensor_data_map" + aten_suffix,
                 "//executorch/extension/named_data_map:merged_data_map" + aten_suffix,
+                "//executorch/runtime/executor:pal_dynamic_allocator",
             ],
             exported_deps = [
                 "//executorch/runtime/executor:program_no_prim_ops" + aten_suffix,

@@ -41,6 +41,7 @@ def define_common_targets():
             "//executorch/runtime/core/exec_aten/util:scalar_type_util",
             "//executorch/runtime/core/exec_aten/util:tensor_shape_to_c_string",
             "//executorch/runtime/core:tag",
+            "//executorch/runtime/executor:dynamic_allocator",
         ],
     )
 

@@ -113,12 +113,53 @@ Error TensorImpl::internal_resize_contiguous(ArrayRef<SizesType> new_sizes) {
       }
 
       break;
-    case TensorShapeDynamism::DYNAMIC_BOUND:
-      // TODO(T175194371): Unbounded dynamic tensor resizing is not yet
-      // supported: treat them as upper-bounded.
     case TensorShapeDynamism::DYNAMIC_UNBOUND: {
       const auto new_numel = compute_numel(new_sizes.data(), dim_);
 
+      ET_CHECK_OR_RETURN_ERROR(
+          static_cast<size_t>(new_numel) <= numel_bound_,
+          NotSupported,
+          "Attempted to resize a dynamic unbound tensor beyond its ceiling of %zu elements to %zu elements.",
+          numel_bound_,
+          new_numel);
+
+      const size_t needed_bytes =
+          static_cast<size_t>(new_numel) * elementSize(type_);
+      if (needed_bytes > capacity_bytes_) {
+        ET_CHECK_OR_RETURN_ERROR(
+            dynamic_allocator_ != nullptr,
+            NotSupported,
+            "DYNAMIC_UNBOUND tensor needs reallocation but has no DynamicAllocator");
+        size_t actual_size = 0;
+        void* new_data = dynamic_allocator_->reallocate(
+            data_,
+            capacity_bytes_,
+            needed_bytes,
+            alignof(std::max_align_t),
+            &actual_size);
+        ET_CHECK_OR_RETURN_ERROR(
+            new_data != nullptr,
+            MemoryAllocationFailed,
+            "Failed to reallocate DYNAMIC_UNBOUND tensor to %zu bytes",
+            needed_bytes);
+        data_ = new_data;
+        capacity_bytes_ = actual_size;
+      }
+
+      if (strides_ && dim_order_) {
+        auto error =
+            dim_order_to_stride(new_sizes.data(), dim_order_, dim_, strides_);
+        if (error != Error::Ok) {
+          return error;
+        }
+      }
+      numel_ = new_numel;
+      std::copy(new_sizes.begin(), new_sizes.end(), sizes_);
+    } break;
+
+    case TensorShapeDynamism::DYNAMIC_BOUND: {
+      const auto new_numel = compute_numel(new_sizes.data(), dim_);
+
       ET_CHECK_OR_RETURN_ERROR(
           static_cast<size_t>(new_numel) <= numel_bound_,
           NotSupported,

@@ -12,6 +12,7 @@
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/portable_type/scalar_type.h>
 #include <executorch/runtime/core/tensor_shape_dynamism.h>
+#include <executorch/runtime/executor/dynamic_allocator.h>
 
 // Forward declaration of a helper that provides access to internal resizing
 // methods of TensorImpl. Real definition is in
@@ -203,6 +204,26 @@ class TensorImpl {
     data_ = ptr;
   }
 
+  /// Returns the dynamic allocator for DYNAMIC_UNBOUND tensors, or nullptr.
+  DynamicAllocator* dynamic_allocator() const {
+    return dynamic_allocator_;
+  }
+
+  /// Sets the dynamic allocator for lazy allocation.
+  void set_dynamic_allocator(DynamicAllocator* allocator) {
+    dynamic_allocator_ = allocator;
+  }
+
+  /// Returns the capacity in bytes of the current dynamic allocation.
+  size_t capacity_bytes() const {
+    return capacity_bytes_;
+  }
+
+  /// Sets the capacity in bytes of the current dynamic allocation.
+  void set_capacity_bytes(size_t capacity) {
+    capacity_bytes_ = capacity;
+  }
+
   /*
    * DEPRECATED: Use torch::executor::resize_tensor() or
    * torch::executor::resize_tensor_impl().
@@ -261,6 +282,13 @@ class TensorImpl {
 
   /// Specifies the mutability of the shape of the tensor.
   const TensorShapeDynamism shape_dynamism_;
+
+  /// Allocator for DYNAMIC_UNBOUND tensors. nullptr for other dynamism types.
+  DynamicAllocator* dynamic_allocator_ = nullptr;
+
+  /// Capacity in bytes of the buffer pointed to by data_, when managed by
+  /// dynamic_allocator_. 0 means no allocation yet.
+  size_t capacity_bytes_ = 0;
 };
 
 /**

@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstddef>
+
+namespace executorch {
+namespace runtime {
+
+/**
+ * Interface for dynamic memory allocation used by DYNAMIC_UNBOUND tensors.
+ *
+ * Tensors marked as DYNAMIC_UNBOUND have their memory allocated lazily at
+ * runtime rather than at load time. This interface allows plugging in custom
+ * allocation strategies (e.g., page-aligned, tracking, virtual memory).
+ */
+class DynamicAllocator {
+ public:
+  virtual ~DynamicAllocator() = default;
+
+  /**
+   * Allocate memory.
+   *
+   * @param[in] size Minimum number of bytes to allocate.
+   * @param[in] alignment Required alignment of the returned pointer.
+   * @param[out] actual_size If non-null, receives the actual allocation size
+   *     (may be larger than requested, e.g., due to growth policy).
+   * @returns Pointer to allocated memory, or nullptr on failure.
+   */
+  virtual void* allocate(
+      size_t size,
+      size_t alignment,
+      size_t* actual_size) = 0;
+
+  /**
+   * Reallocate memory, potentially growing the buffer.
+   *
+   * The allocator may implement a growth policy (e.g., 2x) so that the
+   * actual allocation exceeds new_size. Old data up to min(old_size, new_size)
+   * is preserved.
+   *
+   * @param[in] ptr Pointer previously returned by allocate() or reallocate(),
+   *     or nullptr (in which case this behaves like allocate()).
+   * @param[in] old_size Size of the existing allocation at ptr.
+   * @param[in] new_size Minimum number of bytes needed.
+   * @param[in] alignment Required alignment of the returned pointer.
+   * @param[out] actual_size If non-null, receives the actual allocation size.
+   * @returns Pointer to reallocated memory, or nullptr on failure. On failure,
+   *     the old allocation at ptr remains valid.
+   */
+  virtual void* reallocate(
+      void* ptr,
+      size_t old_size,
+      size_t new_size,
+      size_t alignment,
+      size_t* actual_size) = 0;
+
+  /**
+   * Free memory previously returned by allocate() or reallocate().
+   *
+   * @param[in] ptr Pointer to free. May be nullptr (no-op).
+   */
+  virtual void free(void* ptr) = 0;
+};
+
+} // namespace runtime
+} // namespace executorch
@@ -10,6 +10,7 @@
 
 #include <executorch/runtime/core/hierarchical_allocator.h>
 #include <executorch/runtime/core/memory_allocator.h>
+#include <executorch/runtime/executor/dynamic_allocator.h>
 
 namespace executorch {
 namespace runtime {
@@ -52,10 +53,12 @@ class MemoryManager final {
   explicit MemoryManager(
       MemoryAllocator* method_allocator,
       HierarchicalAllocator* planned_memory = nullptr,
-      MemoryAllocator* temp_allocator = nullptr)
+      MemoryAllocator* temp_allocator = nullptr,
+      DynamicAllocator* dynamic_allocator = nullptr)
       : method_allocator_(method_allocator),
         planned_memory_(planned_memory),
-        temp_allocator_(temp_allocator) {
+        temp_allocator_(temp_allocator),
+        dynamic_allocator_(dynamic_allocator) {
     ET_CHECK_MSG(
         method_allocator != temp_allocator,
         "method allocator cannot be the same as temp allocator");
@@ -105,10 +108,19 @@ class MemoryManager final {
     return temp_allocator_;
   }
 
+  /**
+   * Returns the allocator to use for DYNAMIC_UNBOUND tensor data.
+   * May be nullptr if the program does not use DYNAMIC_UNBOUND tensors.
+   */
+  DynamicAllocator* dynamic_allocator() const {
+    return dynamic_allocator_;
+  }
+
  private:
   MemoryAllocator* method_allocator_;
   HierarchicalAllocator* planned_memory_;
   MemoryAllocator* temp_allocator_;
+  DynamicAllocator* dynamic_allocator_;
 };
 
 } // namespace runtime