From f0b5b5f8a8af55d856a17898c7fc8b9694ec92d5 Mon Sep 17 00:00:00 2001
From: Siddartha Pothapragada <sidart@meta.com>
Date: Thu, 19 Mar 2026 15:05:41 -0700
Subject: [PATCH] DYNAMIC_UNBOUND support for portable runtime: lazy KV cache
 allocation

Enable DYNAMIC_UNBOUND tensors in the portable runtime, allowing KV cache
buffers to be dynamically managed rather than statically memory-planned.
This is the architectural foundation for pay-as-you-go memory allocation
in ExecuTorch LLM inference.

Core changes:
- DynamicAllocator interface with allocate/reallocate/free
- PalDynamicAllocator default impl (PAL-backed, 2x growth policy)
- TrackingDynamicAllocator for memory stats observability
- MemoryManager gains 4th slot for DynamicAllocator (backward compatible)
- TensorImpl gains dynamic_allocator_ and capacity_bytes_ fields
- TensorImpl::internal_resize_contiguous handles DYNAMIC_UNBOUND resize
- tensor_parser_portable.cpp: remove DYNAMIC_UNBOUND rejection, wire up
  allocator at load time for tensors with no memory-planned data
- method.cpp: FreeCall frees dynamic memory; destructor cleans up all
- Module API auto-creates PalDynamicAllocator (DYNAMIC_UNBOUND just works)

Export changes:
- MarkDynamicUnboundPass marks KV cache buffers as DYNAMIC_UNBOUND
- --lazy_kv_cache flag for Llama export

Co-authored-by: Claude <noreply@anthropic.com>
---
 examples/models/llama/export_llama_lib.py     |  16 +++
 exir/passes/mark_dynamic_unbound_pass.py      |  47 ++++++++
 extension/llm/export/config/llm_config.py     |   3 +
 extension/module/module.cpp                   |   8 +-
 extension/module/module.h                     |   2 +
 extension/module/targets.bzl                  |   1 +
 runtime/core/portable_type/targets.bzl        |   1 +
 runtime/core/portable_type/tensor_impl.cpp    |  47 +++++++-
 runtime/core/portable_type/tensor_impl.h      |  28 +++++
 runtime/executor/dynamic_allocator.h          |  73 +++++++++++++
 runtime/executor/memory_manager.h             |  16 ++-
 runtime/executor/method.cpp                   |  22 ++++
 runtime/executor/pal_dynamic_allocator.h      | 102 ++++++++++++++++++
 runtime/executor/targets.bzl                  |  32 ++++++
 runtime/executor/tensor_parser_portable.cpp   |  39 +++++--
 runtime/executor/tracking_dynamic_allocator.h |  92 ++++++++++++++++
 16 files changed, 517 insertions(+), 12 deletions(-)
 create mode 100644 exir/passes/mark_dynamic_unbound_pass.py
 create mode 100644 runtime/executor/dynamic_allocator.h
 create mode 100644 runtime/executor/pal_dynamic_allocator.h
 create mode 100644 runtime/executor/tracking_dynamic_allocator.h

diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 0394bf7f320..b8f5e3cf170 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -418,6 +418,15 @@ def build_args_parser() -> argparse.ArgumentParser:
         help="maximum length of context for model to remember",
     )
 
+    parser.add_argument(
+        "--lazy_kv_cache",
+        action="store_true",
+        default=False,
+        help="Mark KV cache buffers as DYNAMIC_UNBOUND so they are allocated "
+        "lazily at runtime instead of at load time. Reduces initial memory "
+        "usage when max_context_length is large.",
+    )
+
     parser.add_argument(
         "--local_global_attention",
         type=parse_list_of_ints,
@@ -1362,6 +1371,13 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager:  # noqa: C901
     if llm_config.base.model_class.value in TORCHTUNE_DEFINED_MODELS:
         additional_passes = [InitializedMutableBufferPass(["kv_cache_pos"])]
 
+    if llm_config.export.lazy_kv_cache:
+        from executorch.exir.passes.mark_dynamic_unbound_pass import (
+            MarkDynamicUnboundPass,
+        )
+
+        additional_passes.append(MarkDynamicUnboundPass())
+
     # export_to_edge
     builder_manager = _prepare_for_llama_export(llm_config)
     if (
diff --git a/exir/passes/mark_dynamic_unbound_pass.py b/exir/passes/mark_dynamic_unbound_pass.py
new file mode 100644
index 00000000000..b0bb6912a29
--- /dev/null
+++ b/exir/passes/mark_dynamic_unbound_pass.py
@@ -0,0 +1,47 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import List, Optional
+
+import torch
+from executorch.exir.pass_base import ExportPass, PassResult
+from executorch.exir.schema import TensorShapeDynamism
+from executorch.exir.tensor import TensorSpec
+
+
+class MarkDynamicUnboundPass(ExportPass):
+    """
+    Marks mutable buffer TensorSpecs matching given name patterns as
+    DYNAMIC_UNBOUND. This causes the memory planner to skip them (no
+    allocation_info in the flatbuffer), and the runtime will allocate their
+    memory lazily via DynamicAllocator.
+
+    Typical usage: mark KV cache buffers so they start unallocated and grow
+    on demand, avoiding the full upfront memory cost of max_context_length.
+    """
+
+    def __init__(
+        self,
+        name_patterns: Optional[List[str]] = None,
+    ) -> None:
+        super().__init__()
+        self.name_patterns = name_patterns or ["k_cache", "v_cache"]
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        modified = False
+        for node in graph_module.graph.nodes:
+            if node.op != "placeholder":
+                continue
+            spec = node.meta.get("spec")
+            if not isinstance(spec, TensorSpec):
+                continue
+            if not spec.const:
+                # Only process mutable buffers (const=False means mutable).
+                name = node.name
+                if any(pattern in name for pattern in self.name_patterns):
+                    spec.shape_dynamism = TensorShapeDynamism.DYNAMIC_UNBOUND
+                    modified = True
+        return PassResult(graph_module, modified)
diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py
index 47ad2f4374a..2fbcfd2adb5 100644
--- a/extension/llm/export/config/llm_config.py
+++ b/extension/llm/export/config/llm_config.py
@@ -256,6 +256,7 @@ class ExportConfig:
     export_only: bool = False
     foundation_weights_file: Optional[str] = None
     lora_weights_file: Optional[str] = None
+    lazy_kv_cache: bool = False
 
     def __post_init__(self):
         if self.max_context_length < self.max_seq_length:
@@ -695,6 +696,8 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig":  # noqa: C901
             llm_config.export.foundation_weights_file = args.foundation_weights_file
         if hasattr(args, "lora_weights_file"):
             llm_config.export.lora_weights_file = args.lora_weights_file
+        if hasattr(args, "lazy_kv_cache"):
+            llm_config.export.lazy_kv_cache = args.lazy_kv_cache
 
         # QuantizationConfig
         if hasattr(args, "quantization_mode"):
diff --git a/extension/module/module.cpp b/extension/module/module.cpp
index ec7236276f5..c4f9a101398 100644
--- a/extension/module/module.cpp
+++ b/extension/module/module.cpp
@@ -13,6 +13,7 @@
 #include <executorch/extension/flat_tensor/flat_tensor_data_map.h>
 #include <executorch/extension/memory_allocator/malloc_memory_allocator.h>
 #include <executorch/extension/named_data_map/merged_data_map.h>
+#include <executorch/runtime/executor/pal_dynamic_allocator.h>
 #include <executorch/runtime/platform/runtime.h>
 
 namespace executorch {
@@ -389,8 +390,13 @@ runtime::Error Module::load_method(
       planned_memory = method_holder.planned_memory->planned_memory.get();
     }
 
+    method_holder.dynamic_allocator =
+        std::make_unique<runtime::PalDynamicAllocator>();
     method_holder.memory_manager = std::make_unique<runtime::MemoryManager>(
-        memory_allocator_.get(), planned_memory, temp_allocator_.get());
+        memory_allocator_.get(),
+        planned_memory,
+        temp_allocator_.get(),
+        method_holder.dynamic_allocator.get());
     auto res_method = program_->load_method(
         method_name.c_str(),
         method_holder.memory_manager.get(),
diff --git a/extension/module/module.h b/extension/module/module.h
index 08a68b2676b..0e26ba297cc 100644
--- a/extension/module/module.h
+++ b/extension/module/module.h
@@ -14,6 +14,7 @@
 #include <unordered_set>
 #include <vector>
 
+#include <executorch/runtime/executor/dynamic_allocator.h>
 #include <executorch/runtime/executor/program.h>
 
 #ifdef USE_ATEN_LIB
@@ -694,6 +695,7 @@ class Module {
 
   struct MethodHolder {
     std::unique_ptr<PlannedMemory> planned_memory;
+    std::unique_ptr<runtime::DynamicAllocator> dynamic_allocator;
     std::unique_ptr<runtime::MemoryManager> memory_manager;
     std::unique_ptr<Method> method;
   };
diff --git a/extension/module/targets.bzl b/extension/module/targets.bzl
index 6d60429bc51..d613da8b35f 100644
--- a/extension/module/targets.bzl
+++ b/extension/module/targets.bzl
@@ -25,6 +25,7 @@ def define_common_targets():
                 "//executorch/extension/data_loader:mmap_data_loader",
                 "//executorch/extension/flat_tensor:flat_tensor_data_map" + aten_suffix,
                 "//executorch/extension/named_data_map:merged_data_map" + aten_suffix,
+                "//executorch/runtime/executor:pal_dynamic_allocator",
             ],
             exported_deps = [
                 "//executorch/runtime/executor:program_no_prim_ops" + aten_suffix,
diff --git a/runtime/core/portable_type/targets.bzl b/runtime/core/portable_type/targets.bzl
index 5b6e67fa213..9d5b261a9b2 100644
--- a/runtime/core/portable_type/targets.bzl
+++ b/runtime/core/portable_type/targets.bzl
@@ -41,6 +41,7 @@ def define_common_targets():
             "//executorch/runtime/core/exec_aten/util:scalar_type_util",
             "//executorch/runtime/core/exec_aten/util:tensor_shape_to_c_string",
             "//executorch/runtime/core:tag",
+            "//executorch/runtime/executor:dynamic_allocator",
         ],
     )
 
diff --git a/runtime/core/portable_type/tensor_impl.cpp b/runtime/core/portable_type/tensor_impl.cpp
index ede5a3d4101..5b1d2ad850b 100644
--- a/runtime/core/portable_type/tensor_impl.cpp
+++ b/runtime/core/portable_type/tensor_impl.cpp
@@ -113,12 +113,53 @@ Error TensorImpl::internal_resize_contiguous(ArrayRef<SizesType> new_sizes) {
       }
 
       break;
-    case TensorShapeDynamism::DYNAMIC_BOUND:
-      // TODO(T175194371): Unbounded dynamic tensor resizing is not yet
-      // supported: treat them as upper-bounded.
     case TensorShapeDynamism::DYNAMIC_UNBOUND: {
       const auto new_numel = compute_numel(new_sizes.data(), dim_);
 
+      ET_CHECK_OR_RETURN_ERROR(
+          static_cast<size_t>(new_numel) <= numel_bound_,
+          NotSupported,
+          "Attempted to resize a dynamic unbound tensor beyond its ceiling of %zu elements to %zu elements.",
+          numel_bound_,
+          new_numel);
+
+      const size_t needed_bytes =
+          static_cast<size_t>(new_numel) * elementSize(type_);
+      if (needed_bytes > capacity_bytes_) {
+        ET_CHECK_OR_RETURN_ERROR(
+            dynamic_allocator_ != nullptr,
+            NotSupported,
+            "DYNAMIC_UNBOUND tensor needs reallocation but has no DynamicAllocator");
+        size_t actual_size = 0;
+        void* new_data = dynamic_allocator_->reallocate(
+            data_,
+            capacity_bytes_,
+            needed_bytes,
+            alignof(std::max_align_t),
+            &actual_size);
+        ET_CHECK_OR_RETURN_ERROR(
+            new_data != nullptr,
+            MemoryAllocationFailed,
+            "Failed to reallocate DYNAMIC_UNBOUND tensor to %zu bytes",
+            needed_bytes);
+        data_ = new_data;
+        capacity_bytes_ = actual_size;
+      }
+
+      if (strides_ && dim_order_) {
+        auto error =
+            dim_order_to_stride(new_sizes.data(), dim_order_, dim_, strides_);
+        if (error != Error::Ok) {
+          return error;
+        }
+      }
+      numel_ = new_numel;
+      std::copy(new_sizes.begin(), new_sizes.end(), sizes_);
+    } break;
+
+    case TensorShapeDynamism::DYNAMIC_BOUND: {
+      const auto new_numel = compute_numel(new_sizes.data(), dim_);
+
       ET_CHECK_OR_RETURN_ERROR(
           static_cast<size_t>(new_numel) <= numel_bound_,
           NotSupported,
diff --git a/runtime/core/portable_type/tensor_impl.h b/runtime/core/portable_type/tensor_impl.h
index 1e2b3620ca2..e1267f6b8ca 100644
--- a/runtime/core/portable_type/tensor_impl.h
+++ b/runtime/core/portable_type/tensor_impl.h
@@ -12,6 +12,7 @@
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/portable_type/scalar_type.h>
 #include <executorch/runtime/core/tensor_shape_dynamism.h>
+#include <executorch/runtime/executor/dynamic_allocator.h>
 
 // Forward declaration of a helper that provides access to internal resizing
 // methods of TensorImpl. Real definition is in
@@ -203,6 +204,26 @@ class TensorImpl {
     data_ = ptr;
   }
 
+  /// Returns the dynamic allocator for DYNAMIC_UNBOUND tensors, or nullptr.
+  DynamicAllocator* dynamic_allocator() const {
+    return dynamic_allocator_;
+  }
+
+  /// Sets the dynamic allocator for lazy allocation.
+  void set_dynamic_allocator(DynamicAllocator* allocator) {
+    dynamic_allocator_ = allocator;
+  }
+
+  /// Returns the capacity in bytes of the current dynamic allocation.
+  size_t capacity_bytes() const {
+    return capacity_bytes_;
+  }
+
+  /// Sets the capacity in bytes of the current dynamic allocation.
+  void set_capacity_bytes(size_t capacity) {
+    capacity_bytes_ = capacity;
+  }
+
   /*
    * DEPRECATED: Use torch::executor::resize_tensor() or
    * torch::executor::resize_tensor_impl().
@@ -261,6 +282,13 @@ class TensorImpl {
 
   /// Specifies the mutability of the shape of the tensor.
   const TensorShapeDynamism shape_dynamism_;
+
+  /// Allocator for DYNAMIC_UNBOUND tensors. nullptr for other dynamism types.
+  DynamicAllocator* dynamic_allocator_ = nullptr;
+
+  /// Capacity in bytes of the buffer pointed to by data_, when managed by
+  /// dynamic_allocator_. 0 means no allocation yet.
+  size_t capacity_bytes_ = 0;
 };
 
 /**
diff --git a/runtime/executor/dynamic_allocator.h b/runtime/executor/dynamic_allocator.h
new file mode 100644
index 00000000000..8e3c3cb479f
--- /dev/null
+++ b/runtime/executor/dynamic_allocator.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstddef>
+
+namespace executorch {
+namespace runtime {
+
+/**
+ * Interface for dynamic memory allocation used by DYNAMIC_UNBOUND tensors.
+ *
+ * Tensors marked as DYNAMIC_UNBOUND have their memory allocated lazily at
+ * runtime rather than at load time. This interface allows plugging in custom
+ * allocation strategies (e.g., page-aligned, tracking, virtual memory).
+ */
+class DynamicAllocator {
+ public:
+  virtual ~DynamicAllocator() = default;
+
+  /**
+   * Allocate memory.
+   *
+   * @param[in] size Minimum number of bytes to allocate.
+   * @param[in] alignment Required alignment of the returned pointer.
+   * @param[out] actual_size If non-null, receives the actual allocation size
+   *     (may be larger than requested, e.g., due to growth policy).
+   * @returns Pointer to allocated memory, or nullptr on failure.
+   */
+  virtual void* allocate(
+      size_t size,
+      size_t alignment,
+      size_t* actual_size) = 0;
+
+  /**
+   * Reallocate memory, potentially growing the buffer.
+   *
+   * The allocator may implement a growth policy (e.g., 2x) so that the
+   * actual allocation exceeds new_size. Old data up to min(old_size, new_size)
+   * is preserved.
+   *
+   * @param[in] ptr Pointer previously returned by allocate() or reallocate(),
+   *     or nullptr (in which case this behaves like allocate()).
+   * @param[in] old_size Size of the existing allocation at ptr.
+   * @param[in] new_size Minimum number of bytes needed.
+   * @param[in] alignment Required alignment of the returned pointer.
+   * @param[out] actual_size If non-null, receives the actual allocation size.
+   * @returns Pointer to reallocated memory, or nullptr on failure. On failure,
+   *     the old allocation at ptr remains valid.
+   */
+  virtual void* reallocate(
+      void* ptr,
+      size_t old_size,
+      size_t new_size,
+      size_t alignment,
+      size_t* actual_size) = 0;
+
+  /**
+   * Free memory previously returned by allocate() or reallocate().
+   *
+   * @param[in] ptr Pointer to free. May be nullptr (no-op).
+   */
+  virtual void free(void* ptr) = 0;
+};
+
+} // namespace runtime
+} // namespace executorch
diff --git a/runtime/executor/memory_manager.h b/runtime/executor/memory_manager.h
index 42edd9f0bea..0d22232f8b8 100644
--- a/runtime/executor/memory_manager.h
+++ b/runtime/executor/memory_manager.h
@@ -10,6 +10,7 @@
 
 #include <executorch/runtime/core/hierarchical_allocator.h>
 #include <executorch/runtime/core/memory_allocator.h>
+#include <executorch/runtime/executor/dynamic_allocator.h>
 
 namespace executorch {
 namespace runtime {
@@ -52,10 +53,12 @@ class MemoryManager final {
   explicit MemoryManager(
       MemoryAllocator* method_allocator,
       HierarchicalAllocator* planned_memory = nullptr,
-      MemoryAllocator* temp_allocator = nullptr)
+      MemoryAllocator* temp_allocator = nullptr,
+      DynamicAllocator* dynamic_allocator = nullptr)
       : method_allocator_(method_allocator),
         planned_memory_(planned_memory),
-        temp_allocator_(temp_allocator) {
+        temp_allocator_(temp_allocator),
+        dynamic_allocator_(dynamic_allocator) {
     ET_CHECK_MSG(
         method_allocator != temp_allocator,
         "method allocator cannot be the same as temp allocator");
@@ -105,10 +108,19 @@ class MemoryManager final {
     return temp_allocator_;
   }
 
+  /**
+   * Returns the allocator to use for DYNAMIC_UNBOUND tensor data.
+   * May be nullptr if the program does not use DYNAMIC_UNBOUND tensors.
+   */
+  DynamicAllocator* dynamic_allocator() const {
+    return dynamic_allocator_;
+  }
+
  private:
   MemoryAllocator* method_allocator_;
   HierarchicalAllocator* planned_memory_;
   MemoryAllocator* temp_allocator_;
+  DynamicAllocator* dynamic_allocator_;
 };
 
 } // namespace runtime
diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp
index 474afc446d2..c1400dd9270 100644
--- a/runtime/executor/method.cpp
+++ b/runtime/executor/method.cpp
@@ -1502,6 +1502,14 @@ Error Method::execute_instruction() {
       // at init time.
       auto free_call = instruction->instr_args_as_FreeCall();
       auto t = values_[free_call->value_index()].toTensor();
+      // For DYNAMIC_UNBOUND tensors, actually free the dynamically allocated
+      // memory rather than just nulling the pointer.
+      auto* impl = t.unsafeGetTensorImpl();
+      if (impl->dynamic_allocator() != nullptr &&
+          impl->mutable_data() != nullptr) {
+        impl->dynamic_allocator()->free(impl->mutable_data());
+        impl->set_capacity_bytes(0);
+      }
       internal::reset_data_ptr(t);
     } break;
     default:
@@ -1776,6 +1784,20 @@ EventTracer* Method::get_event_tracer() {
 }
 
 Method::~Method() {
+  // Free any dynamically allocated tensor memory (DYNAMIC_UNBOUND).
+  if (values_ != nullptr) {
+    for (size_t i = 0; i < n_value_; ++i) {
+      if (values_[i].isTensor()) {
+        auto* impl = values_[i].toTensor().unsafeGetTensorImpl();
+        if (impl->dynamic_allocator() != nullptr &&
+            impl->mutable_data() != nullptr) {
+          impl->dynamic_allocator()->free(impl->mutable_data());
+          impl->set_data(nullptr);
+          impl->set_capacity_bytes(0);
+        }
+      }
+    }
+  }
   // Destroy the values. It's necessary in ATen mode, where the refcount of
   // Tensors needs to be decremented properly.
   if (values_ != nullptr) {
diff --git a/runtime/executor/pal_dynamic_allocator.h b/runtime/executor/pal_dynamic_allocator.h
new file mode 100644
index 00000000000..ac0ca4e2d72
--- /dev/null
+++ b/runtime/executor/pal_dynamic_allocator.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <cstring>
+
+#include <executorch/runtime/executor/dynamic_allocator.h>
+#include <executorch/runtime/platform/platform.h>
+
+namespace executorch {
+namespace runtime {
+
+/**
+ * Default DynamicAllocator implementation backed by the PAL (Platform
+ * Abstraction Layer). Uses et_pal_allocate/et_pal_free for memory management
+ * and a 2x growth policy on reallocation to amortize allocation cost.
+ */
+class PalDynamicAllocator : public DynamicAllocator {
+ public:
+  void* allocate(size_t size, size_t alignment, size_t* actual_size) override {
+    // Over-allocate to accommodate alignment and the raw-pointer bookkeeping.
+    size_t alloc_size = size + sizeof(void*) + alignment;
+    void* raw = pal_allocate(alloc_size);
+    if (raw == nullptr) {
+      return nullptr;
+    }
+    void* aligned = align_pointer(raw, alignment);
+    // Store the raw pointer just before the aligned pointer so we can free it.
+    store_raw_pointer(aligned, raw);
+    if (actual_size) {
+      *actual_size = size;
+    }
+    return aligned;
+  }
+
+  void* reallocate(
+      void* ptr,
+      size_t old_size,
+      size_t new_size,
+      size_t alignment,
+      size_t* actual_size) override {
+    if (ptr == nullptr) {
+      return allocate(new_size, alignment, actual_size);
+    }
+    // Growth policy: at least 2x old_size to amortize repeated resizes.
+    size_t target = std::max(new_size, old_size * 2);
+    size_t alloc_size = target + sizeof(void*) + alignment;
+    void* raw = pal_allocate(alloc_size);
+    if (raw == nullptr) {
+      return nullptr;
+    }
+    void* aligned = align_pointer(raw, alignment);
+    store_raw_pointer(aligned, raw);
+    // Copy old data.
+    size_t copy_size = std::min(old_size, new_size);
+    if (copy_size > 0) {
+      std::memcpy(aligned, ptr, copy_size);
+    }
+    // Free old allocation.
+    free(ptr);
+    if (actual_size) {
+      *actual_size = target;
+    }
+    return aligned;
+  }
+
+  void free(void* ptr) override {
+    if (ptr == nullptr) {
+      return;
+    }
+    void* raw = load_raw_pointer(ptr);
+    pal_free(raw);
+  }
+
+ private:
+  static void* align_pointer(void* ptr, size_t alignment) {
+    uintptr_t addr = reinterpret_cast<uintptr_t>(ptr);
+    // Reserve space for the raw pointer bookkeeping.
+    addr += sizeof(void*);
+    uintptr_t aligned = (addr + alignment - 1) & ~(alignment - 1);
+    return reinterpret_cast<void*>(aligned);
+  }
+
+  static void store_raw_pointer(void* aligned, void* raw) {
+    // Store the raw (unaligned) pointer immediately before the aligned pointer.
+    reinterpret_cast<void**>(aligned)[-1] = raw;
+  }
+
+  static void* load_raw_pointer(void* aligned) {
+    return reinterpret_cast<void**>(aligned)[-1];
+  }
+};
+
+} // namespace runtime
+} // namespace executorch
diff --git a/runtime/executor/targets.bzl b/runtime/executor/targets.bzl
index 90f8d0221e9..62fbb74373c 100644
--- a/runtime/executor/targets.bzl
+++ b/runtime/executor/targets.bzl
@@ -29,12 +29,44 @@ def define_common_targets():
     TARGETS and BUCK files that call this function.
     """
 
+    runtime.cxx_library(
+        name = "dynamic_allocator",
+        exported_headers = [
+            "dynamic_allocator.h",
+        ],
+        visibility = ["PUBLIC"],
+    )
+
+    runtime.cxx_library(
+        name = "pal_dynamic_allocator",
+        exported_headers = [
+            "pal_dynamic_allocator.h",
+        ],
+        exported_deps = [
+            ":dynamic_allocator",
+            "//executorch/runtime/platform:platform",
+        ],
+        visibility = ["PUBLIC"],
+    )
+
+    runtime.cxx_library(
+        name = "tracking_dynamic_allocator",
+        exported_headers = [
+            "tracking_dynamic_allocator.h",
+        ],
+        exported_deps = [
+            ":dynamic_allocator",
+        ],
+        visibility = ["PUBLIC"],
+    )
+
     runtime.cxx_library(
         name = "memory_manager",
         exported_headers = [
             "memory_manager.h",
         ],
         exported_deps = [
+            ":dynamic_allocator",
             "//executorch/runtime/core:memory_allocator",
         ],
         visibility = ["PUBLIC"],
diff --git a/runtime/executor/tensor_parser_portable.cpp b/runtime/executor/tensor_parser_portable.cpp
index 02cb019a1da..0fb61c30ff0 100644
--- a/runtime/executor/tensor_parser_portable.cpp
+++ b/runtime/executor/tensor_parser_portable.cpp
@@ -51,12 +51,13 @@ Result<Tensor> parseTensor(
 
   TensorShapeDynamism dynamism =
       static_cast<TensorShapeDynamism>(s_tensor->shape_dynamism());
-  // TODO(T175194371): Remove this check once fully dynamic shapes are
-  // supported.
-  ET_CHECK_OR_RETURN_ERROR(
-      dynamism != TensorShapeDynamism::DYNAMIC_UNBOUND,
-      NotSupported,
-      "Fully dynamic tensor shapes not yet supported: T175194371");
+  if (dynamism == TensorShapeDynamism::DYNAMIC_UNBOUND) {
+    ET_CHECK_OR_RETURN_ERROR(
+        memory_manager->dynamic_allocator() != nullptr,
+        NotSupported,
+        "Model contains DYNAMIC_UNBOUND tensors but no DynamicAllocator was "
+        "provided. Pass a DynamicAllocator to MemoryManager.");
+  }
 
   ET_CHECK_OR_RETURN_ERROR(
       s_tensor->sizes() != nullptr, InvalidProgram, "Missing sizes field");
@@ -180,6 +181,32 @@ Result<Tensor> parseTensor(
   }
   tensor_impl->set_data(data_ptr.get());
 
+  // For DYNAMIC_UNBOUND tensors, wire up the dynamic allocator. Memory is
+  // managed by the DynamicAllocator rather than the memory planner, making it
+  // freeable via FreeCall and growable via resize.
+  if (dynamism == TensorShapeDynamism::DYNAMIC_UNBOUND) {
+    auto* dyn_alloc = memory_manager->dynamic_allocator();
+    tensor_impl->set_dynamic_allocator(dyn_alloc);
+    if (data_ptr.get() == nullptr && tensor_impl->nbytes() > 0) {
+      // No memory-planned data (KV cache case: data_buffer_idx=0,
+      // allocation_info=null). Allocate from DynamicAllocator now.
+      size_t actual_size = 0;
+      void* dyn_data = dyn_alloc->allocate(
+          tensor_impl->nbytes(), alignof(std::max_align_t), &actual_size);
+      ET_CHECK_OR_RETURN_ERROR(
+          dyn_data != nullptr,
+          MemoryAllocationFailed,
+          "Failed to allocate %" PRIu64
+          " bytes for DYNAMIC_UNBOUND tensor",
+          static_cast<uint64_t>(tensor_impl->nbytes()));
+      tensor_impl->set_data(dyn_data);
+      tensor_impl->set_capacity_bytes(actual_size);
+    } else {
+      tensor_impl->set_capacity_bytes(
+          data_ptr.get() != nullptr ? tensor_impl->nbytes() : 0);
+    }
+  }
+
   return Tensor(tensor_impl);
 }
 
diff --git a/runtime/executor/tracking_dynamic_allocator.h b/runtime/executor/tracking_dynamic_allocator.h
new file mode 100644
index 00000000000..394017e42b2
--- /dev/null
+++ b/runtime/executor/tracking_dynamic_allocator.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+
+#include <executorch/runtime/executor/dynamic_allocator.h>
+
+namespace executorch {
+namespace runtime {
+
+/**
+ * A DynamicAllocator wrapper that tracks allocation statistics.
+ * Delegates actual allocation to an inner DynamicAllocator and records
+ * current bytes, peak bytes, and allocation count.
+ */
+class TrackingDynamicAllocator : public DynamicAllocator {
+ public:
+  explicit TrackingDynamicAllocator(DynamicAllocator* inner) : inner_(inner) {}
+
+  void* allocate(size_t size, size_t alignment, size_t* actual_size) override {
+    size_t actual = 0;
+    void* ptr = inner_->allocate(size, alignment, &actual);
+    if (ptr != nullptr) {
+      current_bytes_ += actual;
+      peak_bytes_ = std::max(peak_bytes_, current_bytes_);
+      num_allocations_++;
+      if (actual_size) {
+        *actual_size = actual;
+      }
+    }
+    return ptr;
+  }
+
+  void* reallocate(
+      void* ptr,
+      size_t old_size,
+      size_t new_size,
+      size_t alignment,
+      size_t* actual_size) override {
+    size_t actual = 0;
+    void* new_ptr =
+        inner_->reallocate(ptr, old_size, new_size, alignment, &actual);
+    if (new_ptr != nullptr) {
+      current_bytes_ -= old_size;
+      current_bytes_ += actual;
+      peak_bytes_ = std::max(peak_bytes_, current_bytes_);
+      num_allocations_++;
+      if (actual_size) {
+        *actual_size = actual;
+      }
+    }
+    return new_ptr;
+  }
+
+  void free(void* ptr) override {
+    // Note: we don't track which allocation was which size, so current_bytes_
+    // can only be accurately decremented if the caller tracks capacity_bytes.
+    inner_->free(ptr);
+  }
+
+  /// Notify the tracker that `bytes` have been freed.
+  void record_free(size_t bytes) {
+    current_bytes_ -= std::min(bytes, current_bytes_);
+  }
+
+  size_t current_bytes() const {
+    return current_bytes_;
+  }
+  size_t peak_bytes() const {
+    return peak_bytes_;
+  }
+  size_t num_allocations() const {
+    return num_allocations_;
+  }
+
+ private:
+  DynamicAllocator* inner_;
+  size_t current_bytes_ = 0;
+  size_t peak_bytes_ = 0;
+  size_t num_allocations_ = 0;
+};
+
+} // namespace runtime
+} // namespace executorch