Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions examples/models/llama/export_llama_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,15 @@ def build_args_parser() -> argparse.ArgumentParser:
help="maximum length of context for model to remember",
)

parser.add_argument(
"--lazy_kv_cache",
action="store_true",
default=False,
help="Mark KV cache buffers as DYNAMIC_UNBOUND so they are allocated "
"lazily at runtime instead of at load time. Reduces initial memory "
"usage when max_context_length is large.",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this because we do actually touch the full memory during attention?

)

parser.add_argument(
"--local_global_attention",
type=parse_list_of_ints,
Expand Down Expand Up @@ -1362,6 +1371,13 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager: # noqa: C901
if llm_config.base.model_class.value in TORCHTUNE_DEFINED_MODELS:
additional_passes = [InitializedMutableBufferPass(["kv_cache_pos"])]

if llm_config.export.lazy_kv_cache:
from executorch.exir.passes.mark_dynamic_unbound_pass import (
MarkDynamicUnboundPass,
)

additional_passes.append(MarkDynamicUnboundPass())

# export_to_edge
builder_manager = _prepare_for_llama_export(llm_config)
if (
Expand Down
47 changes: 47 additions & 0 deletions exir/passes/mark_dynamic_unbound_pass.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

from typing import List, Optional

import torch
from executorch.exir.pass_base import ExportPass, PassResult
from executorch.exir.schema import TensorShapeDynamism
from executorch.exir.tensor import TensorSpec


class MarkDynamicUnboundPass(ExportPass):
"""
Marks mutable buffer TensorSpecs matching given name patterns as
DYNAMIC_UNBOUND. This causes the memory planner to skip them (no
allocation_info in the flatbuffer), and the runtime will allocate their
memory lazily via DynamicAllocator.

Typical usage: mark KV cache buffers so they start unallocated and grow
on demand, avoiding the full upfront memory cost of max_context_length.
"""

def __init__(
self,
name_patterns: Optional[List[str]] = None,
) -> None:
super().__init__()
self.name_patterns = name_patterns or ["k_cache", "v_cache"]

def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
modified = False
for node in graph_module.graph.nodes:
if node.op != "placeholder":
continue
spec = node.meta.get("spec")
if not isinstance(spec, TensorSpec):
continue
if not spec.const:
# Only process mutable buffers (const=False means mutable).
name = node.name
if any(pattern in name for pattern in self.name_patterns):
spec.shape_dynamism = TensorShapeDynamism.DYNAMIC_UNBOUND
modified = True
return PassResult(graph_module, modified)
3 changes: 3 additions & 0 deletions extension/llm/export/config/llm_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,7 @@ class ExportConfig:
export_only: bool = False
foundation_weights_file: Optional[str] = None
lora_weights_file: Optional[str] = None
lazy_kv_cache: bool = False

def __post_init__(self):
if self.max_context_length < self.max_seq_length:
Expand Down Expand Up @@ -695,6 +696,8 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig": # noqa: C901
llm_config.export.foundation_weights_file = args.foundation_weights_file
if hasattr(args, "lora_weights_file"):
llm_config.export.lora_weights_file = args.lora_weights_file
if hasattr(args, "lazy_kv_cache"):
llm_config.export.lazy_kv_cache = args.lazy_kv_cache

# QuantizationConfig
if hasattr(args, "quantization_mode"):
Expand Down
8 changes: 7 additions & 1 deletion extension/module/module.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <executorch/extension/flat_tensor/flat_tensor_data_map.h>
#include <executorch/extension/memory_allocator/malloc_memory_allocator.h>
#include <executorch/extension/named_data_map/merged_data_map.h>
#include <executorch/runtime/executor/pal_dynamic_allocator.h>
#include <executorch/runtime/platform/runtime.h>

namespace executorch {
Expand Down Expand Up @@ -389,8 +390,13 @@ runtime::Error Module::load_method(
planned_memory = method_holder.planned_memory->planned_memory.get();
}

method_holder.dynamic_allocator =
std::make_unique<runtime::PalDynamicAllocator>();
method_holder.memory_manager = std::make_unique<runtime::MemoryManager>(
memory_allocator_.get(), planned_memory, temp_allocator_.get());
memory_allocator_.get(),
planned_memory,
temp_allocator_.get(),
method_holder.dynamic_allocator.get());
auto res_method = program_->load_method(
method_name.c_str(),
method_holder.memory_manager.get(),
Expand Down
2 changes: 2 additions & 0 deletions extension/module/module.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include <unordered_set>
#include <vector>

#include <executorch/runtime/executor/dynamic_allocator.h>
#include <executorch/runtime/executor/program.h>

#ifdef USE_ATEN_LIB
Expand Down Expand Up @@ -694,6 +695,7 @@ class Module {

struct MethodHolder {
std::unique_ptr<PlannedMemory> planned_memory;
std::unique_ptr<runtime::DynamicAllocator> dynamic_allocator;
std::unique_ptr<runtime::MemoryManager> memory_manager;
std::unique_ptr<Method> method;
};
Expand Down
1 change: 1 addition & 0 deletions extension/module/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def define_common_targets():
"//executorch/extension/data_loader:mmap_data_loader",
"//executorch/extension/flat_tensor:flat_tensor_data_map" + aten_suffix,
"//executorch/extension/named_data_map:merged_data_map" + aten_suffix,
"//executorch/runtime/executor:pal_dynamic_allocator",
],
exported_deps = [
"//executorch/runtime/executor:program_no_prim_ops" + aten_suffix,
Expand Down
1 change: 1 addition & 0 deletions runtime/core/portable_type/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def define_common_targets():
"//executorch/runtime/core/exec_aten/util:scalar_type_util",
"//executorch/runtime/core/exec_aten/util:tensor_shape_to_c_string",
"//executorch/runtime/core:tag",
"//executorch/runtime/executor:dynamic_allocator",
],
)

Expand Down
47 changes: 44 additions & 3 deletions runtime/core/portable_type/tensor_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,12 +113,53 @@ Error TensorImpl::internal_resize_contiguous(ArrayRef<SizesType> new_sizes) {
}

break;
case TensorShapeDynamism::DYNAMIC_BOUND:
// TODO(T175194371): Unbounded dynamic tensor resizing is not yet
// supported: treat them as upper-bounded.
case TensorShapeDynamism::DYNAMIC_UNBOUND: {
const auto new_numel = compute_numel(new_sizes.data(), dim_);

ET_CHECK_OR_RETURN_ERROR(
static_cast<size_t>(new_numel) <= numel_bound_,
NotSupported,
"Attempted to resize a dynamic unbound tensor beyond its ceiling of %zu elements to %zu elements.",
numel_bound_,
new_numel);

const size_t needed_bytes =
static_cast<size_t>(new_numel) * elementSize(type_);
if (needed_bytes > capacity_bytes_) {
ET_CHECK_OR_RETURN_ERROR(
dynamic_allocator_ != nullptr,
NotSupported,
"DYNAMIC_UNBOUND tensor needs reallocation but has no DynamicAllocator");
size_t actual_size = 0;
void* new_data = dynamic_allocator_->reallocate(
data_,
capacity_bytes_,
needed_bytes,
alignof(std::max_align_t),
&actual_size);
ET_CHECK_OR_RETURN_ERROR(
new_data != nullptr,
MemoryAllocationFailed,
"Failed to reallocate DYNAMIC_UNBOUND tensor to %zu bytes",
needed_bytes);
data_ = new_data;
capacity_bytes_ = actual_size;
}

if (strides_ && dim_order_) {
auto error =
dim_order_to_stride(new_sizes.data(), dim_order_, dim_, strides_);
if (error != Error::Ok) {
return error;
}
}
numel_ = new_numel;
std::copy(new_sizes.begin(), new_sizes.end(), sizes_);
} break;

case TensorShapeDynamism::DYNAMIC_BOUND: {
const auto new_numel = compute_numel(new_sizes.data(), dim_);

ET_CHECK_OR_RETURN_ERROR(
static_cast<size_t>(new_numel) <= numel_bound_,
NotSupported,
Expand Down
28 changes: 28 additions & 0 deletions runtime/core/portable_type/tensor_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <executorch/runtime/core/error.h>
#include <executorch/runtime/core/portable_type/scalar_type.h>
#include <executorch/runtime/core/tensor_shape_dynamism.h>
#include <executorch/runtime/executor/dynamic_allocator.h>

// Forward declaration of a helper that provides access to internal resizing
// methods of TensorImpl. Real definition is in
Expand Down Expand Up @@ -203,6 +204,26 @@ class TensorImpl {
data_ = ptr;
}

/// Returns the dynamic allocator for DYNAMIC_UNBOUND tensors, or nullptr.
DynamicAllocator* dynamic_allocator() const {
return dynamic_allocator_;
}

/// Sets the dynamic allocator for lazy allocation.
void set_dynamic_allocator(DynamicAllocator* allocator) {
dynamic_allocator_ = allocator;
}

/// Returns the capacity in bytes of the current dynamic allocation.
size_t capacity_bytes() const {
return capacity_bytes_;
}

/// Sets the capacity in bytes of the current dynamic allocation.
void set_capacity_bytes(size_t capacity) {
capacity_bytes_ = capacity;
}

/*
* DEPRECATED: Use torch::executor::resize_tensor() or
* torch::executor::resize_tensor_impl().
Expand Down Expand Up @@ -261,6 +282,13 @@ class TensorImpl {

/// Specifies the mutability of the shape of the tensor.
const TensorShapeDynamism shape_dynamism_;

/// Allocator for DYNAMIC_UNBOUND tensors. nullptr for other dynamism types.
DynamicAllocator* dynamic_allocator_ = nullptr;

/// Capacity in bytes of the buffer pointed to by data_, when managed by
/// dynamic_allocator_. 0 means no allocation yet.
size_t capacity_bytes_ = 0;
};

/**
Expand Down
73 changes: 73 additions & 0 deletions runtime/executor/dynamic_allocator.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#pragma once

#include <cstddef>

namespace executorch {
namespace runtime {

/**
* Interface for dynamic memory allocation used by DYNAMIC_UNBOUND tensors.
*
* Tensors marked as DYNAMIC_UNBOUND have their memory allocated lazily at
* runtime rather than at load time. This interface allows plugging in custom
* allocation strategies (e.g., page-aligned, tracking, virtual memory).
*/
class DynamicAllocator {
public:
virtual ~DynamicAllocator() = default;

/**
* Allocate memory.
*
* @param[in] size Minimum number of bytes to allocate.
* @param[in] alignment Required alignment of the returned pointer.
* @param[out] actual_size If non-null, receives the actual allocation size
* (may be larger than requested, e.g., due to growth policy).
* @returns Pointer to allocated memory, or nullptr on failure.
*/
virtual void* allocate(
size_t size,
size_t alignment,
size_t* actual_size) = 0;

/**
* Reallocate memory, potentially growing the buffer.
*
* The allocator may implement a growth policy (e.g., 2x) so that the
* actual allocation exceeds new_size. Old data up to min(old_size, new_size)
* is preserved.
*
* @param[in] ptr Pointer previously returned by allocate() or reallocate(),
* or nullptr (in which case this behaves like allocate()).
* @param[in] old_size Size of the existing allocation at ptr.
* @param[in] new_size Minimum number of bytes needed.
* @param[in] alignment Required alignment of the returned pointer.
* @param[out] actual_size If non-null, receives the actual allocation size.
* @returns Pointer to reallocated memory, or nullptr on failure. On failure,
* the old allocation at ptr remains valid.
*/
virtual void* reallocate(
void* ptr,
size_t old_size,
size_t new_size,
size_t alignment,
size_t* actual_size) = 0;

/**
* Free memory previously returned by allocate() or reallocate().
*
* @param[in] ptr Pointer to free. May be nullptr (no-op).
*/
virtual void free(void* ptr) = 0;
};

} // namespace runtime
} // namespace executorch
16 changes: 14 additions & 2 deletions runtime/executor/memory_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#include <executorch/runtime/core/hierarchical_allocator.h>
#include <executorch/runtime/core/memory_allocator.h>
#include <executorch/runtime/executor/dynamic_allocator.h>

namespace executorch {
namespace runtime {
Expand Down Expand Up @@ -52,10 +53,12 @@ class MemoryManager final {
explicit MemoryManager(
MemoryAllocator* method_allocator,
HierarchicalAllocator* planned_memory = nullptr,
MemoryAllocator* temp_allocator = nullptr)
MemoryAllocator* temp_allocator = nullptr,
DynamicAllocator* dynamic_allocator = nullptr)
: method_allocator_(method_allocator),
planned_memory_(planned_memory),
temp_allocator_(temp_allocator) {
temp_allocator_(temp_allocator),
dynamic_allocator_(dynamic_allocator) {
ET_CHECK_MSG(
method_allocator != temp_allocator,
"method allocator cannot be the same as temp allocator");
Expand Down Expand Up @@ -105,10 +108,19 @@ class MemoryManager final {
return temp_allocator_;
}

/**
* Returns the allocator to use for DYNAMIC_UNBOUND tensor data.
* May be nullptr if the program does not use DYNAMIC_UNBOUND tensors.
*/
DynamicAllocator* dynamic_allocator() const {
return dynamic_allocator_;
}

private:
MemoryAllocator* method_allocator_;
HierarchicalAllocator* planned_memory_;
MemoryAllocator* temp_allocator_;
DynamicAllocator* dynamic_allocator_;
};

} // namespace runtime
Expand Down
Loading
Loading