From 7f328f7f56843f96b7e740c9f51704d6514ffb37 Mon Sep 17 00:00:00 2001 From: Lan Luo Date: Wed, 18 Mar 2026 11:21:11 -0700 Subject: [PATCH 1/7] initial checkin --- .../model_static_shape.py | 61 +++++++++++++ py/torch_tensorrt/_compile.py | 50 ++++++++++- .../dynamo/runtime/_TorchTensorRTModule.py | 2 +- py/torch_tensorrt/executorch/__init__.py | 9 ++ py/torch_tensorrt/executorch/backend.py | 85 +++++++++++++++++++ .../executorch/operator_support.py | 26 ++++++ py/torch_tensorrt/executorch/partitioner.py | 63 ++++++++++++++ py/torch_tensorrt/executorch/serialization.py | 32 +++++++ 8 files changed, 323 insertions(+), 5 deletions(-) create mode 100644 examples/torchtrt_executorch_example/model_static_shape.py create mode 100644 py/torch_tensorrt/executorch/__init__.py create mode 100644 py/torch_tensorrt/executorch/backend.py create mode 100644 py/torch_tensorrt/executorch/operator_support.py create mode 100644 py/torch_tensorrt/executorch/partitioner.py create mode 100644 py/torch_tensorrt/executorch/serialization.py diff --git a/examples/torchtrt_executorch_example/model_static_shape.py b/examples/torchtrt_executorch_example/model_static_shape.py new file mode 100644 index 0000000000..f94a7d3c0a --- /dev/null +++ b/examples/torchtrt_executorch_example/model_static_shape.py @@ -0,0 +1,61 @@ +""" +.. _executorch_export: + +Saving a Torch-TensorRT Model in ExecuTorch Format (.pte) +========================================================= + +This example demonstrates how to compile a model with Torch-TensorRT and save it +as an ExecuTorch ``.pte`` file, which can be loaded by the ExecuTorch runtime +(e.g., on embedded or mobile devices with a TensorRT-capable backend). + +Prerequisites +------------- +Install ExecuTorch before running this example:: + + pip install executorch + +See https://pytorch.org/executorch/stable/getting-started-setup.html for details. +""" + +# %% +# Imports and Model Definition +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +import torch +import torch_tensorrt + + +class MyModel(torch.nn.Module): + def forward(self, x): + return x + 1 + + +# %% +# Compile with Torch-TensorRT +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# Export the model, compile it with TensorRT, then save as .pte + +with torch.no_grad(): + model = MyModel().eval().cuda() + example_input = (torch.randn((2, 3, 4, 4)).cuda(),) + + exported_program = torch.export.export(model, example_input) + compile_settings = { + "arg_inputs": [ + torch_tensorrt.Input(shape=(2, 3, 4, 4), dtype=torch.float32), + ], + "min_block_size": 1, + } + trt_gm = torch_tensorrt.dynamo.compile(exported_program, **compile_settings) + + # %% + # Save as ExecuTorch .pte format (loadable by the ExecuTorch runtime) + # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + # The TensorRT engine is serialized inside the .pte using the same blob format + # as the Torch-TensorRT runtime (vector of strings), so one engine format for + # both ExecuTorch and non-ExecuTorch deployment. + torch_tensorrt.save( + trt_gm, "model.pte", output_format="executorch", arg_inputs=example_input + ) + + print("Saved model.pte successfully.") diff --git a/py/torch_tensorrt/_compile.py b/py/torch_tensorrt/_compile.py index c4dbb1c148..3e25cce239 100644 --- a/py/torch_tensorrt/_compile.py +++ b/py/torch_tensorrt/_compile.py @@ -653,7 +653,7 @@ def save( inputs (Union[torch.Tensor, torch_tensorrt.Input]): Torch input tensors or Input specifications arg_inputs (Tuple[Union[torch.Tensor, torch_tensorrt.Input], ...]): Same as inputs. Alias for better understanding with kwarg_inputs. kwarg_inputs (dict[str, Union[torch.Tensor, torch_tensorrt.Input]]): Optional, kwarg inputs to the module forward function. - output_format (str): Format to save the model. Options include exported_program | torchscript | aot_inductor. + output_format (str): Format to save the model. Options include exported_program | torchscript | aot_inductor | executorch. retrace (bool): When the module type is a fx.GraphModule, this option re-exports the graph using torch.export.export(strict=False) to save it. For TRT-compiled modules with dynamic shapes, both retrace=True and retrace=False are supported: @@ -726,7 +726,7 @@ def save( if isinstance(module, CudaGraphsTorchTensorRTModule): module = module.compiled_module module_type = _parse_module_type(module) - accepted_formats = {"exported_program", "torchscript", "aot_inductor"} + accepted_formats = {"exported_program", "torchscript", "aot_inductor", "executorch"} if arg_inputs is not None and not all( isinstance(input, (torch.Tensor, Input)) for input in arg_inputs ): @@ -847,12 +847,16 @@ def _extract_tensor(obj: Any) -> Any: if output_format not in accepted_formats: raise ValueError( - f"Provided output_format {output_format} is not supported. Supported options are exported_program | torchscript" + f"Provided output_format {output_format} is not supported. Supported options are exported_program | torchscript | aot_inductor | executorch" ) if output_format == "aot_inductor" and platform.system() != "Linux": raise ValueError( f"The AOT Inductor format is only supported on Linux, {platform.system()} is not a supported platform for this format" ) + if output_format == "executorch" and platform.system() != "Linux": + raise ValueError( + f"The executorch format is only supported on Linux, {platform.system()} is not a supported platform for this format" + ) if not file_path: raise ValueError("File path cannot be empty. Please provide a valid file path") @@ -906,6 +910,8 @@ def _extract_tensor(obj: Any) -> Any: inductor_configs=inductor_configs, package_path=file_path, ) + elif output_format == "executorch": + _save_as_executorch(module, file_path) else: raise RuntimeError( "Attempted to serialize an exported program with an unsupported format. Exported programs support exported_program and aot_inductor" @@ -963,6 +969,8 @@ def _extract_tensor(obj: Any) -> Any: inductor_configs=inductor_configs, package_path=file_path, ) + elif output_format == "executorch": + _save_as_executorch(exp_program, file_path) else: raise RuntimeError( "Attempted to serialize an exported program with an unsupported format. Exported programs support exported_program and aot_inductor" @@ -1014,7 +1022,7 @@ def _extract_tensor(obj: Any) -> Any: "Provided model is a torch.fx.GraphModule without existing shape metadata and retrace is True, however no inputs specs were provided. " "Please provide valid torch.Tensors or torch_tensorrt.Input objects as inputs to retrace and save the model" ) - + breakpoint() exp_program = torch.export.export( module, args=tuple(arg_tensors), @@ -1042,12 +1050,46 @@ def _extract_tensor(obj: Any) -> Any: inductor_configs=inductor_configs, package_path=file_path, ) + elif output_format == "executorch": + _save_as_executorch(exp_program, file_path) else: raise RuntimeError( "Attempted to serialize an exported program with an unsupported format. Exported programs support exported_program and aot_inductor" ) +def _save_as_executorch(exp_program: Any, file_path: str) -> None: + """Save an ExportedProgram (with TensorRT execute_engine nodes) as an ExecuTorch .pte file. + + Partitions the graph by torch.ops.tensorrt.execute_engine, serializes each engine + to the same blob format as the TRT runtime (vector of strings), and embeds it + in the .pte. Requires the ``executorch`` package and torch_tensorrt_runtime. See + https://pytorch.org/executorch/stable/getting-started-setup.html + """ + if not ENABLED_FEATURES.torch_tensorrt_runtime: + raise RuntimeError( + "output_format='executorch' requires the Torch-TensorRT runtime " + "(torch_tensorrt_runtime). Reinstall torch_tensorrt with the runtime extension." + ) + try: + from executorch.exir import to_edge_transform_and_lower + except ImportError: + raise ImportError( + "ExecuTorch is not installed. Please install it to use output_format='executorch'. " + "See https://pytorch.org/executorch/stable/getting-started-setup.html" + ) + from torch_tensorrt.executorch import TensorRTPartitioner + + breakpoint() + edge_program = to_edge_transform_and_lower( + exp_program, + partitioner=[TensorRTPartitioner()], + ) + executorch_program = edge_program.to_executorch() + with open(file_path, "wb") as f: + executorch_program.write_to_file(f) + + def function_overload_with_kwargs( fn: Callable[..., Any], *args: Any, **kwargs: Any ) -> Any: diff --git a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py index d77c0bf39f..91994ca49b 100644 --- a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py @@ -338,6 +338,7 @@ def forward(self, *inputs: Any) -> torch.Tensor | Tuple[torch.Tensor, ...]: Returns: torch.Tensor or Tuple(torch.Tensor): Result of the engine computation """ + breakpoint() if self.engine is None: raise RuntimeError("Engine has not been setup yet.") @@ -354,7 +355,6 @@ def forward(self, *inputs: Any) -> torch.Tensor | Tuple[torch.Tensor, ...]: (i if isinstance(i, torch.Tensor) else torch.tensor(i).cuda()) for i in inputs ] - outputs: List[torch.Tensor] = torch.ops.tensorrt.execute_engine( list(input_tensors), self.engine ) diff --git a/py/torch_tensorrt/executorch/__init__.py b/py/torch_tensorrt/executorch/__init__.py new file mode 100644 index 0000000000..81aa088610 --- /dev/null +++ b/py/torch_tensorrt/executorch/__init__.py @@ -0,0 +1,9 @@ +# ExecuTorch backend for Torch-TensorRT: save/load .pte with TensorRT delegate. + +from torch_tensorrt.executorch.backend import TensorRTBackend +from torch_tensorrt.executorch.partitioner import TensorRTPartitioner + +__all__ = [ + "TensorRTBackend", + "TensorRTPartitioner", +] diff --git a/py/torch_tensorrt/executorch/backend.py b/py/torch_tensorrt/executorch/backend.py new file mode 100644 index 0000000000..1c9ba4c615 --- /dev/null +++ b/py/torch_tensorrt/executorch/backend.py @@ -0,0 +1,85 @@ +# ExecuTorch TensorRT backend: serialize engine to same blob format as TRT runtime. + +import base64 +from typing import Any, List, final + +import torch +from executorch.exir.backend.backend_details import ( + BackendDetails, + CompileSpec, + PreprocessResult, +) +from torch.export.exported_program import ExportedProgram +from torch_tensorrt.dynamo.runtime._TorchTensorRTModule import ( + ENGINE_IDX, + SERIALIZATION_LEN, +) +from torch_tensorrt.executorch.serialization import serialize_engine_info + + +def _get_engine_info_from_edge_program(edge_program: ExportedProgram) -> List[Any]: + """Extract engine info (list of strings/bytes) from the partition's execute_engine node.""" + gm = edge_program.graph_module + execute_engine_op = torch.ops.tensorrt.execute_engine.default + + for node in gm.graph.nodes: + if node.op != "call_function" or node.target is not execute_engine_op: + continue + if len(node.args) < 2: + continue + engine_arg = node.args[1] + if engine_arg.op == "get_attr": + val = getattr(gm, engine_arg.target, None) + if val is None: + raise RuntimeError( + f"Engine get_attr({engine_arg.target}) not found on partition module." + ) + if hasattr(val, "__getstate__"): + engine_info = val.__getstate__() + else: + engine_info = getattr(val, "engine_info", val) + if ( + isinstance(engine_info, (list, tuple)) + and len(engine_info) >= SERIALIZATION_LEN + ): + return list(engine_info) + raise RuntimeError( + f"Engine argument get_attr({engine_arg.target}) did not yield engine info list (len >= {SERIALIZATION_LEN})." + ) + raise RuntimeError( + "TensorRT ExecuTorch backend expects execute_engine(inputs, engine) " + "where engine is a get_attr; cannot find engine." + ) + raise RuntimeError( + "TensorRT ExecuTorch backend: no execute_engine node found in partition." + ) + + +@final +class TensorRTBackend(BackendDetails): # type: ignore[misc] + """Backend that serializes TensorRT engine to the same blob format as the TRT runtime. + + The partition contains a single execute_engine node; we extract the engine + and metadata and encode them as a vector of strings (same layout as + core/runtime/runtime.h SerializedInfoIndex) so the same blob works for + both ExecuTorch and non-ExecuTorch TRT runtime. + """ + + @staticmethod + def preprocess( + edge_program: ExportedProgram, + compile_specs: List[CompileSpec], + ) -> PreprocessResult: + engine_info = _get_engine_info_from_edge_program(edge_program) + engine_info = list(engine_info) + serialized_engine = engine_info[ENGINE_IDX] + if isinstance(serialized_engine, str): + engine_info[ENGINE_IDX] = base64.b64decode( + serialized_engine.encode("utf-8") + ) + elif not isinstance(serialized_engine, (bytes, bytearray)): + engine_info[ENGINE_IDX] = bytes(serialized_engine) + if len(engine_info) > 7 and isinstance(engine_info[7], bytes): + engine_info[7] = engine_info[7].decode("utf-8", errors="replace") + blob = serialize_engine_info(engine_info) + return PreprocessResult(processed_bytes=blob) diff --git a/py/torch_tensorrt/executorch/operator_support.py b/py/torch_tensorrt/executorch/operator_support.py new file mode 100644 index 0000000000..32763665c2 --- /dev/null +++ b/py/torch_tensorrt/executorch/operator_support.py @@ -0,0 +1,26 @@ +# Operator support for ExecuTorch TensorRT partitioner: only execute_engine is supported. + +from typing import Dict + +import torch +from torch.fx.passes.operator_support import OperatorSupportBase + + +class TensorRTOperatorSupport(OperatorSupportBase): # type: ignore[misc] + """Supports only torch.ops.tensorrt.execute_engine for partitioning. + + Used so that TRT-compiled graphs (which already contain execute_engine nodes) + are partitioned per engine; each partition is then lowered to TensorRTBackend + which serializes the engine to the same blob format as the TRT runtime. + """ + + def __init__(self) -> None: + super().__init__() + self._execute_engine_op = torch.ops.tensorrt.execute_engine.default + + def is_node_supported( + self, submodules: Dict[str, torch.nn.Module], node: torch.fx.Node + ) -> bool: + if node.op != "call_function": + return False + return node.target is self._execute_engine_op diff --git a/py/torch_tensorrt/executorch/partitioner.py b/py/torch_tensorrt/executorch/partitioner.py new file mode 100644 index 0000000000..9fcab9f709 --- /dev/null +++ b/py/torch_tensorrt/executorch/partitioner.py @@ -0,0 +1,63 @@ +# ExecuTorch partitioner: partition by execute_engine nodes. + +from typing import Callable, Dict, List, Optional, Tuple + +import torch +from executorch.exir.backend.compile_spec_schema import CompileSpec +from executorch.exir.backend.partitioner import ( + DelegationSpec, + Partitioner, + PartitionResult, +) +from executorch.exir.backend.utils import tag_constant_data +from torch.export import ExportedProgram +from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner +from torch_tensorrt.executorch.backend import TensorRTBackend +from torch_tensorrt.executorch.operator_support import TensorRTOperatorSupport + + +class TensorRTPartitioner(Partitioner): # type: ignore[misc] + """Partitions the graph for TensorRT delegation. + + Only nodes that are torch.ops.tensorrt.execute_engine are supported; + each such node becomes its own partition so the backend can serialize + the engine to the same format as the TRT runtime. + """ + + def __init__( + self, + compile_specs: Optional[List[CompileSpec]] = None, + ) -> None: + super().__init__() + self.compile_specs = compile_specs or [] + self.delegation_spec = DelegationSpec( + backend_id=TensorRTBackend.__name__, + compile_specs=self.compile_specs, + ) + + def partition(self, exported_program: ExportedProgram) -> PartitionResult: + capability_partitioner = CapabilityBasedPartitioner( + exported_program.graph_module, + TensorRTOperatorSupport(), + allows_single_node_partition=True, + ) + partition_list = capability_partitioner.propose_partitions() + + partition_tags: Dict[str, DelegationSpec] = {} + for partition in partition_list: + tag = f"tensorrt_{partition.id}" + for node in partition.nodes: + node.meta["delegation_tag"] = tag + partition_tags[tag] = self.delegation_spec + + tag_constant_data(exported_program) + + return PartitionResult( + tagged_exported_program=exported_program, + partition_tags=partition_tags, + ) + + def ops_to_not_decompose( + self, ep: ExportedProgram + ) -> Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]: + return ([], None) diff --git a/py/torch_tensorrt/executorch/serialization.py b/py/torch_tensorrt/executorch/serialization.py new file mode 100644 index 0000000000..742269973d --- /dev/null +++ b/py/torch_tensorrt/executorch/serialization.py @@ -0,0 +1,32 @@ +# Serialization for ExecuTorch TensorRT blob: same format as TRT runtime (vector of strings). +# Uses the same list format as TorchTensorRTModule._pack_engine_info, then encodes to bytes. +# Only valid when ENABLED_FEATURES.torch_tensorrt_runtime is True. + +import struct +from typing import List, Union + +from torch_tensorrt.dynamo.runtime._TorchTensorRTModule import SERIALIZATION_LEN + + +def serialize_engine_info(engine_info: List[Union[str, bytes]]) -> bytes: + """Encode engine info list (same format as TorchTensorRTModule._pack_engine_info) to bytes. + + Takes the list produced by _pack_engine_info (or equivalent) and writes it in the + TRT runtime vector format: 4-byte count (SERIALIZATION_LEN), then for each + entry 4-byte length (LE) + raw bytes. C++ can deserialize to std::vector + and pass to TRTEngine(std::vector serialized_info). + """ + if len(engine_info) < SERIALIZATION_LEN: + engine_info = list(engine_info) + [""] * (SERIALIZATION_LEN - len(engine_info)) + parts: List[bytes] = [] + for i in range(SERIALIZATION_LEN): + raw = engine_info[i] + if isinstance(raw, str): + raw = raw.encode("utf-8") + elif raw is None: + raw = b"" + else: + raw = bytes(raw) + parts.append(struct.pack(" Date: Wed, 18 Mar 2026 11:54:13 -0700 Subject: [PATCH 2/7] test1 --- .../model_static_shape.py | 8 ++++- py/torch_tensorrt/_compile.py | 5 +-- .../dynamo/runtime/_TorchTensorRTModule.py | 2 +- .../runtime/meta_ops/register_meta_ops.py | 34 ++++++++++++++++--- 4 files changed, 40 insertions(+), 9 deletions(-) diff --git a/examples/torchtrt_executorch_example/model_static_shape.py b/examples/torchtrt_executorch_example/model_static_shape.py index f94a7d3c0a..f6f9e9006d 100644 --- a/examples/torchtrt_executorch_example/model_static_shape.py +++ b/examples/torchtrt_executorch_example/model_static_shape.py @@ -54,8 +54,14 @@ def forward(self, x): # The TensorRT engine is serialized inside the .pte using the same blob format # as the Torch-TensorRT runtime (vector of strings), so one engine format for # both ExecuTorch and non-ExecuTorch deployment. + # Use retrace=False so the legacy exporter is used; the engine is then available + # when ExecuTorch's partitioner runs the graph. torch_tensorrt.save( - trt_gm, "model.pte", output_format="executorch", arg_inputs=example_input + trt_gm, + "model.pte", + output_format="executorch", + arg_inputs=example_input, + retrace=False, ) print("Saved model.pte successfully.") diff --git a/py/torch_tensorrt/_compile.py b/py/torch_tensorrt/_compile.py index 3e25cce239..6491d91c1f 100644 --- a/py/torch_tensorrt/_compile.py +++ b/py/torch_tensorrt/_compile.py @@ -1022,7 +1022,6 @@ def _extract_tensor(obj: Any) -> Any: "Provided model is a torch.fx.GraphModule without existing shape metadata and retrace is True, however no inputs specs were provided. " "Please provide valid torch.Tensors or torch_tensorrt.Input objects as inputs to retrace and save the model" ) - breakpoint() exp_program = torch.export.export( module, args=tuple(arg_tensors), @@ -1078,9 +1077,11 @@ def _save_as_executorch(exp_program: Any, file_path: str) -> None: "ExecuTorch is not installed. Please install it to use output_format='executorch'. " "See https://pytorch.org/executorch/stable/getting-started-setup.html" ) + # Ensure execute_engine fake kernel is registered so partitioner can run + # when the engine is a CustomObjArgument (export placeholder). + import torch_tensorrt.dynamo.runtime.meta_ops.register_meta_ops # noqa: F401 from torch_tensorrt.executorch import TensorRTPartitioner - breakpoint() edge_program = to_edge_transform_and_lower( exp_program, partitioner=[TensorRTPartitioner()], diff --git a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py index 91994ca49b..2de6b10810 100644 --- a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py @@ -338,7 +338,7 @@ def forward(self, *inputs: Any) -> torch.Tensor | Tuple[torch.Tensor, ...]: Returns: torch.Tensor or Tuple(torch.Tensor): Result of the engine computation """ - breakpoint() + if self.engine is None: raise RuntimeError("Engine has not been setup yet.") diff --git a/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py b/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py index e03c88153c..83ac0644b0 100644 --- a/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py +++ b/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py @@ -189,6 +189,22 @@ def fake_aten_cudnn_grid_sampler( return torch.empty(out_shape, dtype=input.dtype, device=input.device) +def _is_placeholder_engine(engine: Any) -> bool: + """True if engine is a placeholder (CustomObjArgument/FakeScriptObject) from export.""" + if engine is None: + return True + type_name = type(engine).__name__ + if type_name == "CustomObjArgument": + return True + if type_name == "FakeScriptObject": + return True + if hasattr(engine, "fake_val") and engine.fake_val is not None: + return True + if not hasattr(engine, "get_serialized_metadata"): + return True + return False + + @torch.library.register_fake("tensorrt::execute_engine") # type: ignore def fake_tensorrt_execute_engine( inputs: List[torch.Tensor], fake_trt_engine: Any @@ -196,13 +212,23 @@ def fake_tensorrt_execute_engine( """ Meta kernel for TensorRT engine execution. - Uses symbolic shape expressions captured at compile time to correctly infer - output shapes while preserving symbolic SymInt relationships. + When the engine is a placeholder (CustomObjArgument/FakeScriptObject from + torch.export/ExecuTorch), returns one fake output per input (same shape/dtype) + so partitioners can run without a real engine. Otherwise uses symbolic shape + expressions from metadata to infer output shapes. """ + if _is_placeholder_engine(fake_trt_engine): + from torch._guards import detect_fake_mode + + fake_mode = detect_fake_mode(inputs) if inputs else None + if not inputs: + return [torch.empty(())] + if fake_mode is not None: + return [fake_mode.from_tensor(inputs[0])] + return [torch.empty_like(inputs[0])] metadata = None if hasattr(fake_trt_engine, "real_obj"): - # Wrapped C++ engine with real_obj trt_engine = fake_trt_engine.real_obj metadata = TorchTensorRTModule.decode_metadata( trt_engine.get_serialized_metadata() @@ -215,8 +241,6 @@ def fake_tensorrt_execute_engine( shape_info = metadata.get("inout_symexprs") if metadata else None if shape_info: - # Apply the symbolic shape expressions to create output fake tensors - # shape_info now contains both 'inputs' and 'outputs' keys return _apply_symbolic_shape_expressions(inputs, shape_info) else: raise RuntimeError( From c8b92ef381d9b2d862977f70f92838fbf5b25751 Mon Sep 17 00:00:00 2001 From: Lan Luo Date: Tue, 24 Mar 2026 15:16:56 -0700 Subject: [PATCH 3/7] test --- .gitignore | 1 + core/runtime/executorch/TensorRTBackend.cpp | 313 ++++++++++++++++++ core/runtime/executorch/TensorRTBackend.h | 39 +++ ...static_shape.py => export_static_shape.py} | 0 py/torch_tensorrt/_compile.py | 97 +++++- py/torch_tensorrt/dynamo/_compiler.py | 3 + py/torch_tensorrt/dynamo/_leaf_spec_compat.py | 60 ++++ .../runtime/meta_ops/register_meta_ops.py | 41 +++ py/torch_tensorrt/executorch/backend.py | 49 ++- .../executorch/operator_support.py | 14 +- setup.py | 2 +- 11 files changed, 580 insertions(+), 39 deletions(-) create mode 100644 core/runtime/executorch/TensorRTBackend.cpp create mode 100644 core/runtime/executorch/TensorRTBackend.h rename examples/torchtrt_executorch_example/{model_static_shape.py => export_static_shape.py} (100%) create mode 100644 py/torch_tensorrt/dynamo/_leaf_spec_compat.py diff --git a/.gitignore b/.gitignore index f08d97d448..6a4c6eda74 100644 --- a/.gitignore +++ b/.gitignore @@ -81,3 +81,4 @@ coverage.xml *.log *.pt2 examples/torchtrt_aoti_example/torchtrt_aoti_example +CLAUDE.md \ No newline at end of file diff --git a/core/runtime/executorch/TensorRTBackend.cpp b/core/runtime/executorch/TensorRTBackend.cpp new file mode 100644 index 0000000000..93f97dc8ce --- /dev/null +++ b/core/runtime/executorch/TensorRTBackend.cpp @@ -0,0 +1,313 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "core/runtime/executorch/TensorRTBackend.h" + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include "core/runtime/TRTEngine.h" +#include "core/util/prelude.h" + +namespace torch_tensorrt { +namespace executorch_backend { + +using ::executorch::runtime::ArrayRef; +using ::executorch::runtime::BackendExecutionContext; +using ::executorch::runtime::BackendInitContext; +using ::executorch::runtime::CompileSpec; +using ::executorch::runtime::DelegateHandle; +using ::executorch::runtime::Error; +using ::executorch::runtime::EValue; +using ::executorch::runtime::FreeableBuffer; +using ::executorch::runtime::MemoryAllocator; +using ::executorch::runtime::Result; +using ::executorch::runtime::Span; + +namespace { + +// --------------------------------------------------------------------------- +// Blob deserialization +// +// Wire format written by +// py/torch_tensorrt/executorch/serialization.py::serialize_engine_info() +// +// [uint32_t count (LE)] +// for each of `count` entries: +// [uint32_t len (LE)] [uint8_t data[len]] +// +// The resulting vector is passed directly to +// core::runtime::TRTEngine(std::vector serialized_info) +// which expects the 11-element list defined by SerializedInfoIndex in +// core/runtime/runtime.h +// --------------------------------------------------------------------------- +std::vector deserialize_engine_info(const void* data, size_t size) { + const uint8_t* ptr = static_cast(data); + const uint8_t* const end = ptr + size; + + if (ptr + sizeof(uint32_t) > end) { + return {}; + } + + uint32_t count = 0; + std::memcpy(&count, ptr, sizeof(uint32_t)); + ptr += sizeof(uint32_t); + + std::vector result; + result.reserve(count); + + for (uint32_t i = 0; i < count; ++i) { + if (ptr + sizeof(uint32_t) > end) { + return {}; + } + uint32_t len = 0; + std::memcpy(&len, ptr, sizeof(uint32_t)); + ptr += sizeof(uint32_t); + + if (ptr + len > end) { + return {}; + } + result.emplace_back(reinterpret_cast(ptr), len); + ptr += len; + } + + return result; +} + +// --------------------------------------------------------------------------- +// Build a nvinfer1::Dims from an ExecuTorch tensor's shape +// --------------------------------------------------------------------------- +nvinfer1::Dims to_trt_dims(const exec_aten::Tensor& t) { + nvinfer1::Dims dims{}; + dims.nbDims = t.dim(); + for (int d = 0; d < t.dim(); ++d) { + dims.d[d] = static_cast(t.size(d)); + } + return dims; +} + +} // namespace + +// --------------------------------------------------------------------------- +// is_available +// --------------------------------------------------------------------------- +bool TensorRTBackend::is_available() const { + return true; +} + +// --------------------------------------------------------------------------- +// init +// +// Deserializes the processed blob into a TRTEngine and returns it as the +// opaque DelegateHandle. The engine is placement-new'd into memory +// provided by the ExecuTorch MemoryAllocator so that ExecuTorch owns the +// lifetime; destroy() calls the destructor explicitly. +// --------------------------------------------------------------------------- +Result TensorRTBackend::init(BackendInitContext& context, FreeableBuffer* processed) const { + if (processed == nullptr || processed->data() == nullptr) { + ET_LOG(Error, "TensorRTBackend::init: null processed buffer"); + return Error::InvalidArgument; + } + + auto serialized_info = deserialize_engine_info(processed->data(), processed->size()); + + if (serialized_info.empty()) { + ET_LOG(Error, "TensorRTBackend::init: failed to deserialize engine blob"); + return Error::InvalidArgument; + } + + // Validate the vector length before handing to TRTEngine + // (verify_serialization_fmt throws on mismatch) + core::runtime::TRTEngine::verify_serialization_fmt(serialized_info); + + MemoryAllocator* allocator = context.get_runtime_allocator(); + if (allocator == nullptr) { + ET_LOG(Error, "TensorRTBackend::init: null runtime allocator"); + return Error::InvalidState; + } + + // Allocate raw storage for TRTEngine from ExecuTorch's arena + core::runtime::TRTEngine* engine = allocator->allocateInstance(); + if (engine == nullptr) { + ET_LOG(Error, "TensorRTBackend::init: allocateInstance failed"); + return Error::MemoryAllocationFailed; + } + + // Construct in-place; TRTEngine(std::vector) deserializes the + // engine bytes, builds the IRuntime/ICudaEngine/IExecutionContext, and + // populates in_binding_names / out_binding_names / num_io. + new (engine) core::runtime::TRTEngine(std::move(serialized_info)); + + // Release the blob; we no longer need it + processed->Free(); + + ET_LOG( + Info, + "TensorRTBackend::init: engine '%s' ready (%zu inputs, %zu outputs)", + engine->name.c_str(), + engine->num_io.first, + engine->num_io.second); + + return static_cast(engine); +} + +// --------------------------------------------------------------------------- +// execute +// +// Binds the ExecuTorch input/output tensor data pointers directly to the +// TRT IExecutionContext and calls enqueueV3(). ExecuTorch pre-allocates +// all output tensors before calling execute(), so we only need to register +// their addresses; no separate output allocation is required. +// +// Args layout (mirroring the Python exporter): +// args[0 .. num_inputs-1] – input EValues +// args[num_inputs .. num_inputs+num_outputs-1] – output EValues +// --------------------------------------------------------------------------- +Error TensorRTBackend::execute(BackendExecutionContext& context, DelegateHandle* handle, Span args) const { + (void)context; + + if (handle == nullptr) { + ET_LOG(Error, "TensorRTBackend::execute: null delegate handle"); + return Error::InvalidArgument; + } + + auto* engine = static_cast(handle); + + const size_t num_inputs = engine->num_io.first; + const size_t num_outputs = engine->num_io.second; + + if (args.size() < num_inputs + num_outputs) { + ET_LOG( + Error, "TensorRTBackend::execute: expected at least %zu args, got %zu", num_inputs + num_outputs, args.size()); + return Error::InvalidArgument; + } + + // IExecutionContext::enqueueV3 is not thread-safe; use the engine mutex + std::unique_lock lock(engine->mu); + + nvinfer1::IExecutionContext* ctx = engine->exec_ctx.get(); + + // ------------------------------------------------------------------ + // 1. Bind input shapes and addresses + // ------------------------------------------------------------------ + for (size_t i = 0; i < num_inputs; ++i) { + EValue* arg = args[i]; + if (arg == nullptr || !arg->isTensor()) { + ET_LOG(Error, "TensorRTBackend::execute: input %zu is not a tensor", i); + return Error::InvalidArgument; + } + + exec_aten::Tensor et_in = arg->toTensor(); + const std::string& name = engine->in_binding_names[i]; + nvinfer1::Dims dims = to_trt_dims(et_in); + + if (!ctx->setInputShape(name.c_str(), dims)) { + ET_LOG(Error, "TensorRTBackend::execute: setInputShape failed for '%s'", name.c_str()); + return Error::InvalidState; + } + + void* ptr = et_in.mutable_data_ptr(); + // TRT requires a non-null address even for 0-element tensors + static char placeholder[16] = {}; + if (ptr == nullptr || et_in.numel() == 0) { + ptr = placeholder; + } + + if (!ctx->setTensorAddress(name.c_str(), ptr)) { + ET_LOG(Error, "TensorRTBackend::execute: setTensorAddress failed for input '%s'", name.c_str()); + return Error::InvalidState; + } + } + + // ------------------------------------------------------------------ + // 2. Infer output shapes (requires all input shapes to be set first) + // ------------------------------------------------------------------ + { + const int32_t io_size = engine->cuda_engine->getNbIOTensors(); + std::vector unresolved(static_cast(io_size), nullptr); + const int32_t n_unresolved = ctx->inferShapes(io_size, unresolved.data()); + if (n_unresolved != 0) { + ET_LOG(Error, "TensorRTBackend::execute: inferShapes could not resolve %d tensor(s)", n_unresolved); + return Error::InvalidState; + } + } + + // ------------------------------------------------------------------ + // 3. Bind output addresses (ExecuTorch pre-allocates the buffers) + // ------------------------------------------------------------------ + for (size_t o = 0; o < num_outputs; ++o) { + EValue* arg = args[num_inputs + o]; + if (arg == nullptr || !arg->isTensor()) { + ET_LOG(Error, "TensorRTBackend::execute: output %zu is not a tensor", o); + return Error::InvalidArgument; + } + + exec_aten::Tensor et_out = arg->toTensor(); + const std::string& name = engine->out_binding_names[o]; + void* ptr = et_out.mutable_data_ptr(); + + if (!ctx->setTensorAddress(name.c_str(), ptr)) { + ET_LOG(Error, "TensorRTBackend::execute: setTensorAddress failed for output '%s'", name.c_str()); + return Error::InvalidState; + } + } + + // ------------------------------------------------------------------ + // 4. Enqueue inference on the current CUDA stream + // ------------------------------------------------------------------ + cudaStream_t stream = c10::cuda::getCurrentCUDAStream(static_cast(engine->device_info.id)); + + if (!ctx->enqueueV3(stream)) { + ET_LOG(Error, "TensorRTBackend::execute: enqueueV3 failed"); + return Error::InvalidState; + } + + // Synchronize so that outputs are visible to downstream ExecuTorch ops + cudaStreamSynchronize(stream); + + return Error::Ok; +} + +// --------------------------------------------------------------------------- +// destroy +// +// Explicitly destructs the TRTEngine. The underlying memory was allocated +// by ExecuTorch's MemoryAllocator and will be reclaimed by the arena. +// --------------------------------------------------------------------------- +void TensorRTBackend::destroy(DelegateHandle* handle) const { + if (handle != nullptr) { + static_cast(handle)->~TRTEngine(); + } +} + +} // namespace executorch_backend +} // namespace torch_tensorrt + +// --------------------------------------------------------------------------- +// Static registration – links the name "TensorRTBackend" used in the .pte +// file to this implementation at program startup. +// --------------------------------------------------------------------------- +namespace { + +torch_tensorrt::executorch_backend::TensorRTBackend& get_backend() { + static torch_tensorrt::executorch_backend::TensorRTBackend backend; + return backend; +} + +const ::executorch::runtime::Backend kBackendId{"TensorRTBackend", &get_backend()}; +const auto kRegistered = ::executorch::runtime::register_backend(kBackendId); + +} // namespace diff --git a/core/runtime/executorch/TensorRTBackend.h b/core/runtime/executorch/TensorRTBackend.h new file mode 100644 index 0000000000..3855942251 --- /dev/null +++ b/core/runtime/executorch/TensorRTBackend.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + * + * ExecuTorch backend delegate that runs TensorRT engines serialized by + * torch_tensorrt. The processed blob must be in the vector-of-strings wire + * format produced by + * py/torch_tensorrt/executorch/serialization.py::serialize_engine_info() + * which maps 1-to-1 to the std::vector accepted by + * core/runtime/TRTEngine::TRTEngine(std::vector). + */ +#pragma once + +#include + +namespace torch_tensorrt { +namespace executorch_backend { + +class TensorRTBackend final : public ::executorch::runtime::BackendInterface { + public: + bool is_available() const override; + + ::executorch::runtime::Result<::executorch::runtime::DelegateHandle*> init( + ::executorch::runtime::BackendInitContext& context, + ::executorch::runtime::FreeableBuffer* processed, + ::executorch::runtime::ArrayRef<::executorch::runtime::CompileSpec> compile_specs) const override; + + ::executorch::runtime::Error execute( + ::executorch::runtime::BackendExecutionContext& context, + ::executorch::runtime::DelegateHandle* handle, + ::executorch::runtime::Span<::executorch::runtime::EValue*> args) const override; + + void destroy(::executorch::runtime::DelegateHandle* handle) const override; +}; + +} // namespace executorch_backend +} // namespace torch_tensorrt diff --git a/examples/torchtrt_executorch_example/model_static_shape.py b/examples/torchtrt_executorch_example/export_static_shape.py similarity index 100% rename from examples/torchtrt_executorch_example/model_static_shape.py rename to examples/torchtrt_executorch_example/export_static_shape.py diff --git a/py/torch_tensorrt/_compile.py b/py/torch_tensorrt/_compile.py index 6491d91c1f..0b4470d3ef 100644 --- a/py/torch_tensorrt/_compile.py +++ b/py/torch_tensorrt/_compile.py @@ -1057,13 +1057,95 @@ def _extract_tensor(obj: Any) -> Any: ) +def _replace_execute_engine_with_no_op(exp_program: Any) -> Any: + """Replace execute_engine nodes with no_op_placeholder_for_execute_engine. + + ExecuTorch's edge-lowering passes symbolically execute every node before + partitioning runs. The execute_engine schema requires a + ``__torch__.torch.classes.tensorrt.Engine`` argument, but after export the + engine is represented as a ``CustomObjArgument`` — causing a schema type + error inside the pass interpreter. + + The no_op_placeholder op uses flat string arguments instead of a custom + class, so it passes through the edge passes without issue. The + TensorRTPartitioner and TensorRTBackend are updated to work with the + no-op form. + """ + import base64 + + import torch_tensorrt.dynamo.runtime.meta_ops.register_meta_ops # noqa: F401 + + gm = exp_program.graph_module + execute_engine_op = torch.ops.tensorrt.execute_engine.default + no_op = torch.ops.tensorrt.no_op_placeholder_for_execute_engine.default + + nodes_to_replace = [ + n + for n in gm.graph.nodes + if n.op == "call_function" and n.target is execute_engine_op + ] + for node in nodes_to_replace: + inputs_arg = node.args[0] + engine_node = node.args[1] + + if engine_node.op == "get_attr": + engine_obj = getattr(gm, engine_node.target, None) + if engine_obj is None: + raise RuntimeError( + f"execute_engine node '{node.name}': get_attr target " + f"'{engine_node.target}' not found on graph module" + ) + elif engine_node.op == "placeholder": + # After torch.export, get_attr nodes for custom objects are lifted + # into placeholder inputs; the actual object lives in exp_program.constants. + constants = getattr(exp_program, "constants", {}) + engine_obj = constants.get(engine_node.name) or constants.get( + engine_node.target + ) + if engine_obj is None: + raise RuntimeError( + f"execute_engine node '{node.name}': placeholder engine node " + f"'{engine_node.name}' not found in exp_program.constants" + ) + else: + raise RuntimeError( + f"execute_engine node '{node.name}': expected engine arg to be " + f"a get_attr or placeholder node, got op='{engine_node.op}'" + ) + # Get engine info list via __getstate__ (same format as _pack_engine_info()) + from torch_tensorrt.dynamo.runtime._TorchTensorRTModule import ENGINE_IDX + + engine_info = list(engine_obj.__getstate__()) + engine_info = engine_info[0] + # Base64-encode the engine bytes, matching the reference cross-compile path + engine_bytes = engine_info[ENGINE_IDX] + if isinstance(engine_bytes, (bytes, bytearray)): + engine_info[ENGINE_IDX] = base64.b64encode(engine_bytes).decode("utf-8") + + with gm.graph.inserting_before(node): + no_op_node = gm.graph.call_function(no_op, (inputs_arg, *engine_info)) + no_op_node.meta = dict(node.meta) + node.replace_all_uses_with(no_op_node) + gm.graph.erase_node(node) + # Only erase get_attr engine nodes; placeholder nodes belong to the + # exported program's input signature and must not be removed here. + if engine_node.op == "get_attr": + gm.graph.erase_node(engine_node) + + gm.graph.eliminate_dead_code() + gm.graph.lint() + gm.recompile() + return exp_program + + def _save_as_executorch(exp_program: Any, file_path: str) -> None: """Save an ExportedProgram (with TensorRT execute_engine nodes) as an ExecuTorch .pte file. - Partitions the graph by torch.ops.tensorrt.execute_engine, serializes each engine - to the same blob format as the TRT runtime (vector of strings), and embeds it - in the .pte. Requires the ``executorch`` package and torch_tensorrt_runtime. See - https://pytorch.org/executorch/stable/getting-started-setup.html + Partitions the graph by torch.ops.tensorrt.no_op_placeholder_for_execute_engine + (execute_engine is pre-converted to avoid schema type errors in edge passes), + serializes each engine to the same blob format as the TRT runtime (vector of + strings), and embeds it in the .pte. Requires the ``executorch`` package and + torch_tensorrt_runtime. See https://pytorch.org/executorch/stable/getting-started-setup.html """ if not ENABLED_FEATURES.torch_tensorrt_runtime: raise RuntimeError( @@ -1077,11 +1159,14 @@ def _save_as_executorch(exp_program: Any, file_path: str) -> None: "ExecuTorch is not installed. Please install it to use output_format='executorch'. " "See https://pytorch.org/executorch/stable/getting-started-setup.html" ) - # Ensure execute_engine fake kernel is registered so partitioner can run - # when the engine is a CustomObjArgument (export placeholder). import torch_tensorrt.dynamo.runtime.meta_ops.register_meta_ops # noqa: F401 from torch_tensorrt.executorch import TensorRTPartitioner + # Replace execute_engine with no_op_placeholder before edge lowering so that + # ExecuTorch's symbolic-execution passes don't trip on the Engine custom-class + # schema check. + exp_program = _replace_execute_engine_with_no_op(exp_program) + breakpoint() edge_program = to_edge_transform_and_lower( exp_program, partitioner=[TensorRTPartitioner()], diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index bc3cdc5721..117a87abc8 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -9,6 +9,9 @@ import torch from torch.export import ExportedProgram +from torch_tensorrt.dynamo._leaf_spec_compat import _apply_leaf_spec_patch + +_apply_leaf_spec_patch() from torch.fx.node import Target from torch_tensorrt._Device import Device from torch_tensorrt._enums import EngineCapability, dtype diff --git a/py/torch_tensorrt/dynamo/_leaf_spec_compat.py b/py/torch_tensorrt/dynamo/_leaf_spec_compat.py new file mode 100644 index 0000000000..7bbd3a97ad --- /dev/null +++ b/py/torch_tensorrt/dynamo/_leaf_spec_compat.py @@ -0,0 +1,60 @@ +""" +Compatibility shim for a PyTorch 2.11 bug where ``LeafSpec`` (frozen dataclass +with ``slots=True``) inherits the ``type`` slot from ``TreeSpec`` but never +initialises it, leaving the slot empty. This causes + + AttributeError: 'LeafSpec' object has no attribute 'type' + +inside ``ExportedProgram.run_decompositions()`` when a model returns a single +tensor (i.e. the output pytree spec is a leaf rather than a list/tuple). + +The fix is applied once at import time and is a no-op on versions that already +set the attribute correctly. + +Upstream fix: https://github.com/pytorch/pytorch/issues/ +""" + +from __future__ import annotations + +import logging + +logger = logging.getLogger(__name__) + + +def _apply_leaf_spec_patch() -> None: + """Patch ``LeafSpec`` so its inherited ``type`` slot is always set to ``None``. + + Safe to call multiple times; the patch is idempotent. + """ + try: + from torch.utils._pytree import _LEAF_SPEC, LeafSpec + except ImportError: + return # too old / too new, nothing to do + + # Check whether the bug is present on the singleton instance + try: + _ = _LEAF_SPEC.type # noqa: F841 + return # attribute accessible — no patch needed + except AttributeError: + pass + + logger.debug( + "torch_tensorrt: applying LeafSpec.type compatibility patch " + "(PyTorch bug: frozen-dataclass slot not initialised in subclass)" + ) + + # Fix the pre-existing singleton that all pytree leaf specs share + object.__setattr__(_LEAF_SPEC, "type", None) + object.__setattr__(_LEAF_SPEC, "_context", None) + object.__setattr__(_LEAF_SPEC, "_children", []) + + # Patch __post_init__ so any new LeafSpec() instances are also fixed + _orig_post_init = LeafSpec.__post_init__ + + def _post_init_with_type(self: LeafSpec) -> None: + _orig_post_init(self) + object.__setattr__(self, "type", None) + object.__setattr__(self, "_context", None) + object.__setattr__(self, "_children", []) + + LeafSpec.__post_init__ = _post_init_with_type diff --git a/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py b/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py index 83ac0644b0..d33faa5b21 100644 --- a/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py +++ b/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py @@ -344,7 +344,48 @@ def no_op_placeholder_for_execute_engine( serialized_metadata: str, serialized_target_platform: str, serialized_require_output_allocator: str, + serialized_require_output_alocator_idx: str, ) -> List[torch.Tensor]: raise RuntimeError( "The saved model is cross compiled for windows in Linux, should only be loadded in Windows via torch_tensorrt.load_cross_compiled_exported_program() api." ) + + +@no_op_placeholder_for_execute_engine.register_fake # type: ignore +def fake_no_op_placeholder_for_execute_engine( + inputs: List[torch.Tensor], + abi_version: str, + name: str, + serialized_device_info: str, + serialized_engine: str, + serialized_in_binding_names: str, + serialized_out_binding_names: str, + serialized_hardware_compatible: str, + serialized_metadata: str, + serialized_target_platform: str, + serialized_require_output_allocator: str, + serialized_require_output_alocator_idx: str, +) -> List[torch.Tensor]: + """Fake kernel for no_op_placeholder_for_execute_engine. + + Parses serialized_metadata to derive output shapes, mirroring the + execute_engine fake kernel logic. + """ + if serialized_metadata: + try: + metadata = TorchTensorRTModule.decode_metadata(serialized_metadata) + shape_info = metadata.get("inout_symexprs") if metadata else None + if shape_info: + return _apply_symbolic_shape_expressions(inputs, shape_info) + except Exception: + pass + + # Fallback: return one tensor with same shape/dtype as the first input + from torch._guards import detect_fake_mode + + fake_mode = detect_fake_mode(inputs) if inputs else None + if not inputs: + return [torch.empty(())] + if fake_mode is not None: + return [fake_mode.from_tensor(inputs[0])] + return [torch.empty_like(inputs[0])] diff --git a/py/torch_tensorrt/executorch/backend.py b/py/torch_tensorrt/executorch/backend.py index 1c9ba4c615..b95cf55121 100644 --- a/py/torch_tensorrt/executorch/backend.py +++ b/py/torch_tensorrt/executorch/backend.py @@ -18,40 +18,37 @@ def _get_engine_info_from_edge_program(edge_program: ExportedProgram) -> List[Any]: - """Extract engine info (list of strings/bytes) from the partition's execute_engine node.""" + """Extract engine info (list of strings/bytes) from the partition's no_op_placeholder node. + + Before calling to_edge_transform_and_lower, _save_as_executorch replaces + execute_engine nodes with no_op_placeholder_for_execute_engine whose args are + (inputs_tuple, abi_version, name, device, engine_b64, in_names, out_names, + hw_compat, metadata, platform, requires_oa). This function reads those flat + args back out and returns them as a list indexed by SerializedInfoIndex. + """ gm = edge_program.graph_module - execute_engine_op = torch.ops.tensorrt.execute_engine.default + no_op = torch.ops.tensorrt.no_op_placeholder_for_execute_engine.default for node in gm.graph.nodes: - if node.op != "call_function" or node.target is not execute_engine_op: + if node.op != "call_function" or node.target is not no_op: continue + # args layout: (inputs_tuple, *engine_info_strings) + # engine_info_strings has SERIALIZATION_LEN - 1 entries (no RESOURCE_ALLOCATION_STRATEGY) if len(node.args) < 2: - continue - engine_arg = node.args[1] - if engine_arg.op == "get_attr": - val = getattr(gm, engine_arg.target, None) - if val is None: - raise RuntimeError( - f"Engine get_attr({engine_arg.target}) not found on partition module." - ) - if hasattr(val, "__getstate__"): - engine_info = val.__getstate__() - else: - engine_info = getattr(val, "engine_info", val) - if ( - isinstance(engine_info, (list, tuple)) - and len(engine_info) >= SERIALIZATION_LEN - ): - return list(engine_info) raise RuntimeError( - f"Engine argument get_attr({engine_arg.target}) did not yield engine info list (len >= {SERIALIZATION_LEN})." + f"no_op_placeholder node '{node.name}' has too few args: {len(node.args)}" ) - raise RuntimeError( - "TensorRT ExecuTorch backend expects execute_engine(inputs, engine) " - "where engine is a get_attr; cannot find engine." - ) + engine_info = list(node.args[1:]) + if len(engine_info) < SERIALIZATION_LEN - 1: + raise RuntimeError( + f"no_op_placeholder node '{node.name}' has {len(engine_info)} engine " + f"info args, expected at least {SERIALIZATION_LEN - 1}" + ) + return engine_info + raise RuntimeError( - "TensorRT ExecuTorch backend: no execute_engine node found in partition." + "TensorRT ExecuTorch backend: no no_op_placeholder_for_execute_engine " + "node found in partition." ) diff --git a/py/torch_tensorrt/executorch/operator_support.py b/py/torch_tensorrt/executorch/operator_support.py index 32763665c2..29991611f9 100644 --- a/py/torch_tensorrt/executorch/operator_support.py +++ b/py/torch_tensorrt/executorch/operator_support.py @@ -7,20 +7,22 @@ class TensorRTOperatorSupport(OperatorSupportBase): # type: ignore[misc] - """Supports only torch.ops.tensorrt.execute_engine for partitioning. + """Supports torch.ops.tensorrt.no_op_placeholder_for_execute_engine for partitioning. - Used so that TRT-compiled graphs (which already contain execute_engine nodes) - are partitioned per engine; each partition is then lowered to TensorRTBackend - which serializes the engine to the same blob format as the TRT runtime. + Prior to calling to_edge_transform_and_lower, _save_as_executorch replaces + execute_engine nodes with no_op_placeholder_for_execute_engine so that + ExecuTorch's edge-lowering passes (which symbolically execute every node) do + not trip on the Engine custom-class schema check. The partitioner therefore + targets the no-op placeholder instead of execute_engine directly. """ def __init__(self) -> None: super().__init__() - self._execute_engine_op = torch.ops.tensorrt.execute_engine.default + self._no_op = torch.ops.tensorrt.no_op_placeholder_for_execute_engine.default def is_node_supported( self, submodules: Dict[str, torch.nn.Module], node: torch.fx.Node ) -> bool: if node.op != "call_function": return False - return node.target is self._execute_engine_op + return node.target is self._no_op diff --git a/setup.py b/setup.py index 72eab98bd5..6115328b47 100644 --- a/setup.py +++ b/setup.py @@ -787,7 +787,7 @@ def get_x86_64_requirements(base_requirements): if IS_DLFW_CI: return requirements else: - requirements = requirements + ["torch>=2.12.0.dev,<2.13.0"] + requirements = requirements + ["torch>=2.11.0,<2.12.0"] if USE_TRT_RTX: return requirements + [ "tensorrt_rtx>=1.3.0.35", From 29a12a9efb4b4479a8dc9b4c20a2bfcd9ad37dcf Mon Sep 17 00:00:00 2001 From: Lan Luo Date: Tue, 24 Mar 2026 15:42:52 -0700 Subject: [PATCH 4/7] test --- py/torch_tensorrt/_compile.py | 6 +++--- setup.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/py/torch_tensorrt/_compile.py b/py/torch_tensorrt/_compile.py index 0b4470d3ef..c68057a516 100644 --- a/py/torch_tensorrt/_compile.py +++ b/py/torch_tensorrt/_compile.py @@ -1153,7 +1153,7 @@ def _save_as_executorch(exp_program: Any, file_path: str) -> None: "(torch_tensorrt_runtime). Reinstall torch_tensorrt with the runtime extension." ) try: - from executorch.exir import to_edge_transform_and_lower + from executorch.exir import to_edge_transform_and_lower, EdgeCompileConfig except ImportError: raise ImportError( "ExecuTorch is not installed. Please install it to use output_format='executorch'. " @@ -1165,11 +1165,11 @@ def _save_as_executorch(exp_program: Any, file_path: str) -> None: # Replace execute_engine with no_op_placeholder before edge lowering so that # ExecuTorch's symbolic-execution passes don't trip on the Engine custom-class # schema check. - exp_program = _replace_execute_engine_with_no_op(exp_program) - breakpoint() + # exp_program = _replace_execute_engine_with_no_op(exp_program) edge_program = to_edge_transform_and_lower( exp_program, partitioner=[TensorRTPartitioner()], + compile_config=EdgeCompileConfig(_check_ir_validity=False), ) executorch_program = edge_program.to_executorch() with open(file_path, "wb") as f: diff --git a/setup.py b/setup.py index 6115328b47..171f8f481b 100644 --- a/setup.py +++ b/setup.py @@ -35,7 +35,7 @@ LEGACY_BASE_VERSION_SUFFIX_PATTERN = re.compile("a0$") # CI_PIPELINE_ID is the environment variable set by DLFW ci build IS_DLFW_CI = os.environ.get("CI_PIPELINE_ID") is not None - +IS_DLFW_CI = True def get_root_dir() -> Path: return Path(__file__).parent.absolute() From ad77701c5a60f56a712105ab84c5226909e32c51 Mon Sep 17 00:00:00 2001 From: Lan Luo Date: Tue, 24 Mar 2026 15:53:18 -0700 Subject: [PATCH 5/7] test --- py/torch_tensorrt/_compile.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/py/torch_tensorrt/_compile.py b/py/torch_tensorrt/_compile.py index c68057a516..4d070650e7 100644 --- a/py/torch_tensorrt/_compile.py +++ b/py/torch_tensorrt/_compile.py @@ -1138,7 +1138,7 @@ def _replace_execute_engine_with_no_op(exp_program: Any) -> Any: return exp_program -def _save_as_executorch(exp_program: Any, file_path: str) -> None: +def _save_as_executorch(exp_program: Any, file_path: str, **kwargs) -> None: """Save an ExportedProgram (with TensorRT execute_engine nodes) as an ExecuTorch .pte file. Partitions the graph by torch.ops.tensorrt.no_op_placeholder_for_execute_engine @@ -1161,14 +1161,18 @@ def _save_as_executorch(exp_program: Any, file_path: str) -> None: ) import torch_tensorrt.dynamo.runtime.meta_ops.register_meta_ops # noqa: F401 from torch_tensorrt.executorch import TensorRTPartitioner + extra_partitioners = kwargs.get("partitioners", []) + partitioners = [TensorRTPartitioner()] + extra_partitioners + # DO I NEED THIS? # Replace execute_engine with no_op_placeholder before edge lowering so that # ExecuTorch's symbolic-execution passes don't trip on the Engine custom-class # schema check. # exp_program = _replace_execute_engine_with_no_op(exp_program) + edge_program = to_edge_transform_and_lower( exp_program, - partitioner=[TensorRTPartitioner()], + partitioner=partitioners, compile_config=EdgeCompileConfig(_check_ir_validity=False), ) executorch_program = edge_program.to_executorch() From 87ac93103171ebf3b91b8c6ec3e7f3451f3e954e Mon Sep 17 00:00:00 2001 From: Lan Luo Date: Tue, 24 Mar 2026 16:33:18 -0700 Subject: [PATCH 6/7] test --- py/torch_tensorrt/_compile.py | 92 +------------------ .../runtime/meta_ops/register_meta_ops.py | 50 ---------- py/torch_tensorrt/executorch/backend.py | 57 +++++++----- .../executorch/operator_support.py | 14 ++- 4 files changed, 42 insertions(+), 171 deletions(-) diff --git a/py/torch_tensorrt/_compile.py b/py/torch_tensorrt/_compile.py index 4d070650e7..d0246710e2 100644 --- a/py/torch_tensorrt/_compile.py +++ b/py/torch_tensorrt/_compile.py @@ -1057,88 +1057,7 @@ def _extract_tensor(obj: Any) -> Any: ) -def _replace_execute_engine_with_no_op(exp_program: Any) -> Any: - """Replace execute_engine nodes with no_op_placeholder_for_execute_engine. - - ExecuTorch's edge-lowering passes symbolically execute every node before - partitioning runs. The execute_engine schema requires a - ``__torch__.torch.classes.tensorrt.Engine`` argument, but after export the - engine is represented as a ``CustomObjArgument`` — causing a schema type - error inside the pass interpreter. - - The no_op_placeholder op uses flat string arguments instead of a custom - class, so it passes through the edge passes without issue. The - TensorRTPartitioner and TensorRTBackend are updated to work with the - no-op form. - """ - import base64 - - import torch_tensorrt.dynamo.runtime.meta_ops.register_meta_ops # noqa: F401 - - gm = exp_program.graph_module - execute_engine_op = torch.ops.tensorrt.execute_engine.default - no_op = torch.ops.tensorrt.no_op_placeholder_for_execute_engine.default - - nodes_to_replace = [ - n - for n in gm.graph.nodes - if n.op == "call_function" and n.target is execute_engine_op - ] - for node in nodes_to_replace: - inputs_arg = node.args[0] - engine_node = node.args[1] - - if engine_node.op == "get_attr": - engine_obj = getattr(gm, engine_node.target, None) - if engine_obj is None: - raise RuntimeError( - f"execute_engine node '{node.name}': get_attr target " - f"'{engine_node.target}' not found on graph module" - ) - elif engine_node.op == "placeholder": - # After torch.export, get_attr nodes for custom objects are lifted - # into placeholder inputs; the actual object lives in exp_program.constants. - constants = getattr(exp_program, "constants", {}) - engine_obj = constants.get(engine_node.name) or constants.get( - engine_node.target - ) - if engine_obj is None: - raise RuntimeError( - f"execute_engine node '{node.name}': placeholder engine node " - f"'{engine_node.name}' not found in exp_program.constants" - ) - else: - raise RuntimeError( - f"execute_engine node '{node.name}': expected engine arg to be " - f"a get_attr or placeholder node, got op='{engine_node.op}'" - ) - # Get engine info list via __getstate__ (same format as _pack_engine_info()) - from torch_tensorrt.dynamo.runtime._TorchTensorRTModule import ENGINE_IDX - - engine_info = list(engine_obj.__getstate__()) - engine_info = engine_info[0] - # Base64-encode the engine bytes, matching the reference cross-compile path - engine_bytes = engine_info[ENGINE_IDX] - if isinstance(engine_bytes, (bytes, bytearray)): - engine_info[ENGINE_IDX] = base64.b64encode(engine_bytes).decode("utf-8") - - with gm.graph.inserting_before(node): - no_op_node = gm.graph.call_function(no_op, (inputs_arg, *engine_info)) - no_op_node.meta = dict(node.meta) - node.replace_all_uses_with(no_op_node) - gm.graph.erase_node(node) - # Only erase get_attr engine nodes; placeholder nodes belong to the - # exported program's input signature and must not be removed here. - if engine_node.op == "get_attr": - gm.graph.erase_node(engine_node) - - gm.graph.eliminate_dead_code() - gm.graph.lint() - gm.recompile() - return exp_program - - -def _save_as_executorch(exp_program: Any, file_path: str, **kwargs) -> None: +def _save_as_executorch(exp_program: Any, file_path: str, **kwargs: Any) -> None: """Save an ExportedProgram (with TensorRT execute_engine nodes) as an ExecuTorch .pte file. Partitions the graph by torch.ops.tensorrt.no_op_placeholder_for_execute_engine @@ -1153,7 +1072,7 @@ def _save_as_executorch(exp_program: Any, file_path: str, **kwargs) -> None: "(torch_tensorrt_runtime). Reinstall torch_tensorrt with the runtime extension." ) try: - from executorch.exir import to_edge_transform_and_lower, EdgeCompileConfig + from executorch.exir import EdgeCompileConfig, to_edge_transform_and_lower except ImportError: raise ImportError( "ExecuTorch is not installed. Please install it to use output_format='executorch'. " @@ -1161,15 +1080,10 @@ def _save_as_executorch(exp_program: Any, file_path: str, **kwargs) -> None: ) import torch_tensorrt.dynamo.runtime.meta_ops.register_meta_ops # noqa: F401 from torch_tensorrt.executorch import TensorRTPartitioner + extra_partitioners = kwargs.get("partitioners", []) partitioners = [TensorRTPartitioner()] + extra_partitioners - # DO I NEED THIS? - # Replace execute_engine with no_op_placeholder before edge lowering so that - # ExecuTorch's symbolic-execution passes don't trip on the Engine custom-class - # schema check. - # exp_program = _replace_execute_engine_with_no_op(exp_program) - edge_program = to_edge_transform_and_lower( exp_program, partitioner=partitioners, diff --git a/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py b/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py index d33faa5b21..78a517e6fc 100644 --- a/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py +++ b/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py @@ -217,15 +217,6 @@ def fake_tensorrt_execute_engine( so partitioners can run without a real engine. Otherwise uses symbolic shape expressions from metadata to infer output shapes. """ - if _is_placeholder_engine(fake_trt_engine): - from torch._guards import detect_fake_mode - - fake_mode = detect_fake_mode(inputs) if inputs else None - if not inputs: - return [torch.empty(())] - if fake_mode is not None: - return [fake_mode.from_tensor(inputs[0])] - return [torch.empty_like(inputs[0])] metadata = None if hasattr(fake_trt_engine, "real_obj"): @@ -239,7 +230,6 @@ def fake_tensorrt_execute_engine( ) shape_info = metadata.get("inout_symexprs") if metadata else None - if shape_info: return _apply_symbolic_shape_expressions(inputs, shape_info) else: @@ -349,43 +339,3 @@ def no_op_placeholder_for_execute_engine( raise RuntimeError( "The saved model is cross compiled for windows in Linux, should only be loadded in Windows via torch_tensorrt.load_cross_compiled_exported_program() api." ) - - -@no_op_placeholder_for_execute_engine.register_fake # type: ignore -def fake_no_op_placeholder_for_execute_engine( - inputs: List[torch.Tensor], - abi_version: str, - name: str, - serialized_device_info: str, - serialized_engine: str, - serialized_in_binding_names: str, - serialized_out_binding_names: str, - serialized_hardware_compatible: str, - serialized_metadata: str, - serialized_target_platform: str, - serialized_require_output_allocator: str, - serialized_require_output_alocator_idx: str, -) -> List[torch.Tensor]: - """Fake kernel for no_op_placeholder_for_execute_engine. - - Parses serialized_metadata to derive output shapes, mirroring the - execute_engine fake kernel logic. - """ - if serialized_metadata: - try: - metadata = TorchTensorRTModule.decode_metadata(serialized_metadata) - shape_info = metadata.get("inout_symexprs") if metadata else None - if shape_info: - return _apply_symbolic_shape_expressions(inputs, shape_info) - except Exception: - pass - - # Fallback: return one tensor with same shape/dtype as the first input - from torch._guards import detect_fake_mode - - fake_mode = detect_fake_mode(inputs) if inputs else None - if not inputs: - return [torch.empty(())] - if fake_mode is not None: - return [fake_mode.from_tensor(inputs[0])] - return [torch.empty_like(inputs[0])] diff --git a/py/torch_tensorrt/executorch/backend.py b/py/torch_tensorrt/executorch/backend.py index b95cf55121..5ed8a01893 100644 --- a/py/torch_tensorrt/executorch/backend.py +++ b/py/torch_tensorrt/executorch/backend.py @@ -10,45 +10,54 @@ PreprocessResult, ) from torch.export.exported_program import ExportedProgram -from torch_tensorrt.dynamo.runtime._TorchTensorRTModule import ( - ENGINE_IDX, - SERIALIZATION_LEN, -) +from torch_tensorrt.dynamo.runtime._TorchTensorRTModule import ENGINE_IDX from torch_tensorrt.executorch.serialization import serialize_engine_info def _get_engine_info_from_edge_program(edge_program: ExportedProgram) -> List[Any]: - """Extract engine info (list of strings/bytes) from the partition's no_op_placeholder node. + """Extract engine info (list of strings/bytes) from the partition's execute_engine node. - Before calling to_edge_transform_and_lower, _save_as_executorch replaces - execute_engine nodes with no_op_placeholder_for_execute_engine whose args are - (inputs_tuple, abi_version, name, device, engine_b64, in_names, out_names, - hw_compat, metadata, platform, requires_oa). This function reads those flat - args back out and returns them as a list indexed by SerializedInfoIndex. + The partition contains a single execute_engine node whose second argument is + either a get_attr node (engine on the graph module) or a placeholder node + (engine lifted into edge_program.constants by torch.export). Either way, + the engine object's __getstate__() returns the SERIALIZATION_LEN-item list + used by the TRT runtime blob format. """ gm = edge_program.graph_module - no_op = torch.ops.tensorrt.no_op_placeholder_for_execute_engine.default + execute_engine_op = torch.ops.tensorrt.execute_engine.default for node in gm.graph.nodes: - if node.op != "call_function" or node.target is not no_op: + if node.op != "call_function" or node.target is not execute_engine_op: continue - # args layout: (inputs_tuple, *engine_info_strings) - # engine_info_strings has SERIALIZATION_LEN - 1 entries (no RESOURCE_ALLOCATION_STRATEGY) - if len(node.args) < 2: - raise RuntimeError( - f"no_op_placeholder node '{node.name}' has too few args: {len(node.args)}" + + engine_node = node.args[1] + if engine_node.op == "get_attr": + engine_obj = getattr(gm, engine_node.target, None) + if engine_obj is None: + raise RuntimeError( + f"execute_engine node '{node.name}': get_attr target " + f"'{engine_node.target}' not found on graph module" + ) + elif engine_node.op == "placeholder": + constants = getattr(edge_program, "constants", {}) + engine_obj = constants.get(engine_node.name) or constants.get( + engine_node.target ) - engine_info = list(node.args[1:]) - if len(engine_info) < SERIALIZATION_LEN - 1: + if engine_obj is None: + raise RuntimeError( + f"execute_engine node '{node.name}': placeholder engine " + f"'{engine_node.name}' not found in edge_program.constants" + ) + else: raise RuntimeError( - f"no_op_placeholder node '{node.name}' has {len(engine_info)} engine " - f"info args, expected at least {SERIALIZATION_LEN - 1}" + f"execute_engine node '{node.name}': unexpected engine arg op " + f"'{engine_node.op}'" ) - return engine_info + + return list(engine_obj.__getstate__()) raise RuntimeError( - "TensorRT ExecuTorch backend: no no_op_placeholder_for_execute_engine " - "node found in partition." + "TensorRT ExecuTorch backend: no execute_engine node found in partition." ) diff --git a/py/torch_tensorrt/executorch/operator_support.py b/py/torch_tensorrt/executorch/operator_support.py index 29991611f9..32763665c2 100644 --- a/py/torch_tensorrt/executorch/operator_support.py +++ b/py/torch_tensorrt/executorch/operator_support.py @@ -7,22 +7,20 @@ class TensorRTOperatorSupport(OperatorSupportBase): # type: ignore[misc] - """Supports torch.ops.tensorrt.no_op_placeholder_for_execute_engine for partitioning. + """Supports only torch.ops.tensorrt.execute_engine for partitioning. - Prior to calling to_edge_transform_and_lower, _save_as_executorch replaces - execute_engine nodes with no_op_placeholder_for_execute_engine so that - ExecuTorch's edge-lowering passes (which symbolically execute every node) do - not trip on the Engine custom-class schema check. The partitioner therefore - targets the no-op placeholder instead of execute_engine directly. + Used so that TRT-compiled graphs (which already contain execute_engine nodes) + are partitioned per engine; each partition is then lowered to TensorRTBackend + which serializes the engine to the same blob format as the TRT runtime. """ def __init__(self) -> None: super().__init__() - self._no_op = torch.ops.tensorrt.no_op_placeholder_for_execute_engine.default + self._execute_engine_op = torch.ops.tensorrt.execute_engine.default def is_node_supported( self, submodules: Dict[str, torch.nn.Module], node: torch.fx.Node ) -> bool: if node.op != "call_function": return False - return node.target is self._no_op + return node.target is self._execute_engine_op From faac9094a7468ee406884482992936525a016f5b Mon Sep 17 00:00:00 2001 From: Lan Luo Date: Tue, 24 Mar 2026 16:40:13 -0700 Subject: [PATCH 7/7] test --- py/torch_tensorrt/dynamo/_compiler.py | 3 +++ .../dynamo/runtime/_TorchTensorRTModule.py | 2 +- .../runtime/meta_ops/register_meta_ops.py | 27 +++++-------------- setup.py | 4 ++- 4 files changed, 13 insertions(+), 23 deletions(-) diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index 117a87abc8..79cd025925 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -9,6 +9,9 @@ import torch from torch.export import ExportedProgram + +# TODO: remove this in future, this is just for test executorch which uses torch 2.11 which has a bug in the leaf spec compat +# the bug has been fixed in the torch 2.12 in the upstream. from torch_tensorrt.dynamo._leaf_spec_compat import _apply_leaf_spec_patch _apply_leaf_spec_patch() diff --git a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py index 2de6b10810..d77c0bf39f 100644 --- a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py @@ -338,7 +338,6 @@ def forward(self, *inputs: Any) -> torch.Tensor | Tuple[torch.Tensor, ...]: Returns: torch.Tensor or Tuple(torch.Tensor): Result of the engine computation """ - if self.engine is None: raise RuntimeError("Engine has not been setup yet.") @@ -355,6 +354,7 @@ def forward(self, *inputs: Any) -> torch.Tensor | Tuple[torch.Tensor, ...]: (i if isinstance(i, torch.Tensor) else torch.tensor(i).cuda()) for i in inputs ] + outputs: List[torch.Tensor] = torch.ops.tensorrt.execute_engine( list(input_tensors), self.engine ) diff --git a/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py b/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py index 78a517e6fc..e03c88153c 100644 --- a/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py +++ b/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py @@ -189,22 +189,6 @@ def fake_aten_cudnn_grid_sampler( return torch.empty(out_shape, dtype=input.dtype, device=input.device) -def _is_placeholder_engine(engine: Any) -> bool: - """True if engine is a placeholder (CustomObjArgument/FakeScriptObject) from export.""" - if engine is None: - return True - type_name = type(engine).__name__ - if type_name == "CustomObjArgument": - return True - if type_name == "FakeScriptObject": - return True - if hasattr(engine, "fake_val") and engine.fake_val is not None: - return True - if not hasattr(engine, "get_serialized_metadata"): - return True - return False - - @torch.library.register_fake("tensorrt::execute_engine") # type: ignore def fake_tensorrt_execute_engine( inputs: List[torch.Tensor], fake_trt_engine: Any @@ -212,14 +196,13 @@ def fake_tensorrt_execute_engine( """ Meta kernel for TensorRT engine execution. - When the engine is a placeholder (CustomObjArgument/FakeScriptObject from - torch.export/ExecuTorch), returns one fake output per input (same shape/dtype) - so partitioners can run without a real engine. Otherwise uses symbolic shape - expressions from metadata to infer output shapes. + Uses symbolic shape expressions captured at compile time to correctly infer + output shapes while preserving symbolic SymInt relationships. """ metadata = None if hasattr(fake_trt_engine, "real_obj"): + # Wrapped C++ engine with real_obj trt_engine = fake_trt_engine.real_obj metadata = TorchTensorRTModule.decode_metadata( trt_engine.get_serialized_metadata() @@ -230,7 +213,10 @@ def fake_tensorrt_execute_engine( ) shape_info = metadata.get("inout_symexprs") if metadata else None + if shape_info: + # Apply the symbolic shape expressions to create output fake tensors + # shape_info now contains both 'inputs' and 'outputs' keys return _apply_symbolic_shape_expressions(inputs, shape_info) else: raise RuntimeError( @@ -334,7 +320,6 @@ def no_op_placeholder_for_execute_engine( serialized_metadata: str, serialized_target_platform: str, serialized_require_output_allocator: str, - serialized_require_output_alocator_idx: str, ) -> List[torch.Tensor]: raise RuntimeError( "The saved model is cross compiled for windows in Linux, should only be loadded in Windows via torch_tensorrt.load_cross_compiled_exported_program() api." diff --git a/setup.py b/setup.py index 171f8f481b..7445818cd9 100644 --- a/setup.py +++ b/setup.py @@ -35,8 +35,10 @@ LEGACY_BASE_VERSION_SUFFIX_PATTERN = re.compile("a0$") # CI_PIPELINE_ID is the environment variable set by DLFW ci build IS_DLFW_CI = os.environ.get("CI_PIPELINE_ID") is not None +# TODO: remove this in future, this is just for test executorch which try to use whatever torch version executorch uses IS_DLFW_CI = True + def get_root_dir() -> Path: return Path(__file__).parent.absolute() @@ -787,7 +789,7 @@ def get_x86_64_requirements(base_requirements): if IS_DLFW_CI: return requirements else: - requirements = requirements + ["torch>=2.11.0,<2.12.0"] + requirements = requirements + ["torch>=2.12.0.dev,<2.13.0"] if USE_TRT_RTX: return requirements + [ "tensorrt_rtx>=1.3.0.35",