From 7f328f7f56843f96b7e740c9f51704d6514ffb37 Mon Sep 17 00:00:00 2001
From: Lan Luo <lanl@nvidia.com>
Date: Wed, 18 Mar 2026 11:21:11 -0700
Subject: [PATCH 1/7] initial checkin

---
 .../model_static_shape.py                     | 61 +++++++++++++
 py/torch_tensorrt/_compile.py                 | 50 ++++++++++-
 .../dynamo/runtime/_TorchTensorRTModule.py    |  2 +-
 py/torch_tensorrt/executorch/__init__.py      |  9 ++
 py/torch_tensorrt/executorch/backend.py       | 85 +++++++++++++++++++
 .../executorch/operator_support.py            | 26 ++++++
 py/torch_tensorrt/executorch/partitioner.py   | 63 ++++++++++++++
 py/torch_tensorrt/executorch/serialization.py | 32 +++++++
 8 files changed, 323 insertions(+), 5 deletions(-)
 create mode 100644 examples/torchtrt_executorch_example/model_static_shape.py
 create mode 100644 py/torch_tensorrt/executorch/__init__.py
 create mode 100644 py/torch_tensorrt/executorch/backend.py
 create mode 100644 py/torch_tensorrt/executorch/operator_support.py
 create mode 100644 py/torch_tensorrt/executorch/partitioner.py
 create mode 100644 py/torch_tensorrt/executorch/serialization.py

diff --git a/examples/torchtrt_executorch_example/model_static_shape.py b/examples/torchtrt_executorch_example/model_static_shape.py
new file mode 100644
index 0000000000..f94a7d3c0a
--- /dev/null
+++ b/examples/torchtrt_executorch_example/model_static_shape.py
@@ -0,0 +1,61 @@
+"""
+.. _executorch_export:
+
+Saving a Torch-TensorRT Model in ExecuTorch Format (.pte)
+=========================================================
+
+This example demonstrates how to compile a model with Torch-TensorRT and save it
+as an ExecuTorch ``.pte`` file, which can be loaded by the ExecuTorch runtime
+(e.g., on embedded or mobile devices with a TensorRT-capable backend).
+
+Prerequisites
+-------------
+Install ExecuTorch before running this example::
+
+    pip install executorch
+
+See https://pytorch.org/executorch/stable/getting-started-setup.html for details.
+"""
+
+# %%
+# Imports and Model Definition
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+import torch
+import torch_tensorrt
+
+
+class MyModel(torch.nn.Module):
+    def forward(self, x):
+        return x + 1
+
+
+# %%
+# Compile with Torch-TensorRT
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+# Export the model, compile it with TensorRT, then save as .pte
+
+with torch.no_grad():
+    model = MyModel().eval().cuda()
+    example_input = (torch.randn((2, 3, 4, 4)).cuda(),)
+
+    exported_program = torch.export.export(model, example_input)
+    compile_settings = {
+        "arg_inputs": [
+            torch_tensorrt.Input(shape=(2, 3, 4, 4), dtype=torch.float32),
+        ],
+        "min_block_size": 1,
+    }
+    trt_gm = torch_tensorrt.dynamo.compile(exported_program, **compile_settings)
+
+    # %%
+    # Save as ExecuTorch .pte format (loadable by the ExecuTorch runtime)
+    # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+    # The TensorRT engine is serialized inside the .pte using the same blob format
+    # as the Torch-TensorRT runtime (vector of strings), so one engine format for
+    # both ExecuTorch and non-ExecuTorch deployment.
+    torch_tensorrt.save(
+        trt_gm, "model.pte", output_format="executorch", arg_inputs=example_input
+    )
+
+    print("Saved model.pte successfully.")
diff --git a/py/torch_tensorrt/_compile.py b/py/torch_tensorrt/_compile.py
index c4dbb1c148..3e25cce239 100644
--- a/py/torch_tensorrt/_compile.py
+++ b/py/torch_tensorrt/_compile.py
@@ -653,7 +653,7 @@ def save(
         inputs (Union[torch.Tensor, torch_tensorrt.Input]): Torch input tensors or Input specifications
         arg_inputs (Tuple[Union[torch.Tensor, torch_tensorrt.Input], ...]): Same as inputs. Alias for better understanding with kwarg_inputs.
         kwarg_inputs (dict[str, Union[torch.Tensor, torch_tensorrt.Input]]): Optional, kwarg inputs to the module forward function.
-        output_format (str): Format to save the model. Options include exported_program | torchscript | aot_inductor.
+        output_format (str): Format to save the model. Options include exported_program | torchscript | aot_inductor | executorch.
         retrace (bool): When the module type is a fx.GraphModule, this option re-exports the graph using torch.export.export(strict=False) to save it.
 
                 For TRT-compiled modules with dynamic shapes, both retrace=True and retrace=False are supported:
@@ -726,7 +726,7 @@ def save(
     if isinstance(module, CudaGraphsTorchTensorRTModule):
         module = module.compiled_module
     module_type = _parse_module_type(module)
-    accepted_formats = {"exported_program", "torchscript", "aot_inductor"}
+    accepted_formats = {"exported_program", "torchscript", "aot_inductor", "executorch"}
     if arg_inputs is not None and not all(
         isinstance(input, (torch.Tensor, Input)) for input in arg_inputs
     ):
@@ -847,12 +847,16 @@ def _extract_tensor(obj: Any) -> Any:
 
     if output_format not in accepted_formats:
         raise ValueError(
-            f"Provided output_format {output_format} is not supported. Supported options are exported_program | torchscript"
+            f"Provided output_format {output_format} is not supported. Supported options are exported_program | torchscript | aot_inductor | executorch"
         )
     if output_format == "aot_inductor" and platform.system() != "Linux":
         raise ValueError(
             f"The AOT Inductor format is only supported on Linux, {platform.system()} is not a supported platform for this format"
         )
+    if output_format == "executorch" and platform.system() != "Linux":
+        raise ValueError(
+            f"The executorch format is only supported on Linux, {platform.system()} is not a supported platform for this format"
+        )
     if not file_path:
         raise ValueError("File path cannot be empty. Please provide a valid file path")
 
@@ -906,6 +910,8 @@ def _extract_tensor(obj: Any) -> Any:
                     inductor_configs=inductor_configs,
                     package_path=file_path,
                 )
+            elif output_format == "executorch":
+                _save_as_executorch(module, file_path)
             else:
                 raise RuntimeError(
                     "Attempted to serialize an exported program with an unsupported format. Exported programs support exported_program and aot_inductor"
@@ -963,6 +969,8 @@ def _extract_tensor(obj: Any) -> Any:
                         inductor_configs=inductor_configs,
                         package_path=file_path,
                     )
+                elif output_format == "executorch":
+                    _save_as_executorch(exp_program, file_path)
                 else:
                     raise RuntimeError(
                         "Attempted to serialize an exported program with an unsupported format. Exported programs support exported_program and aot_inductor"
@@ -1014,7 +1022,7 @@ def _extract_tensor(obj: Any) -> Any:
                             "Provided model is a torch.fx.GraphModule without existing shape metadata and retrace is True, however no inputs specs were provided. "
                             "Please provide valid torch.Tensors or torch_tensorrt.Input objects as inputs to retrace and save the model"
                         )
-
+                    breakpoint()
                     exp_program = torch.export.export(
                         module,
                         args=tuple(arg_tensors),
@@ -1042,12 +1050,46 @@ def _extract_tensor(obj: Any) -> Any:
                         inductor_configs=inductor_configs,
                         package_path=file_path,
                     )
+                elif output_format == "executorch":
+                    _save_as_executorch(exp_program, file_path)
                 else:
                     raise RuntimeError(
                         "Attempted to serialize an exported program with an unsupported format. Exported programs support exported_program and aot_inductor"
                     )
 
 
+def _save_as_executorch(exp_program: Any, file_path: str) -> None:
+    """Save an ExportedProgram (with TensorRT execute_engine nodes) as an ExecuTorch .pte file.
+
+    Partitions the graph by torch.ops.tensorrt.execute_engine, serializes each engine
+    to the same blob format as the TRT runtime (vector of strings), and embeds it
+    in the .pte. Requires the ``executorch`` package and torch_tensorrt_runtime. See
+    https://pytorch.org/executorch/stable/getting-started-setup.html
+    """
+    if not ENABLED_FEATURES.torch_tensorrt_runtime:
+        raise RuntimeError(
+            "output_format='executorch' requires the Torch-TensorRT runtime "
+            "(torch_tensorrt_runtime). Reinstall torch_tensorrt with the runtime extension."
+        )
+    try:
+        from executorch.exir import to_edge_transform_and_lower
+    except ImportError:
+        raise ImportError(
+            "ExecuTorch is not installed. Please install it to use output_format='executorch'. "
+            "See https://pytorch.org/executorch/stable/getting-started-setup.html"
+        )
+    from torch_tensorrt.executorch import TensorRTPartitioner
+
+    breakpoint()
+    edge_program = to_edge_transform_and_lower(
+        exp_program,
+        partitioner=[TensorRTPartitioner()],
+    )
+    executorch_program = edge_program.to_executorch()
+    with open(file_path, "wb") as f:
+        executorch_program.write_to_file(f)
+
+
 def function_overload_with_kwargs(
     fn: Callable[..., Any], *args: Any, **kwargs: Any
 ) -> Any:
diff --git a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
index d77c0bf39f..91994ca49b 100644
--- a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
+++ b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
@@ -338,6 +338,7 @@ def forward(self, *inputs: Any) -> torch.Tensor | Tuple[torch.Tensor, ...]:
         Returns:
             torch.Tensor or Tuple(torch.Tensor): Result of the engine computation
         """
+        breakpoint()
         if self.engine is None:
             raise RuntimeError("Engine has not been setup yet.")
 
@@ -354,7 +355,6 @@ def forward(self, *inputs: Any) -> torch.Tensor | Tuple[torch.Tensor, ...]:
             (i if isinstance(i, torch.Tensor) else torch.tensor(i).cuda())
             for i in inputs
         ]
-
         outputs: List[torch.Tensor] = torch.ops.tensorrt.execute_engine(
             list(input_tensors), self.engine
         )
diff --git a/py/torch_tensorrt/executorch/__init__.py b/py/torch_tensorrt/executorch/__init__.py
new file mode 100644
index 0000000000..81aa088610
--- /dev/null
+++ b/py/torch_tensorrt/executorch/__init__.py
@@ -0,0 +1,9 @@
+# ExecuTorch backend for Torch-TensorRT: save/load .pte with TensorRT delegate.
+
+from torch_tensorrt.executorch.backend import TensorRTBackend
+from torch_tensorrt.executorch.partitioner import TensorRTPartitioner
+
+__all__ = [
+    "TensorRTBackend",
+    "TensorRTPartitioner",
+]
diff --git a/py/torch_tensorrt/executorch/backend.py b/py/torch_tensorrt/executorch/backend.py
new file mode 100644
index 0000000000..1c9ba4c615
--- /dev/null
+++ b/py/torch_tensorrt/executorch/backend.py
@@ -0,0 +1,85 @@
+# ExecuTorch TensorRT backend: serialize engine to same blob format as TRT runtime.
+
+import base64
+from typing import Any, List, final
+
+import torch
+from executorch.exir.backend.backend_details import (
+    BackendDetails,
+    CompileSpec,
+    PreprocessResult,
+)
+from torch.export.exported_program import ExportedProgram
+from torch_tensorrt.dynamo.runtime._TorchTensorRTModule import (
+    ENGINE_IDX,
+    SERIALIZATION_LEN,
+)
+from torch_tensorrt.executorch.serialization import serialize_engine_info
+
+
+def _get_engine_info_from_edge_program(edge_program: ExportedProgram) -> List[Any]:
+    """Extract engine info (list of strings/bytes) from the partition's execute_engine node."""
+    gm = edge_program.graph_module
+    execute_engine_op = torch.ops.tensorrt.execute_engine.default
+
+    for node in gm.graph.nodes:
+        if node.op != "call_function" or node.target is not execute_engine_op:
+            continue
+        if len(node.args) < 2:
+            continue
+        engine_arg = node.args[1]
+        if engine_arg.op == "get_attr":
+            val = getattr(gm, engine_arg.target, None)
+            if val is None:
+                raise RuntimeError(
+                    f"Engine get_attr({engine_arg.target}) not found on partition module."
+                )
+            if hasattr(val, "__getstate__"):
+                engine_info = val.__getstate__()
+            else:
+                engine_info = getattr(val, "engine_info", val)
+            if (
+                isinstance(engine_info, (list, tuple))
+                and len(engine_info) >= SERIALIZATION_LEN
+            ):
+                return list(engine_info)
+            raise RuntimeError(
+                f"Engine argument get_attr({engine_arg.target}) did not yield engine info list (len >= {SERIALIZATION_LEN})."
+            )
+        raise RuntimeError(
+            "TensorRT ExecuTorch backend expects execute_engine(inputs, engine) "
+            "where engine is a get_attr; cannot find engine."
+        )
+    raise RuntimeError(
+        "TensorRT ExecuTorch backend: no execute_engine node found in partition."
+    )
+
+
+@final
+class TensorRTBackend(BackendDetails):  # type: ignore[misc]
+    """Backend that serializes TensorRT engine to the same blob format as the TRT runtime.
+
+    The partition contains a single execute_engine node; we extract the engine
+    and metadata and encode them as a vector of strings (same layout as
+    core/runtime/runtime.h SerializedInfoIndex) so the same blob works for
+    both ExecuTorch and non-ExecuTorch TRT runtime.
+    """
+
+    @staticmethod
+    def preprocess(
+        edge_program: ExportedProgram,
+        compile_specs: List[CompileSpec],
+    ) -> PreprocessResult:
+        engine_info = _get_engine_info_from_edge_program(edge_program)
+        engine_info = list(engine_info)
+        serialized_engine = engine_info[ENGINE_IDX]
+        if isinstance(serialized_engine, str):
+            engine_info[ENGINE_IDX] = base64.b64decode(
+                serialized_engine.encode("utf-8")
+            )
+        elif not isinstance(serialized_engine, (bytes, bytearray)):
+            engine_info[ENGINE_IDX] = bytes(serialized_engine)
+        if len(engine_info) > 7 and isinstance(engine_info[7], bytes):
+            engine_info[7] = engine_info[7].decode("utf-8", errors="replace")
+        blob = serialize_engine_info(engine_info)
+        return PreprocessResult(processed_bytes=blob)
diff --git a/py/torch_tensorrt/executorch/operator_support.py b/py/torch_tensorrt/executorch/operator_support.py
new file mode 100644
index 0000000000..32763665c2
--- /dev/null
+++ b/py/torch_tensorrt/executorch/operator_support.py
@@ -0,0 +1,26 @@
+# Operator support for ExecuTorch TensorRT partitioner: only execute_engine is supported.
+
+from typing import Dict
+
+import torch
+from torch.fx.passes.operator_support import OperatorSupportBase
+
+
+class TensorRTOperatorSupport(OperatorSupportBase):  # type: ignore[misc]
+    """Supports only torch.ops.tensorrt.execute_engine for partitioning.
+
+    Used so that TRT-compiled graphs (which already contain execute_engine nodes)
+    are partitioned per engine; each partition is then lowered to TensorRTBackend
+    which serializes the engine to the same blob format as the TRT runtime.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self._execute_engine_op = torch.ops.tensorrt.execute_engine.default
+
+    def is_node_supported(
+        self, submodules: Dict[str, torch.nn.Module], node: torch.fx.Node
+    ) -> bool:
+        if node.op != "call_function":
+            return False
+        return node.target is self._execute_engine_op
diff --git a/py/torch_tensorrt/executorch/partitioner.py b/py/torch_tensorrt/executorch/partitioner.py
new file mode 100644
index 0000000000..9fcab9f709
--- /dev/null
+++ b/py/torch_tensorrt/executorch/partitioner.py
@@ -0,0 +1,63 @@
+# ExecuTorch partitioner: partition by execute_engine nodes.
+
+from typing import Callable, Dict, List, Optional, Tuple
+
+import torch
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+from executorch.exir.backend.partitioner import (
+    DelegationSpec,
+    Partitioner,
+    PartitionResult,
+)
+from executorch.exir.backend.utils import tag_constant_data
+from torch.export import ExportedProgram
+from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
+from torch_tensorrt.executorch.backend import TensorRTBackend
+from torch_tensorrt.executorch.operator_support import TensorRTOperatorSupport
+
+
+class TensorRTPartitioner(Partitioner):  # type: ignore[misc]
+    """Partitions the graph for TensorRT delegation.
+
+    Only nodes that are torch.ops.tensorrt.execute_engine are supported;
+    each such node becomes its own partition so the backend can serialize
+    the engine to the same format as the TRT runtime.
+    """
+
+    def __init__(
+        self,
+        compile_specs: Optional[List[CompileSpec]] = None,
+    ) -> None:
+        super().__init__()
+        self.compile_specs = compile_specs or []
+        self.delegation_spec = DelegationSpec(
+            backend_id=TensorRTBackend.__name__,
+            compile_specs=self.compile_specs,
+        )
+
+    def partition(self, exported_program: ExportedProgram) -> PartitionResult:
+        capability_partitioner = CapabilityBasedPartitioner(
+            exported_program.graph_module,
+            TensorRTOperatorSupport(),
+            allows_single_node_partition=True,
+        )
+        partition_list = capability_partitioner.propose_partitions()
+
+        partition_tags: Dict[str, DelegationSpec] = {}
+        for partition in partition_list:
+            tag = f"tensorrt_{partition.id}"
+            for node in partition.nodes:
+                node.meta["delegation_tag"] = tag
+            partition_tags[tag] = self.delegation_spec
+
+        tag_constant_data(exported_program)
+
+        return PartitionResult(
+            tagged_exported_program=exported_program,
+            partition_tags=partition_tags,
+        )
+
+    def ops_to_not_decompose(
+        self, ep: ExportedProgram
+    ) -> Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]:
+        return ([], None)
diff --git a/py/torch_tensorrt/executorch/serialization.py b/py/torch_tensorrt/executorch/serialization.py
new file mode 100644
index 0000000000..742269973d
--- /dev/null
+++ b/py/torch_tensorrt/executorch/serialization.py
@@ -0,0 +1,32 @@
+# Serialization for ExecuTorch TensorRT blob: same format as TRT runtime (vector of strings).
+# Uses the same list format as TorchTensorRTModule._pack_engine_info, then encodes to bytes.
+# Only valid when ENABLED_FEATURES.torch_tensorrt_runtime is True.
+
+import struct
+from typing import List, Union
+
+from torch_tensorrt.dynamo.runtime._TorchTensorRTModule import SERIALIZATION_LEN
+
+
+def serialize_engine_info(engine_info: List[Union[str, bytes]]) -> bytes:
+    """Encode engine info list (same format as TorchTensorRTModule._pack_engine_info) to bytes.
+
+    Takes the list produced by _pack_engine_info (or equivalent) and writes it in the
+    TRT runtime vector<string> format: 4-byte count (SERIALIZATION_LEN), then for each
+    entry 4-byte length (LE) + raw bytes. C++ can deserialize to std::vector<std::string>
+    and pass to TRTEngine(std::vector<std::string> serialized_info).
+    """
+    if len(engine_info) < SERIALIZATION_LEN:
+        engine_info = list(engine_info) + [""] * (SERIALIZATION_LEN - len(engine_info))
+    parts: List[bytes] = []
+    for i in range(SERIALIZATION_LEN):
+        raw = engine_info[i]
+        if isinstance(raw, str):
+            raw = raw.encode("utf-8")
+        elif raw is None:
+            raw = b""
+        else:
+            raw = bytes(raw)
+        parts.append(struct.pack("<I", len(raw)))
+        parts.append(raw)
+    return struct.pack("<I", SERIALIZATION_LEN) + b"".join(parts)

From 06a9d5c221f5b9be294154b616a79aad945c6416 Mon Sep 17 00:00:00 2001
From: Lan Luo <lanl@nvidia.com>
Date: Wed, 18 Mar 2026 11:54:13 -0700
Subject: [PATCH 2/7] test1

---
 .../model_static_shape.py                     |  8 ++++-
 py/torch_tensorrt/_compile.py                 |  5 +--
 .../dynamo/runtime/_TorchTensorRTModule.py    |  2 +-
 .../runtime/meta_ops/register_meta_ops.py     | 34 ++++++++++++++++---
 4 files changed, 40 insertions(+), 9 deletions(-)

diff --git a/examples/torchtrt_executorch_example/model_static_shape.py b/examples/torchtrt_executorch_example/model_static_shape.py
index f94a7d3c0a..f6f9e9006d 100644
--- a/examples/torchtrt_executorch_example/model_static_shape.py
+++ b/examples/torchtrt_executorch_example/model_static_shape.py
@@ -54,8 +54,14 @@ def forward(self, x):
     # The TensorRT engine is serialized inside the .pte using the same blob format
     # as the Torch-TensorRT runtime (vector of strings), so one engine format for
     # both ExecuTorch and non-ExecuTorch deployment.
+    # Use retrace=False so the legacy exporter is used; the engine is then available
+    # when ExecuTorch's partitioner runs the graph.
     torch_tensorrt.save(
-        trt_gm, "model.pte", output_format="executorch", arg_inputs=example_input
+        trt_gm,
+        "model.pte",
+        output_format="executorch",
+        arg_inputs=example_input,
+        retrace=False,
     )
 
     print("Saved model.pte successfully.")
diff --git a/py/torch_tensorrt/_compile.py b/py/torch_tensorrt/_compile.py
index 3e25cce239..6491d91c1f 100644
--- a/py/torch_tensorrt/_compile.py
+++ b/py/torch_tensorrt/_compile.py
@@ -1022,7 +1022,6 @@ def _extract_tensor(obj: Any) -> Any:
                             "Provided model is a torch.fx.GraphModule without existing shape metadata and retrace is True, however no inputs specs were provided. "
                             "Please provide valid torch.Tensors or torch_tensorrt.Input objects as inputs to retrace and save the model"
                         )
-                    breakpoint()
                     exp_program = torch.export.export(
                         module,
                         args=tuple(arg_tensors),
@@ -1078,9 +1077,11 @@ def _save_as_executorch(exp_program: Any, file_path: str) -> None:
             "ExecuTorch is not installed. Please install it to use output_format='executorch'. "
             "See https://pytorch.org/executorch/stable/getting-started-setup.html"
         )
+    # Ensure execute_engine fake kernel is registered so partitioner can run
+    # when the engine is a CustomObjArgument (export placeholder).
+    import torch_tensorrt.dynamo.runtime.meta_ops.register_meta_ops  # noqa: F401
     from torch_tensorrt.executorch import TensorRTPartitioner
 
-    breakpoint()
     edge_program = to_edge_transform_and_lower(
         exp_program,
         partitioner=[TensorRTPartitioner()],
diff --git a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
index 91994ca49b..2de6b10810 100644
--- a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
+++ b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
@@ -338,7 +338,7 @@ def forward(self, *inputs: Any) -> torch.Tensor | Tuple[torch.Tensor, ...]:
         Returns:
             torch.Tensor or Tuple(torch.Tensor): Result of the engine computation
         """
-        breakpoint()
+
         if self.engine is None:
             raise RuntimeError("Engine has not been setup yet.")
 
diff --git a/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py b/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py
index e03c88153c..83ac0644b0 100644
--- a/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py
+++ b/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py
@@ -189,6 +189,22 @@ def fake_aten_cudnn_grid_sampler(
     return torch.empty(out_shape, dtype=input.dtype, device=input.device)
 
 
+def _is_placeholder_engine(engine: Any) -> bool:
+    """True if engine is a placeholder (CustomObjArgument/FakeScriptObject) from export."""
+    if engine is None:
+        return True
+    type_name = type(engine).__name__
+    if type_name == "CustomObjArgument":
+        return True
+    if type_name == "FakeScriptObject":
+        return True
+    if hasattr(engine, "fake_val") and engine.fake_val is not None:
+        return True
+    if not hasattr(engine, "get_serialized_metadata"):
+        return True
+    return False
+
+
 @torch.library.register_fake("tensorrt::execute_engine")  # type: ignore
 def fake_tensorrt_execute_engine(
     inputs: List[torch.Tensor], fake_trt_engine: Any
@@ -196,13 +212,23 @@ def fake_tensorrt_execute_engine(
     """
     Meta kernel for TensorRT engine execution.
 
-    Uses symbolic shape expressions captured at compile time to correctly infer
-    output shapes while preserving symbolic SymInt relationships.
+    When the engine is a placeholder (CustomObjArgument/FakeScriptObject from
+    torch.export/ExecuTorch), returns one fake output per input (same shape/dtype)
+    so partitioners can run without a real engine. Otherwise uses symbolic shape
+    expressions from metadata to infer output shapes.
     """
+    if _is_placeholder_engine(fake_trt_engine):
+        from torch._guards import detect_fake_mode
+
+        fake_mode = detect_fake_mode(inputs) if inputs else None
+        if not inputs:
+            return [torch.empty(())]
+        if fake_mode is not None:
+            return [fake_mode.from_tensor(inputs[0])]
+        return [torch.empty_like(inputs[0])]
 
     metadata = None
     if hasattr(fake_trt_engine, "real_obj"):
-        # Wrapped C++ engine with real_obj
         trt_engine = fake_trt_engine.real_obj
         metadata = TorchTensorRTModule.decode_metadata(
             trt_engine.get_serialized_metadata()
@@ -215,8 +241,6 @@ def fake_tensorrt_execute_engine(
     shape_info = metadata.get("inout_symexprs") if metadata else None
 
     if shape_info:
-        # Apply the symbolic shape expressions to create output fake tensors
-        # shape_info now contains both 'inputs' and 'outputs' keys
         return _apply_symbolic_shape_expressions(inputs, shape_info)
     else:
         raise RuntimeError(

From c8b92ef381d9b2d862977f70f92838fbf5b25751 Mon Sep 17 00:00:00 2001
From: Lan Luo <lanl@nvidia.com>
Date: Tue, 24 Mar 2026 15:16:56 -0700
Subject: [PATCH 3/7] test

---
 .gitignore                                    |   1 +
 core/runtime/executorch/TensorRTBackend.cpp   | 313 ++++++++++++++++++
 core/runtime/executorch/TensorRTBackend.h     |  39 +++
 ...static_shape.py => export_static_shape.py} |   0
 py/torch_tensorrt/_compile.py                 |  97 +++++-
 py/torch_tensorrt/dynamo/_compiler.py         |   3 +
 py/torch_tensorrt/dynamo/_leaf_spec_compat.py |  60 ++++
 .../runtime/meta_ops/register_meta_ops.py     |  41 +++
 py/torch_tensorrt/executorch/backend.py       |  49 ++-
 .../executorch/operator_support.py            |  14 +-
 setup.py                                      |   2 +-
 11 files changed, 580 insertions(+), 39 deletions(-)
 create mode 100644 core/runtime/executorch/TensorRTBackend.cpp
 create mode 100644 core/runtime/executorch/TensorRTBackend.h
 rename examples/torchtrt_executorch_example/{model_static_shape.py => export_static_shape.py} (100%)
 create mode 100644 py/torch_tensorrt/dynamo/_leaf_spec_compat.py

diff --git a/.gitignore b/.gitignore
index f08d97d448..6a4c6eda74 100644
--- a/.gitignore
+++ b/.gitignore
@@ -81,3 +81,4 @@ coverage.xml
 *.log
 *.pt2
 examples/torchtrt_aoti_example/torchtrt_aoti_example
+CLAUDE.md
\ No newline at end of file
diff --git a/core/runtime/executorch/TensorRTBackend.cpp b/core/runtime/executorch/TensorRTBackend.cpp
new file mode 100644
index 0000000000..93f97dc8ce
--- /dev/null
+++ b/core/runtime/executorch/TensorRTBackend.cpp
@@ -0,0 +1,313 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "core/runtime/executorch/TensorRTBackend.h"
+
+#include <cstdint>
+#include <cstring>
+#include <mutex>
+#include <string>
+#include <vector>
+
+#include <NvInfer.h>
+#include <cuda_runtime.h>
+
+#include <executorch/runtime/backend/interface.h>
+#include <executorch/runtime/platform/log.h>
+
+#include "core/runtime/TRTEngine.h"
+#include "core/util/prelude.h"
+
+namespace torch_tensorrt {
+namespace executorch_backend {
+
+using ::executorch::runtime::ArrayRef;
+using ::executorch::runtime::BackendExecutionContext;
+using ::executorch::runtime::BackendInitContext;
+using ::executorch::runtime::CompileSpec;
+using ::executorch::runtime::DelegateHandle;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::EValue;
+using ::executorch::runtime::FreeableBuffer;
+using ::executorch::runtime::MemoryAllocator;
+using ::executorch::runtime::Result;
+using ::executorch::runtime::Span;
+
+namespace {
+
+// ---------------------------------------------------------------------------
+// Blob deserialization
+//
+// Wire format written by
+//   py/torch_tensorrt/executorch/serialization.py::serialize_engine_info()
+//
+//   [uint32_t count (LE)]
+//   for each of `count` entries:
+//     [uint32_t len (LE)] [uint8_t data[len]]
+//
+// The resulting vector<string> is passed directly to
+//   core::runtime::TRTEngine(std::vector<std::string> serialized_info)
+// which expects the 11-element list defined by SerializedInfoIndex in
+//   core/runtime/runtime.h
+// ---------------------------------------------------------------------------
+std::vector<std::string> deserialize_engine_info(const void* data, size_t size) {
+  const uint8_t* ptr = static_cast<const uint8_t*>(data);
+  const uint8_t* const end = ptr + size;
+
+  if (ptr + sizeof(uint32_t) > end) {
+    return {};
+  }
+
+  uint32_t count = 0;
+  std::memcpy(&count, ptr, sizeof(uint32_t));
+  ptr += sizeof(uint32_t);
+
+  std::vector<std::string> result;
+  result.reserve(count);
+
+  for (uint32_t i = 0; i < count; ++i) {
+    if (ptr + sizeof(uint32_t) > end) {
+      return {};
+    }
+    uint32_t len = 0;
+    std::memcpy(&len, ptr, sizeof(uint32_t));
+    ptr += sizeof(uint32_t);
+
+    if (ptr + len > end) {
+      return {};
+    }
+    result.emplace_back(reinterpret_cast<const char*>(ptr), len);
+    ptr += len;
+  }
+
+  return result;
+}
+
+// ---------------------------------------------------------------------------
+// Build a nvinfer1::Dims from an ExecuTorch tensor's shape
+// ---------------------------------------------------------------------------
+nvinfer1::Dims to_trt_dims(const exec_aten::Tensor& t) {
+  nvinfer1::Dims dims{};
+  dims.nbDims = t.dim();
+  for (int d = 0; d < t.dim(); ++d) {
+    dims.d[d] = static_cast<int64_t>(t.size(d));
+  }
+  return dims;
+}
+
+} // namespace
+
+// ---------------------------------------------------------------------------
+// is_available
+// ---------------------------------------------------------------------------
+bool TensorRTBackend::is_available() const {
+  return true;
+}
+
+// ---------------------------------------------------------------------------
+// init
+//
+// Deserializes the processed blob into a TRTEngine and returns it as the
+// opaque DelegateHandle.  The engine is placement-new'd into memory
+// provided by the ExecuTorch MemoryAllocator so that ExecuTorch owns the
+// lifetime; destroy() calls the destructor explicitly.
+// ---------------------------------------------------------------------------
+Result<DelegateHandle*> TensorRTBackend::init(BackendInitContext& context, FreeableBuffer* processed) const {
+  if (processed == nullptr || processed->data() == nullptr) {
+    ET_LOG(Error, "TensorRTBackend::init: null processed buffer");
+    return Error::InvalidArgument;
+  }
+
+  auto serialized_info = deserialize_engine_info(processed->data(), processed->size());
+
+  if (serialized_info.empty()) {
+    ET_LOG(Error, "TensorRTBackend::init: failed to deserialize engine blob");
+    return Error::InvalidArgument;
+  }
+
+  // Validate the vector length before handing to TRTEngine
+  // (verify_serialization_fmt throws on mismatch)
+  core::runtime::TRTEngine::verify_serialization_fmt(serialized_info);
+
+  MemoryAllocator* allocator = context.get_runtime_allocator();
+  if (allocator == nullptr) {
+    ET_LOG(Error, "TensorRTBackend::init: null runtime allocator");
+    return Error::InvalidState;
+  }
+
+  // Allocate raw storage for TRTEngine from ExecuTorch's arena
+  core::runtime::TRTEngine* engine = allocator->allocateInstance<core::runtime::TRTEngine>();
+  if (engine == nullptr) {
+    ET_LOG(Error, "TensorRTBackend::init: allocateInstance failed");
+    return Error::MemoryAllocationFailed;
+  }
+
+  // Construct in-place; TRTEngine(std::vector<std::string>) deserializes the
+  // engine bytes, builds the IRuntime/ICudaEngine/IExecutionContext, and
+  // populates in_binding_names / out_binding_names / num_io.
+  new (engine) core::runtime::TRTEngine(std::move(serialized_info));
+
+  // Release the blob; we no longer need it
+  processed->Free();
+
+  ET_LOG(
+      Info,
+      "TensorRTBackend::init: engine '%s' ready (%zu inputs, %zu outputs)",
+      engine->name.c_str(),
+      engine->num_io.first,
+      engine->num_io.second);
+
+  return static_cast<DelegateHandle*>(engine);
+}
+
+// ---------------------------------------------------------------------------
+// execute
+//
+// Binds the ExecuTorch input/output tensor data pointers directly to the
+// TRT IExecutionContext and calls enqueueV3().  ExecuTorch pre-allocates
+// all output tensors before calling execute(), so we only need to register
+// their addresses; no separate output allocation is required.
+//
+// Args layout (mirroring the Python exporter):
+//   args[0 .. num_inputs-1]             – input EValues
+//   args[num_inputs .. num_inputs+num_outputs-1] – output EValues
+// ---------------------------------------------------------------------------
+Error TensorRTBackend::execute(BackendExecutionContext& context, DelegateHandle* handle, Span<EValue*> args) const {
+  (void)context;
+
+  if (handle == nullptr) {
+    ET_LOG(Error, "TensorRTBackend::execute: null delegate handle");
+    return Error::InvalidArgument;
+  }
+
+  auto* engine = static_cast<core::runtime::TRTEngine*>(handle);
+
+  const size_t num_inputs = engine->num_io.first;
+  const size_t num_outputs = engine->num_io.second;
+
+  if (args.size() < num_inputs + num_outputs) {
+    ET_LOG(
+        Error, "TensorRTBackend::execute: expected at least %zu args, got %zu", num_inputs + num_outputs, args.size());
+    return Error::InvalidArgument;
+  }
+
+  // IExecutionContext::enqueueV3 is not thread-safe; use the engine mutex
+  std::unique_lock<std::mutex> lock(engine->mu);
+
+  nvinfer1::IExecutionContext* ctx = engine->exec_ctx.get();
+
+  // ------------------------------------------------------------------
+  // 1. Bind input shapes and addresses
+  // ------------------------------------------------------------------
+  for (size_t i = 0; i < num_inputs; ++i) {
+    EValue* arg = args[i];
+    if (arg == nullptr || !arg->isTensor()) {
+      ET_LOG(Error, "TensorRTBackend::execute: input %zu is not a tensor", i);
+      return Error::InvalidArgument;
+    }
+
+    exec_aten::Tensor et_in = arg->toTensor();
+    const std::string& name = engine->in_binding_names[i];
+    nvinfer1::Dims dims = to_trt_dims(et_in);
+
+    if (!ctx->setInputShape(name.c_str(), dims)) {
+      ET_LOG(Error, "TensorRTBackend::execute: setInputShape failed for '%s'", name.c_str());
+      return Error::InvalidState;
+    }
+
+    void* ptr = et_in.mutable_data_ptr();
+    // TRT requires a non-null address even for 0-element tensors
+    static char placeholder[16] = {};
+    if (ptr == nullptr || et_in.numel() == 0) {
+      ptr = placeholder;
+    }
+
+    if (!ctx->setTensorAddress(name.c_str(), ptr)) {
+      ET_LOG(Error, "TensorRTBackend::execute: setTensorAddress failed for input '%s'", name.c_str());
+      return Error::InvalidState;
+    }
+  }
+
+  // ------------------------------------------------------------------
+  // 2. Infer output shapes (requires all input shapes to be set first)
+  // ------------------------------------------------------------------
+  {
+    const int32_t io_size = engine->cuda_engine->getNbIOTensors();
+    std::vector<const char*> unresolved(static_cast<size_t>(io_size), nullptr);
+    const int32_t n_unresolved = ctx->inferShapes(io_size, unresolved.data());
+    if (n_unresolved != 0) {
+      ET_LOG(Error, "TensorRTBackend::execute: inferShapes could not resolve %d tensor(s)", n_unresolved);
+      return Error::InvalidState;
+    }
+  }
+
+  // ------------------------------------------------------------------
+  // 3. Bind output addresses (ExecuTorch pre-allocates the buffers)
+  // ------------------------------------------------------------------
+  for (size_t o = 0; o < num_outputs; ++o) {
+    EValue* arg = args[num_inputs + o];
+    if (arg == nullptr || !arg->isTensor()) {
+      ET_LOG(Error, "TensorRTBackend::execute: output %zu is not a tensor", o);
+      return Error::InvalidArgument;
+    }
+
+    exec_aten::Tensor et_out = arg->toTensor();
+    const std::string& name = engine->out_binding_names[o];
+    void* ptr = et_out.mutable_data_ptr();
+
+    if (!ctx->setTensorAddress(name.c_str(), ptr)) {
+      ET_LOG(Error, "TensorRTBackend::execute: setTensorAddress failed for output '%s'", name.c_str());
+      return Error::InvalidState;
+    }
+  }
+
+  // ------------------------------------------------------------------
+  // 4. Enqueue inference on the current CUDA stream
+  // ------------------------------------------------------------------
+  cudaStream_t stream = c10::cuda::getCurrentCUDAStream(static_cast<c10::DeviceIndex>(engine->device_info.id));
+
+  if (!ctx->enqueueV3(stream)) {
+    ET_LOG(Error, "TensorRTBackend::execute: enqueueV3 failed");
+    return Error::InvalidState;
+  }
+
+  // Synchronize so that outputs are visible to downstream ExecuTorch ops
+  cudaStreamSynchronize(stream);
+
+  return Error::Ok;
+}
+
+// ---------------------------------------------------------------------------
+// destroy
+//
+// Explicitly destructs the TRTEngine.  The underlying memory was allocated
+// by ExecuTorch's MemoryAllocator and will be reclaimed by the arena.
+// ---------------------------------------------------------------------------
+void TensorRTBackend::destroy(DelegateHandle* handle) const {
+  if (handle != nullptr) {
+    static_cast<core::runtime::TRTEngine*>(handle)->~TRTEngine();
+  }
+}
+
+} // namespace executorch_backend
+} // namespace torch_tensorrt
+
+// ---------------------------------------------------------------------------
+// Static registration – links the name "TensorRTBackend" used in the .pte
+// file to this implementation at program startup.
+// ---------------------------------------------------------------------------
+namespace {
+
+torch_tensorrt::executorch_backend::TensorRTBackend& get_backend() {
+  static torch_tensorrt::executorch_backend::TensorRTBackend backend;
+  return backend;
+}
+
+const ::executorch::runtime::Backend kBackendId{"TensorRTBackend", &get_backend()};
+const auto kRegistered = ::executorch::runtime::register_backend(kBackendId);
+
+} // namespace
diff --git a/core/runtime/executorch/TensorRTBackend.h b/core/runtime/executorch/TensorRTBackend.h
new file mode 100644
index 0000000000..3855942251
--- /dev/null
+++ b/core/runtime/executorch/TensorRTBackend.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * ExecuTorch backend delegate that runs TensorRT engines serialized by
+ * torch_tensorrt.  The processed blob must be in the vector-of-strings wire
+ * format produced by
+ *   py/torch_tensorrt/executorch/serialization.py::serialize_engine_info()
+ * which maps 1-to-1 to the std::vector<std::string> accepted by
+ *   core/runtime/TRTEngine::TRTEngine(std::vector<std::string>).
+ */
+#pragma once
+
+#include <executorch/runtime/backend/interface.h>
+
+namespace torch_tensorrt {
+namespace executorch_backend {
+
+class TensorRTBackend final : public ::executorch::runtime::BackendInterface {
+ public:
+  bool is_available() const override;
+
+  ::executorch::runtime::Result<::executorch::runtime::DelegateHandle*> init(
+      ::executorch::runtime::BackendInitContext& context,
+      ::executorch::runtime::FreeableBuffer* processed,
+      ::executorch::runtime::ArrayRef<::executorch::runtime::CompileSpec> compile_specs) const override;
+
+  ::executorch::runtime::Error execute(
+      ::executorch::runtime::BackendExecutionContext& context,
+      ::executorch::runtime::DelegateHandle* handle,
+      ::executorch::runtime::Span<::executorch::runtime::EValue*> args) const override;
+
+  void destroy(::executorch::runtime::DelegateHandle* handle) const override;
+};
+
+} // namespace executorch_backend
+} // namespace torch_tensorrt
diff --git a/examples/torchtrt_executorch_example/model_static_shape.py b/examples/torchtrt_executorch_example/export_static_shape.py
similarity index 100%
rename from examples/torchtrt_executorch_example/model_static_shape.py
rename to examples/torchtrt_executorch_example/export_static_shape.py
diff --git a/py/torch_tensorrt/_compile.py b/py/torch_tensorrt/_compile.py
index 6491d91c1f..0b4470d3ef 100644
--- a/py/torch_tensorrt/_compile.py
+++ b/py/torch_tensorrt/_compile.py
@@ -1057,13 +1057,95 @@ def _extract_tensor(obj: Any) -> Any:
                     )
 
 
+def _replace_execute_engine_with_no_op(exp_program: Any) -> Any:
+    """Replace execute_engine nodes with no_op_placeholder_for_execute_engine.
+
+    ExecuTorch's edge-lowering passes symbolically execute every node before
+    partitioning runs.  The execute_engine schema requires a
+    ``__torch__.torch.classes.tensorrt.Engine`` argument, but after export the
+    engine is represented as a ``CustomObjArgument`` — causing a schema type
+    error inside the pass interpreter.
+
+    The no_op_placeholder op uses flat string arguments instead of a custom
+    class, so it passes through the edge passes without issue.  The
+    TensorRTPartitioner and TensorRTBackend are updated to work with the
+    no-op form.
+    """
+    import base64
+
+    import torch_tensorrt.dynamo.runtime.meta_ops.register_meta_ops  # noqa: F401
+
+    gm = exp_program.graph_module
+    execute_engine_op = torch.ops.tensorrt.execute_engine.default
+    no_op = torch.ops.tensorrt.no_op_placeholder_for_execute_engine.default
+
+    nodes_to_replace = [
+        n
+        for n in gm.graph.nodes
+        if n.op == "call_function" and n.target is execute_engine_op
+    ]
+    for node in nodes_to_replace:
+        inputs_arg = node.args[0]
+        engine_node = node.args[1]
+
+        if engine_node.op == "get_attr":
+            engine_obj = getattr(gm, engine_node.target, None)
+            if engine_obj is None:
+                raise RuntimeError(
+                    f"execute_engine node '{node.name}': get_attr target "
+                    f"'{engine_node.target}' not found on graph module"
+                )
+        elif engine_node.op == "placeholder":
+            # After torch.export, get_attr nodes for custom objects are lifted
+            # into placeholder inputs; the actual object lives in exp_program.constants.
+            constants = getattr(exp_program, "constants", {})
+            engine_obj = constants.get(engine_node.name) or constants.get(
+                engine_node.target
+            )
+            if engine_obj is None:
+                raise RuntimeError(
+                    f"execute_engine node '{node.name}': placeholder engine node "
+                    f"'{engine_node.name}' not found in exp_program.constants"
+                )
+        else:
+            raise RuntimeError(
+                f"execute_engine node '{node.name}': expected engine arg to be "
+                f"a get_attr or placeholder node, got op='{engine_node.op}'"
+            )
+        # Get engine info list via __getstate__ (same format as _pack_engine_info())
+        from torch_tensorrt.dynamo.runtime._TorchTensorRTModule import ENGINE_IDX
+
+        engine_info = list(engine_obj.__getstate__())
+        engine_info = engine_info[0]
+        # Base64-encode the engine bytes, matching the reference cross-compile path
+        engine_bytes = engine_info[ENGINE_IDX]
+        if isinstance(engine_bytes, (bytes, bytearray)):
+            engine_info[ENGINE_IDX] = base64.b64encode(engine_bytes).decode("utf-8")
+
+        with gm.graph.inserting_before(node):
+            no_op_node = gm.graph.call_function(no_op, (inputs_arg, *engine_info))
+        no_op_node.meta = dict(node.meta)
+        node.replace_all_uses_with(no_op_node)
+        gm.graph.erase_node(node)
+        # Only erase get_attr engine nodes; placeholder nodes belong to the
+        # exported program's input signature and must not be removed here.
+        if engine_node.op == "get_attr":
+            gm.graph.erase_node(engine_node)
+
+    gm.graph.eliminate_dead_code()
+    gm.graph.lint()
+    gm.recompile()
+    return exp_program
+
+
 def _save_as_executorch(exp_program: Any, file_path: str) -> None:
     """Save an ExportedProgram (with TensorRT execute_engine nodes) as an ExecuTorch .pte file.
 
-    Partitions the graph by torch.ops.tensorrt.execute_engine, serializes each engine
-    to the same blob format as the TRT runtime (vector of strings), and embeds it
-    in the .pte. Requires the ``executorch`` package and torch_tensorrt_runtime. See
-    https://pytorch.org/executorch/stable/getting-started-setup.html
+    Partitions the graph by torch.ops.tensorrt.no_op_placeholder_for_execute_engine
+    (execute_engine is pre-converted to avoid schema type errors in edge passes),
+    serializes each engine to the same blob format as the TRT runtime (vector of
+    strings), and embeds it in the .pte. Requires the ``executorch`` package and
+    torch_tensorrt_runtime. See https://pytorch.org/executorch/stable/getting-started-setup.html
     """
     if not ENABLED_FEATURES.torch_tensorrt_runtime:
         raise RuntimeError(
@@ -1077,11 +1159,14 @@ def _save_as_executorch(exp_program: Any, file_path: str) -> None:
             "ExecuTorch is not installed. Please install it to use output_format='executorch'. "
             "See https://pytorch.org/executorch/stable/getting-started-setup.html"
         )
-    # Ensure execute_engine fake kernel is registered so partitioner can run
-    # when the engine is a CustomObjArgument (export placeholder).
     import torch_tensorrt.dynamo.runtime.meta_ops.register_meta_ops  # noqa: F401
     from torch_tensorrt.executorch import TensorRTPartitioner
 
+    # Replace execute_engine with no_op_placeholder before edge lowering so that
+    # ExecuTorch's symbolic-execution passes don't trip on the Engine custom-class
+    # schema check.
+    exp_program = _replace_execute_engine_with_no_op(exp_program)
+    breakpoint()
     edge_program = to_edge_transform_and_lower(
         exp_program,
         partitioner=[TensorRTPartitioner()],
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
index bc3cdc5721..117a87abc8 100644
--- a/py/torch_tensorrt/dynamo/_compiler.py
+++ b/py/torch_tensorrt/dynamo/_compiler.py
@@ -9,6 +9,9 @@
 
 import torch
 from torch.export import ExportedProgram
+from torch_tensorrt.dynamo._leaf_spec_compat import _apply_leaf_spec_patch
+
+_apply_leaf_spec_patch()
 from torch.fx.node import Target
 from torch_tensorrt._Device import Device
 from torch_tensorrt._enums import EngineCapability, dtype
diff --git a/py/torch_tensorrt/dynamo/_leaf_spec_compat.py b/py/torch_tensorrt/dynamo/_leaf_spec_compat.py
new file mode 100644
index 0000000000..7bbd3a97ad
--- /dev/null
+++ b/py/torch_tensorrt/dynamo/_leaf_spec_compat.py
@@ -0,0 +1,60 @@
+"""
+Compatibility shim for a PyTorch 2.11 bug where ``LeafSpec`` (frozen dataclass
+with ``slots=True``) inherits the ``type`` slot from ``TreeSpec`` but never
+initialises it, leaving the slot empty.  This causes
+
+    AttributeError: 'LeafSpec' object has no attribute 'type'
+
+inside ``ExportedProgram.run_decompositions()`` when a model returns a single
+tensor (i.e. the output pytree spec is a leaf rather than a list/tuple).
+
+The fix is applied once at import time and is a no-op on versions that already
+set the attribute correctly.
+
+Upstream fix: https://github.com/pytorch/pytorch/issues/<TBD>
+"""
+
+from __future__ import annotations
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def _apply_leaf_spec_patch() -> None:
+    """Patch ``LeafSpec`` so its inherited ``type`` slot is always set to ``None``.
+
+    Safe to call multiple times; the patch is idempotent.
+    """
+    try:
+        from torch.utils._pytree import _LEAF_SPEC, LeafSpec
+    except ImportError:
+        return  # too old / too new, nothing to do
+
+    # Check whether the bug is present on the singleton instance
+    try:
+        _ = _LEAF_SPEC.type  # noqa: F841
+        return  # attribute accessible — no patch needed
+    except AttributeError:
+        pass
+
+    logger.debug(
+        "torch_tensorrt: applying LeafSpec.type compatibility patch "
+        "(PyTorch bug: frozen-dataclass slot not initialised in subclass)"
+    )
+
+    # Fix the pre-existing singleton that all pytree leaf specs share
+    object.__setattr__(_LEAF_SPEC, "type", None)
+    object.__setattr__(_LEAF_SPEC, "_context", None)
+    object.__setattr__(_LEAF_SPEC, "_children", [])
+
+    # Patch __post_init__ so any new LeafSpec() instances are also fixed
+    _orig_post_init = LeafSpec.__post_init__
+
+    def _post_init_with_type(self: LeafSpec) -> None:
+        _orig_post_init(self)
+        object.__setattr__(self, "type", None)
+        object.__setattr__(self, "_context", None)
+        object.__setattr__(self, "_children", [])
+
+    LeafSpec.__post_init__ = _post_init_with_type
diff --git a/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py b/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py
index 83ac0644b0..d33faa5b21 100644
--- a/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py
+++ b/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py
@@ -344,7 +344,48 @@ def no_op_placeholder_for_execute_engine(
     serialized_metadata: str,
     serialized_target_platform: str,
     serialized_require_output_allocator: str,
+    serialized_require_output_alocator_idx: str,
 ) -> List[torch.Tensor]:
     raise RuntimeError(
         "The saved model is cross compiled for windows in Linux, should only be loadded in Windows via torch_tensorrt.load_cross_compiled_exported_program() api."
     )
+
+
+@no_op_placeholder_for_execute_engine.register_fake  # type: ignore
+def fake_no_op_placeholder_for_execute_engine(
+    inputs: List[torch.Tensor],
+    abi_version: str,
+    name: str,
+    serialized_device_info: str,
+    serialized_engine: str,
+    serialized_in_binding_names: str,
+    serialized_out_binding_names: str,
+    serialized_hardware_compatible: str,
+    serialized_metadata: str,
+    serialized_target_platform: str,
+    serialized_require_output_allocator: str,
+    serialized_require_output_alocator_idx: str,
+) -> List[torch.Tensor]:
+    """Fake kernel for no_op_placeholder_for_execute_engine.
+
+    Parses serialized_metadata to derive output shapes, mirroring the
+    execute_engine fake kernel logic.
+    """
+    if serialized_metadata:
+        try:
+            metadata = TorchTensorRTModule.decode_metadata(serialized_metadata)
+            shape_info = metadata.get("inout_symexprs") if metadata else None
+            if shape_info:
+                return _apply_symbolic_shape_expressions(inputs, shape_info)
+        except Exception:
+            pass
+
+    # Fallback: return one tensor with same shape/dtype as the first input
+    from torch._guards import detect_fake_mode
+
+    fake_mode = detect_fake_mode(inputs) if inputs else None
+    if not inputs:
+        return [torch.empty(())]
+    if fake_mode is not None:
+        return [fake_mode.from_tensor(inputs[0])]
+    return [torch.empty_like(inputs[0])]
diff --git a/py/torch_tensorrt/executorch/backend.py b/py/torch_tensorrt/executorch/backend.py
index 1c9ba4c615..b95cf55121 100644
--- a/py/torch_tensorrt/executorch/backend.py
+++ b/py/torch_tensorrt/executorch/backend.py
@@ -18,40 +18,37 @@
 
 
 def _get_engine_info_from_edge_program(edge_program: ExportedProgram) -> List[Any]:
-    """Extract engine info (list of strings/bytes) from the partition's execute_engine node."""
+    """Extract engine info (list of strings/bytes) from the partition's no_op_placeholder node.
+
+    Before calling to_edge_transform_and_lower, _save_as_executorch replaces
+    execute_engine nodes with no_op_placeholder_for_execute_engine whose args are
+    (inputs_tuple, abi_version, name, device, engine_b64, in_names, out_names,
+    hw_compat, metadata, platform, requires_oa).  This function reads those flat
+    args back out and returns them as a list indexed by SerializedInfoIndex.
+    """
     gm = edge_program.graph_module
-    execute_engine_op = torch.ops.tensorrt.execute_engine.default
+    no_op = torch.ops.tensorrt.no_op_placeholder_for_execute_engine.default
 
     for node in gm.graph.nodes:
-        if node.op != "call_function" or node.target is not execute_engine_op:
+        if node.op != "call_function" or node.target is not no_op:
             continue
+        # args layout: (inputs_tuple, *engine_info_strings)
+        # engine_info_strings has SERIALIZATION_LEN - 1 entries (no RESOURCE_ALLOCATION_STRATEGY)
         if len(node.args) < 2:
-            continue
-        engine_arg = node.args[1]
-        if engine_arg.op == "get_attr":
-            val = getattr(gm, engine_arg.target, None)
-            if val is None:
-                raise RuntimeError(
-                    f"Engine get_attr({engine_arg.target}) not found on partition module."
-                )
-            if hasattr(val, "__getstate__"):
-                engine_info = val.__getstate__()
-            else:
-                engine_info = getattr(val, "engine_info", val)
-            if (
-                isinstance(engine_info, (list, tuple))
-                and len(engine_info) >= SERIALIZATION_LEN
-            ):
-                return list(engine_info)
             raise RuntimeError(
-                f"Engine argument get_attr({engine_arg.target}) did not yield engine info list (len >= {SERIALIZATION_LEN})."
+                f"no_op_placeholder node '{node.name}' has too few args: {len(node.args)}"
             )
-        raise RuntimeError(
-            "TensorRT ExecuTorch backend expects execute_engine(inputs, engine) "
-            "where engine is a get_attr; cannot find engine."
-        )
+        engine_info = list(node.args[1:])
+        if len(engine_info) < SERIALIZATION_LEN - 1:
+            raise RuntimeError(
+                f"no_op_placeholder node '{node.name}' has {len(engine_info)} engine "
+                f"info args, expected at least {SERIALIZATION_LEN - 1}"
+            )
+        return engine_info
+
     raise RuntimeError(
-        "TensorRT ExecuTorch backend: no execute_engine node found in partition."
+        "TensorRT ExecuTorch backend: no no_op_placeholder_for_execute_engine "
+        "node found in partition."
     )
 
 
diff --git a/py/torch_tensorrt/executorch/operator_support.py b/py/torch_tensorrt/executorch/operator_support.py
index 32763665c2..29991611f9 100644
--- a/py/torch_tensorrt/executorch/operator_support.py
+++ b/py/torch_tensorrt/executorch/operator_support.py
@@ -7,20 +7,22 @@
 
 
 class TensorRTOperatorSupport(OperatorSupportBase):  # type: ignore[misc]
-    """Supports only torch.ops.tensorrt.execute_engine for partitioning.
+    """Supports torch.ops.tensorrt.no_op_placeholder_for_execute_engine for partitioning.
 
-    Used so that TRT-compiled graphs (which already contain execute_engine nodes)
-    are partitioned per engine; each partition is then lowered to TensorRTBackend
-    which serializes the engine to the same blob format as the TRT runtime.
+    Prior to calling to_edge_transform_and_lower, _save_as_executorch replaces
+    execute_engine nodes with no_op_placeholder_for_execute_engine so that
+    ExecuTorch's edge-lowering passes (which symbolically execute every node) do
+    not trip on the Engine custom-class schema check.  The partitioner therefore
+    targets the no-op placeholder instead of execute_engine directly.
     """
 
     def __init__(self) -> None:
         super().__init__()
-        self._execute_engine_op = torch.ops.tensorrt.execute_engine.default
+        self._no_op = torch.ops.tensorrt.no_op_placeholder_for_execute_engine.default
 
     def is_node_supported(
         self, submodules: Dict[str, torch.nn.Module], node: torch.fx.Node
     ) -> bool:
         if node.op != "call_function":
             return False
-        return node.target is self._execute_engine_op
+        return node.target is self._no_op
diff --git a/setup.py b/setup.py
index 72eab98bd5..6115328b47 100644
--- a/setup.py
+++ b/setup.py
@@ -787,7 +787,7 @@ def get_x86_64_requirements(base_requirements):
     if IS_DLFW_CI:
         return requirements
     else:
-        requirements = requirements + ["torch>=2.12.0.dev,<2.13.0"]
+        requirements = requirements + ["torch>=2.11.0,<2.12.0"]
         if USE_TRT_RTX:
             return requirements + [
                 "tensorrt_rtx>=1.3.0.35",

From 29a12a9efb4b4479a8dc9b4c20a2bfcd9ad37dcf Mon Sep 17 00:00:00 2001
From: Lan Luo <lanl@nvidia.com>
Date: Tue, 24 Mar 2026 15:42:52 -0700
Subject: [PATCH 4/7] test

---
 py/torch_tensorrt/_compile.py | 6 +++---
 setup.py                      | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/py/torch_tensorrt/_compile.py b/py/torch_tensorrt/_compile.py
index 0b4470d3ef..c68057a516 100644
--- a/py/torch_tensorrt/_compile.py
+++ b/py/torch_tensorrt/_compile.py
@@ -1153,7 +1153,7 @@ def _save_as_executorch(exp_program: Any, file_path: str) -> None:
             "(torch_tensorrt_runtime). Reinstall torch_tensorrt with the runtime extension."
         )
     try:
-        from executorch.exir import to_edge_transform_and_lower
+        from executorch.exir import to_edge_transform_and_lower, EdgeCompileConfig
     except ImportError:
         raise ImportError(
             "ExecuTorch is not installed. Please install it to use output_format='executorch'. "
@@ -1165,11 +1165,11 @@ def _save_as_executorch(exp_program: Any, file_path: str) -> None:
     # Replace execute_engine with no_op_placeholder before edge lowering so that
     # ExecuTorch's symbolic-execution passes don't trip on the Engine custom-class
     # schema check.
-    exp_program = _replace_execute_engine_with_no_op(exp_program)
-    breakpoint()
+    # exp_program = _replace_execute_engine_with_no_op(exp_program)
     edge_program = to_edge_transform_and_lower(
         exp_program,
         partitioner=[TensorRTPartitioner()],
+        compile_config=EdgeCompileConfig(_check_ir_validity=False),
     )
     executorch_program = edge_program.to_executorch()
     with open(file_path, "wb") as f:
diff --git a/setup.py b/setup.py
index 6115328b47..171f8f481b 100644
--- a/setup.py
+++ b/setup.py
@@ -35,7 +35,7 @@
 LEGACY_BASE_VERSION_SUFFIX_PATTERN = re.compile("a0$")
 # CI_PIPELINE_ID is the environment variable set by DLFW ci build
 IS_DLFW_CI = os.environ.get("CI_PIPELINE_ID") is not None
-
+IS_DLFW_CI = True
 
 def get_root_dir() -> Path:
     return Path(__file__).parent.absolute()

From ad77701c5a60f56a712105ab84c5226909e32c51 Mon Sep 17 00:00:00 2001
From: Lan Luo <lanl@nvidia.com>
Date: Tue, 24 Mar 2026 15:53:18 -0700
Subject: [PATCH 5/7] test

---
 py/torch_tensorrt/_compile.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/py/torch_tensorrt/_compile.py b/py/torch_tensorrt/_compile.py
index c68057a516..4d070650e7 100644
--- a/py/torch_tensorrt/_compile.py
+++ b/py/torch_tensorrt/_compile.py
@@ -1138,7 +1138,7 @@ def _replace_execute_engine_with_no_op(exp_program: Any) -> Any:
     return exp_program
 
 
-def _save_as_executorch(exp_program: Any, file_path: str) -> None:
+def _save_as_executorch(exp_program: Any, file_path: str, **kwargs) -> None:
     """Save an ExportedProgram (with TensorRT execute_engine nodes) as an ExecuTorch .pte file.
 
     Partitions the graph by torch.ops.tensorrt.no_op_placeholder_for_execute_engine
@@ -1161,14 +1161,18 @@ def _save_as_executorch(exp_program: Any, file_path: str) -> None:
         )
     import torch_tensorrt.dynamo.runtime.meta_ops.register_meta_ops  # noqa: F401
     from torch_tensorrt.executorch import TensorRTPartitioner
+    extra_partitioners = kwargs.get("partitioners", [])
+    partitioners = [TensorRTPartitioner()] + extra_partitioners
 
+    # DO I NEED THIS?
     # Replace execute_engine with no_op_placeholder before edge lowering so that
     # ExecuTorch's symbolic-execution passes don't trip on the Engine custom-class
     # schema check.
     # exp_program = _replace_execute_engine_with_no_op(exp_program)
+
     edge_program = to_edge_transform_and_lower(
         exp_program,
-        partitioner=[TensorRTPartitioner()],
+        partitioner=partitioners,
         compile_config=EdgeCompileConfig(_check_ir_validity=False),
     )
     executorch_program = edge_program.to_executorch()

From 87ac93103171ebf3b91b8c6ec3e7f3451f3e954e Mon Sep 17 00:00:00 2001
From: Lan Luo <lanl@nvidia.com>
Date: Tue, 24 Mar 2026 16:33:18 -0700
Subject: [PATCH 6/7] test

---
 py/torch_tensorrt/_compile.py                 | 92 +------------------
 .../runtime/meta_ops/register_meta_ops.py     | 50 ----------
 py/torch_tensorrt/executorch/backend.py       | 57 +++++++-----
 .../executorch/operator_support.py            | 14 ++-
 4 files changed, 42 insertions(+), 171 deletions(-)

diff --git a/py/torch_tensorrt/_compile.py b/py/torch_tensorrt/_compile.py
index 4d070650e7..d0246710e2 100644
--- a/py/torch_tensorrt/_compile.py
+++ b/py/torch_tensorrt/_compile.py
@@ -1057,88 +1057,7 @@ def _extract_tensor(obj: Any) -> Any:
                     )
 
 
-def _replace_execute_engine_with_no_op(exp_program: Any) -> Any:
-    """Replace execute_engine nodes with no_op_placeholder_for_execute_engine.
-
-    ExecuTorch's edge-lowering passes symbolically execute every node before
-    partitioning runs.  The execute_engine schema requires a
-    ``__torch__.torch.classes.tensorrt.Engine`` argument, but after export the
-    engine is represented as a ``CustomObjArgument`` — causing a schema type
-    error inside the pass interpreter.
-
-    The no_op_placeholder op uses flat string arguments instead of a custom
-    class, so it passes through the edge passes without issue.  The
-    TensorRTPartitioner and TensorRTBackend are updated to work with the
-    no-op form.
-    """
-    import base64
-
-    import torch_tensorrt.dynamo.runtime.meta_ops.register_meta_ops  # noqa: F401
-
-    gm = exp_program.graph_module
-    execute_engine_op = torch.ops.tensorrt.execute_engine.default
-    no_op = torch.ops.tensorrt.no_op_placeholder_for_execute_engine.default
-
-    nodes_to_replace = [
-        n
-        for n in gm.graph.nodes
-        if n.op == "call_function" and n.target is execute_engine_op
-    ]
-    for node in nodes_to_replace:
-        inputs_arg = node.args[0]
-        engine_node = node.args[1]
-
-        if engine_node.op == "get_attr":
-            engine_obj = getattr(gm, engine_node.target, None)
-            if engine_obj is None:
-                raise RuntimeError(
-                    f"execute_engine node '{node.name}': get_attr target "
-                    f"'{engine_node.target}' not found on graph module"
-                )
-        elif engine_node.op == "placeholder":
-            # After torch.export, get_attr nodes for custom objects are lifted
-            # into placeholder inputs; the actual object lives in exp_program.constants.
-            constants = getattr(exp_program, "constants", {})
-            engine_obj = constants.get(engine_node.name) or constants.get(
-                engine_node.target
-            )
-            if engine_obj is None:
-                raise RuntimeError(
-                    f"execute_engine node '{node.name}': placeholder engine node "
-                    f"'{engine_node.name}' not found in exp_program.constants"
-                )
-        else:
-            raise RuntimeError(
-                f"execute_engine node '{node.name}': expected engine arg to be "
-                f"a get_attr or placeholder node, got op='{engine_node.op}'"
-            )
-        # Get engine info list via __getstate__ (same format as _pack_engine_info())
-        from torch_tensorrt.dynamo.runtime._TorchTensorRTModule import ENGINE_IDX
-
-        engine_info = list(engine_obj.__getstate__())
-        engine_info = engine_info[0]
-        # Base64-encode the engine bytes, matching the reference cross-compile path
-        engine_bytes = engine_info[ENGINE_IDX]
-        if isinstance(engine_bytes, (bytes, bytearray)):
-            engine_info[ENGINE_IDX] = base64.b64encode(engine_bytes).decode("utf-8")
-
-        with gm.graph.inserting_before(node):
-            no_op_node = gm.graph.call_function(no_op, (inputs_arg, *engine_info))
-        no_op_node.meta = dict(node.meta)
-        node.replace_all_uses_with(no_op_node)
-        gm.graph.erase_node(node)
-        # Only erase get_attr engine nodes; placeholder nodes belong to the
-        # exported program's input signature and must not be removed here.
-        if engine_node.op == "get_attr":
-            gm.graph.erase_node(engine_node)
-
-    gm.graph.eliminate_dead_code()
-    gm.graph.lint()
-    gm.recompile()
-    return exp_program
-
-
-def _save_as_executorch(exp_program: Any, file_path: str, **kwargs) -> None:
+def _save_as_executorch(exp_program: Any, file_path: str, **kwargs: Any) -> None:
     """Save an ExportedProgram (with TensorRT execute_engine nodes) as an ExecuTorch .pte file.
 
     Partitions the graph by torch.ops.tensorrt.no_op_placeholder_for_execute_engine
@@ -1153,7 +1072,7 @@ def _save_as_executorch(exp_program: Any, file_path: str, **kwargs) -> None:
             "(torch_tensorrt_runtime). Reinstall torch_tensorrt with the runtime extension."
         )
     try:
-        from executorch.exir import to_edge_transform_and_lower, EdgeCompileConfig
+        from executorch.exir import EdgeCompileConfig, to_edge_transform_and_lower
     except ImportError:
         raise ImportError(
             "ExecuTorch is not installed. Please install it to use output_format='executorch'. "
@@ -1161,15 +1080,10 @@ def _save_as_executorch(exp_program: Any, file_path: str, **kwargs) -> None:
         )
     import torch_tensorrt.dynamo.runtime.meta_ops.register_meta_ops  # noqa: F401
     from torch_tensorrt.executorch import TensorRTPartitioner
+
     extra_partitioners = kwargs.get("partitioners", [])
     partitioners = [TensorRTPartitioner()] + extra_partitioners
 
-    # DO I NEED THIS?
-    # Replace execute_engine with no_op_placeholder before edge lowering so that
-    # ExecuTorch's symbolic-execution passes don't trip on the Engine custom-class
-    # schema check.
-    # exp_program = _replace_execute_engine_with_no_op(exp_program)
-
     edge_program = to_edge_transform_and_lower(
         exp_program,
         partitioner=partitioners,
diff --git a/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py b/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py
index d33faa5b21..78a517e6fc 100644
--- a/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py
+++ b/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py
@@ -217,15 +217,6 @@ def fake_tensorrt_execute_engine(
     so partitioners can run without a real engine. Otherwise uses symbolic shape
     expressions from metadata to infer output shapes.
     """
-    if _is_placeholder_engine(fake_trt_engine):
-        from torch._guards import detect_fake_mode
-
-        fake_mode = detect_fake_mode(inputs) if inputs else None
-        if not inputs:
-            return [torch.empty(())]
-        if fake_mode is not None:
-            return [fake_mode.from_tensor(inputs[0])]
-        return [torch.empty_like(inputs[0])]
 
     metadata = None
     if hasattr(fake_trt_engine, "real_obj"):
@@ -239,7 +230,6 @@ def fake_tensorrt_execute_engine(
         )
 
     shape_info = metadata.get("inout_symexprs") if metadata else None
-
     if shape_info:
         return _apply_symbolic_shape_expressions(inputs, shape_info)
     else:
@@ -349,43 +339,3 @@ def no_op_placeholder_for_execute_engine(
     raise RuntimeError(
         "The saved model is cross compiled for windows in Linux, should only be loadded in Windows via torch_tensorrt.load_cross_compiled_exported_program() api."
     )
-
-
-@no_op_placeholder_for_execute_engine.register_fake  # type: ignore
-def fake_no_op_placeholder_for_execute_engine(
-    inputs: List[torch.Tensor],
-    abi_version: str,
-    name: str,
-    serialized_device_info: str,
-    serialized_engine: str,
-    serialized_in_binding_names: str,
-    serialized_out_binding_names: str,
-    serialized_hardware_compatible: str,
-    serialized_metadata: str,
-    serialized_target_platform: str,
-    serialized_require_output_allocator: str,
-    serialized_require_output_alocator_idx: str,
-) -> List[torch.Tensor]:
-    """Fake kernel for no_op_placeholder_for_execute_engine.
-
-    Parses serialized_metadata to derive output shapes, mirroring the
-    execute_engine fake kernel logic.
-    """
-    if serialized_metadata:
-        try:
-            metadata = TorchTensorRTModule.decode_metadata(serialized_metadata)
-            shape_info = metadata.get("inout_symexprs") if metadata else None
-            if shape_info:
-                return _apply_symbolic_shape_expressions(inputs, shape_info)
-        except Exception:
-            pass
-
-    # Fallback: return one tensor with same shape/dtype as the first input
-    from torch._guards import detect_fake_mode
-
-    fake_mode = detect_fake_mode(inputs) if inputs else None
-    if not inputs:
-        return [torch.empty(())]
-    if fake_mode is not None:
-        return [fake_mode.from_tensor(inputs[0])]
-    return [torch.empty_like(inputs[0])]
diff --git a/py/torch_tensorrt/executorch/backend.py b/py/torch_tensorrt/executorch/backend.py
index b95cf55121..5ed8a01893 100644
--- a/py/torch_tensorrt/executorch/backend.py
+++ b/py/torch_tensorrt/executorch/backend.py
@@ -10,45 +10,54 @@
     PreprocessResult,
 )
 from torch.export.exported_program import ExportedProgram
-from torch_tensorrt.dynamo.runtime._TorchTensorRTModule import (
-    ENGINE_IDX,
-    SERIALIZATION_LEN,
-)
+from torch_tensorrt.dynamo.runtime._TorchTensorRTModule import ENGINE_IDX
 from torch_tensorrt.executorch.serialization import serialize_engine_info
 
 
 def _get_engine_info_from_edge_program(edge_program: ExportedProgram) -> List[Any]:
-    """Extract engine info (list of strings/bytes) from the partition's no_op_placeholder node.
+    """Extract engine info (list of strings/bytes) from the partition's execute_engine node.
 
-    Before calling to_edge_transform_and_lower, _save_as_executorch replaces
-    execute_engine nodes with no_op_placeholder_for_execute_engine whose args are
-    (inputs_tuple, abi_version, name, device, engine_b64, in_names, out_names,
-    hw_compat, metadata, platform, requires_oa).  This function reads those flat
-    args back out and returns them as a list indexed by SerializedInfoIndex.
+    The partition contains a single execute_engine node whose second argument is
+    either a get_attr node (engine on the graph module) or a placeholder node
+    (engine lifted into edge_program.constants by torch.export).  Either way,
+    the engine object's __getstate__() returns the SERIALIZATION_LEN-item list
+    used by the TRT runtime blob format.
     """
     gm = edge_program.graph_module
-    no_op = torch.ops.tensorrt.no_op_placeholder_for_execute_engine.default
+    execute_engine_op = torch.ops.tensorrt.execute_engine.default
 
     for node in gm.graph.nodes:
-        if node.op != "call_function" or node.target is not no_op:
+        if node.op != "call_function" or node.target is not execute_engine_op:
             continue
-        # args layout: (inputs_tuple, *engine_info_strings)
-        # engine_info_strings has SERIALIZATION_LEN - 1 entries (no RESOURCE_ALLOCATION_STRATEGY)
-        if len(node.args) < 2:
-            raise RuntimeError(
-                f"no_op_placeholder node '{node.name}' has too few args: {len(node.args)}"
+
+        engine_node = node.args[1]
+        if engine_node.op == "get_attr":
+            engine_obj = getattr(gm, engine_node.target, None)
+            if engine_obj is None:
+                raise RuntimeError(
+                    f"execute_engine node '{node.name}': get_attr target "
+                    f"'{engine_node.target}' not found on graph module"
+                )
+        elif engine_node.op == "placeholder":
+            constants = getattr(edge_program, "constants", {})
+            engine_obj = constants.get(engine_node.name) or constants.get(
+                engine_node.target
             )
-        engine_info = list(node.args[1:])
-        if len(engine_info) < SERIALIZATION_LEN - 1:
+            if engine_obj is None:
+                raise RuntimeError(
+                    f"execute_engine node '{node.name}': placeholder engine "
+                    f"'{engine_node.name}' not found in edge_program.constants"
+                )
+        else:
             raise RuntimeError(
-                f"no_op_placeholder node '{node.name}' has {len(engine_info)} engine "
-                f"info args, expected at least {SERIALIZATION_LEN - 1}"
+                f"execute_engine node '{node.name}': unexpected engine arg op "
+                f"'{engine_node.op}'"
             )
-        return engine_info
+
+        return list(engine_obj.__getstate__())
 
     raise RuntimeError(
-        "TensorRT ExecuTorch backend: no no_op_placeholder_for_execute_engine "
-        "node found in partition."
+        "TensorRT ExecuTorch backend: no execute_engine node found in partition."
     )
 
 
diff --git a/py/torch_tensorrt/executorch/operator_support.py b/py/torch_tensorrt/executorch/operator_support.py
index 29991611f9..32763665c2 100644
--- a/py/torch_tensorrt/executorch/operator_support.py
+++ b/py/torch_tensorrt/executorch/operator_support.py
@@ -7,22 +7,20 @@
 
 
 class TensorRTOperatorSupport(OperatorSupportBase):  # type: ignore[misc]
-    """Supports torch.ops.tensorrt.no_op_placeholder_for_execute_engine for partitioning.
+    """Supports only torch.ops.tensorrt.execute_engine for partitioning.
 
-    Prior to calling to_edge_transform_and_lower, _save_as_executorch replaces
-    execute_engine nodes with no_op_placeholder_for_execute_engine so that
-    ExecuTorch's edge-lowering passes (which symbolically execute every node) do
-    not trip on the Engine custom-class schema check.  The partitioner therefore
-    targets the no-op placeholder instead of execute_engine directly.
+    Used so that TRT-compiled graphs (which already contain execute_engine nodes)
+    are partitioned per engine; each partition is then lowered to TensorRTBackend
+    which serializes the engine to the same blob format as the TRT runtime.
     """
 
     def __init__(self) -> None:
         super().__init__()
-        self._no_op = torch.ops.tensorrt.no_op_placeholder_for_execute_engine.default
+        self._execute_engine_op = torch.ops.tensorrt.execute_engine.default
 
     def is_node_supported(
         self, submodules: Dict[str, torch.nn.Module], node: torch.fx.Node
     ) -> bool:
         if node.op != "call_function":
             return False
-        return node.target is self._no_op
+        return node.target is self._execute_engine_op

From faac9094a7468ee406884482992936525a016f5b Mon Sep 17 00:00:00 2001
From: Lan Luo <lanl@nvidia.com>
Date: Tue, 24 Mar 2026 16:40:13 -0700
Subject: [PATCH 7/7] test

---
 py/torch_tensorrt/dynamo/_compiler.py         |  3 +++
 .../dynamo/runtime/_TorchTensorRTModule.py    |  2 +-
 .../runtime/meta_ops/register_meta_ops.py     | 27 +++++--------------
 setup.py                                      |  4 ++-
 4 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
index 117a87abc8..79cd025925 100644
--- a/py/torch_tensorrt/dynamo/_compiler.py
+++ b/py/torch_tensorrt/dynamo/_compiler.py
@@ -9,6 +9,9 @@
 
 import torch
 from torch.export import ExportedProgram
+
+# TODO: remove this in future, this is just for test executorch which uses torch 2.11 which has a bug in the leaf spec compat
+# the bug has been fixed in the torch 2.12 in the upstream.
 from torch_tensorrt.dynamo._leaf_spec_compat import _apply_leaf_spec_patch
 
 _apply_leaf_spec_patch()
diff --git a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
index 2de6b10810..d77c0bf39f 100644
--- a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
+++ b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
@@ -338,7 +338,6 @@ def forward(self, *inputs: Any) -> torch.Tensor | Tuple[torch.Tensor, ...]:
         Returns:
             torch.Tensor or Tuple(torch.Tensor): Result of the engine computation
         """
-
         if self.engine is None:
             raise RuntimeError("Engine has not been setup yet.")
 
@@ -355,6 +354,7 @@ def forward(self, *inputs: Any) -> torch.Tensor | Tuple[torch.Tensor, ...]:
             (i if isinstance(i, torch.Tensor) else torch.tensor(i).cuda())
             for i in inputs
         ]
+
         outputs: List[torch.Tensor] = torch.ops.tensorrt.execute_engine(
             list(input_tensors), self.engine
         )
diff --git a/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py b/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py
index 78a517e6fc..e03c88153c 100644
--- a/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py
+++ b/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py
@@ -189,22 +189,6 @@ def fake_aten_cudnn_grid_sampler(
     return torch.empty(out_shape, dtype=input.dtype, device=input.device)
 
 
-def _is_placeholder_engine(engine: Any) -> bool:
-    """True if engine is a placeholder (CustomObjArgument/FakeScriptObject) from export."""
-    if engine is None:
-        return True
-    type_name = type(engine).__name__
-    if type_name == "CustomObjArgument":
-        return True
-    if type_name == "FakeScriptObject":
-        return True
-    if hasattr(engine, "fake_val") and engine.fake_val is not None:
-        return True
-    if not hasattr(engine, "get_serialized_metadata"):
-        return True
-    return False
-
-
 @torch.library.register_fake("tensorrt::execute_engine")  # type: ignore
 def fake_tensorrt_execute_engine(
     inputs: List[torch.Tensor], fake_trt_engine: Any
@@ -212,14 +196,13 @@ def fake_tensorrt_execute_engine(
     """
     Meta kernel for TensorRT engine execution.
 
-    When the engine is a placeholder (CustomObjArgument/FakeScriptObject from
-    torch.export/ExecuTorch), returns one fake output per input (same shape/dtype)
-    so partitioners can run without a real engine. Otherwise uses symbolic shape
-    expressions from metadata to infer output shapes.
+    Uses symbolic shape expressions captured at compile time to correctly infer
+    output shapes while preserving symbolic SymInt relationships.
     """
 
     metadata = None
     if hasattr(fake_trt_engine, "real_obj"):
+        # Wrapped C++ engine with real_obj
         trt_engine = fake_trt_engine.real_obj
         metadata = TorchTensorRTModule.decode_metadata(
             trt_engine.get_serialized_metadata()
@@ -230,7 +213,10 @@ def fake_tensorrt_execute_engine(
         )
 
     shape_info = metadata.get("inout_symexprs") if metadata else None
+
     if shape_info:
+        # Apply the symbolic shape expressions to create output fake tensors
+        # shape_info now contains both 'inputs' and 'outputs' keys
         return _apply_symbolic_shape_expressions(inputs, shape_info)
     else:
         raise RuntimeError(
@@ -334,7 +320,6 @@ def no_op_placeholder_for_execute_engine(
     serialized_metadata: str,
     serialized_target_platform: str,
     serialized_require_output_allocator: str,
-    serialized_require_output_alocator_idx: str,
 ) -> List[torch.Tensor]:
     raise RuntimeError(
         "The saved model is cross compiled for windows in Linux, should only be loadded in Windows via torch_tensorrt.load_cross_compiled_exported_program() api."
diff --git a/setup.py b/setup.py
index 171f8f481b..7445818cd9 100644
--- a/setup.py
+++ b/setup.py
@@ -35,8 +35,10 @@
 LEGACY_BASE_VERSION_SUFFIX_PATTERN = re.compile("a0$")
 # CI_PIPELINE_ID is the environment variable set by DLFW ci build
 IS_DLFW_CI = os.environ.get("CI_PIPELINE_ID") is not None
+# TODO: remove this in future, this is just for test executorch which try to use whatever torch version executorch uses
 IS_DLFW_CI = True
 
+
 def get_root_dir() -> Path:
     return Path(__file__).parent.absolute()
 
@@ -787,7 +789,7 @@ def get_x86_64_requirements(base_requirements):
     if IS_DLFW_CI:
         return requirements
     else:
-        requirements = requirements + ["torch>=2.11.0,<2.12.0"]
+        requirements = requirements + ["torch>=2.12.0.dev,<2.13.0"]
         if USE_TRT_RTX:
             return requirements + [
                 "tensorrt_rtx>=1.3.0.35",