From 3112a4e884917026394d3f873bb748159885f7c0 Mon Sep 17 00:00:00 2001
From: Will Guo <willg@nvidia.com>
Date: Sun, 1 Feb 2026 22:51:59 +0000
Subject: [PATCH 1/5] Integrate Automated QDQ placement tool - part 3.3

Signed-off-by: Will Guo <willg@nvidia.com>
---
 .../onnx/quantization/autotune/__main__.py    | 310 +++++++++++++++
 .../onnx/quantization/autotune/workflows.py   | 376 ++++++++++++++++++
 .../onnx/quantization/autotune/test_config.py | 104 +++++
 3 files changed, 790 insertions(+)
 create mode 100644 modelopt/onnx/quantization/autotune/__main__.py
 create mode 100644 modelopt/onnx/quantization/autotune/workflows.py
 create mode 100644 tests/unit/onnx/quantization/autotune/test_config.py

diff --git a/modelopt/onnx/quantization/autotune/__main__.py b/modelopt/onnx/quantization/autotune/__main__.py
new file mode 100644
index 000000000..ea2f99856
--- /dev/null
+++ b/modelopt/onnx/quantization/autotune/__main__.py
@@ -0,0 +1,310 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Command-line interface for ONNX Q/DQ autotuning."""
+
+import argparse
+import sys
+from pathlib import Path
+
+from modelopt.onnx.logging_config import logger
+from modelopt.onnx.quantization.autotune.workflows import (
+    init_benchmark_instance,
+    region_pattern_autotuning_workflow,
+)
+
+DEFAULT_OUTPUT_DIR = "./autotuner_output"
+DEFAULT_NUM_SCHEMES = 30
+DEFAULT_QUANT_TYPE = "int8"
+DEFAULT_DQ_DTYPE = "float32"
+DEFAULT_TIMING_CACHE = "/tmp/trtexec_timing.cache"  # nosec B108
+DEFAULT_WARMUP_RUNS = 5
+DEFAULT_TIMING_RUNS = 20
+
+
+def validate_file_path(path: str | None, description: str) -> Path | None:
+    """Validate that a file path exists.
+
+    Args:
+        path: Path string to validate (can be None)
+        description: Description of the file for error messages
+
+    Returns:
+        Path object if valid, None if path is None
+
+    Raises:
+        SystemExit: If path is provided but doesn't exist
+    """
+    if path is None:
+        return None
+
+    path_obj = Path(path)
+    if not path_obj.exists():
+        logger.error(f"{description} not found: {path_obj}")
+        sys.exit(1)
+
+    return path_obj
+
+
+def log_benchmark_config(args):
+    """Log TensorRT benchmark configuration for transparency.
+
+    Logs timing cache path, warmup/timing run counts, and any custom
+    plugin libraries that will be loaded.
+
+    Args:
+        args: Parsed command-line arguments with benchmark configuration
+    """
+    logger.info("Initializing TensorRT benchmark")
+    logger.info(f"  Timing cache: {args.timing_cache}")
+    logger.info(f"  Warmup runs: {args.warmup_runs}")
+    logger.info(f"  Timing runs: {args.timing_runs}")
+    if args.plugin_libraries:
+        logger.info(f"  Plugin libraries: {', '.join(args.plugin_libraries)}")
+    if hasattr(args, "trtexec_benchmark_args") and args.trtexec_benchmark_args:
+        logger.info(f"  Trtexec args: {args.trtexec_benchmark_args}")
+
+
+def run_autotune(args=None) -> int:
+    """Execute the complete pattern-based Q/DQ autotuning workflow.
+
+    This function orchestrates the entire optimization process:
+    1. Parses command-line arguments (if not provided)
+    2. Validates input paths (model, baseline, output directory)
+    3. Initializes TensorRT benchmark instance
+    4. Runs pattern-based region autotuning workflow
+    5. Handles interruptions gracefully with state preservation
+
+    Args:
+        args: Optional parsed command-line arguments. If None, parses sys.argv.
+
+    Returns:
+        Exit code:
+        - 0: Success
+        - 1: Autotuning failed (exception occurred)
+        - 130: Interrupted by user (Ctrl+C)
+    """
+    if args is None:
+        args = _get_autotune_parser().parse_args()
+
+    model_path = validate_file_path(args.onnx_path, "Model file")
+    validate_file_path(args.qdq_baseline, "QDQ baseline model")
+    output_dir = Path(args.output)
+
+    log_benchmark_config(args)
+    trtexec_args = getattr(args, "trtexec_benchmark_args", None)
+    benchmark_instance = init_benchmark_instance(
+        use_trtexec=args.use_trtexec,
+        plugin_libraries=args.plugin_libraries,
+        timing_cache_file=args.timing_cache,
+        warmup_runs=args.warmup_runs,
+        timing_runs=args.timing_runs,
+        trtexec_args=trtexec_args,
+    )
+
+    if benchmark_instance is None:
+        logger.error("Failed to initialize TensorRT benchmark")
+        return 1
+
+    logger.info("Autotuning Mode: Pattern-Based")
+
+    try:
+        node_filter_list = None
+        if args.node_filter_list:
+            filter_file = validate_file_path(args.node_filter_list, "Node filter list file")
+            if filter_file:
+                with open(filter_file) as f:
+                    node_filter_list = [
+                        line.strip()
+                        for line in f
+                        if line.strip() and not line.strip().startswith("#")
+                    ]
+                logger.info(f"Loaded {len(node_filter_list)} filter patterns from {filter_file}")
+
+        region_pattern_autotuning_workflow(
+            model_path=str(model_path),
+            output_dir=output_dir,
+            num_schemes_per_region=args.num_schemes,
+            pattern_cache_file=args.pattern_cache_file,
+            state_file=args.state_file,
+            quant_type=args.quant_type,
+            default_dq_dtype=args.default_dq_dtype,
+            qdq_baseline_model=args.qdq_baseline,
+            node_filter_list=node_filter_list,
+            verbose=args.verbose,
+        )
+
+        logger.info("\n" + "=" * 70)
+        logger.info("✓ Autotuning completed successfully!")
+        logger.info(f"✓ Results: {output_dir}")
+        logger.info("=" * 70)
+        return 0
+
+    except KeyboardInterrupt:
+        logger.warning("\nInterrupted by user")
+        state_file = args.state_file or output_dir / "autotuner_state.yaml"
+        logger.info(f"Progress saved to: {state_file}")
+        return 130
+
+    except Exception as e:
+        logger.error(f"\nAutotuning failed: {e}", exc_info=args.verbose)
+        return 1
+
+
+def _get_autotune_parser() -> argparse.ArgumentParser:
+    """Create and configure the command-line argument parser."""
+    parser = argparse.ArgumentParser(
+        prog="modelopt.onnx.quantization.autotune",
+        description="ONNX Q/DQ Autotuning with TensorRT",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Basic usage
+  python -m modelopt.onnx.quantization.autotune --onnx_path model.onnx
+
+  # Import patterns from QDQ baseline model
+  python -m modelopt.onnx.quantization.autotune \\
+      --onnx_path model.onnx --qdq_baseline baseline.onnx
+
+  # Use pattern cache for warm-start
+  python -m modelopt.onnx.quantization.autotune --onnx_path model.onnx --pattern_cache cache.yaml
+
+  # Full example with all options
+  python -m modelopt.onnx.quantization.autotune \\
+      --onnx_path model.onnx --schemes_per_region 50 \\
+      --pattern_cache cache.yaml --qdq_baseline baseline.onnx \\
+      --quant_type int8 --verbose
+        """,
+    )
+
+    # Model and Output
+    io_group = parser.add_argument_group("Model and Output")
+    io_group.add_argument(
+        "--onnx_path", "-m", type=str, required=True, help="Path to ONNX model file"
+    )
+    io_group.add_argument(
+        "--output",
+        "-o",
+        type=str,
+        default=DEFAULT_OUTPUT_DIR,
+        help=f"Output directory for results (default: {DEFAULT_OUTPUT_DIR})",
+    )
+
+    # Autotuning Strategy
+    strategy_group = parser.add_argument_group("Autotuning Strategy")
+    strategy_group.add_argument(
+        "--schemes_per_region",
+        "-s",
+        type=int,
+        default=DEFAULT_NUM_SCHEMES,
+        dest="num_schemes",
+        help=f"Number of schemes to test per region (default: {DEFAULT_NUM_SCHEMES})",
+    )
+    strategy_group.add_argument(
+        "--pattern_cache",
+        type=str,
+        default=None,
+        dest="pattern_cache_file",
+        help="Path to pattern cache YAML for warm-start (optional)",
+    )
+    strategy_group.add_argument(
+        "--qdq_baseline",
+        type=str,
+        default=None,
+        help="Path to QDQ baseline ONNX model to import quantization patterns (optional)",
+    )
+    strategy_group.add_argument(
+        "--state_file",
+        type=str,
+        default=None,
+        help="State file path for resume capability (default: <output>/autotuner_state.yaml)",
+    )
+    strategy_group.add_argument(
+        "--node_filter_list",
+        type=str,
+        default=None,
+        help="Path to a file containing wildcard patterns to filter ONNX nodes (one pattern per line). "
+        "Regions without any matching nodes are skipped during autotuning.",
+    )
+
+    # Quantization
+    quant_group = parser.add_argument_group("Quantization")
+    quant_group.add_argument(
+        "--quant_type",
+        type=str,
+        default=DEFAULT_QUANT_TYPE,
+        choices=["int8", "fp8"],
+        help=f"Quantization data type (default: {DEFAULT_QUANT_TYPE})",
+    )
+    quant_group.add_argument(
+        "--default_dq_dtype",
+        type=str,
+        default=DEFAULT_DQ_DTYPE,
+        choices=["float16", "float32", "bfloat16"],
+        help="Default DQ output dtype if cannot be deduced (optional)",
+    )
+
+    # TensorRT Benchmark
+    trt_group = parser.add_argument_group("TensorRT Benchmark")
+    trt_group.add_argument(
+        "--use_trtexec",
+        action="store_true",
+        help="Use trtexec for benchmarking (default: False)",
+        default=False,
+    )
+    trt_group.add_argument(
+        "--timing_cache",
+        type=str,
+        default=DEFAULT_TIMING_CACHE,
+        help=f"TensorRT timing cache file (default: {DEFAULT_TIMING_CACHE})",
+    )
+    trt_group.add_argument(
+        "--warmup_runs",
+        type=int,
+        default=DEFAULT_WARMUP_RUNS,
+        help=f"Number of warmup runs (default: {DEFAULT_WARMUP_RUNS})",
+    )
+    trt_group.add_argument(
+        "--timing_runs",
+        type=int,
+        default=DEFAULT_TIMING_RUNS,
+        help=f"Number of timing runs (default: {DEFAULT_TIMING_RUNS})",
+    )
+    trt_group.add_argument(
+        "--plugin_libraries",
+        "--plugins",
+        type=str,
+        nargs="+",
+        default=None,
+        dest="plugin_libraries",
+        help="TensorRT plugin libraries (.so files) to load (optional, space-separated)",
+    )
+    trt_group.add_argument(
+        "--trtexec_benchmark_args",
+        type=str,
+        default=None,
+        help="Additional command-line arguments to pass to trtexec as a single quoted string. "
+        "Example: --trtexec_benchmark_args '--fp16 --workspace=4096 --verbose'",
+    )
+
+    # Logging
+    parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose DEBUG logging")
+
+    return parser
+
+
+if __name__ == "__main__":
+    sys.exit(run_autotune())
diff --git a/modelopt/onnx/quantization/autotune/workflows.py b/modelopt/onnx/quantization/autotune/workflows.py
new file mode 100644
index 000000000..17ae3aa7e
--- /dev/null
+++ b/modelopt/onnx/quantization/autotune/workflows.py
@@ -0,0 +1,376 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""ONNX Q/DQ Autotuning Workflows.
+
+This module provides high-level workflow functions for automated Q/DQ (Quantization/Dequantization)
+optimization of ONNX models using pattern-based region analysis and TensorRT performance measurement.
+"""
+
+import fnmatch
+from pathlib import Path
+
+import onnx
+
+from modelopt.onnx.logging_config import logger
+from modelopt.onnx.quantization.autotune.autotuner import QDQAutotuner
+from modelopt.onnx.quantization.autotune.benchmark import TensorRTPyBenchmark, TrtExecBenchmark
+from modelopt.onnx.quantization.autotune.common import Config, PatternCache
+from modelopt.onnx.quantization.qdq_utils import get_quantized_tensors
+
+_benchmark_instance = None
+
+
+def benchmark_onnx_model(
+    model_path: str | bytes, log_file: str | None = None, flush_timing_cache: bool = False
+) -> float:
+    """Benchmark ONNX model inference latency using TensorRT Python API.
+
+    Args:
+        model_path: Path to ONNX model file, or bytes containing serialized model protobuf
+        log_file: Optional path to save detailed TensorRT build and benchmark logs
+                 (default: None, no logging)
+        flush_timing_cache: If True, flushes TensorRT timing cache before building engine.
+                           Useful for periodic cache refresh (default: False)
+
+    Returns:
+        Measured median inference latency in milliseconds.
+        Returns float('inf') on failure (invalid model, build error, etc.)
+
+    Raises:
+        No exceptions raised - errors are caught and logged, returning float('inf')
+    """
+    global _benchmark_instance
+
+    if _benchmark_instance is None:
+        logger.error("Benchmark instance not initialized")
+        return float("inf")
+
+    try:
+        latency = _benchmark_instance.run(
+            model_path, log_file=log_file, flush_timing_cache=flush_timing_cache
+        )
+
+        if latency == float("inf"):
+            if isinstance(model_path, bytes):
+                logger.warning("Benchmark failed for model bytes")
+            else:
+                logger.warning(f"Benchmark failed: {model_path}")
+            return float("inf")
+
+        logger.debug(f"Benchmark result: {latency:.2f} ms")
+        return latency
+
+    except Exception as e:
+        logger.error(f"Benchmark error: {e}", exc_info=True)
+        return float("inf")
+
+
+def init_benchmark_instance(
+    use_trtexec: bool = False,
+    plugin_libraries: list[str] | None = None,
+    timing_cache_file: str | None = None,
+    warmup_runs: int = 5,
+    timing_runs: int = 20,
+    trtexec_args: str | None = None,
+):
+    """Initialize global TensorRT benchmark instance for model performance measurement.
+
+    Args:
+        use_trtexec: Whether to use trtexec for benchmarking.
+        plugin_libraries: List of paths to TensorRT plugin shared libraries (.so files).
+                          These plugins will be loaded by trtexec or TensorRT Python API during engine building.
+                          If None, no custom plugins are loaded.
+        timing_cache_file: Path to TensorRT timing cache file for faster engine builds.
+                          If None, uses default "trtexec_timing.cache" (default: None)
+        warmup_runs: Number of warmup inference iterations before measurement.
+                    Allows GPU to reach stable performance state (default: 5)
+        timing_runs: Number of timed inference iterations for latency measurement.
+                    Higher values give more stable median (default: 20)
+        trtexec_args: Additional command-line arguments to pass to trtexec as a string (only used if use_trtexec=True).
+                     Example: '--fp16 --workspace=4096 --verbose'
+    """
+    global _benchmark_instance
+    try:
+        if use_trtexec:
+            _benchmark_instance = TrtExecBenchmark(
+                timing_cache_file=timing_cache_file,
+                warmup_runs=warmup_runs,
+                timing_runs=timing_runs,
+                plugin_libraries=plugin_libraries,
+                trtexec_args=trtexec_args,
+            )
+            logger.info("Trtexec benchmark initialized")
+        else:
+            _benchmark_instance = TensorRTPyBenchmark(
+                timing_cache_file=timing_cache_file,
+                warmup_runs=warmup_runs,
+                timing_runs=timing_runs,
+                plugin_libraries=plugin_libraries,
+            )
+            logger.info("TensorRT Python API benchmark initialized")
+        logger.debug(
+            f"Settings: warmup={warmup_runs}, timing={timing_runs}, "
+            f"cache={timing_cache_file or 'trtexec_timing.cache'}, plugin_libraries={plugin_libraries}"
+        )
+        return _benchmark_instance
+    except Exception as e:
+        logger.error(f"TensorRT initialization failed: {e}", exc_info=True)
+        return None
+
+
+def _region_matches_filter(region, graph, filter_patterns: list[str]) -> bool:
+    """Check if any node in the region matches any of the filter patterns.
+
+    Args:
+        region: Region object to check
+        graph: ONNX graph (graphsurgeon) containing node information
+        filter_patterns: List of wildcard patterns to match against node names
+
+    Returns:
+        True if at least one node in the region matches any pattern, False otherwise
+    """
+    if not filter_patterns:
+        return True
+
+    node_indices = region.get_all_nodes_recursive()
+
+    for node_idx in node_indices:
+        if node_idx < len(graph.nodes):
+            node_name = graph.nodes[node_idx].name
+            for pattern in filter_patterns:
+                if fnmatch.fnmatch(node_name, pattern):
+                    return True
+
+    return False
+
+
+def region_pattern_autotuning_workflow(
+    model_path: str,
+    output_dir: Path,
+    num_schemes_per_region: int = 30,
+    pattern_cache_file: str | None = None,
+    state_file: str | None = None,
+    quant_type: str = "int8",
+    default_dq_dtype: str = "float32",
+    qdq_baseline_model: str | None = None,
+    node_filter_list: list[str] | None = None,
+    verbose: bool = False,
+) -> QDQAutotuner:
+    """Run automated Q/DQ (Quantization/Dequantization) optimization on an ONNX model.
+
+    This workflow uses pattern-based region optimization to efficiently find optimal
+    Q/DQ insertion points. The key insight: regions with identical structural patterns
+    can share the same Q/DQ scheme. When a best scheme is found for a pattern, it
+    automatically applies to all regions matching that pattern, making optimization
+    both efficient and consistent.
+
+    Automatically discovers regions, generates and tests Q/DQ insertion schemes,
+    and exports optimized model. Supports incremental state saving for crash recovery
+    and pattern cache-based warm-start.
+
+    **Workflow Steps:**
+    1. Load model and initialize autotuner with automatic hierarchical region discovery
+    2. Resume from checkpoint if state file exists (crash recovery)
+    3. Load pattern cache if provided (warm-start with known-good schemes)
+    4. Import Q/DQ patterns from baseline model if provided (transfer learning)
+    5. Measure baseline performance without Q/DQ insertions
+    6. For each discovered region pattern:
+       a. Generate Q/DQ insertion schemes (pattern-relative)
+       b. Build TensorRT engine and measure latency for each scheme
+       c. Select best scheme for this pattern (applies to all matching regions)
+       d. Save checkpoint and intermediate model
+    7. Export final optimized model with best Q/DQ scheme for each pattern
+
+    Args:
+        model_path: Path to ONNX model file to optimize
+        output_dir: Directory for output files (state, logs, models). Created if doesn't exist.
+        num_schemes_per_region: Number of Q/DQ insertion schemes to test per region pattern.
+                               Higher values explore more configurations but take longer (default: 30)
+        pattern_cache_file: Optional path to pattern cache YAML file containing known-good schemes
+                           from previous runs. Enables warm-start optimization (default: None)
+        state_file: Optional path to state file for checkpoint/resume. If None, automatically
+                   uses <output_dir>/autotuner_state.yaml (default: None)
+        quant_type: Quantization data type - "int8" for INT8 quantization (default),
+                   "fp8" for FP8 quantization
+        qdq_baseline_model: Optional path to a pre-quantized ONNX model. If provided,
+                           extracts Q/DQ insertion patterns and adds them to pattern cache
+                           for warm-start (default: None)
+        node_filter_list: Optional list of wildcard patterns to filter ONNX nodes. Regions
+                         without any matching nodes are skipped during autotuning (default: None)
+        verbose: Enable verbose logging in Config for detailed autotuner output (default: False)
+
+    Returns:
+        QDQAutotuner instance after autotuning
+    """
+    output_dir.mkdir(parents=True, exist_ok=True)
+    logs_dir = output_dir / "logs"
+    logs_dir.mkdir(exist_ok=True)
+    models_dir = output_dir / "region_models"
+    models_dir.mkdir(exist_ok=True)
+
+    if state_file is None:
+        state_file = str(output_dir / "autotuner_state.yaml")
+    state_path = Path(state_file)
+
+    logger.info(f"Loading model: {model_path}")
+    model = onnx.load(model_path)
+
+    pattern_cache = None
+    if pattern_cache_file:
+        pattern_cache_path = Path(pattern_cache_file)
+        if pattern_cache_path.exists():
+            pattern_cache = PatternCache.load(str(pattern_cache_path))
+            logger.info(
+                f"Loaded pattern cache: {pattern_cache.num_patterns} patterns, "
+                f"{pattern_cache.total_schemes} schemes"
+            )
+        else:
+            logger.warning(f"Pattern cache not found: {pattern_cache_file}")
+
+    logger.info(
+        f"Initializing autotuner (quant_type={quant_type}, default_dq_dtype={default_dq_dtype})"
+    )
+    config = Config(
+        default_quant_type=quant_type,
+        default_dq_dtype=default_dq_dtype,
+        verbose=verbose,
+    )
+
+    autotuner = QDQAutotuner(model)
+    autotuner.initialize(config, pattern_cache)
+
+    if state_path.exists():
+        logger.info(f"Resuming from checkpoint: {state_path}")
+        autotuner.load_state(str(state_path))
+    else:
+        logger.info("Starting new autotuning session")
+
+    if qdq_baseline_model:
+        qdq_baseline_path = Path(qdq_baseline_model)
+        if qdq_baseline_path.exists():
+            logger.info(f"Importing patterns from QDQ baseline: {qdq_baseline_model}")
+            qdq_model = onnx.load(str(qdq_baseline_path))
+            quantized_tensors = get_quantized_tensors(qdq_model)
+            logger.debug(f"Found {len(quantized_tensors)} quantized tensors in baseline")
+            autotuner.import_insertion_points(quantized_tensors)
+            logger.info("Pattern import complete")
+        else:
+            logger.warning(f"QDQ baseline not found: {qdq_baseline_model}")
+
+    regions = autotuner.regions
+    logger.info(f"Ready to profile {len(regions)} regions")
+
+    if autotuner.baseline_latency_ms is None:
+        logger.info("Measuring baseline (no Q/DQ)")
+        baseline_path = output_dir / "baseline.onnx"
+        autotuner.export_onnx(str(baseline_path), insert_qdq=False)
+        baseline_log = logs_dir / "baseline.log"
+        baseline_latency = benchmark_onnx_model(str(baseline_path), str(baseline_log))
+        autotuner.submit(baseline_latency)
+        logger.info(f"Baseline: {baseline_latency:.2f} ms")
+    else:
+        baseline_latency = autotuner.baseline_latency_ms
+        logger.info(f"Using baseline from checkpoint: {baseline_latency:.2f} ms")
+
+    logger.info(f"Starting region profiling ({num_schemes_per_region} schemes per region)")
+
+    iteration_count = 0
+
+    for region_idx, region in enumerate(regions):
+        logger.info(
+            f"Region {region_idx + 1}/{len(regions)} (ID={region.id}, level={region.get_level()})"
+        )
+
+        if node_filter_list and not _region_matches_filter(
+            region, autotuner.graph, node_filter_list
+        ):
+            logger.info("  Skipping (no nodes match filter patterns)")
+            continue
+
+        commit = region_idx > 0
+        autotuner.set_profile_region(region, commit=commit)
+
+        if autotuner.current_profile_pattern_schemes is None:
+            logger.info("  Skipping (already profiled)")
+            continue
+
+        schemes_tested = 0
+        for scheme_num in range(num_schemes_per_region):
+            iteration_count += 1
+            scheme_idx = autotuner.generate()
+
+            if scheme_idx == -1:
+                logger.debug(f"  Stopping at scheme {scheme_num + 1} (no more unique schemes)")
+                break
+
+            schemes_tested += 1
+            model_bytes = autotuner.export_onnx(None, insert_qdq=True)
+            test_log = logs_dir / f"region_{region.id}_scheme_{scheme_idx}.log"
+            flush_timing_cache = (iteration_count % 10) == 0
+            latency = benchmark_onnx_model(
+                model_bytes, str(test_log), flush_timing_cache=flush_timing_cache
+            )
+
+            autotuner.submit(latency, success=(latency != float("inf")))
+
+        ps = autotuner.current_profile_pattern_schemes
+        if ps and ps.schemes:
+            best_scheme = ps.best_scheme
+            if best_scheme and best_scheme.latency_ms < float("inf") and baseline_latency > 0:
+                speedup = baseline_latency / best_scheme.latency_ms
+                logger.info(
+                    f"  Tested {schemes_tested} schemes: "
+                    f"best {best_scheme.latency_ms:.2f} ms ({speedup:.3f}x speedup)"
+                )
+            else:
+                logger.info(f"  Tested {schemes_tested} schemes: no valid measurements")
+        else:
+            logger.info(f"  Tested {schemes_tested} schemes")
+
+        region_model_path = models_dir / f"region_{region.id}_level_{region.get_level()}.onnx"
+        autotuner.export_onnx(str(region_model_path), insert_qdq=True, best=True)
+        logger.debug(f"  Saved best model: {region_model_path.name}")
+
+        # Save state after each region (incremental, crash recovery)
+        autotuner.save_state(str(state_path))
+        logger.debug("  Checkpoint saved")
+
+    # Commit final region
+    autotuner.set_profile_region(None, commit=True)
+
+    logger.info("Exporting final optimized model")
+    final_model_path = output_dir / "optimized_final.onnx"
+    autotuner.export_onnx(str(final_model_path), insert_qdq=True)
+    final_log = logs_dir / "final.log"
+    final_latency = benchmark_onnx_model(str(final_model_path), str(final_log))
+
+    if final_latency > 0 and final_latency != float("inf"):
+        speedup = baseline_latency / final_latency
+        logger.info(
+            f"Results: {baseline_latency:.2f} ms → {final_latency:.2f} ms ({speedup:.3f}x speedup)"
+        )
+    else:
+        logger.info(f"Results: {baseline_latency:.2f} ms → failed (invalid measurement)")
+
+    autotuner.save_state(str(state_path))
+
+    logger.info("Autotuning complete")
+    logger.info(f"  Final model: {final_model_path}")
+    logger.info(f"  State: {state_path}")
+    logger.debug(f"  Logs: {logs_dir}")
+    logger.debug(f"  Region models: {models_dir}")
+
+    return autotuner
diff --git a/tests/unit/onnx/quantization/autotune/test_config.py b/tests/unit/onnx/quantization/autotune/test_config.py
new file mode 100644
index 000000000..ed4840e3e
--- /dev/null
+++ b/tests/unit/onnx/quantization/autotune/test_config.py
@@ -0,0 +1,104 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Tests for the Config class in the autotuner.
+
+Tests configuration parameter validation and defaults.
+"""
+
+import os
+import sys
+import unittest
+
+# Add parent directory to path
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from modelopt.onnx.quantization.autotune.common import Config
+
+
+class TestConfig(unittest.TestCase):
+    """Test Config class functionality."""
+
+    def test_default_values(self):
+        """Test that Config has correct default values."""
+        config = Config()
+
+        # Logging
+        assert not config.verbose
+
+        # Performance thresholds
+
+        # Q/DQ defaults
+        assert config.default_q_scale == 0.1
+        assert config.default_q_zero_point == 0
+        assert config.default_quant_type == "int8"
+
+        # Region builder settings
+        assert config.maximum_sequence_region_size == 10
+        assert config.minimum_topdown_search_size == 10
+
+        # Scheme generation parameters
+        assert config.top_percent_to_mutate == 0.1
+        assert config.minimum_schemes_to_mutate == 10
+        assert config.maximum_mutations == 3
+        assert config.maximum_generation_attempts == 100
+
+        # Pattern cache parameters
+        assert config.pattern_cache_minimum_distance == 4
+        assert config.pattern_cache_max_entries_per_pattern == 32
+
+    def test_custom_values(self):
+        """Test creating Config with custom values."""
+        config = Config(
+            verbose=True,
+            default_q_scale=0.05,
+            default_q_zero_point=128,
+            default_quant_type="fp8",
+            maximum_sequence_region_size=20,
+        )
+
+        assert config.verbose
+        assert config.default_q_scale == 0.05
+        assert config.default_q_zero_point == 128
+        assert config.default_quant_type == "fp8"
+        assert config.maximum_sequence_region_size == 20
+
+    def test_region_size_validation(self):
+        """Test that region size parameters are positive."""
+        config = Config(maximum_sequence_region_size=50, minimum_topdown_search_size=5)
+        assert config.maximum_sequence_region_size > 0
+        assert config.minimum_topdown_search_size > 0
+
+    def test_genetic_algorithm_params(self):
+        """Test genetic algorithm parameters."""
+        config = Config(
+            top_percent_to_mutate=0.2,
+            minimum_schemes_to_mutate=2,
+            maximum_mutations=5,
+            maximum_generation_attempts=50,
+        )
+
+        assert config.top_percent_to_mutate == 0.2
+        assert config.minimum_schemes_to_mutate == 2
+        assert config.maximum_mutations == 5
+        assert config.maximum_generation_attempts == 50
+
+    def test_pattern_cache_params(self):
+        """Test pattern cache parameters."""
+        config = Config(pattern_cache_minimum_distance=3, pattern_cache_max_entries_per_pattern=10)
+
+        assert config.pattern_cache_minimum_distance == 3
+        assert config.pattern_cache_max_entries_per_pattern == 10

From e3ad6da0a17987447789f75f9b4a32c163d475eb Mon Sep 17 00:00:00 2001
From: Will Guo <willg@nvidia.com>
Date: Mon, 9 Feb 2026 08:42:25 +0000
Subject: [PATCH 2/5] remove unused statements

Signed-off-by: Will Guo <willg@nvidia.com>
---
 tests/unit/onnx/quantization/autotune/test_config.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/tests/unit/onnx/quantization/autotune/test_config.py b/tests/unit/onnx/quantization/autotune/test_config.py
index ed4840e3e..025cdb94e 100644
--- a/tests/unit/onnx/quantization/autotune/test_config.py
+++ b/tests/unit/onnx/quantization/autotune/test_config.py
@@ -19,13 +19,8 @@
 Tests configuration parameter validation and defaults.
 """
 
-import os
-import sys
 import unittest
 
-# Add parent directory to path
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
 from modelopt.onnx.quantization.autotune.common import Config
 
 

From 94ef42380987cf139915d5f079c52115dd656a2e Mon Sep 17 00:00:00 2001
From: Will Guo <willg@nvidia.com>
Date: Tue, 10 Feb 2026 03:25:54 +0000
Subject: [PATCH 3/5] resolve comments

Signed-off-by: Will Guo <willg@nvidia.com>
---
 .../onnx/quantization/autotune/__main__.py    | 28 +++----
 modelopt/onnx/quantization/autotune/common.py | 74 +++++++++++++++++++
 2 files changed, 85 insertions(+), 17 deletions(-)

diff --git a/modelopt/onnx/quantization/autotune/__main__.py b/modelopt/onnx/quantization/autotune/__main__.py
index ea2f99856..4dab990ac 100644
--- a/modelopt/onnx/quantization/autotune/__main__.py
+++ b/modelopt/onnx/quantization/autotune/__main__.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python3
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -78,18 +77,14 @@ def log_benchmark_config(args):
         logger.info(f"  Trtexec args: {args.trtexec_benchmark_args}")
 
 
-def run_autotune(args=None) -> int:
+def run_autotune() -> int:
     """Execute the complete pattern-based Q/DQ autotuning workflow.
 
-    This function orchestrates the entire optimization process:
-    1. Parses command-line arguments (if not provided)
-    2. Validates input paths (model, baseline, output directory)
-    3. Initializes TensorRT benchmark instance
-    4. Runs pattern-based region autotuning workflow
-    5. Handles interruptions gracefully with state preservation
-
-    Args:
-        args: Optional parsed command-line arguments. If None, parses sys.argv.
+    Parses command-line arguments, then:
+    1. Validates input paths (model, baseline, output directory)
+    2. Initializes TensorRT benchmark instance
+    3. Runs pattern-based region autotuning workflow
+    4. Handles interruptions gracefully with state preservation
 
     Returns:
         Exit code:
@@ -97,12 +92,10 @@ def run_autotune(args=None) -> int:
         - 1: Autotuning failed (exception occurred)
         - 130: Interrupted by user (Ctrl+C)
     """
-    if args is None:
-        args = _get_autotune_parser().parse_args()
-
+    args = _get_autotune_parser().parse_args()
     model_path = validate_file_path(args.onnx_path, "Model file")
     validate_file_path(args.qdq_baseline, "QDQ baseline model")
-    output_dir = Path(args.output)
+    output_dir = Path(args.output_dir)
 
     log_benchmark_config(args)
     trtexec_args = getattr(args, "trtexec_benchmark_args", None)
@@ -196,10 +189,11 @@ def _get_autotune_parser() -> argparse.ArgumentParser:
         "--onnx_path", "-m", type=str, required=True, help="Path to ONNX model file"
     )
     io_group.add_argument(
-        "--output",
+        "--output_dir",
         "-o",
         type=str,
         default=DEFAULT_OUTPUT_DIR,
+        dest="output_dir",
         help=f"Output directory for results (default: {DEFAULT_OUTPUT_DIR})",
     )
 
@@ -230,7 +224,7 @@ def _get_autotune_parser() -> argparse.ArgumentParser:
         "--state_file",
         type=str,
         default=None,
-        help="State file path for resume capability (default: <output>/autotuner_state.yaml)",
+        help="State file path for resume capability (default: <output_dir>/autotuner_state.yaml)",
     )
     strategy_group.add_argument(
         "--node_filter_list",
diff --git a/modelopt/onnx/quantization/autotune/common.py b/modelopt/onnx/quantization/autotune/common.py
index a8929315a..25794d162 100644
--- a/modelopt/onnx/quantization/autotune/common.py
+++ b/modelopt/onnx/quantization/autotune/common.py
@@ -315,3 +315,77 @@ def __str__(self) -> str:
             f"region_output_insertions={len(self.region_outputs)}, "
             f"latency={self.latency_ms:.3f}ms{error_str})"
         )
+
+
+@dataclass
+class Config:
+    """Configuration parameters for QDQ autotuning.
+
+    Controls the autotuning process including performance requirements, quantization
+    parameters, region building, scheme generation, and finetuning behavior.
+
+    Attributes:
+        # Logging
+        verbose: Enable detailed logging of autotuning progress (default: False)
+
+        # Performance Requirements
+        performance_threshold: Minimum speedup ratio to accept a scheme.
+            1.0 = no improvement required, 1.02 = 2% improvement (default: 1.02)
+
+        # Quantization Parameters
+        default_q_scale: Default scale parameter for Q/DQ nodes. Controls quantization
+            granularity. Typical range: 0.01-0.1 (default: 0.1)
+        default_q_zero_point: Default zero-point for Q/DQ nodes. Use 0 for signed int8,
+            128 for unsigned uint8 (default: 0)
+        default_quant_type: Quantization type for Q/DQ nodes. Options: "int8" (default), "fp8"
+
+        # Region Builder Settings
+        maximum_sequence_region_size: Maximum number of nodes in a sequence region during
+            top-down refinement. Prevents overly large merged regions (default: 10)
+        minimum_topdown_search_size: Minimum number of nodes in a region to trigger
+            top-down search during region building (default: 10)
+
+    # Scheme Generation Settings
+    top_percent_to_mutate: Top percentage of best schemes to use as mutation seeds
+        during scheme generation. Range: 0.0-1.0 (default: 0.1 = top 10%)
+    minimum_schemes_to_mutate: Minimum number of schemes to keep as mutation seeds,
+        even if top_percent_to_mutate results in fewer (default: 10)
+    maximum_mutations: Maximum number of mutations to apply to a single scheme
+        during generation (default: 3)
+    maximum_generation_attempts: Maximum attempts to generate a unique new scheme
+        before giving up (default: 100)
+
+    # Pattern Cache Settings
+    pattern_cache_minimum_distance: Minimum edit distance required between schemes in cache.
+        When adding schemes, if a scheme is too similar (distance < minimum_distance)
+        to an existing scheme, only the better-performing one is kept (default: 4)
+    pattern_cache_max_entries_per_pattern: Maximum number of schemes to keep per pattern
+        in pattern cache. Only the top N best-performing schemes are kept for each pattern.
+        Use 0 to keep all schemes (default: 32)
+    """
+
+    # Logging
+    verbose: bool = False
+
+    # Performance Requirements
+    performance_threshold: float = 1.02
+
+    # Quantization Parameters
+    default_q_scale: float = 0.1
+    default_q_zero_point: int = 0
+    default_quant_type: str = "int8"
+    default_dq_dtype: str = "float32"
+
+    # Region Builder Settings
+    maximum_sequence_region_size: int = 10
+    minimum_topdown_search_size: int = 10
+
+    # Scheme Generation Settings
+    top_percent_to_mutate: float = 0.1
+    minimum_schemes_to_mutate: int = 10
+    maximum_mutations: int = 3
+    maximum_generation_attempts: int = 100
+
+    # Pattern Cache Settings
+    pattern_cache_minimum_distance: int = 4
+    pattern_cache_max_entries_per_pattern: int = 32

From ebc60873081a7a55dc85aae21aee06d76f932c12 Mon Sep 17 00:00:00 2001
From: Will Guo <willg@nvidia.com>
Date: Wed, 11 Feb 2026 13:50:27 +0000
Subject: [PATCH 4/5] add pattern scheme classes

Signed-off-by: Will Guo <willg@nvidia.com>
---
 modelopt/onnx/quantization/autotune/common.py | 531 +++++++++++++++++-
 .../onnx/quantization/autotune/test_config.py |   4 +-
 2 files changed, 531 insertions(+), 4 deletions(-)

diff --git a/modelopt/onnx/quantization/autotune/common.py b/modelopt/onnx/quantization/autotune/common.py
index 25794d162..bddcd40a5 100644
--- a/modelopt/onnx/quantization/autotune/common.py
+++ b/modelopt/onnx/quantization/autotune/common.py
@@ -18,13 +18,17 @@
 import hashlib
 from dataclasses import dataclass, field
 from enum import Enum
-from typing import Any
+from typing import Any, Optional
+
+import onnx_graphsurgeon as gs
+import yaml
 
 from modelopt.onnx.logging_config import logger
 from modelopt.onnx.quantization.autotune.insertion_points import (
     ChildRegionInputInsertionPoint,
     ChildRegionOutputInsertionPoint,
     NodeInputInsertionPoint,
+    ResolvedInsertionPoint,
 )
 
 
@@ -317,6 +321,531 @@ def __str__(self) -> str:
         )
 
 
+@dataclass
+class PatternSchemes:
+    """Collection of Q/DQ insertion schemes for a single pattern.
+
+    Manages multiple InsertionScheme candidates for a region pattern, tracking
+    their performance and identifying the best-performing configuration. This
+    enables pattern-based optimization where all regions with the same structure
+    use the same Q/DQ insertion strategy.
+
+    **Workflow:**
+    1. Pattern is identified from region structure
+    2. Multiple schemes are generated and tested
+    3. Each scheme is measured (latency_ms)
+    4. Best scheme is selected (lowest latency)
+    5. Best scheme is applied to all matching regions
+
+    **Best Scheme Selection:**
+    - Automatically identifies scheme with lowest latency
+    - Excludes schemes with errors (error=True)
+    - Schemes with latency_ms = inf are considered unmeasured
+    - best_scheme property provides easy access to optimal configuration
+
+    **Attributes:**
+        pattern: RegionPattern defining the structural signature
+        schemes: List of InsertionScheme candidates with measurements
+    """
+
+    pattern: Optional["RegionPattern"] = None  # Structural pattern signature
+    schemes: list[InsertionScheme] = field(default_factory=list)  # Candidate schemes
+
+    @property
+    def pattern_signature(self) -> str:
+        """Get the pattern signature string."""
+        return self.pattern.signature if self.pattern else ""
+
+    @property
+    def pattern_size(self) -> int:
+        """Get the pattern size (total node count)."""
+        return self.pattern.size if self.pattern else 0
+
+    @property
+    def best_scheme_index(self) -> int:
+        """Get index of the best performing scheme (lowest latency).
+
+        Scans all schemes to find the one with minimum latency_ms,
+        excluding schemes with errors.
+        If no schemes exist or all have errors, returns -1.
+
+        Returns:
+            Index of best scheme (without errors), or -1 if no valid schemes available
+        """
+        if len(self.schemes) == 0:
+            return -1
+        min_idx, min_latency = -1, float("inf")
+        for idx, scheme in enumerate(self.schemes):
+            if not scheme.error and scheme.latency_ms < min_latency:
+                min_idx = idx
+                min_latency = scheme.latency_ms
+        return min_idx
+
+    @property
+    def best_scheme(self) -> InsertionScheme | None:
+        """Get the best performing scheme (lowest latency).
+
+        Convenience property for accessing the optimal scheme directly
+        without needing to look up by index. Excludes schemes with errors.
+
+        Returns:
+            InsertionScheme with lowest latency (excluding error schemes),
+            or None if no valid schemes exist
+        """
+        index = self.best_scheme_index
+        if index < 0 or index >= len(self.schemes):
+            return None
+        return self.schemes[index]
+
+    @property
+    def num_schemes(self) -> int:
+        """Get total number of schemes."""
+        return len(self.schemes)
+
+    @property
+    def has_schemes(self) -> bool:
+        """Check if any schemes have been added."""
+        return len(self.schemes) > 0
+
+    def add_scheme(self, scheme: InsertionScheme) -> None:
+        """Add a scheme to the collection.
+
+        Args:
+            scheme: InsertionScheme to add
+        """
+        self.schemes.append(scheme)
+
+    def get_measured_schemes(self) -> list[InsertionScheme]:
+        """Get schemes that have been measured (finite latency).
+
+        Returns:
+            List of schemes with performance measurements (excludes unmeasured schemes with inf latency)
+        """
+        return [s for s in self.schemes if s.latency_ms != float("inf")]
+
+    def get_valid_schemes(self) -> list[InsertionScheme]:
+        """Get schemes without errors.
+
+        Returns:
+            List of schemes that completed successfully without errors
+        """
+        return [s for s in self.schemes if not s.error]
+
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary for serialization.
+
+        Note: Excludes runtime objects like pattern (RegionPattern).
+        Only serializes metadata and schemes.
+        """
+        return {
+            "pattern_signature": self.pattern_signature,
+            "pattern_size": self.pattern_size,
+            "schemes": [scheme.to_dict() for scheme in self.schemes],
+        }
+
+    @classmethod
+    def from_dict(
+        cls, data: dict[str, Any], pattern: Optional["RegionPattern"] = None
+    ) -> "PatternSchemes":
+        """Create PatternSchemes from serialized dictionary.
+
+        Reconstructs the pattern schemes collection from saved data. The
+        RegionPattern object must be provided separately since it's not
+        serialized (it's a runtime object computed from the graph).
+
+        If no pattern is provided, creates a minimal RegionPattern from the
+        saved signature and size for signature matching purposes.
+
+        Args:
+            data: Dictionary containing 'pattern_signature', 'pattern_size',
+                  and 'schemes' keys
+            pattern: RegionPattern object to associate (must match signature).
+                    If None, creates minimal pattern from saved data.
+
+        Returns:
+            Reconstructed PatternSchemes instance
+        """
+        # Import here to avoid circular dependency at runtime
+        from modelopt.onnx.quantization.autotune.region_pattern import RegionPattern
+
+        ps = cls()
+
+        # If no pattern provided, create minimal one from saved data
+        if pattern is None and "pattern_signature" in data:
+            pattern = RegionPattern(
+                signature=data["pattern_signature"], size=data.get("pattern_size", 0)
+            )
+
+        ps.pattern = pattern
+
+        ps.schemes = [
+            InsertionScheme.from_dict(scheme_data) for scheme_data in data.get("schemes", [])
+        ]
+
+        return ps
+
+    def __str__(self) -> str:
+        """String representation for debugging."""
+        best_latency = self.best_scheme.latency_ms if self.best_scheme else 0.0
+        return (
+            f"PatternSchemes(pattern='{self.pattern_signature[:40]}...', "
+            f"schemes={self.num_schemes}, best_latency={best_latency:.3f}ms)"
+        )
+
+
+@dataclass
+class PatternCache:
+    """Pattern cache containing best-performing schemes for patterns with automatic eviction.
+
+    Stores a collection of PatternSchemes that can be used as seeds for autotuning.
+    Each PatternSchemes contains high-performing insertion schemes for a specific
+    pattern signature. The cache automatically evicts non-performant schemes based on:
+    - Error status (schemes with errors are evicted)
+    - Duplicate schemes (only better-performing duplicate is kept)
+    - Similarity (similar schemes where only better-performing one is kept)
+    - Count limit (only top N best schemes are kept per pattern)
+    """
+
+    pattern_schemes: list[PatternSchemes] = field(default_factory=list)
+    minimum_distance: int = 4  # Minimum distance between schemes in cache
+    max_entries_per_pattern: int = 32  # Maximum number of schemes per pattern (0 = no limit)
+
+    def add_pattern_schemes(self, pattern_schemes: PatternSchemes) -> None:
+        """Add PatternSchemes to pattern cache with automatic eviction of non-performant entries.
+
+        Merges new schemes with existing schemes for the same pattern, automatically
+        evicting schemes that are non-performant based on multiple criteria.
+
+        Args:
+            pattern_schemes: PatternSchemes to add to the cache
+        """
+        if not pattern_schemes or not pattern_schemes.pattern:
+            return
+
+        pattern_sig = pattern_schemes.pattern_signature
+
+        # Find existing PatternSchemes for this pattern
+        existing_idx = None
+        for idx, ps in enumerate(self.pattern_schemes):
+            if ps.pattern_signature == pattern_sig:
+                existing_idx = idx
+                break
+
+        # Collect all schemes (existing + new)
+        all_schemes = list(pattern_schemes.schemes)
+        if existing_idx is not None:
+            all_schemes.extend(self.pattern_schemes[existing_idx].schemes)
+
+        # Filter out schemes with errors and deduplicate by hash
+        valid_schemes = [s for s in all_schemes if not s.error]
+        unique_schemes = {}
+        for scheme in valid_schemes:
+            scheme_hash = scheme.hash
+            if (
+                scheme_hash not in unique_schemes
+                or scheme.latency_ms < unique_schemes[scheme_hash].latency_ms
+            ):
+                unique_schemes[scheme_hash] = scheme
+
+        # Sort by latency to get best schemes
+        sorted_schemes = sorted(unique_schemes.values(), key=lambda s: s.latency_ms)
+
+        # Apply distance-based filtering if minimum_distance > 0
+        if self.minimum_distance > 0:
+            filtered_schemes = []
+            for scheme in sorted_schemes:
+                # Check if this scheme is too similar to any already-filtered scheme
+                too_similar = False
+                for existing_scheme in filtered_schemes:
+                    distance = scheme.distance(existing_scheme)
+                    if distance < self.minimum_distance:
+                        # Schemes are too similar, keep the better one
+                        if scheme.latency_ms < existing_scheme.latency_ms:
+                            # New scheme is better, remove existing and add new
+                            filtered_schemes.remove(existing_scheme)
+                            break
+                        else:
+                            # Existing scheme is better, skip new one
+                            too_similar = True
+                            break
+
+                if not too_similar:
+                    filtered_schemes.append(scheme)
+
+            sorted_schemes = filtered_schemes
+
+        # Apply count limit if max_entries_per_pattern > 0
+        # Keep only the top N best-performing schemes per pattern
+        if self.max_entries_per_pattern > 0:
+            sorted_schemes = sorted_schemes[: self.max_entries_per_pattern]
+
+        # Create PatternSchemes with all schemes that passed the eviction criteria
+        result = PatternSchemes(pattern=pattern_schemes.pattern)
+        result.schemes = sorted_schemes
+
+        # Replace existing or append new
+        if existing_idx is not None:
+            self.pattern_schemes[existing_idx] = result
+        else:
+            self.pattern_schemes.append(result)
+
+    def get_pattern_schemes(self, pattern_signature: str) -> PatternSchemes | None:
+        """Get PatternSchemes for a specific pattern signature.
+
+        Args:
+            pattern_signature: Pattern signature to lookup
+
+        Returns:
+            PatternSchemes if found, None otherwise
+        """
+        for ps in self.pattern_schemes:
+            if ps.pattern_signature == pattern_signature:
+                return ps
+        return None
+
+    def has_pattern(self, pattern_signature: str) -> bool:
+        """Check if pattern cache contains a specific pattern.
+
+        Args:
+            pattern_signature: Pattern signature to check
+
+        Returns:
+            True if pattern exists in pattern cache
+        """
+        return any(ps.pattern_signature == pattern_signature for ps in self.pattern_schemes)
+
+    def add_pattern_from_region(
+        self, region: Region, graph: gs.Graph, quantized_tensors: set[str]
+    ) -> None:
+        """Build and add a pattern cache entry from a region in a quantized model.
+
+        Analyzes a region from an already-quantized model to extract its Q/DQ
+        insertion scheme. This allows capturing known-good quantization strategies
+        from existing models and using them as seeds for autotuning.
+
+        Args:
+            region: Region from the quantized model to analyze
+            graph: ONNX graph containing the region
+            quantized_tensors: Set of tensor names that have Q/DQ nodes
+
+        Example:
+            >>> cache = PatternCache()
+            >>> for region in all_regions:
+            ...     cache.add_pattern_from_region(region, graph, quantized_tensors)
+            >>> cache.save("learned_patterns.yaml")
+        """
+        # Import here to avoid circular dependency at runtime
+        from modelopt.onnx.quantization.autotune.region_pattern import RegionPattern
+
+        # Create pattern from region
+        pattern = RegionPattern.from_region(region, graph)
+        # Track insertion points
+        scheme = InsertionScheme(
+            node_inputs=[],
+            child_region_inputs=[],
+            region_outputs=[],
+            latency_ms=float("inf"),
+            error=False,
+        )
+        # Analyze node inputs
+        full_insertion_scheme = pattern.get_full_insertion_scheme(region, graph)
+        for point in full_insertion_scheme.node_inputs:
+            temp_scheme = InsertionScheme(
+                node_inputs=[point],
+                child_region_inputs=[],
+                region_outputs=[],
+                latency_ms=float("inf"),
+                error=False,
+            )
+            temp_insertion_points: list[ResolvedInsertionPoint] = pattern.matches(
+                region, graph, temp_scheme
+            )
+            temp_tensor_names = {tensor.tensor_name for tensor in temp_insertion_points}
+            if len(temp_tensor_names.intersection(quantized_tensors)) > 0:
+                scheme.node_inputs.append(point)
+        # Analyze region boundaries (for COMPOSITE regions)
+        if region.type == RegionType.COMPOSITE:
+            for point in full_insertion_scheme.child_region_inputs:
+                temp_scheme = InsertionScheme(
+                    node_inputs=[],
+                    child_region_inputs=[point],
+                    region_outputs=[],
+                    latency_ms=float("inf"),
+                    error=False,
+                )
+                temp_insertion_points: list[ResolvedInsertionPoint] = pattern.matches(
+                    region, graph, temp_scheme
+                )
+                temp_tensor_names = {tensor.tensor_name for tensor in temp_insertion_points}
+                if len(temp_tensor_names.intersection(quantized_tensors)) > 0:
+                    scheme.child_region_inputs.append(point)
+        # Analyze region outputs
+        for point in full_insertion_scheme.region_outputs:
+            temp_scheme = InsertionScheme(
+                node_inputs=[],
+                child_region_inputs=[],
+                region_outputs=[point],
+                latency_ms=float("inf"),
+                error=False,
+            )
+            temp_insertion_points: list[ResolvedInsertionPoint] = pattern.matches(
+                region, graph, temp_scheme
+            )
+            temp_tensor_names = {tensor.tensor_name for tensor in temp_insertion_points}
+            if len(temp_tensor_names.intersection(quantized_tensors)) > 0:
+                scheme.region_outputs.append(point)
+        # Add pattern and scheme to pattern cache
+        pattern_schemes = PatternSchemes(pattern=pattern, schemes=[scheme])
+        self.add_pattern_schemes(pattern_schemes)
+        num_points = (
+            len(scheme.node_inputs) + len(scheme.child_region_inputs) + len(scheme.region_outputs)
+        )
+        logger.debug(
+            f"Added pattern from region {region.id} with {num_points} insertion points"
+        )
+        # Add patterns from child regions
+        if region.type == RegionType.COMPOSITE:
+            for child_region in region.get_children():
+                self.add_pattern_from_region(child_region, graph, quantized_tensors)
+
+    @property
+    def num_patterns(self) -> int:
+        """Get number of patterns in pattern cache."""
+        return len(self.pattern_schemes)
+
+    @property
+    def total_schemes(self) -> int:
+        """Get total number of schemes across all patterns."""
+        return sum(ps.num_schemes for ps in self.pattern_schemes)
+
+    def get_all_pattern_signatures(self) -> list[str]:
+        """Get list of all pattern signatures in pattern cache.
+
+        Returns:
+            List of pattern signature strings
+        """
+        return [ps.pattern_signature for ps in self.pattern_schemes]
+
+    def clear(self) -> None:
+        """Clear all pattern cache data."""
+        self.pattern_schemes.clear()
+
+    def merge(self, other: "PatternCache", prefer_existing: bool = True) -> None:
+        """Merge another PatternCache into this one.
+
+        Args:
+            other: PatternCache to merge
+            prefer_existing: If True, keep existing patterns when there's a conflict.
+                           If False, overwrite with other's patterns.
+        """
+        for schemes in other.pattern_schemes:
+            if not self.has_pattern(schemes.pattern_signature) or not prefer_existing:
+                self.add_pattern_schemes(schemes)
+
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary for serialization.
+
+        Returns:
+            Dictionary with 'minimum_distance', 'max_entries_per_pattern', and 'pattern_schemes' keys
+        """
+        return {
+            "minimum_distance": self.minimum_distance,
+            "max_entries_per_pattern": self.max_entries_per_pattern,
+            "pattern_schemes": [ps.to_dict() for ps in self.pattern_schemes],
+        }
+
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> "PatternCache":
+        """Create PatternCache from serialized dictionary.
+
+        Note: RegionPattern objects are not restored (they're runtime objects).
+        Only pattern signatures and scheme data are loaded.
+
+        Args:
+            data: Dictionary containing pattern cache data
+
+        Returns:
+            Reconstructed PatternCache instance
+        """
+        cache = cls(
+            minimum_distance=data.get("minimum_distance", 4),
+            max_entries_per_pattern=data.get("max_entries_per_pattern", 32),
+        )
+
+        for ps_data in data.get("pattern_schemes", []):
+            # Create PatternSchemes without pattern object (pattern=None)
+            ps = PatternSchemes.from_dict(ps_data, pattern=None)
+            cache.pattern_schemes.append(ps)
+
+        return cache
+
+    def save(self, output_path: str) -> None:
+        """Save pattern cache to a YAML file.
+
+        Serializes all pattern schemes and their insertion points to a YAML file
+        that can be loaded later for seeded autotuning. The format matches the
+        autotuner state file format for consistency.
+
+        Args:
+            output_path: File path where the YAML pattern cache file will be written
+        """
+        state = self.to_dict()
+
+        with open(output_path, "w") as f:
+            yaml.dump(state, f, default_flow_style=False, sort_keys=False)
+
+        logger.info(
+            f"Saved pattern cache → {output_path} ({self.num_patterns} patterns, "
+            f"{self.total_schemes} schemes)"
+        )
+        logger.debug(
+            f"Cache settings: min_distance={self.minimum_distance}, "
+            f"max_per_pattern={self.max_entries_per_pattern}"
+        )
+
+    @classmethod
+    def load(cls, input_path: str) -> "PatternCache":
+        """Load pattern cache from a YAML file.
+
+        Reads a previously saved pattern cache file and reconstructs all pattern
+        schemes. The loaded pattern cache can be used to seed autotuning with
+        known-good insertion schemes.
+
+        Args:
+            input_path: File path to the YAML pattern cache file to load
+
+        Returns:
+            PatternCache instance with all pattern schemes loaded
+
+        Raises:
+            FileNotFoundError: If the input_path doesn't exist
+        """
+        with open(input_path) as f:
+            state = yaml.safe_load(f)
+
+        cache = cls.from_dict(state)
+
+        logger.info(
+            f"Loaded pattern cache from {input_path} ({cache.num_patterns} patterns, "
+            f"{cache.total_schemes} schemes)"
+        )
+        logger.debug(
+            f"Cache settings: min_distance={cache.minimum_distance}, "
+            f"max_per_pattern={cache.max_entries_per_pattern}"
+        )
+
+        return cache
+
+    def __str__(self) -> str:
+        """String representation for debugging."""
+        return (
+            f"PatternCache(patterns={self.num_patterns}, "
+            f"schemes={self.total_schemes}, "
+            f"minimum_distance={self.minimum_distance}, "
+            f"max_entries_per_pattern={self.max_entries_per_pattern})"
+        )
+
+
 @dataclass
 class Config:
     """Configuration parameters for QDQ autotuning.
diff --git a/tests/unit/onnx/quantization/autotune/test_config.py b/tests/unit/onnx/quantization/autotune/test_config.py
index 025cdb94e..9ec99d65d 100644
--- a/tests/unit/onnx/quantization/autotune/test_config.py
+++ b/tests/unit/onnx/quantization/autotune/test_config.py
@@ -19,12 +19,10 @@
 Tests configuration parameter validation and defaults.
 """
 
-import unittest
-
 from modelopt.onnx.quantization.autotune.common import Config
 
 
-class TestConfig(unittest.TestCase):
+class TestConfig:
     """Test Config class functionality."""
 
     def test_default_values(self):

From 0414b81adb9ce18596642945ceb76d12145cd257 Mon Sep 17 00:00:00 2001
From: Will Guo <willg@nvidia.com>
Date: Thu, 12 Feb 2026 00:34:22 +0000
Subject: [PATCH 5/5] add test for worklfow

Signed-off-by: Will Guo <willg@nvidia.com>
---
 .../quantization/autotune/test_workflow.py    | 80 +++++++++++++++++++
 1 file changed, 80 insertions(+)
 create mode 100644 tests/unit/onnx/quantization/autotune/test_workflow.py

diff --git a/tests/unit/onnx/quantization/autotune/test_workflow.py b/tests/unit/onnx/quantization/autotune/test_workflow.py
new file mode 100644
index 000000000..7aadb3d2f
--- /dev/null
+++ b/tests/unit/onnx/quantization/autotune/test_workflow.py
@@ -0,0 +1,80 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import tempfile
+from pathlib import Path
+
+import onnx
+import pytest
+
+from _test_utils.import_helper import skip_if_no_tensorrt, skip_if_no_trtexec
+import .models as _test_models
+
+from modelopt.onnx.quantization.autotune.workflows import (
+    init_benchmark_instance,
+    region_pattern_autotuning_workflow,
+)
+
+@pytest.fixture
+def simple_conv_model():
+    """Simple ONNX model: Input -> Conv -> Relu -> Output. Created via models.py."""
+    return _test_models._create_simple_conv_onnx_model()
+
+@pytest.mark.parametrize("use_trtexec", [True, False])
+def test_export_quantized_model(use_trtexec, simple_conv_model):
+    """Test exporting quantized model with Q/DQ."""
+    if use_trtexec:
+        skip_if_no_trtexec()
+    else:
+        skip_if_no_tensorrt()
+
+    with tempfile.NamedTemporaryFile(suffix=".onnx", delete=False) as f:
+        baseline_model_path = f.name
+
+    # Save baseline model
+    onnx.save(simple_conv_model, baseline_model_path)
+
+    output_dir = baseline_model_path.strip(".onnx")
+    output_path = output_dir + ".quant.onnx"
+
+    try:
+        init_benchmark_instance(use_trtexec=use_trtexec)
+        autotuner = region_pattern_autotuning_workflow(baseline_model_path, Path(output_dir))
+
+        # Export model with Q/DQ insertion
+        autotuner.export_onnx(output_path, insert_qdq=True)
+
+        # Verify file was created
+        assert os.path.exists(output_path)
+
+        # Verify it's a valid ONNX model
+        exported_model = onnx.load(output_path)
+        assert exported_model is not None
+
+        # Verify that it contains Q/DQ nodes
+        qdq_nodes = [
+            n
+            for n in exported_model.graph.node
+            if n.op_type in ["QuantizeLinear", "DequantizeLinear"]
+        ]
+        assert qdq_nodes, "Q/DQ nodes not found in quantized model"
+
+        print("✓ QDQAutotuner export quantized model")
+    finally:
+        if os.path.exists(output_path):
+            os.unlink(output_path)
+