From 3112a4e884917026394d3f873bb748159885f7c0 Mon Sep 17 00:00:00 2001 From: Will Guo Date: Sun, 1 Feb 2026 22:51:59 +0000 Subject: [PATCH 1/5] Integrate Automated QDQ placement tool - part 3.3 Signed-off-by: Will Guo --- .../onnx/quantization/autotune/__main__.py | 310 +++++++++++++++ .../onnx/quantization/autotune/workflows.py | 376 ++++++++++++++++++ .../onnx/quantization/autotune/test_config.py | 104 +++++ 3 files changed, 790 insertions(+) create mode 100644 modelopt/onnx/quantization/autotune/__main__.py create mode 100644 modelopt/onnx/quantization/autotune/workflows.py create mode 100644 tests/unit/onnx/quantization/autotune/test_config.py diff --git a/modelopt/onnx/quantization/autotune/__main__.py b/modelopt/onnx/quantization/autotune/__main__.py new file mode 100644 index 000000000..ea2f99856 --- /dev/null +++ b/modelopt/onnx/quantization/autotune/__main__.py @@ -0,0 +1,310 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Command-line interface for ONNX Q/DQ autotuning.""" + +import argparse +import sys +from pathlib import Path + +from modelopt.onnx.logging_config import logger +from modelopt.onnx.quantization.autotune.workflows import ( + init_benchmark_instance, + region_pattern_autotuning_workflow, +) + +DEFAULT_OUTPUT_DIR = "./autotuner_output" +DEFAULT_NUM_SCHEMES = 30 +DEFAULT_QUANT_TYPE = "int8" +DEFAULT_DQ_DTYPE = "float32" +DEFAULT_TIMING_CACHE = "/tmp/trtexec_timing.cache" # nosec B108 +DEFAULT_WARMUP_RUNS = 5 +DEFAULT_TIMING_RUNS = 20 + + +def validate_file_path(path: str | None, description: str) -> Path | None: + """Validate that a file path exists. + + Args: + path: Path string to validate (can be None) + description: Description of the file for error messages + + Returns: + Path object if valid, None if path is None + + Raises: + SystemExit: If path is provided but doesn't exist + """ + if path is None: + return None + + path_obj = Path(path) + if not path_obj.exists(): + logger.error(f"{description} not found: {path_obj}") + sys.exit(1) + + return path_obj + + +def log_benchmark_config(args): + """Log TensorRT benchmark configuration for transparency. + + Logs timing cache path, warmup/timing run counts, and any custom + plugin libraries that will be loaded. + + Args: + args: Parsed command-line arguments with benchmark configuration + """ + logger.info("Initializing TensorRT benchmark") + logger.info(f" Timing cache: {args.timing_cache}") + logger.info(f" Warmup runs: {args.warmup_runs}") + logger.info(f" Timing runs: {args.timing_runs}") + if args.plugin_libraries: + logger.info(f" Plugin libraries: {', '.join(args.plugin_libraries)}") + if hasattr(args, "trtexec_benchmark_args") and args.trtexec_benchmark_args: + logger.info(f" Trtexec args: {args.trtexec_benchmark_args}") + + +def run_autotune(args=None) -> int: + """Execute the complete pattern-based Q/DQ autotuning workflow. + + This function orchestrates the entire optimization process: + 1. Parses command-line arguments (if not provided) + 2. Validates input paths (model, baseline, output directory) + 3. Initializes TensorRT benchmark instance + 4. Runs pattern-based region autotuning workflow + 5. Handles interruptions gracefully with state preservation + + Args: + args: Optional parsed command-line arguments. If None, parses sys.argv. + + Returns: + Exit code: + - 0: Success + - 1: Autotuning failed (exception occurred) + - 130: Interrupted by user (Ctrl+C) + """ + if args is None: + args = _get_autotune_parser().parse_args() + + model_path = validate_file_path(args.onnx_path, "Model file") + validate_file_path(args.qdq_baseline, "QDQ baseline model") + output_dir = Path(args.output) + + log_benchmark_config(args) + trtexec_args = getattr(args, "trtexec_benchmark_args", None) + benchmark_instance = init_benchmark_instance( + use_trtexec=args.use_trtexec, + plugin_libraries=args.plugin_libraries, + timing_cache_file=args.timing_cache, + warmup_runs=args.warmup_runs, + timing_runs=args.timing_runs, + trtexec_args=trtexec_args, + ) + + if benchmark_instance is None: + logger.error("Failed to initialize TensorRT benchmark") + return 1 + + logger.info("Autotuning Mode: Pattern-Based") + + try: + node_filter_list = None + if args.node_filter_list: + filter_file = validate_file_path(args.node_filter_list, "Node filter list file") + if filter_file: + with open(filter_file) as f: + node_filter_list = [ + line.strip() + for line in f + if line.strip() and not line.strip().startswith("#") + ] + logger.info(f"Loaded {len(node_filter_list)} filter patterns from {filter_file}") + + region_pattern_autotuning_workflow( + model_path=str(model_path), + output_dir=output_dir, + num_schemes_per_region=args.num_schemes, + pattern_cache_file=args.pattern_cache_file, + state_file=args.state_file, + quant_type=args.quant_type, + default_dq_dtype=args.default_dq_dtype, + qdq_baseline_model=args.qdq_baseline, + node_filter_list=node_filter_list, + verbose=args.verbose, + ) + + logger.info("\n" + "=" * 70) + logger.info("✓ Autotuning completed successfully!") + logger.info(f"✓ Results: {output_dir}") + logger.info("=" * 70) + return 0 + + except KeyboardInterrupt: + logger.warning("\nInterrupted by user") + state_file = args.state_file or output_dir / "autotuner_state.yaml" + logger.info(f"Progress saved to: {state_file}") + return 130 + + except Exception as e: + logger.error(f"\nAutotuning failed: {e}", exc_info=args.verbose) + return 1 + + +def _get_autotune_parser() -> argparse.ArgumentParser: + """Create and configure the command-line argument parser.""" + parser = argparse.ArgumentParser( + prog="modelopt.onnx.quantization.autotune", + description="ONNX Q/DQ Autotuning with TensorRT", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Basic usage + python -m modelopt.onnx.quantization.autotune --onnx_path model.onnx + + # Import patterns from QDQ baseline model + python -m modelopt.onnx.quantization.autotune \\ + --onnx_path model.onnx --qdq_baseline baseline.onnx + + # Use pattern cache for warm-start + python -m modelopt.onnx.quantization.autotune --onnx_path model.onnx --pattern_cache cache.yaml + + # Full example with all options + python -m modelopt.onnx.quantization.autotune \\ + --onnx_path model.onnx --schemes_per_region 50 \\ + --pattern_cache cache.yaml --qdq_baseline baseline.onnx \\ + --quant_type int8 --verbose + """, + ) + + # Model and Output + io_group = parser.add_argument_group("Model and Output") + io_group.add_argument( + "--onnx_path", "-m", type=str, required=True, help="Path to ONNX model file" + ) + io_group.add_argument( + "--output", + "-o", + type=str, + default=DEFAULT_OUTPUT_DIR, + help=f"Output directory for results (default: {DEFAULT_OUTPUT_DIR})", + ) + + # Autotuning Strategy + strategy_group = parser.add_argument_group("Autotuning Strategy") + strategy_group.add_argument( + "--schemes_per_region", + "-s", + type=int, + default=DEFAULT_NUM_SCHEMES, + dest="num_schemes", + help=f"Number of schemes to test per region (default: {DEFAULT_NUM_SCHEMES})", + ) + strategy_group.add_argument( + "--pattern_cache", + type=str, + default=None, + dest="pattern_cache_file", + help="Path to pattern cache YAML for warm-start (optional)", + ) + strategy_group.add_argument( + "--qdq_baseline", + type=str, + default=None, + help="Path to QDQ baseline ONNX model to import quantization patterns (optional)", + ) + strategy_group.add_argument( + "--state_file", + type=str, + default=None, + help="State file path for resume capability (default: /autotuner_state.yaml)", + ) + strategy_group.add_argument( + "--node_filter_list", + type=str, + default=None, + help="Path to a file containing wildcard patterns to filter ONNX nodes (one pattern per line). " + "Regions without any matching nodes are skipped during autotuning.", + ) + + # Quantization + quant_group = parser.add_argument_group("Quantization") + quant_group.add_argument( + "--quant_type", + type=str, + default=DEFAULT_QUANT_TYPE, + choices=["int8", "fp8"], + help=f"Quantization data type (default: {DEFAULT_QUANT_TYPE})", + ) + quant_group.add_argument( + "--default_dq_dtype", + type=str, + default=DEFAULT_DQ_DTYPE, + choices=["float16", "float32", "bfloat16"], + help="Default DQ output dtype if cannot be deduced (optional)", + ) + + # TensorRT Benchmark + trt_group = parser.add_argument_group("TensorRT Benchmark") + trt_group.add_argument( + "--use_trtexec", + action="store_true", + help="Use trtexec for benchmarking (default: False)", + default=False, + ) + trt_group.add_argument( + "--timing_cache", + type=str, + default=DEFAULT_TIMING_CACHE, + help=f"TensorRT timing cache file (default: {DEFAULT_TIMING_CACHE})", + ) + trt_group.add_argument( + "--warmup_runs", + type=int, + default=DEFAULT_WARMUP_RUNS, + help=f"Number of warmup runs (default: {DEFAULT_WARMUP_RUNS})", + ) + trt_group.add_argument( + "--timing_runs", + type=int, + default=DEFAULT_TIMING_RUNS, + help=f"Number of timing runs (default: {DEFAULT_TIMING_RUNS})", + ) + trt_group.add_argument( + "--plugin_libraries", + "--plugins", + type=str, + nargs="+", + default=None, + dest="plugin_libraries", + help="TensorRT plugin libraries (.so files) to load (optional, space-separated)", + ) + trt_group.add_argument( + "--trtexec_benchmark_args", + type=str, + default=None, + help="Additional command-line arguments to pass to trtexec as a single quoted string. " + "Example: --trtexec_benchmark_args '--fp16 --workspace=4096 --verbose'", + ) + + # Logging + parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose DEBUG logging") + + return parser + + +if __name__ == "__main__": + sys.exit(run_autotune()) diff --git a/modelopt/onnx/quantization/autotune/workflows.py b/modelopt/onnx/quantization/autotune/workflows.py new file mode 100644 index 000000000..17ae3aa7e --- /dev/null +++ b/modelopt/onnx/quantization/autotune/workflows.py @@ -0,0 +1,376 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""ONNX Q/DQ Autotuning Workflows. + +This module provides high-level workflow functions for automated Q/DQ (Quantization/Dequantization) +optimization of ONNX models using pattern-based region analysis and TensorRT performance measurement. +""" + +import fnmatch +from pathlib import Path + +import onnx + +from modelopt.onnx.logging_config import logger +from modelopt.onnx.quantization.autotune.autotuner import QDQAutotuner +from modelopt.onnx.quantization.autotune.benchmark import TensorRTPyBenchmark, TrtExecBenchmark +from modelopt.onnx.quantization.autotune.common import Config, PatternCache +from modelopt.onnx.quantization.qdq_utils import get_quantized_tensors + +_benchmark_instance = None + + +def benchmark_onnx_model( + model_path: str | bytes, log_file: str | None = None, flush_timing_cache: bool = False +) -> float: + """Benchmark ONNX model inference latency using TensorRT Python API. + + Args: + model_path: Path to ONNX model file, or bytes containing serialized model protobuf + log_file: Optional path to save detailed TensorRT build and benchmark logs + (default: None, no logging) + flush_timing_cache: If True, flushes TensorRT timing cache before building engine. + Useful for periodic cache refresh (default: False) + + Returns: + Measured median inference latency in milliseconds. + Returns float('inf') on failure (invalid model, build error, etc.) + + Raises: + No exceptions raised - errors are caught and logged, returning float('inf') + """ + global _benchmark_instance + + if _benchmark_instance is None: + logger.error("Benchmark instance not initialized") + return float("inf") + + try: + latency = _benchmark_instance.run( + model_path, log_file=log_file, flush_timing_cache=flush_timing_cache + ) + + if latency == float("inf"): + if isinstance(model_path, bytes): + logger.warning("Benchmark failed for model bytes") + else: + logger.warning(f"Benchmark failed: {model_path}") + return float("inf") + + logger.debug(f"Benchmark result: {latency:.2f} ms") + return latency + + except Exception as e: + logger.error(f"Benchmark error: {e}", exc_info=True) + return float("inf") + + +def init_benchmark_instance( + use_trtexec: bool = False, + plugin_libraries: list[str] | None = None, + timing_cache_file: str | None = None, + warmup_runs: int = 5, + timing_runs: int = 20, + trtexec_args: str | None = None, +): + """Initialize global TensorRT benchmark instance for model performance measurement. + + Args: + use_trtexec: Whether to use trtexec for benchmarking. + plugin_libraries: List of paths to TensorRT plugin shared libraries (.so files). + These plugins will be loaded by trtexec or TensorRT Python API during engine building. + If None, no custom plugins are loaded. + timing_cache_file: Path to TensorRT timing cache file for faster engine builds. + If None, uses default "trtexec_timing.cache" (default: None) + warmup_runs: Number of warmup inference iterations before measurement. + Allows GPU to reach stable performance state (default: 5) + timing_runs: Number of timed inference iterations for latency measurement. + Higher values give more stable median (default: 20) + trtexec_args: Additional command-line arguments to pass to trtexec as a string (only used if use_trtexec=True). + Example: '--fp16 --workspace=4096 --verbose' + """ + global _benchmark_instance + try: + if use_trtexec: + _benchmark_instance = TrtExecBenchmark( + timing_cache_file=timing_cache_file, + warmup_runs=warmup_runs, + timing_runs=timing_runs, + plugin_libraries=plugin_libraries, + trtexec_args=trtexec_args, + ) + logger.info("Trtexec benchmark initialized") + else: + _benchmark_instance = TensorRTPyBenchmark( + timing_cache_file=timing_cache_file, + warmup_runs=warmup_runs, + timing_runs=timing_runs, + plugin_libraries=plugin_libraries, + ) + logger.info("TensorRT Python API benchmark initialized") + logger.debug( + f"Settings: warmup={warmup_runs}, timing={timing_runs}, " + f"cache={timing_cache_file or 'trtexec_timing.cache'}, plugin_libraries={plugin_libraries}" + ) + return _benchmark_instance + except Exception as e: + logger.error(f"TensorRT initialization failed: {e}", exc_info=True) + return None + + +def _region_matches_filter(region, graph, filter_patterns: list[str]) -> bool: + """Check if any node in the region matches any of the filter patterns. + + Args: + region: Region object to check + graph: ONNX graph (graphsurgeon) containing node information + filter_patterns: List of wildcard patterns to match against node names + + Returns: + True if at least one node in the region matches any pattern, False otherwise + """ + if not filter_patterns: + return True + + node_indices = region.get_all_nodes_recursive() + + for node_idx in node_indices: + if node_idx < len(graph.nodes): + node_name = graph.nodes[node_idx].name + for pattern in filter_patterns: + if fnmatch.fnmatch(node_name, pattern): + return True + + return False + + +def region_pattern_autotuning_workflow( + model_path: str, + output_dir: Path, + num_schemes_per_region: int = 30, + pattern_cache_file: str | None = None, + state_file: str | None = None, + quant_type: str = "int8", + default_dq_dtype: str = "float32", + qdq_baseline_model: str | None = None, + node_filter_list: list[str] | None = None, + verbose: bool = False, +) -> QDQAutotuner: + """Run automated Q/DQ (Quantization/Dequantization) optimization on an ONNX model. + + This workflow uses pattern-based region optimization to efficiently find optimal + Q/DQ insertion points. The key insight: regions with identical structural patterns + can share the same Q/DQ scheme. When a best scheme is found for a pattern, it + automatically applies to all regions matching that pattern, making optimization + both efficient and consistent. + + Automatically discovers regions, generates and tests Q/DQ insertion schemes, + and exports optimized model. Supports incremental state saving for crash recovery + and pattern cache-based warm-start. + + **Workflow Steps:** + 1. Load model and initialize autotuner with automatic hierarchical region discovery + 2. Resume from checkpoint if state file exists (crash recovery) + 3. Load pattern cache if provided (warm-start with known-good schemes) + 4. Import Q/DQ patterns from baseline model if provided (transfer learning) + 5. Measure baseline performance without Q/DQ insertions + 6. For each discovered region pattern: + a. Generate Q/DQ insertion schemes (pattern-relative) + b. Build TensorRT engine and measure latency for each scheme + c. Select best scheme for this pattern (applies to all matching regions) + d. Save checkpoint and intermediate model + 7. Export final optimized model with best Q/DQ scheme for each pattern + + Args: + model_path: Path to ONNX model file to optimize + output_dir: Directory for output files (state, logs, models). Created if doesn't exist. + num_schemes_per_region: Number of Q/DQ insertion schemes to test per region pattern. + Higher values explore more configurations but take longer (default: 30) + pattern_cache_file: Optional path to pattern cache YAML file containing known-good schemes + from previous runs. Enables warm-start optimization (default: None) + state_file: Optional path to state file for checkpoint/resume. If None, automatically + uses /autotuner_state.yaml (default: None) + quant_type: Quantization data type - "int8" for INT8 quantization (default), + "fp8" for FP8 quantization + qdq_baseline_model: Optional path to a pre-quantized ONNX model. If provided, + extracts Q/DQ insertion patterns and adds them to pattern cache + for warm-start (default: None) + node_filter_list: Optional list of wildcard patterns to filter ONNX nodes. Regions + without any matching nodes are skipped during autotuning (default: None) + verbose: Enable verbose logging in Config for detailed autotuner output (default: False) + + Returns: + QDQAutotuner instance after autotuning + """ + output_dir.mkdir(parents=True, exist_ok=True) + logs_dir = output_dir / "logs" + logs_dir.mkdir(exist_ok=True) + models_dir = output_dir / "region_models" + models_dir.mkdir(exist_ok=True) + + if state_file is None: + state_file = str(output_dir / "autotuner_state.yaml") + state_path = Path(state_file) + + logger.info(f"Loading model: {model_path}") + model = onnx.load(model_path) + + pattern_cache = None + if pattern_cache_file: + pattern_cache_path = Path(pattern_cache_file) + if pattern_cache_path.exists(): + pattern_cache = PatternCache.load(str(pattern_cache_path)) + logger.info( + f"Loaded pattern cache: {pattern_cache.num_patterns} patterns, " + f"{pattern_cache.total_schemes} schemes" + ) + else: + logger.warning(f"Pattern cache not found: {pattern_cache_file}") + + logger.info( + f"Initializing autotuner (quant_type={quant_type}, default_dq_dtype={default_dq_dtype})" + ) + config = Config( + default_quant_type=quant_type, + default_dq_dtype=default_dq_dtype, + verbose=verbose, + ) + + autotuner = QDQAutotuner(model) + autotuner.initialize(config, pattern_cache) + + if state_path.exists(): + logger.info(f"Resuming from checkpoint: {state_path}") + autotuner.load_state(str(state_path)) + else: + logger.info("Starting new autotuning session") + + if qdq_baseline_model: + qdq_baseline_path = Path(qdq_baseline_model) + if qdq_baseline_path.exists(): + logger.info(f"Importing patterns from QDQ baseline: {qdq_baseline_model}") + qdq_model = onnx.load(str(qdq_baseline_path)) + quantized_tensors = get_quantized_tensors(qdq_model) + logger.debug(f"Found {len(quantized_tensors)} quantized tensors in baseline") + autotuner.import_insertion_points(quantized_tensors) + logger.info("Pattern import complete") + else: + logger.warning(f"QDQ baseline not found: {qdq_baseline_model}") + + regions = autotuner.regions + logger.info(f"Ready to profile {len(regions)} regions") + + if autotuner.baseline_latency_ms is None: + logger.info("Measuring baseline (no Q/DQ)") + baseline_path = output_dir / "baseline.onnx" + autotuner.export_onnx(str(baseline_path), insert_qdq=False) + baseline_log = logs_dir / "baseline.log" + baseline_latency = benchmark_onnx_model(str(baseline_path), str(baseline_log)) + autotuner.submit(baseline_latency) + logger.info(f"Baseline: {baseline_latency:.2f} ms") + else: + baseline_latency = autotuner.baseline_latency_ms + logger.info(f"Using baseline from checkpoint: {baseline_latency:.2f} ms") + + logger.info(f"Starting region profiling ({num_schemes_per_region} schemes per region)") + + iteration_count = 0 + + for region_idx, region in enumerate(regions): + logger.info( + f"Region {region_idx + 1}/{len(regions)} (ID={region.id}, level={region.get_level()})" + ) + + if node_filter_list and not _region_matches_filter( + region, autotuner.graph, node_filter_list + ): + logger.info(" Skipping (no nodes match filter patterns)") + continue + + commit = region_idx > 0 + autotuner.set_profile_region(region, commit=commit) + + if autotuner.current_profile_pattern_schemes is None: + logger.info(" Skipping (already profiled)") + continue + + schemes_tested = 0 + for scheme_num in range(num_schemes_per_region): + iteration_count += 1 + scheme_idx = autotuner.generate() + + if scheme_idx == -1: + logger.debug(f" Stopping at scheme {scheme_num + 1} (no more unique schemes)") + break + + schemes_tested += 1 + model_bytes = autotuner.export_onnx(None, insert_qdq=True) + test_log = logs_dir / f"region_{region.id}_scheme_{scheme_idx}.log" + flush_timing_cache = (iteration_count % 10) == 0 + latency = benchmark_onnx_model( + model_bytes, str(test_log), flush_timing_cache=flush_timing_cache + ) + + autotuner.submit(latency, success=(latency != float("inf"))) + + ps = autotuner.current_profile_pattern_schemes + if ps and ps.schemes: + best_scheme = ps.best_scheme + if best_scheme and best_scheme.latency_ms < float("inf") and baseline_latency > 0: + speedup = baseline_latency / best_scheme.latency_ms + logger.info( + f" Tested {schemes_tested} schemes: " + f"best {best_scheme.latency_ms:.2f} ms ({speedup:.3f}x speedup)" + ) + else: + logger.info(f" Tested {schemes_tested} schemes: no valid measurements") + else: + logger.info(f" Tested {schemes_tested} schemes") + + region_model_path = models_dir / f"region_{region.id}_level_{region.get_level()}.onnx" + autotuner.export_onnx(str(region_model_path), insert_qdq=True, best=True) + logger.debug(f" Saved best model: {region_model_path.name}") + + # Save state after each region (incremental, crash recovery) + autotuner.save_state(str(state_path)) + logger.debug(" Checkpoint saved") + + # Commit final region + autotuner.set_profile_region(None, commit=True) + + logger.info("Exporting final optimized model") + final_model_path = output_dir / "optimized_final.onnx" + autotuner.export_onnx(str(final_model_path), insert_qdq=True) + final_log = logs_dir / "final.log" + final_latency = benchmark_onnx_model(str(final_model_path), str(final_log)) + + if final_latency > 0 and final_latency != float("inf"): + speedup = baseline_latency / final_latency + logger.info( + f"Results: {baseline_latency:.2f} ms → {final_latency:.2f} ms ({speedup:.3f}x speedup)" + ) + else: + logger.info(f"Results: {baseline_latency:.2f} ms → failed (invalid measurement)") + + autotuner.save_state(str(state_path)) + + logger.info("Autotuning complete") + logger.info(f" Final model: {final_model_path}") + logger.info(f" State: {state_path}") + logger.debug(f" Logs: {logs_dir}") + logger.debug(f" Region models: {models_dir}") + + return autotuner diff --git a/tests/unit/onnx/quantization/autotune/test_config.py b/tests/unit/onnx/quantization/autotune/test_config.py new file mode 100644 index 000000000..ed4840e3e --- /dev/null +++ b/tests/unit/onnx/quantization/autotune/test_config.py @@ -0,0 +1,104 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Tests for the Config class in the autotuner. + +Tests configuration parameter validation and defaults. +""" + +import os +import sys +import unittest + +# Add parent directory to path +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from modelopt.onnx.quantization.autotune.common import Config + + +class TestConfig(unittest.TestCase): + """Test Config class functionality.""" + + def test_default_values(self): + """Test that Config has correct default values.""" + config = Config() + + # Logging + assert not config.verbose + + # Performance thresholds + + # Q/DQ defaults + assert config.default_q_scale == 0.1 + assert config.default_q_zero_point == 0 + assert config.default_quant_type == "int8" + + # Region builder settings + assert config.maximum_sequence_region_size == 10 + assert config.minimum_topdown_search_size == 10 + + # Scheme generation parameters + assert config.top_percent_to_mutate == 0.1 + assert config.minimum_schemes_to_mutate == 10 + assert config.maximum_mutations == 3 + assert config.maximum_generation_attempts == 100 + + # Pattern cache parameters + assert config.pattern_cache_minimum_distance == 4 + assert config.pattern_cache_max_entries_per_pattern == 32 + + def test_custom_values(self): + """Test creating Config with custom values.""" + config = Config( + verbose=True, + default_q_scale=0.05, + default_q_zero_point=128, + default_quant_type="fp8", + maximum_sequence_region_size=20, + ) + + assert config.verbose + assert config.default_q_scale == 0.05 + assert config.default_q_zero_point == 128 + assert config.default_quant_type == "fp8" + assert config.maximum_sequence_region_size == 20 + + def test_region_size_validation(self): + """Test that region size parameters are positive.""" + config = Config(maximum_sequence_region_size=50, minimum_topdown_search_size=5) + assert config.maximum_sequence_region_size > 0 + assert config.minimum_topdown_search_size > 0 + + def test_genetic_algorithm_params(self): + """Test genetic algorithm parameters.""" + config = Config( + top_percent_to_mutate=0.2, + minimum_schemes_to_mutate=2, + maximum_mutations=5, + maximum_generation_attempts=50, + ) + + assert config.top_percent_to_mutate == 0.2 + assert config.minimum_schemes_to_mutate == 2 + assert config.maximum_mutations == 5 + assert config.maximum_generation_attempts == 50 + + def test_pattern_cache_params(self): + """Test pattern cache parameters.""" + config = Config(pattern_cache_minimum_distance=3, pattern_cache_max_entries_per_pattern=10) + + assert config.pattern_cache_minimum_distance == 3 + assert config.pattern_cache_max_entries_per_pattern == 10 From e3ad6da0a17987447789f75f9b4a32c163d475eb Mon Sep 17 00:00:00 2001 From: Will Guo Date: Mon, 9 Feb 2026 08:42:25 +0000 Subject: [PATCH 2/5] remove unused statements Signed-off-by: Will Guo --- tests/unit/onnx/quantization/autotune/test_config.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/unit/onnx/quantization/autotune/test_config.py b/tests/unit/onnx/quantization/autotune/test_config.py index ed4840e3e..025cdb94e 100644 --- a/tests/unit/onnx/quantization/autotune/test_config.py +++ b/tests/unit/onnx/quantization/autotune/test_config.py @@ -19,13 +19,8 @@ Tests configuration parameter validation and defaults. """ -import os -import sys import unittest -# Add parent directory to path -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - from modelopt.onnx.quantization.autotune.common import Config From 94ef42380987cf139915d5f079c52115dd656a2e Mon Sep 17 00:00:00 2001 From: Will Guo Date: Tue, 10 Feb 2026 03:25:54 +0000 Subject: [PATCH 3/5] resolve comments Signed-off-by: Will Guo --- .../onnx/quantization/autotune/__main__.py | 28 +++---- modelopt/onnx/quantization/autotune/common.py | 74 +++++++++++++++++++ 2 files changed, 85 insertions(+), 17 deletions(-) diff --git a/modelopt/onnx/quantization/autotune/__main__.py b/modelopt/onnx/quantization/autotune/__main__.py index ea2f99856..4dab990ac 100644 --- a/modelopt/onnx/quantization/autotune/__main__.py +++ b/modelopt/onnx/quantization/autotune/__main__.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # @@ -78,18 +77,14 @@ def log_benchmark_config(args): logger.info(f" Trtexec args: {args.trtexec_benchmark_args}") -def run_autotune(args=None) -> int: +def run_autotune() -> int: """Execute the complete pattern-based Q/DQ autotuning workflow. - This function orchestrates the entire optimization process: - 1. Parses command-line arguments (if not provided) - 2. Validates input paths (model, baseline, output directory) - 3. Initializes TensorRT benchmark instance - 4. Runs pattern-based region autotuning workflow - 5. Handles interruptions gracefully with state preservation - - Args: - args: Optional parsed command-line arguments. If None, parses sys.argv. + Parses command-line arguments, then: + 1. Validates input paths (model, baseline, output directory) + 2. Initializes TensorRT benchmark instance + 3. Runs pattern-based region autotuning workflow + 4. Handles interruptions gracefully with state preservation Returns: Exit code: @@ -97,12 +92,10 @@ def run_autotune(args=None) -> int: - 1: Autotuning failed (exception occurred) - 130: Interrupted by user (Ctrl+C) """ - if args is None: - args = _get_autotune_parser().parse_args() - + args = _get_autotune_parser().parse_args() model_path = validate_file_path(args.onnx_path, "Model file") validate_file_path(args.qdq_baseline, "QDQ baseline model") - output_dir = Path(args.output) + output_dir = Path(args.output_dir) log_benchmark_config(args) trtexec_args = getattr(args, "trtexec_benchmark_args", None) @@ -196,10 +189,11 @@ def _get_autotune_parser() -> argparse.ArgumentParser: "--onnx_path", "-m", type=str, required=True, help="Path to ONNX model file" ) io_group.add_argument( - "--output", + "--output_dir", "-o", type=str, default=DEFAULT_OUTPUT_DIR, + dest="output_dir", help=f"Output directory for results (default: {DEFAULT_OUTPUT_DIR})", ) @@ -230,7 +224,7 @@ def _get_autotune_parser() -> argparse.ArgumentParser: "--state_file", type=str, default=None, - help="State file path for resume capability (default: /autotuner_state.yaml)", + help="State file path for resume capability (default: /autotuner_state.yaml)", ) strategy_group.add_argument( "--node_filter_list", diff --git a/modelopt/onnx/quantization/autotune/common.py b/modelopt/onnx/quantization/autotune/common.py index a8929315a..25794d162 100644 --- a/modelopt/onnx/quantization/autotune/common.py +++ b/modelopt/onnx/quantization/autotune/common.py @@ -315,3 +315,77 @@ def __str__(self) -> str: f"region_output_insertions={len(self.region_outputs)}, " f"latency={self.latency_ms:.3f}ms{error_str})" ) + + +@dataclass +class Config: + """Configuration parameters for QDQ autotuning. + + Controls the autotuning process including performance requirements, quantization + parameters, region building, scheme generation, and finetuning behavior. + + Attributes: + # Logging + verbose: Enable detailed logging of autotuning progress (default: False) + + # Performance Requirements + performance_threshold: Minimum speedup ratio to accept a scheme. + 1.0 = no improvement required, 1.02 = 2% improvement (default: 1.02) + + # Quantization Parameters + default_q_scale: Default scale parameter for Q/DQ nodes. Controls quantization + granularity. Typical range: 0.01-0.1 (default: 0.1) + default_q_zero_point: Default zero-point for Q/DQ nodes. Use 0 for signed int8, + 128 for unsigned uint8 (default: 0) + default_quant_type: Quantization type for Q/DQ nodes. Options: "int8" (default), "fp8" + + # Region Builder Settings + maximum_sequence_region_size: Maximum number of nodes in a sequence region during + top-down refinement. Prevents overly large merged regions (default: 10) + minimum_topdown_search_size: Minimum number of nodes in a region to trigger + top-down search during region building (default: 10) + + # Scheme Generation Settings + top_percent_to_mutate: Top percentage of best schemes to use as mutation seeds + during scheme generation. Range: 0.0-1.0 (default: 0.1 = top 10%) + minimum_schemes_to_mutate: Minimum number of schemes to keep as mutation seeds, + even if top_percent_to_mutate results in fewer (default: 10) + maximum_mutations: Maximum number of mutations to apply to a single scheme + during generation (default: 3) + maximum_generation_attempts: Maximum attempts to generate a unique new scheme + before giving up (default: 100) + + # Pattern Cache Settings + pattern_cache_minimum_distance: Minimum edit distance required between schemes in cache. + When adding schemes, if a scheme is too similar (distance < minimum_distance) + to an existing scheme, only the better-performing one is kept (default: 4) + pattern_cache_max_entries_per_pattern: Maximum number of schemes to keep per pattern + in pattern cache. Only the top N best-performing schemes are kept for each pattern. + Use 0 to keep all schemes (default: 32) + """ + + # Logging + verbose: bool = False + + # Performance Requirements + performance_threshold: float = 1.02 + + # Quantization Parameters + default_q_scale: float = 0.1 + default_q_zero_point: int = 0 + default_quant_type: str = "int8" + default_dq_dtype: str = "float32" + + # Region Builder Settings + maximum_sequence_region_size: int = 10 + minimum_topdown_search_size: int = 10 + + # Scheme Generation Settings + top_percent_to_mutate: float = 0.1 + minimum_schemes_to_mutate: int = 10 + maximum_mutations: int = 3 + maximum_generation_attempts: int = 100 + + # Pattern Cache Settings + pattern_cache_minimum_distance: int = 4 + pattern_cache_max_entries_per_pattern: int = 32 From ebc60873081a7a55dc85aae21aee06d76f932c12 Mon Sep 17 00:00:00 2001 From: Will Guo Date: Wed, 11 Feb 2026 13:50:27 +0000 Subject: [PATCH 4/5] add pattern scheme classes Signed-off-by: Will Guo --- modelopt/onnx/quantization/autotune/common.py | 531 +++++++++++++++++- .../onnx/quantization/autotune/test_config.py | 4 +- 2 files changed, 531 insertions(+), 4 deletions(-) diff --git a/modelopt/onnx/quantization/autotune/common.py b/modelopt/onnx/quantization/autotune/common.py index 25794d162..bddcd40a5 100644 --- a/modelopt/onnx/quantization/autotune/common.py +++ b/modelopt/onnx/quantization/autotune/common.py @@ -18,13 +18,17 @@ import hashlib from dataclasses import dataclass, field from enum import Enum -from typing import Any +from typing import Any, Optional + +import onnx_graphsurgeon as gs +import yaml from modelopt.onnx.logging_config import logger from modelopt.onnx.quantization.autotune.insertion_points import ( ChildRegionInputInsertionPoint, ChildRegionOutputInsertionPoint, NodeInputInsertionPoint, + ResolvedInsertionPoint, ) @@ -317,6 +321,531 @@ def __str__(self) -> str: ) +@dataclass +class PatternSchemes: + """Collection of Q/DQ insertion schemes for a single pattern. + + Manages multiple InsertionScheme candidates for a region pattern, tracking + their performance and identifying the best-performing configuration. This + enables pattern-based optimization where all regions with the same structure + use the same Q/DQ insertion strategy. + + **Workflow:** + 1. Pattern is identified from region structure + 2. Multiple schemes are generated and tested + 3. Each scheme is measured (latency_ms) + 4. Best scheme is selected (lowest latency) + 5. Best scheme is applied to all matching regions + + **Best Scheme Selection:** + - Automatically identifies scheme with lowest latency + - Excludes schemes with errors (error=True) + - Schemes with latency_ms = inf are considered unmeasured + - best_scheme property provides easy access to optimal configuration + + **Attributes:** + pattern: RegionPattern defining the structural signature + schemes: List of InsertionScheme candidates with measurements + """ + + pattern: Optional["RegionPattern"] = None # Structural pattern signature + schemes: list[InsertionScheme] = field(default_factory=list) # Candidate schemes + + @property + def pattern_signature(self) -> str: + """Get the pattern signature string.""" + return self.pattern.signature if self.pattern else "" + + @property + def pattern_size(self) -> int: + """Get the pattern size (total node count).""" + return self.pattern.size if self.pattern else 0 + + @property + def best_scheme_index(self) -> int: + """Get index of the best performing scheme (lowest latency). + + Scans all schemes to find the one with minimum latency_ms, + excluding schemes with errors. + If no schemes exist or all have errors, returns -1. + + Returns: + Index of best scheme (without errors), or -1 if no valid schemes available + """ + if len(self.schemes) == 0: + return -1 + min_idx, min_latency = -1, float("inf") + for idx, scheme in enumerate(self.schemes): + if not scheme.error and scheme.latency_ms < min_latency: + min_idx = idx + min_latency = scheme.latency_ms + return min_idx + + @property + def best_scheme(self) -> InsertionScheme | None: + """Get the best performing scheme (lowest latency). + + Convenience property for accessing the optimal scheme directly + without needing to look up by index. Excludes schemes with errors. + + Returns: + InsertionScheme with lowest latency (excluding error schemes), + or None if no valid schemes exist + """ + index = self.best_scheme_index + if index < 0 or index >= len(self.schemes): + return None + return self.schemes[index] + + @property + def num_schemes(self) -> int: + """Get total number of schemes.""" + return len(self.schemes) + + @property + def has_schemes(self) -> bool: + """Check if any schemes have been added.""" + return len(self.schemes) > 0 + + def add_scheme(self, scheme: InsertionScheme) -> None: + """Add a scheme to the collection. + + Args: + scheme: InsertionScheme to add + """ + self.schemes.append(scheme) + + def get_measured_schemes(self) -> list[InsertionScheme]: + """Get schemes that have been measured (finite latency). + + Returns: + List of schemes with performance measurements (excludes unmeasured schemes with inf latency) + """ + return [s for s in self.schemes if s.latency_ms != float("inf")] + + def get_valid_schemes(self) -> list[InsertionScheme]: + """Get schemes without errors. + + Returns: + List of schemes that completed successfully without errors + """ + return [s for s in self.schemes if not s.error] + + def to_dict(self) -> dict[str, Any]: + """Convert to dictionary for serialization. + + Note: Excludes runtime objects like pattern (RegionPattern). + Only serializes metadata and schemes. + """ + return { + "pattern_signature": self.pattern_signature, + "pattern_size": self.pattern_size, + "schemes": [scheme.to_dict() for scheme in self.schemes], + } + + @classmethod + def from_dict( + cls, data: dict[str, Any], pattern: Optional["RegionPattern"] = None + ) -> "PatternSchemes": + """Create PatternSchemes from serialized dictionary. + + Reconstructs the pattern schemes collection from saved data. The + RegionPattern object must be provided separately since it's not + serialized (it's a runtime object computed from the graph). + + If no pattern is provided, creates a minimal RegionPattern from the + saved signature and size for signature matching purposes. + + Args: + data: Dictionary containing 'pattern_signature', 'pattern_size', + and 'schemes' keys + pattern: RegionPattern object to associate (must match signature). + If None, creates minimal pattern from saved data. + + Returns: + Reconstructed PatternSchemes instance + """ + # Import here to avoid circular dependency at runtime + from modelopt.onnx.quantization.autotune.region_pattern import RegionPattern + + ps = cls() + + # If no pattern provided, create minimal one from saved data + if pattern is None and "pattern_signature" in data: + pattern = RegionPattern( + signature=data["pattern_signature"], size=data.get("pattern_size", 0) + ) + + ps.pattern = pattern + + ps.schemes = [ + InsertionScheme.from_dict(scheme_data) for scheme_data in data.get("schemes", []) + ] + + return ps + + def __str__(self) -> str: + """String representation for debugging.""" + best_latency = self.best_scheme.latency_ms if self.best_scheme else 0.0 + return ( + f"PatternSchemes(pattern='{self.pattern_signature[:40]}...', " + f"schemes={self.num_schemes}, best_latency={best_latency:.3f}ms)" + ) + + +@dataclass +class PatternCache: + """Pattern cache containing best-performing schemes for patterns with automatic eviction. + + Stores a collection of PatternSchemes that can be used as seeds for autotuning. + Each PatternSchemes contains high-performing insertion schemes for a specific + pattern signature. The cache automatically evicts non-performant schemes based on: + - Error status (schemes with errors are evicted) + - Duplicate schemes (only better-performing duplicate is kept) + - Similarity (similar schemes where only better-performing one is kept) + - Count limit (only top N best schemes are kept per pattern) + """ + + pattern_schemes: list[PatternSchemes] = field(default_factory=list) + minimum_distance: int = 4 # Minimum distance between schemes in cache + max_entries_per_pattern: int = 32 # Maximum number of schemes per pattern (0 = no limit) + + def add_pattern_schemes(self, pattern_schemes: PatternSchemes) -> None: + """Add PatternSchemes to pattern cache with automatic eviction of non-performant entries. + + Merges new schemes with existing schemes for the same pattern, automatically + evicting schemes that are non-performant based on multiple criteria. + + Args: + pattern_schemes: PatternSchemes to add to the cache + """ + if not pattern_schemes or not pattern_schemes.pattern: + return + + pattern_sig = pattern_schemes.pattern_signature + + # Find existing PatternSchemes for this pattern + existing_idx = None + for idx, ps in enumerate(self.pattern_schemes): + if ps.pattern_signature == pattern_sig: + existing_idx = idx + break + + # Collect all schemes (existing + new) + all_schemes = list(pattern_schemes.schemes) + if existing_idx is not None: + all_schemes.extend(self.pattern_schemes[existing_idx].schemes) + + # Filter out schemes with errors and deduplicate by hash + valid_schemes = [s for s in all_schemes if not s.error] + unique_schemes = {} + for scheme in valid_schemes: + scheme_hash = scheme.hash + if ( + scheme_hash not in unique_schemes + or scheme.latency_ms < unique_schemes[scheme_hash].latency_ms + ): + unique_schemes[scheme_hash] = scheme + + # Sort by latency to get best schemes + sorted_schemes = sorted(unique_schemes.values(), key=lambda s: s.latency_ms) + + # Apply distance-based filtering if minimum_distance > 0 + if self.minimum_distance > 0: + filtered_schemes = [] + for scheme in sorted_schemes: + # Check if this scheme is too similar to any already-filtered scheme + too_similar = False + for existing_scheme in filtered_schemes: + distance = scheme.distance(existing_scheme) + if distance < self.minimum_distance: + # Schemes are too similar, keep the better one + if scheme.latency_ms < existing_scheme.latency_ms: + # New scheme is better, remove existing and add new + filtered_schemes.remove(existing_scheme) + break + else: + # Existing scheme is better, skip new one + too_similar = True + break + + if not too_similar: + filtered_schemes.append(scheme) + + sorted_schemes = filtered_schemes + + # Apply count limit if max_entries_per_pattern > 0 + # Keep only the top N best-performing schemes per pattern + if self.max_entries_per_pattern > 0: + sorted_schemes = sorted_schemes[: self.max_entries_per_pattern] + + # Create PatternSchemes with all schemes that passed the eviction criteria + result = PatternSchemes(pattern=pattern_schemes.pattern) + result.schemes = sorted_schemes + + # Replace existing or append new + if existing_idx is not None: + self.pattern_schemes[existing_idx] = result + else: + self.pattern_schemes.append(result) + + def get_pattern_schemes(self, pattern_signature: str) -> PatternSchemes | None: + """Get PatternSchemes for a specific pattern signature. + + Args: + pattern_signature: Pattern signature to lookup + + Returns: + PatternSchemes if found, None otherwise + """ + for ps in self.pattern_schemes: + if ps.pattern_signature == pattern_signature: + return ps + return None + + def has_pattern(self, pattern_signature: str) -> bool: + """Check if pattern cache contains a specific pattern. + + Args: + pattern_signature: Pattern signature to check + + Returns: + True if pattern exists in pattern cache + """ + return any(ps.pattern_signature == pattern_signature for ps in self.pattern_schemes) + + def add_pattern_from_region( + self, region: Region, graph: gs.Graph, quantized_tensors: set[str] + ) -> None: + """Build and add a pattern cache entry from a region in a quantized model. + + Analyzes a region from an already-quantized model to extract its Q/DQ + insertion scheme. This allows capturing known-good quantization strategies + from existing models and using them as seeds for autotuning. + + Args: + region: Region from the quantized model to analyze + graph: ONNX graph containing the region + quantized_tensors: Set of tensor names that have Q/DQ nodes + + Example: + >>> cache = PatternCache() + >>> for region in all_regions: + ... cache.add_pattern_from_region(region, graph, quantized_tensors) + >>> cache.save("learned_patterns.yaml") + """ + # Import here to avoid circular dependency at runtime + from modelopt.onnx.quantization.autotune.region_pattern import RegionPattern + + # Create pattern from region + pattern = RegionPattern.from_region(region, graph) + # Track insertion points + scheme = InsertionScheme( + node_inputs=[], + child_region_inputs=[], + region_outputs=[], + latency_ms=float("inf"), + error=False, + ) + # Analyze node inputs + full_insertion_scheme = pattern.get_full_insertion_scheme(region, graph) + for point in full_insertion_scheme.node_inputs: + temp_scheme = InsertionScheme( + node_inputs=[point], + child_region_inputs=[], + region_outputs=[], + latency_ms=float("inf"), + error=False, + ) + temp_insertion_points: list[ResolvedInsertionPoint] = pattern.matches( + region, graph, temp_scheme + ) + temp_tensor_names = {tensor.tensor_name for tensor in temp_insertion_points} + if len(temp_tensor_names.intersection(quantized_tensors)) > 0: + scheme.node_inputs.append(point) + # Analyze region boundaries (for COMPOSITE regions) + if region.type == RegionType.COMPOSITE: + for point in full_insertion_scheme.child_region_inputs: + temp_scheme = InsertionScheme( + node_inputs=[], + child_region_inputs=[point], + region_outputs=[], + latency_ms=float("inf"), + error=False, + ) + temp_insertion_points: list[ResolvedInsertionPoint] = pattern.matches( + region, graph, temp_scheme + ) + temp_tensor_names = {tensor.tensor_name for tensor in temp_insertion_points} + if len(temp_tensor_names.intersection(quantized_tensors)) > 0: + scheme.child_region_inputs.append(point) + # Analyze region outputs + for point in full_insertion_scheme.region_outputs: + temp_scheme = InsertionScheme( + node_inputs=[], + child_region_inputs=[], + region_outputs=[point], + latency_ms=float("inf"), + error=False, + ) + temp_insertion_points: list[ResolvedInsertionPoint] = pattern.matches( + region, graph, temp_scheme + ) + temp_tensor_names = {tensor.tensor_name for tensor in temp_insertion_points} + if len(temp_tensor_names.intersection(quantized_tensors)) > 0: + scheme.region_outputs.append(point) + # Add pattern and scheme to pattern cache + pattern_schemes = PatternSchemes(pattern=pattern, schemes=[scheme]) + self.add_pattern_schemes(pattern_schemes) + num_points = ( + len(scheme.node_inputs) + len(scheme.child_region_inputs) + len(scheme.region_outputs) + ) + logger.debug( + f"Added pattern from region {region.id} with {num_points} insertion points" + ) + # Add patterns from child regions + if region.type == RegionType.COMPOSITE: + for child_region in region.get_children(): + self.add_pattern_from_region(child_region, graph, quantized_tensors) + + @property + def num_patterns(self) -> int: + """Get number of patterns in pattern cache.""" + return len(self.pattern_schemes) + + @property + def total_schemes(self) -> int: + """Get total number of schemes across all patterns.""" + return sum(ps.num_schemes for ps in self.pattern_schemes) + + def get_all_pattern_signatures(self) -> list[str]: + """Get list of all pattern signatures in pattern cache. + + Returns: + List of pattern signature strings + """ + return [ps.pattern_signature for ps in self.pattern_schemes] + + def clear(self) -> None: + """Clear all pattern cache data.""" + self.pattern_schemes.clear() + + def merge(self, other: "PatternCache", prefer_existing: bool = True) -> None: + """Merge another PatternCache into this one. + + Args: + other: PatternCache to merge + prefer_existing: If True, keep existing patterns when there's a conflict. + If False, overwrite with other's patterns. + """ + for schemes in other.pattern_schemes: + if not self.has_pattern(schemes.pattern_signature) or not prefer_existing: + self.add_pattern_schemes(schemes) + + def to_dict(self) -> dict[str, Any]: + """Convert to dictionary for serialization. + + Returns: + Dictionary with 'minimum_distance', 'max_entries_per_pattern', and 'pattern_schemes' keys + """ + return { + "minimum_distance": self.minimum_distance, + "max_entries_per_pattern": self.max_entries_per_pattern, + "pattern_schemes": [ps.to_dict() for ps in self.pattern_schemes], + } + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "PatternCache": + """Create PatternCache from serialized dictionary. + + Note: RegionPattern objects are not restored (they're runtime objects). + Only pattern signatures and scheme data are loaded. + + Args: + data: Dictionary containing pattern cache data + + Returns: + Reconstructed PatternCache instance + """ + cache = cls( + minimum_distance=data.get("minimum_distance", 4), + max_entries_per_pattern=data.get("max_entries_per_pattern", 32), + ) + + for ps_data in data.get("pattern_schemes", []): + # Create PatternSchemes without pattern object (pattern=None) + ps = PatternSchemes.from_dict(ps_data, pattern=None) + cache.pattern_schemes.append(ps) + + return cache + + def save(self, output_path: str) -> None: + """Save pattern cache to a YAML file. + + Serializes all pattern schemes and their insertion points to a YAML file + that can be loaded later for seeded autotuning. The format matches the + autotuner state file format for consistency. + + Args: + output_path: File path where the YAML pattern cache file will be written + """ + state = self.to_dict() + + with open(output_path, "w") as f: + yaml.dump(state, f, default_flow_style=False, sort_keys=False) + + logger.info( + f"Saved pattern cache → {output_path} ({self.num_patterns} patterns, " + f"{self.total_schemes} schemes)" + ) + logger.debug( + f"Cache settings: min_distance={self.minimum_distance}, " + f"max_per_pattern={self.max_entries_per_pattern}" + ) + + @classmethod + def load(cls, input_path: str) -> "PatternCache": + """Load pattern cache from a YAML file. + + Reads a previously saved pattern cache file and reconstructs all pattern + schemes. The loaded pattern cache can be used to seed autotuning with + known-good insertion schemes. + + Args: + input_path: File path to the YAML pattern cache file to load + + Returns: + PatternCache instance with all pattern schemes loaded + + Raises: + FileNotFoundError: If the input_path doesn't exist + """ + with open(input_path) as f: + state = yaml.safe_load(f) + + cache = cls.from_dict(state) + + logger.info( + f"Loaded pattern cache from {input_path} ({cache.num_patterns} patterns, " + f"{cache.total_schemes} schemes)" + ) + logger.debug( + f"Cache settings: min_distance={cache.minimum_distance}, " + f"max_per_pattern={cache.max_entries_per_pattern}" + ) + + return cache + + def __str__(self) -> str: + """String representation for debugging.""" + return ( + f"PatternCache(patterns={self.num_patterns}, " + f"schemes={self.total_schemes}, " + f"minimum_distance={self.minimum_distance}, " + f"max_entries_per_pattern={self.max_entries_per_pattern})" + ) + + @dataclass class Config: """Configuration parameters for QDQ autotuning. diff --git a/tests/unit/onnx/quantization/autotune/test_config.py b/tests/unit/onnx/quantization/autotune/test_config.py index 025cdb94e..9ec99d65d 100644 --- a/tests/unit/onnx/quantization/autotune/test_config.py +++ b/tests/unit/onnx/quantization/autotune/test_config.py @@ -19,12 +19,10 @@ Tests configuration parameter validation and defaults. """ -import unittest - from modelopt.onnx.quantization.autotune.common import Config -class TestConfig(unittest.TestCase): +class TestConfig: """Test Config class functionality.""" def test_default_values(self): From 0414b81adb9ce18596642945ceb76d12145cd257 Mon Sep 17 00:00:00 2001 From: Will Guo Date: Thu, 12 Feb 2026 00:34:22 +0000 Subject: [PATCH 5/5] add test for worklfow Signed-off-by: Will Guo --- .../quantization/autotune/test_workflow.py | 80 +++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 tests/unit/onnx/quantization/autotune/test_workflow.py diff --git a/tests/unit/onnx/quantization/autotune/test_workflow.py b/tests/unit/onnx/quantization/autotune/test_workflow.py new file mode 100644 index 000000000..7aadb3d2f --- /dev/null +++ b/tests/unit/onnx/quantization/autotune/test_workflow.py @@ -0,0 +1,80 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import tempfile +from pathlib import Path + +import onnx +import pytest + +from _test_utils.import_helper import skip_if_no_tensorrt, skip_if_no_trtexec +import .models as _test_models + +from modelopt.onnx.quantization.autotune.workflows import ( + init_benchmark_instance, + region_pattern_autotuning_workflow, +) + +@pytest.fixture +def simple_conv_model(): + """Simple ONNX model: Input -> Conv -> Relu -> Output. Created via models.py.""" + return _test_models._create_simple_conv_onnx_model() + +@pytest.mark.parametrize("use_trtexec", [True, False]) +def test_export_quantized_model(use_trtexec, simple_conv_model): + """Test exporting quantized model with Q/DQ.""" + if use_trtexec: + skip_if_no_trtexec() + else: + skip_if_no_tensorrt() + + with tempfile.NamedTemporaryFile(suffix=".onnx", delete=False) as f: + baseline_model_path = f.name + + # Save baseline model + onnx.save(simple_conv_model, baseline_model_path) + + output_dir = baseline_model_path.strip(".onnx") + output_path = output_dir + ".quant.onnx" + + try: + init_benchmark_instance(use_trtexec=use_trtexec) + autotuner = region_pattern_autotuning_workflow(baseline_model_path, Path(output_dir)) + + # Export model with Q/DQ insertion + autotuner.export_onnx(output_path, insert_qdq=True) + + # Verify file was created + assert os.path.exists(output_path) + + # Verify it's a valid ONNX model + exported_model = onnx.load(output_path) + assert exported_model is not None + + # Verify that it contains Q/DQ nodes + qdq_nodes = [ + n + for n in exported_model.graph.node + if n.op_type in ["QuantizeLinear", "DequantizeLinear"] + ] + assert qdq_nodes, "Q/DQ nodes not found in quantized model" + + print("✓ QDQAutotuner export quantized model") + finally: + if os.path.exists(output_path): + os.unlink(output_path) +