From e17027ad52054fb670fe5d7a4aa5b8c58b2a9f4e Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Mon, 2 Feb 2026 15:15:40 -0800
Subject: [PATCH 1/3] Begin Cythonization of _program.py

- Rename _program.py to _program.pyx
- Convert Program to cdef class with _program.pxd declarations
- Extract _MembersNeededForFinalize to module-level _ProgramMNFF
  (nested classes not allowed in cdef class)
- Add __repr__ method to Program
- Keep ProgramOptions as @dataclass (unchanged)
- Keep weakref.finalize pattern for handle cleanup
---
 cuda_core/cuda/core/_program.pxd              | 12 +++++
 .../cuda/core/{_program.py => _program.pyx}   | 46 ++++++++++---------
 2 files changed, 37 insertions(+), 21 deletions(-)
 create mode 100644 cuda_core/cuda/core/_program.pxd
 rename cuda_core/cuda/core/{_program.py => _program.pyx} (97%)
diff --git a/cuda_core/cuda/core/_program.pxd b/cuda_core/cuda/core/_program.pxd
new file mode 100644
index 0000000000..7dc89cf87d
--- /dev/null
+++ b/cuda_core/cuda/core/_program.pxd
@@ -0,0 +1,12 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+cdef class Program:
+    cdef:
+        object _mnff
+        str _backend
+        object _linker  # Linker (not yet Cythonized)
+        object _options  # ProgramOptions
+        object __weakref__
diff --git a/cuda_core/cuda/core/_program.py b/cuda_core/cuda/core/_program.pyx
similarity index 97%
rename from cuda_core/cuda/core/_program.py
rename to cuda_core/cuda/core/_program.pyx
index 1ef1aa51f5..45e5441cac 100644
--- a/cuda_core/cuda/core/_program.py
+++ b/cuda_core/cuda/core/_program.pyx
@@ -631,7 +631,27 @@ def __repr__(self):
 ProgramHandleT = Union["cuda.bindings.nvrtc.nvrtcProgram", LinkerHandleT]
 
 
-class Program:
+class _ProgramMNFF:
+    """Members needed for postrm release of program handles."""
+
+    __slots__ = "handle", "backend"
+
+    def __init__(self, program_obj, handle, backend):
+        self.handle = handle
+        self.backend = backend
+        weakref.finalize(program_obj, self.close)
+
+    def close(self):
+        if self.handle is not None:
+            if self.backend == "NVRTC":
+                handle_return(nvrtc.nvrtcDestroyProgram(self.handle))
+            elif self.backend == "NVVM":
+                nvvm = _get_nvvm_module()
+                nvvm.destroy_program(self.handle)
+            self.handle = None
+
+
+cdef class Program:
     """Represent a compilation machinery to process programs into
     :obj:`~_module.ObjectCode`.
 
@@ -650,27 +670,8 @@ class Program:
         See :obj:`ProgramOptions` for more information.
     """
 
-    class _MembersNeededForFinalize:
-        __slots__ = "handle", "backend"
-
-        def __init__(self, program_obj, handle, backend):
-            self.handle = handle
-            self.backend = backend
-            weakref.finalize(program_obj, self.close)
-
-        def close(self):
-            if self.handle is not None:
-                if self.backend == "NVRTC":
-                    handle_return(nvrtc.nvrtcDestroyProgram(self.handle))
-                elif self.backend == "NVVM":
-                    nvvm = _get_nvvm_module()
-                    nvvm.destroy_program(self.handle)
-                self.handle = None
-
-    __slots__ = ("__weakref__", "_mnff", "_backend", "_linker", "_options")
-
     def __init__(self, code, code_type, options: ProgramOptions = None):
-        self._mnff = Program._MembersNeededForFinalize(self, None, None)
+        self._mnff = _ProgramMNFF(self, None, None)
 
         self._options = options = check_or_create_options(ProgramOptions, options, "Program options")
         code_type = code_type.lower()
@@ -858,3 +859,6 @@ def handle(self) -> ProgramHandleT:
             handle, call ``int(Program.handle)``.
         """
         return self._mnff.handle
+
+    def __repr__(self) -> str:
+        return f"<Program backend='{self._backend}'>"

From 2f47e9ee08da9585be80636751dbf9c65d7bb314 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Mon, 2 Feb 2026 15:23:38 -0800
Subject: [PATCH 2/3] Extract Program helpers to module-level cdef functions

- Move _translate_program_options to Program_translate_options (cdef)
- Move _can_load_generated_ptx to Program_can_load_generated_ptx (cdef)
- Remove unused TYPE_CHECKING import block
- Follow _memory/_buffer.pyx helper function patterns
---
 cuda_core/cuda/core/_program.pyx | 65 +++++++++++++++++---------------
 1 file changed, 35 insertions(+), 30 deletions(-)

diff --git a/cuda_core/cuda/core/_program.pyx b/cuda_core/cuda/core/_program.pyx
index 45e5441cac..4d2eccd893 100644
--- a/cuda_core/cuda/core/_program.pyx
+++ b/cuda_core/cuda/core/_program.pyx
@@ -7,12 +7,9 @@ from __future__ import annotations
 import weakref
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Union
+from typing import Union
 from warnings import warn
 
-if TYPE_CHECKING:
-    import cuda.bindings
-
 from cuda.core._device import Device
 from cuda.core._linker import Linker, LinkerHandleT, LinkerOptions
 from cuda.core._module import ObjectCode
@@ -689,7 +686,7 @@ cdef class Program:
         elif code_type == "ptx":
             assert_type(code, str)
             self._linker = Linker(
-                ObjectCode._init(code.encode(), code_type), options=self._translate_program_options(options)
+                ObjectCode._init(code.encode(), code_type), options=Program_translate_options(options)
             )
             self._backend = self._linker.backend
 
@@ -711,36 +708,12 @@ cdef class Program:
             assert code_type not in supported_code_types, f"{code_type=}"
             raise RuntimeError(f"Unsupported {code_type=} ({supported_code_types=})")
 
-    def _translate_program_options(self, options: ProgramOptions) -> LinkerOptions:
-        return LinkerOptions(
-            name=options.name,
-            arch=options.arch,
-            max_register_count=options.max_register_count,
-            time=options.time,
-            link_time_optimization=options.link_time_optimization,
-            debug=options.debug,
-            lineinfo=options.lineinfo,
-            ftz=options.ftz,
-            prec_div=options.prec_div,
-            prec_sqrt=options.prec_sqrt,
-            fma=options.fma,
-            split_compile=options.split_compile,
-            ptxas_options=options.ptxas_options,
-            no_cache=options.no_cache,
-        )
-
     def close(self):
         """Destroy this program."""
         if self._linker:
             self._linker.close()
         self._mnff.close()
 
-    @staticmethod
-    def _can_load_generated_ptx():
-        driver_ver = handle_return(driver.cuDriverGetVersion())
-        nvrtc_major, nvrtc_minor = handle_return(nvrtc.nvrtcVersion())
-        return nvrtc_major * 1000 + nvrtc_minor * 10 <= driver_ver
-
     def compile(self, target_type, name_expressions=(), logs=None):
         """Compile the program with a specific compilation type.
 
@@ -768,7 +741,7 @@ cdef class Program:
             raise ValueError(f'Unsupported target_type="{target_type}" ({supported_target_types=})')
 
         if self._backend == "NVRTC":
-            if target_type == "ptx" and not self._can_load_generated_ptx():
+            if target_type == "ptx" and not Program_can_load_generated_ptx():
                 warn(
                     "The CUDA driver version is older than the backend version. "
                     "The generated ptx will not be loadable by the current driver.",
@@ -862,3 +835,35 @@ cdef class Program:
 
     def __repr__(self) -> str:
         return f"<Program backend='{self._backend}'>"
+
+
+# =============================================================================
+# Helper functions
+# =============================================================================
+
+
+cdef bint Program_can_load_generated_ptx():
+    """Check if the driver can load PTX generated by the current NVRTC version."""
+    driver_ver = handle_return(driver.cuDriverGetVersion())
+    nvrtc_major, nvrtc_minor = handle_return(nvrtc.nvrtcVersion())
+    return nvrtc_major * 1000 + nvrtc_minor * 10 <= driver_ver
+
+
+cdef object Program_translate_options(object options):
+    """Translate ProgramOptions to LinkerOptions for PTX compilation."""
+    return LinkerOptions(
+        name=options.name,
+        arch=options.arch,
+        max_register_count=options.max_register_count,
+        time=options.time,
+        link_time_optimization=options.link_time_optimization,
+        debug=options.debug,
+        lineinfo=options.lineinfo,
+        ftz=options.ftz,
+        prec_div=options.prec_div,
+        prec_sqrt=options.prec_sqrt,
+        fma=options.fma,
+        split_compile=options.split_compile,
+        ptxas_options=options.ptxas_options,
+        no_cache=options.no_cache,
+    )

From 85dbbb5918fa6c7422c71d581ecda59bca94adda Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Mon, 2 Feb 2026 16:54:21 -0800
Subject: [PATCH 3/3] Complete Cythonization of _program.py

- Reorganize file structure per developer guide (principal class first)
- Add module docstring, __all__, type alias section
- Factor long methods into cdef inline helpers
- Add proper exception specs to cdef functions
- Fix docstrings (use :class: refs, public paths)
- Add type annotations to public methods
- Inline _nvvm_exception_manager (single use)
- Remove Union import, use | syntax
- Add public Program.driver_can_load_nvrtc_ptx_output() API
- Update tests to use new public API

Closes #1082
---
 cuda_core/cuda/core/_program.pxd |    2 +-
 cuda_core/cuda/core/_program.pyx | 1037 ++++++++++++++++--------------
 cuda_core/tests/test_module.py   |    4 +-
 3 files changed, 544 insertions(+), 499 deletions(-)

diff --git a/cuda_core/cuda/core/_program.pxd b/cuda_core/cuda/core/_program.pxd
index 7dc89cf87d..444257f1e4 100644
--- a/cuda_core/cuda/core/_program.pxd
+++ b/cuda_core/cuda/core/_program.pxd
@@ -7,6 +7,6 @@ cdef class Program:
     cdef:
         object _mnff
         str _backend
-        object _linker  # Linker (not yet Cythonized)
+        object _linker  # Linker
         object _options  # ProgramOptions
         object __weakref__
diff --git a/cuda_core/cuda/core/_program.pyx b/cuda_core/cuda/core/_program.pyx
index 4d2eccd893..79a3cd4f7f 100644
--- a/cuda_core/cuda/core/_program.pyx
+++ b/cuda_core/cuda/core/_program.pyx
@@ -1,15 +1,19 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
+"""Compilation machinery for CUDA programs.
+
+This module provides :class:`Program` for compiling source code into
+:class:`~cuda.core.ObjectCode`, with :class:`ProgramOptions` for configuration.
+"""
 
 from __future__ import annotations
 
 import weakref
-from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import Union
 from warnings import warn
 
+from cuda.bindings import driver, nvrtc
 from cuda.core._device import Device
 from cuda.core._linker import Linker, LinkerHandleT, LinkerOptions
 from cuda.core._module import ObjectCode
@@ -18,115 +22,127 @@ from cuda.core._utils.cuda_utils import (
     CUDAError,
     _handle_boolean_option,
     check_or_create_options,
-    driver,
     get_binding_version,
     handle_return,
     is_nested_sequence,
     is_sequence,
-    nvrtc,
 )
 
+__all__ = ["Program", "ProgramOptions"]
 
-@contextmanager
-def _nvvm_exception_manager(self):
-    """
-    Taken from _linker.py
-    """
-    try:
-        yield
-    except Exception as e:
-        error_log = ""
-        if hasattr(self, "_mnff"):
-            try:
-                nvvm = _get_nvvm_module()
-                logsize = nvvm.get_program_log_size(self._mnff.handle)
-                if logsize > 1:
-                    log = bytearray(logsize)
-                    nvvm.get_program_log(self._mnff.handle, log)
-                    error_log = log.decode("utf-8", errors="backslashreplace")
-            except Exception:
-                error_log = ""
-        # Starting Python 3.11 we could also use Exception.add_note() for the same purpose, but
-        # unfortunately we are still supporting Python 3.10...
-        e.args = (e.args[0] + (f"\nNVVM program log: {error_log}" if error_log else ""), *e.args[1:])
-        raise e
+ProgramHandleT = nvrtc.nvrtcProgram | LinkerHandleT
+"""Type alias for program handle types across different backends."""
 
 
-_nvvm_module = None
-_nvvm_import_attempted = False
+# =============================================================================
+# Principal Class
+# =============================================================================
 
 
-def _get_nvvm_module():
-    """
-    Handles the import of NVVM module with version and availability checks.
-    NVVM bindings were added in cuda-bindings 12.9.0, so we need to handle cases where:
-    1. cuda.bindings is not new enough (< 12.9.0)
-    2. libnvvm is not found in the Python environment
+cdef class Program:
+    """Represent a compilation machinery to process programs into
+    :class:`~cuda.core.ObjectCode`.
 
-    Returns:
-        The nvvm module if available and working
+    This object provides a unified interface to multiple underlying
+    compiler libraries. Compilation support is enabled for a wide
+    range of code types and compilation types.
 
-    Raises:
-        RuntimeError: If NVVM is not available due to version or library issues
+    Parameters
+    ----------
+    code : str | bytes | bytearray
+        The source code to compile. For C++ and PTX, must be a string.
+        For NVVM IR, can be str, bytes, or bytearray.
+    code_type : str
+        The type of source code. Must be one of ``"c++"``, ``"ptx"``, or ``"nvvm"``.
+    options : :class:`ProgramOptions`, optional
+        Options to customize the compilation process.
     """
-    global _nvvm_module, _nvvm_import_attempted
 
-    if _nvvm_import_attempted:
-        if _nvvm_module is None:
-            raise RuntimeError("NVVM module is not available (previous import attempt failed)")
-        return _nvvm_module
+    def __init__(self, code: str | bytes | bytearray, code_type: str, options: ProgramOptions | None = None):
+        Program_init(self, code, code_type, options)
 
-    _nvvm_import_attempted = True
+    def close(self):
+        """Destroy this program."""
+        if self._linker:
+            self._linker.close()
+        self._mnff.close()
 
-    try:
-        version = get_binding_version()
-        if version < (12, 9):
-            raise RuntimeError(
-                f"NVVM bindings require cuda-bindings >= 12.9.0, but found {version[0]}.{version[1]}.x. "
-                "Please update cuda-bindings to use NVVM features."
-            )
+    def compile(
+        self, target_type: str, name_expressions: tuple | list = (), logs = None
+    ) -> ObjectCode:
+        """Compile the program to the specified target type.
 
-        from cuda.bindings import nvvm
-        from cuda.bindings._internal.nvvm import _inspect_function_pointer
+        Parameters
+        ----------
+        target_type : str
+            The compilation target. Must be one of ``"ptx"``, ``"cubin"``, or ``"ltoir"``.
+        name_expressions : tuple | list, optional
+            Sequence of name expressions to make accessible in the compiled code.
+            Used for template instantiation and similar cases.
+        logs : object, optional
+            Object with a ``write`` method to receive compilation logs.
 
-        if _inspect_function_pointer("__nvvmCreateProgram") == 0:
-            raise RuntimeError("NVVM library (libnvvm) is not available in this Python environment. ")
+        Returns
+        -------
+        :class:`~cuda.core.ObjectCode`
+            The compiled object code.
+        """
+        return Program_compile(self, target_type, name_expressions, logs)
 
-        _nvvm_module = nvvm
-        return _nvvm_module
+    @property
+    def backend(self) -> str:
+        """Return this Program instance's underlying backend."""
+        return self._backend
 
-    except RuntimeError as e:
-        _nvvm_module = None
-        raise e
+    @property
+    def handle(self) -> ProgramHandleT:
+        """Return the underlying handle object.
 
+        .. note::
 
-def _process_define_macro_inner(formatted_options, macro):
-    if isinstance(macro, str):
-        formatted_options.append(f"--define-macro={macro}")
-        return True
-    if isinstance(macro, tuple):
-        if len(macro) != 2 or any(not isinstance(val, str) for val in macro):
-            raise RuntimeError(f"Expected define_macro tuple[str, str], got {macro}")
-        formatted_options.append(f"--define-macro={macro[0]}={macro[1]}")
-        return True
-    return False
+           The type of the returned object depends on the backend.
 
+        .. caution::
 
-def _process_define_macro(formatted_options, macro):
-    union_type = "Union[str, tuple[str, str]]"
-    if _process_define_macro_inner(formatted_options, macro):
-        return
-    if is_nested_sequence(macro):
-        for seq_macro in macro:
-            if not _process_define_macro_inner(formatted_options, seq_macro):
-                raise RuntimeError(f"Expected define_macro {union_type}, got {seq_macro}")
-        return
-    raise RuntimeError(f"Expected define_macro {union_type}, list[{union_type}], got {macro}")
+            This handle is a Python object. To get the memory address of the underlying C
+            handle, call ``int(Program.handle)``.
+        """
+        return self._mnff.handle
+
+    @staticmethod
+    def driver_can_load_nvrtc_ptx_output() -> bool:
+        """Check if the CUDA driver can load PTX generated by NVRTC.
+
+        NVRTC generates PTX targeting a specific CUDA version. If the installed
+        driver is older than the NVRTC version, it may not be able to load the
+        generated PTX.
+
+        Returns
+        -------
+        bool
+            True if the driver version is new enough to load PTX generated
+            by the current NVRTC version, False otherwise.
+
+        Examples
+        --------
+        >>> if Program.driver_can_load_nvrtc_ptx_output():
+        ...     obj = program.compile("ptx")
+        ...     kernel = obj.get_kernel("my_kernel")
+        """
+        return _can_load_generated_ptx()
+
+    def __repr__(self) -> str:
+        return f"<Program backend='{self._backend}'>"
+
+
+# =============================================================================
+# Other Public Classes
+# =============================================================================
 
 
 @dataclass
 class ProgramOptions:
-    """Customizable options for configuring `Program`.
+    """Customizable options for configuring :class:`Program`.
 
     Attributes
     ----------
@@ -149,7 +165,7 @@ class ProgramOptions:
         Generate line-number information.
         Default: False
     device_code_optimize : bool, optional
-        Enable device code optimization. When specified along with ‘-G’, enables limited debug information generation
+        Enable device code optimization. When specified along with '-G', enables limited debug information generation
         for optimized device code.
         Default: None
     ptxas_options : Union[str, list[str]], optional
@@ -351,238 +367,10 @@ class ProgramOptions:
             self.arch = f"sm_{Device().arch}"
 
     def _prepare_nvrtc_options(self) -> list[bytes]:
-        # Build NVRTC-specific options
-        options = [f"-arch={self.arch}"]
-        if self.relocatable_device_code is not None:
-            options.append(f"--relocatable-device-code={_handle_boolean_option(self.relocatable_device_code)}")
-        if self.extensible_whole_program is not None and self.extensible_whole_program:
-            options.append("--extensible-whole-program")
-        if self.debug is not None and self.debug:
-            options.append("--device-debug")
-        if self.lineinfo is not None and self.lineinfo:
-            options.append("--generate-line-info")
-        if self.device_code_optimize is not None and self.device_code_optimize:
-            options.append("--dopt=on")
-        if self.ptxas_options is not None:
-            opt_name = "--ptxas-options"
-            if isinstance(self.ptxas_options, str):
-                options.append(f"{opt_name}={self.ptxas_options}")
-            elif is_sequence(self.ptxas_options):
-                for opt_value in self.ptxas_options:
-                    options.append(f"{opt_name}={opt_value}")
-        if self.max_register_count is not None:
-            options.append(f"--maxrregcount={self.max_register_count}")
-        if self.ftz is not None:
-            options.append(f"--ftz={_handle_boolean_option(self.ftz)}")
-        if self.prec_sqrt is not None:
-            options.append(f"--prec-sqrt={_handle_boolean_option(self.prec_sqrt)}")
-        if self.prec_div is not None:
-            options.append(f"--prec-div={_handle_boolean_option(self.prec_div)}")
-        if self.fma is not None:
-            options.append(f"--fmad={_handle_boolean_option(self.fma)}")
-        if self.use_fast_math is not None and self.use_fast_math:
-            options.append("--use_fast_math")
-        if self.extra_device_vectorization is not None and self.extra_device_vectorization:
-            options.append("--extra-device-vectorization")
-        if self.link_time_optimization is not None and self.link_time_optimization:
-            options.append("--dlink-time-opt")
-        if self.gen_opt_lto is not None and self.gen_opt_lto:
-            options.append("--gen-opt-lto")
-        if self.define_macro is not None:
-            _process_define_macro(options, self.define_macro)
-        if self.undefine_macro is not None:
-            if isinstance(self.undefine_macro, str):
-                options.append(f"--undefine-macro={self.undefine_macro}")
-            elif is_sequence(self.undefine_macro):
-                for macro in self.undefine_macro:
-                    options.append(f"--undefine-macro={macro}")
-        if self.include_path is not None:
-            if isinstance(self.include_path, str):
-                options.append(f"--include-path={self.include_path}")
-            elif is_sequence(self.include_path):
-                for path in self.include_path:
-                    options.append(f"--include-path={path}")
-        if self.pre_include is not None:
-            if isinstance(self.pre_include, str):
-                options.append(f"--pre-include={self.pre_include}")
-            elif is_sequence(self.pre_include):
-                for header in self.pre_include:
-                    options.append(f"--pre-include={header}")
-        if self.no_source_include is not None and self.no_source_include:
-            options.append("--no-source-include")
-        if self.std is not None:
-            options.append(f"--std={self.std}")
-        if self.builtin_move_forward is not None:
-            options.append(f"--builtin-move-forward={_handle_boolean_option(self.builtin_move_forward)}")
-        if self.builtin_initializer_list is not None:
-            options.append(f"--builtin-initializer-list={_handle_boolean_option(self.builtin_initializer_list)}")
-        if self.disable_warnings is not None and self.disable_warnings:
-            options.append("--disable-warnings")
-        if self.restrict is not None and self.restrict:
-            options.append("--restrict")
-        if self.device_as_default_execution_space is not None and self.device_as_default_execution_space:
-            options.append("--device-as-default-execution-space")
-        if self.device_int128 is not None and self.device_int128:
-            options.append("--device-int128")
-        if self.device_float128 is not None and self.device_float128:
-            options.append("--device-float128")
-        if self.optimization_info is not None:
-            options.append(f"--optimization-info={self.optimization_info}")
-        if self.no_display_error_number is not None and self.no_display_error_number:
-            options.append("--no-display-error-number")
-        if self.diag_error is not None:
-            if isinstance(self.diag_error, int):
-                options.append(f"--diag-error={self.diag_error}")
-            elif is_sequence(self.diag_error):
-                for error in self.diag_error:
-                    options.append(f"--diag-error={error}")
-        if self.diag_suppress is not None:
-            if isinstance(self.diag_suppress, int):
-                options.append(f"--diag-suppress={self.diag_suppress}")
-            elif is_sequence(self.diag_suppress):
-                for suppress in self.diag_suppress:
-                    options.append(f"--diag-suppress={suppress}")
-        if self.diag_warn is not None:
-            if isinstance(self.diag_warn, int):
-                options.append(f"--diag-warn={self.diag_warn}")
-            elif is_sequence(self.diag_warn):
-                for warn in self.diag_warn:
-                    options.append(f"--diag-warn={warn}")
-        if self.brief_diagnostics is not None:
-            options.append(f"--brief-diagnostics={_handle_boolean_option(self.brief_diagnostics)}")
-        if self.time is not None:
-            options.append(f"--time={self.time}")
-        if self.split_compile is not None:
-            options.append(f"--split-compile={self.split_compile}")
-        if self.fdevice_syntax_only is not None and self.fdevice_syntax_only:
-            options.append("--fdevice-syntax-only")
-        if self.minimal is not None and self.minimal:
-            options.append("--minimal")
-        if self.no_cache is not None and self.no_cache:
-            options.append("--no-cache")
-        if self.fdevice_time_trace is not None:
-            options.append(f"--fdevice-time-trace={self.fdevice_time_trace}")
-        if self.frandom_seed is not None:
-            options.append(f"--frandom-seed={self.frandom_seed}")
-        if self.ofast_compile is not None:
-            options.append(f"--Ofast-compile={self.ofast_compile}")
-        # PCH options (CUDA 12.8+)
-        if self.pch is not None and self.pch:
-            options.append("--pch")
-        if self.create_pch is not None:
-            options.append(f"--create-pch={self.create_pch}")
-        if self.use_pch is not None:
-            options.append(f"--use-pch={self.use_pch}")
-        if self.pch_dir is not None:
-            options.append(f"--pch-dir={self.pch_dir}")
-        if self.pch_verbose is not None:
-            options.append(f"--pch-verbose={_handle_boolean_option(self.pch_verbose)}")
-        if self.pch_messages is not None:
-            options.append(f"--pch-messages={_handle_boolean_option(self.pch_messages)}")
-        if self.instantiate_templates_in_pch is not None:
-            options.append(
-                f"--instantiate-templates-in-pch={_handle_boolean_option(self.instantiate_templates_in_pch)}"
-            )
-        if self.numba_debug:
-            options.append("--numba-debug")
-        return [o.encode() for o in options]
+        return _prepare_nvrtc_options_impl(self)
 
     def _prepare_nvvm_options(self, as_bytes: bool = True) -> list[bytes] | list[str]:
-        options = []
-
-        # Options supported by NVVM
-        assert self.arch is not None
-        arch = self.arch
-        if arch.startswith("sm_"):
-            arch = f"compute_{arch[3:]}"
-        options.append(f"-arch={arch}")
-        if self.debug is not None and self.debug:
-            options.append("-g")
-        if self.device_code_optimize is False:
-            options.append("-opt=0")
-        elif self.device_code_optimize is True:
-            options.append("-opt=3")
-        # NVVM uses 0/1 instead of true/false for boolean options
-        if self.ftz is not None:
-            options.append(f"-ftz={'1' if self.ftz else '0'}")
-        if self.prec_sqrt is not None:
-            options.append(f"-prec-sqrt={'1' if self.prec_sqrt else '0'}")
-        if self.prec_div is not None:
-            options.append(f"-prec-div={'1' if self.prec_div else '0'}")
-        if self.fma is not None:
-            options.append(f"-fma={'1' if self.fma else '0'}")
-
-        # Check for unsupported options and raise error if they are set
-        unsupported = []
-        if self.relocatable_device_code is not None:
-            unsupported.append("relocatable_device_code")
-        if self.extensible_whole_program is not None and self.extensible_whole_program:
-            unsupported.append("extensible_whole_program")
-        if self.lineinfo is not None and self.lineinfo:
-            unsupported.append("lineinfo")
-        if self.ptxas_options is not None:
-            unsupported.append("ptxas_options")
-        if self.max_register_count is not None:
-            unsupported.append("max_register_count")
-        if self.use_fast_math is not None and self.use_fast_math:
-            unsupported.append("use_fast_math")
-        if self.extra_device_vectorization is not None and self.extra_device_vectorization:
-            unsupported.append("extra_device_vectorization")
-        if self.gen_opt_lto is not None and self.gen_opt_lto:
-            unsupported.append("gen_opt_lto")
-        if self.define_macro is not None:
-            unsupported.append("define_macro")
-        if self.undefine_macro is not None:
-            unsupported.append("undefine_macro")
-        if self.include_path is not None:
-            unsupported.append("include_path")
-        if self.pre_include is not None:
-            unsupported.append("pre_include")
-        if self.no_source_include is not None and self.no_source_include:
-            unsupported.append("no_source_include")
-        if self.std is not None:
-            unsupported.append("std")
-        if self.builtin_move_forward is not None:
-            unsupported.append("builtin_move_forward")
-        if self.builtin_initializer_list is not None:
-            unsupported.append("builtin_initializer_list")
-        if self.disable_warnings is not None and self.disable_warnings:
-            unsupported.append("disable_warnings")
-        if self.restrict is not None and self.restrict:
-            unsupported.append("restrict")
-        if self.device_as_default_execution_space is not None and self.device_as_default_execution_space:
-            unsupported.append("device_as_default_execution_space")
-        if self.device_int128 is not None and self.device_int128:
-            unsupported.append("device_int128")
-        if self.optimization_info is not None:
-            unsupported.append("optimization_info")
-        if self.no_display_error_number is not None and self.no_display_error_number:
-            unsupported.append("no_display_error_number")
-        if self.diag_error is not None:
-            unsupported.append("diag_error")
-        if self.diag_suppress is not None:
-            unsupported.append("diag_suppress")
-        if self.diag_warn is not None:
-            unsupported.append("diag_warn")
-        if self.brief_diagnostics is not None:
-            unsupported.append("brief_diagnostics")
-        if self.time is not None:
-            unsupported.append("time")
-        if self.split_compile is not None:
-            unsupported.append("split_compile")
-        if self.fdevice_syntax_only is not None and self.fdevice_syntax_only:
-            unsupported.append("fdevice_syntax_only")
-        if self.minimal is not None and self.minimal:
-            unsupported.append("minimal")
-        if self.numba_debug is not None and self.numba_debug:
-            unsupported.append("numba_debug")
-        if unsupported:
-            raise CUDAError(f"The following options are not supported by NVVM backend: {', '.join(unsupported)}")
-
-        if as_bytes:
-            return [o.encode() for o in options]
-        else:
-            return options
+        return _prepare_nvvm_options_impl(self, as_bytes)
 
     def as_bytes(self, backend: str) -> list[bytes]:
         """Convert program options to bytes format for the specified backend.
@@ -625,7 +413,13 @@ class ProgramOptions:
         return f"ProgramOptions(name={self.name!r}, arch={self.arch!r})"
 
 
-ProgramHandleT = Union["cuda.bindings.nvrtc.nvrtcProgram", LinkerHandleT]
+# =============================================================================
+# Private Classes and Helper Functions
+# =============================================================================
+
+# Module-level state for NVVM lazy loading
+cdef object_nvvm_module = None
+cdef bint _nvvm_import_attempted = False
 
 
 class _ProgramMNFF:
@@ -648,208 +442,73 @@ class _ProgramMNFF:
             self.handle = None
 
 
-cdef class Program:
-    """Represent a compilation machinery to process programs into
-    :obj:`~_module.ObjectCode`.
-
-    This object provides a unified interface to multiple underlying
-    compiler libraries. Compilation support is enabled for a wide
-    range of code types and compilation types.
-
-    Parameters
-    ----------
-    code : Any
-        String of the CUDA Runtime Compilation program.
-    code_type : Any
-        String of the code type. Currently ``"ptx"``, ``"c++"``, and ``"nvvm"`` are supported.
-    options : ProgramOptions, optional
-        A ProgramOptions object to customize the compilation process.
-        See :obj:`ProgramOptions` for more information.
-    """
-
-    def __init__(self, code, code_type, options: ProgramOptions = None):
-        self._mnff = _ProgramMNFF(self, None, None)
-
-        self._options = options = check_or_create_options(ProgramOptions, options, "Program options")
-        code_type = code_type.lower()
-
-        if code_type == "c++":
-            assert_type(code, str)
-            # TODO: support pre-loaded headers & include names
-            # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
-
-            self._mnff.handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), options._name, 0, [], []))
-            self._mnff.backend = "NVRTC"
-            self._backend = "NVRTC"
-            self._linker = None
-
-        elif code_type == "ptx":
-            assert_type(code, str)
-            self._linker = Linker(
-                ObjectCode._init(code.encode(), code_type), options=Program_translate_options(options)
-            )
-            self._backend = self._linker.backend
-
-        elif code_type == "nvvm":
-            if isinstance(code, str):
-                code = code.encode("utf-8")
-            elif not isinstance(code, (bytes, bytearray)):
-                raise TypeError("NVVM IR code must be provided as str, bytes, or bytearray")
-
-            nvvm = _get_nvvm_module()
-            self._mnff.handle = nvvm.create_program()
-            self._mnff.backend = "NVVM"
-            nvvm.add_module_to_program(self._mnff.handle, code, len(code), options._name.decode())
-            self._backend = "NVVM"
-            self._linker = None
-
-        else:
-            supported_code_types = ("c++", "ptx", "nvvm")
-            assert code_type not in supported_code_types, f"{code_type=}"
-            raise RuntimeError(f"Unsupported {code_type=} ({supported_code_types=})")
-
-    def close(self):
-        """Destroy this program."""
-        if self._linker:
-            self._linker.close()
-        self._mnff.close()
-
-    def compile(self, target_type, name_expressions=(), logs=None):
-        """Compile the program with a specific compilation type.
+def _get_nvvm_module():
+    """Get the NVVM module, importing it lazily with availability checks."""
+    global _nvvm_module, _nvvm_import_attempted
 
-        Parameters
-        ----------
-        target_type : Any
-            String of the targeted compilation type.
-            Supported options are "ptx", "cubin" and "ltoir".
-        name_expressions : Union[list, tuple], optional
-            List of explicit name expressions to become accessible.
-            (Default to no expressions)
-        logs : Any, optional
-            Object with a write method to receive the logs generated
-            from compilation.
-            (Default to no logs)
+    if _nvvm_import_attempted:
+        if _nvvm_module is None:
+            raise RuntimeError("NVVM module is not available (previous import attempt failed)")
+        return _nvvm_module
 
-        Returns
-        -------
-        :obj:`~_module.ObjectCode`
-            Newly created code object.
+    _nvvm_import_attempted = True
 
-        """
-        supported_target_types = ("ptx", "cubin", "ltoir")
-        if target_type not in supported_target_types:
-            raise ValueError(f'Unsupported target_type="{target_type}" ({supported_target_types=})')
-
-        if self._backend == "NVRTC":
-            if target_type == "ptx" and not Program_can_load_generated_ptx():
-                warn(
-                    "The CUDA driver version is older than the backend version. "
-                    "The generated ptx will not be loadable by the current driver.",
-                    stacklevel=1,
-                    category=RuntimeWarning,
-                )
-            if name_expressions:
-                for n in name_expressions:
-                    handle_return(
-                        nvrtc.nvrtcAddNameExpression(self._mnff.handle, n.encode()),
-                        handle=self._mnff.handle,
-                    )
-            options = self._options.as_bytes("nvrtc")
-            handle_return(
-                nvrtc.nvrtcCompileProgram(self._mnff.handle, len(options), options),
-                handle=self._mnff.handle,
+    try:
+        version = get_binding_version()
+        if version < (12, 9):
+            raise RuntimeError(
+                f"NVVM bindings require cuda-bindings >= 12.9.0, but found {version[0]}.{version[1]}.x. "
+                "Please update cuda-bindings to use NVVM features."
             )
 
-            size_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}Size")
-            comp_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}")
-            size = handle_return(size_func(self._mnff.handle), handle=self._mnff.handle)
-            data = b" " * size
-            handle_return(comp_func(self._mnff.handle, data), handle=self._mnff.handle)
-
-            symbol_mapping = {}
-            if name_expressions:
-                for n in name_expressions:
-                    symbol_mapping[n] = handle_return(
-                        nvrtc.nvrtcGetLoweredName(self._mnff.handle, n.encode()), handle=self._mnff.handle
-                    )
-
-            if logs is not None:
-                logsize = handle_return(nvrtc.nvrtcGetProgramLogSize(self._mnff.handle), handle=self._mnff.handle)
-                if logsize > 1:
-                    log = b" " * logsize
-                    handle_return(nvrtc.nvrtcGetProgramLog(self._mnff.handle, log), handle=self._mnff.handle)
-                    logs.write(log.decode("utf-8", errors="backslashreplace"))
-
-            return ObjectCode._init(data, target_type, symbol_mapping=symbol_mapping, name=self._options.name)
-
-        elif self._backend == "NVVM":
-            if target_type not in ("ptx", "ltoir"):
-                raise ValueError(f'NVVM backend only supports target_type="ptx", "ltoir", got "{target_type}"')
-
-            # TODO: flip to True when NVIDIA/cuda-python#1354 is resolved and CUDA 12 is dropped
-            nvvm_options = self._options._prepare_nvvm_options(as_bytes=False)
-            if target_type == "ltoir" and "-gen-lto" not in nvvm_options:
-                nvvm_options.append("-gen-lto")
-            nvvm = _get_nvvm_module()
-            with _nvvm_exception_manager(self):
-                nvvm.verify_program(self._mnff.handle, len(nvvm_options), nvvm_options)
-                nvvm.compile_program(self._mnff.handle, len(nvvm_options), nvvm_options)
-
-            size = nvvm.get_compiled_result_size(self._mnff.handle)
-            data = bytearray(size)
-            nvvm.get_compiled_result(self._mnff.handle, data)
-
-            if logs is not None:
-                logsize = nvvm.get_program_log_size(self._mnff.handle)
-                if logsize > 1:
-                    log = bytearray(logsize)
-                    nvvm.get_program_log(self._mnff.handle, log)
-                    logs.write(log.decode("utf-8", errors="backslashreplace"))
-
-            return ObjectCode._init(data, target_type, name=self._options.name)
-
-        supported_backends = ("nvJitLink", "driver")
-        if self._backend not in supported_backends:
-            raise ValueError(f'Unsupported backend="{self._backend}" ({supported_backends=})')
-        return self._linker.link(target_type)
-
-    @property
-    def backend(self) -> str:
-        """Return this Program instance's underlying backend."""
-        return self._backend
-
-    @property
-    def handle(self) -> ProgramHandleT:
-        """Return the underlying handle object.
+        from cuda.bindings import nvvm
+        from cuda.bindings._internal.nvvm import _inspect_function_pointer
 
-        .. note::
+        if _inspect_function_pointer("__nvvmCreateProgram") == 0:
+            raise RuntimeError("NVVM library (libnvvm) is not available in this Python environment. ")
 
-           The type of the returned object depends on the backend.
+        _nvvm_module = nvvm
+        return _nvvm_module
 
-        .. caution::
+    except RuntimeError as e:
+        _nvvm_module = None
+        raise e
 
-            This handle is a Python object. To get the memory address of the underlying C
-            handle, call ``int(Program.handle)``.
-        """
-        return self._mnff.handle
 
-    def __repr__(self) -> str:
-        return f"<Program backend='{self._backend}'>"
+cdef inline bint _process_define_macro_inner(list options, object macro) except? -1:
+    """Process a single define macro, returning True if successful."""
+    if isinstance(macro, str):
+        options.append(f"--define-macro={macro}")
+        return True
+    if isinstance(macro, tuple):
+        if len(macro) != 2 or any(not isinstance(val, str) for val in macro):
+            raise RuntimeError(f"Expected define_macro tuple[str, str], got {macro}")
+        options.append(f"--define-macro={macro[0]}={macro[1]}")
+        return True
+    return False
 
 
-# =============================================================================
-# Helper functions
-# =============================================================================
+cdef inline void _process_define_macro(list options, object macro) except *:
+    """Process define_macro option which can be str, tuple, or list thereof."""
+    union_type = "Union[str, tuple[str, str]]"
+    if _process_define_macro_inner(options, macro):
+        return
+    if is_nested_sequence(macro):
+        for seq_macro in macro:
+            if not _process_define_macro_inner(options, seq_macro):
+                raise RuntimeError(f"Expected define_macro {union_type}, got {seq_macro}")
+        return
+    raise RuntimeError(f"Expected define_macro {union_type}, list[{union_type}], got {macro}")
 
 
-cdef bint Program_can_load_generated_ptx():
+cdef inline bint _can_load_generated_ptx() except? -1:
     """Check if the driver can load PTX generated by the current NVRTC version."""
     driver_ver = handle_return(driver.cuDriverGetVersion())
     nvrtc_major, nvrtc_minor = handle_return(nvrtc.nvrtcVersion())
     return nvrtc_major * 1000 + nvrtc_minor * 10 <= driver_ver
 
 
-cdef object Program_translate_options(object options):
+cdef inline object _translate_program_options(object options):
     """Translate ProgramOptions to LinkerOptions for PTX compilation."""
     return LinkerOptions(
         name=options.name,
@@ -867,3 +526,389 @@ cdef object Program_translate_options(object options):
         ptxas_options=options.ptxas_options,
         no_cache=options.no_cache,
     )
+
+
+cdef inline int Program_init(Program self, object code, str code_type, object options) except -1:
+    """Initialize a Program instance."""
+    self._mnff = _ProgramMNFF(self, None, None)
+    self._options = options = check_or_create_options(ProgramOptions, options, "Program options")
+    code_type = code_type.lower()
+
+    if code_type == "c++":
+        assert_type(code, str)
+        # TODO: support pre-loaded headers & include names
+        # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
+        self._mnff.handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), options._name, 0, [], []))
+        self._mnff.backend = "NVRTC"
+        self._backend = "NVRTC"
+        self._linker = None
+
+    elif code_type == "ptx":
+        assert_type(code, str)
+        self._linker = Linker(
+            ObjectCode._init(code.encode(), code_type), options=_translate_program_options(options)
+        )
+        self._backend = self._linker.backend
+
+    elif code_type == "nvvm":
+        if isinstance(code, str):
+            code = code.encode("utf-8")
+        elif not isinstance(code, (bytes, bytearray)):
+            raise TypeError("NVVM IR code must be provided as str, bytes, or bytearray")
+
+        nvvm = _get_nvvm_module()
+        self._mnff.handle = nvvm.create_program()
+        self._mnff.backend = "NVVM"
+        nvvm.add_module_to_program(self._mnff.handle, code, len(code), options._name.decode())
+        self._backend = "NVVM"
+        self._linker = None
+
+    else:
+        supported_code_types = ("c++", "ptx", "nvvm")
+        assert code_type not in supported_code_types, f"{code_type=}"
+        raise RuntimeError(f"Unsupported {code_type=} ({supported_code_types=})")
+
+    return 0
+
+
+cdef object Program_compile_nvrtc(Program self, str target_type, object name_expressions, object logs):
+    """Compile using NVRTC backend."""
+    if target_type == "ptx" and not _can_load_generated_ptx():
+        warn(
+            "The CUDA driver version is older than the backend version. "
+            "The generated ptx will not be loadable by the current driver.",
+            stacklevel=2,
+            category=RuntimeWarning,
+        )
+
+    if name_expressions:
+        for n in name_expressions:
+            handle_return(
+                nvrtc.nvrtcAddNameExpression(self._mnff.handle, n.encode()),
+                handle=self._mnff.handle,
+            )
+
+    options = self._options.as_bytes("nvrtc")
+    handle_return(
+        nvrtc.nvrtcCompileProgram(self._mnff.handle, len(options), options),
+        handle=self._mnff.handle,
+    )
+
+    size_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}Size")
+    comp_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}")
+    size = handle_return(size_func(self._mnff.handle), handle=self._mnff.handle)
+    data = b" " * size
+    handle_return(comp_func(self._mnff.handle, data), handle=self._mnff.handle)
+
+    symbol_mapping = {}
+    if name_expressions:
+        for n in name_expressions:
+            symbol_mapping[n] = handle_return(
+                nvrtc.nvrtcGetLoweredName(self._mnff.handle, n.encode()), handle=self._mnff.handle
+            )
+
+    if logs is not None:
+        logsize = handle_return(nvrtc.nvrtcGetProgramLogSize(self._mnff.handle), handle=self._mnff.handle)
+        if logsize > 1:
+            log = b" " * logsize
+            handle_return(nvrtc.nvrtcGetProgramLog(self._mnff.handle, log), handle=self._mnff.handle)
+            logs.write(log.decode("utf-8", errors="backslashreplace"))
+
+    return ObjectCode._init(data, target_type, symbol_mapping=symbol_mapping, name=self._options.name)
+
+
+cdef object Program_compile_nvvm(Program self, str target_type, object logs):
+    """Compile using NVVM backend."""
+    if target_type not in ("ptx", "ltoir"):
+        raise ValueError(f'NVVM backend only supports target_type="ptx", "ltoir", got "{target_type}"')
+
+    # TODO: flip to True when NVIDIA/cuda-python#1354 is resolved and CUDA 12 is dropped
+    nvvm_options = self._options._prepare_nvvm_options(as_bytes=False)
+    if target_type == "ltoir" and "-gen-lto" not in nvvm_options:
+        nvvm_options.append("-gen-lto")
+
+    nvvm = _get_nvvm_module()
+    try:
+        nvvm.verify_program(self._mnff.handle, len(nvvm_options), nvvm_options)
+        nvvm.compile_program(self._mnff.handle, len(nvvm_options), nvvm_options)
+    except Exception as e:
+        # Capture NVVM program log on error
+        error_log = ""
+        try:
+            logsize = nvvm.get_program_log_size(self._mnff.handle)
+            if logsize > 1:
+                log = bytearray(logsize)
+                nvvm.get_program_log(self._mnff.handle, log)
+                error_log = log.decode("utf-8", errors="backslashreplace")
+        except Exception:
+            pass
+        e.args = (e.args[0] + (f"\nNVVM program log: {error_log}" if error_log else ""), *e.args[1:])
+        raise
+
+    size = nvvm.get_compiled_result_size(self._mnff.handle)
+    data = bytearray(size)
+    nvvm.get_compiled_result(self._mnff.handle, data)
+
+    if logs is not None:
+        logsize = nvvm.get_program_log_size(self._mnff.handle)
+        if logsize > 1:
+            log = bytearray(logsize)
+            nvvm.get_program_log(self._mnff.handle, log)
+            logs.write(log.decode("utf-8", errors="backslashreplace"))
+
+    return ObjectCode._init(data, target_type, name=self._options.name)
+
+
+cdef object Program_compile(Program self, str target_type, object name_expressions, object logs):
+    """Compile the program to the specified target type."""
+    supported_target_types = ("ptx", "cubin", "ltoir")
+    if target_type not in supported_target_types:
+        raise ValueError(f'Unsupported target_type="{target_type}" ({supported_target_types=})')
+
+    if self._backend == "NVRTC":
+        return Program_compile_nvrtc(self, target_type, name_expressions, logs)
+    elif self._backend == "NVVM":
+        return Program_compile_nvvm(self, target_type, logs)
+
+    # Linker backend (PTX code type)
+    supported_backends = ("nvJitLink", "driver")
+    if self._backend not in supported_backends:
+        raise ValueError(f'Unsupported backend="{self._backend}" ({supported_backends=})')
+    return self._linker.link(target_type)
+
+
+cdef inline list _prepare_nvrtc_options_impl(object opts):
+    """Build NVRTC-specific compiler options."""
+    options = [f"-arch={opts.arch}"]
+    if opts.relocatable_device_code is not None:
+        options.append(f"--relocatable-device-code={_handle_boolean_option(opts.relocatable_device_code)}")
+    if opts.extensible_whole_program is not None and opts.extensible_whole_program:
+        options.append("--extensible-whole-program")
+    if opts.debug is not None and opts.debug:
+        options.append("--device-debug")
+    if opts.lineinfo is not None and opts.lineinfo:
+        options.append("--generate-line-info")
+    if opts.device_code_optimize is not None and opts.device_code_optimize:
+        options.append("--dopt=on")
+    if opts.ptxas_options is not None:
+        opt_name = "--ptxas-options"
+        if isinstance(opts.ptxas_options, str):
+            options.append(f"{opt_name}={opts.ptxas_options}")
+        elif is_sequence(opts.ptxas_options):
+            for opt_value in opts.ptxas_options:
+                options.append(f"{opt_name}={opt_value}")
+    if opts.max_register_count is not None:
+        options.append(f"--maxrregcount={opts.max_register_count}")
+    if opts.ftz is not None:
+        options.append(f"--ftz={_handle_boolean_option(opts.ftz)}")
+    if opts.prec_sqrt is not None:
+        options.append(f"--prec-sqrt={_handle_boolean_option(opts.prec_sqrt)}")
+    if opts.prec_div is not None:
+        options.append(f"--prec-div={_handle_boolean_option(opts.prec_div)}")
+    if opts.fma is not None:
+        options.append(f"--fmad={_handle_boolean_option(opts.fma)}")
+    if opts.use_fast_math is not None and opts.use_fast_math:
+        options.append("--use_fast_math")
+    if opts.extra_device_vectorization is not None and opts.extra_device_vectorization:
+        options.append("--extra-device-vectorization")
+    if opts.link_time_optimization is not None and opts.link_time_optimization:
+        options.append("--dlink-time-opt")
+    if opts.gen_opt_lto is not None and opts.gen_opt_lto:
+        options.append("--gen-opt-lto")
+    if opts.define_macro is not None:
+        _process_define_macro(options, opts.define_macro)
+    if opts.undefine_macro is not None:
+        if isinstance(opts.undefine_macro, str):
+            options.append(f"--undefine-macro={opts.undefine_macro}")
+        elif is_sequence(opts.undefine_macro):
+            for macro in opts.undefine_macro:
+                options.append(f"--undefine-macro={macro}")
+    if opts.include_path is not None:
+        if isinstance(opts.include_path, str):
+            options.append(f"--include-path={opts.include_path}")
+        elif is_sequence(opts.include_path):
+            for path in opts.include_path:
+                options.append(f"--include-path={path}")
+    if opts.pre_include is not None:
+        if isinstance(opts.pre_include, str):
+            options.append(f"--pre-include={opts.pre_include}")
+        elif is_sequence(opts.pre_include):
+            for header in opts.pre_include:
+                options.append(f"--pre-include={header}")
+    if opts.no_source_include is not None and opts.no_source_include:
+        options.append("--no-source-include")
+    if opts.std is not None:
+        options.append(f"--std={opts.std}")
+    if opts.builtin_move_forward is not None:
+        options.append(f"--builtin-move-forward={_handle_boolean_option(opts.builtin_move_forward)}")
+    if opts.builtin_initializer_list is not None:
+        options.append(f"--builtin-initializer-list={_handle_boolean_option(opts.builtin_initializer_list)}")
+    if opts.disable_warnings is not None and opts.disable_warnings:
+        options.append("--disable-warnings")
+    if opts.restrict is not None and opts.restrict:
+        options.append("--restrict")
+    if opts.device_as_default_execution_space is not None and opts.device_as_default_execution_space:
+        options.append("--device-as-default-execution-space")
+    if opts.device_int128 is not None and opts.device_int128:
+        options.append("--device-int128")
+    if opts.device_float128 is not None and opts.device_float128:
+        options.append("--device-float128")
+    if opts.optimization_info is not None:
+        options.append(f"--optimization-info={opts.optimization_info}")
+    if opts.no_display_error_number is not None and opts.no_display_error_number:
+        options.append("--no-display-error-number")
+    if opts.diag_error is not None:
+        if isinstance(opts.diag_error, int):
+            options.append(f"--diag-error={opts.diag_error}")
+        elif is_sequence(opts.diag_error):
+            for error in opts.diag_error:
+                options.append(f"--diag-error={error}")
+    if opts.diag_suppress is not None:
+        if isinstance(opts.diag_suppress, int):
+            options.append(f"--diag-suppress={opts.diag_suppress}")
+        elif is_sequence(opts.diag_suppress):
+            for suppress in opts.diag_suppress:
+                options.append(f"--diag-suppress={suppress}")
+    if opts.diag_warn is not None:
+        if isinstance(opts.diag_warn, int):
+            options.append(f"--diag-warn={opts.diag_warn}")
+        elif is_sequence(opts.diag_warn):
+            for w in opts.diag_warn:
+                options.append(f"--diag-warn={w}")
+    if opts.brief_diagnostics is not None:
+        options.append(f"--brief-diagnostics={_handle_boolean_option(opts.brief_diagnostics)}")
+    if opts.time is not None:
+        options.append(f"--time={opts.time}")
+    if opts.split_compile is not None:
+        options.append(f"--split-compile={opts.split_compile}")
+    if opts.fdevice_syntax_only is not None and opts.fdevice_syntax_only:
+        options.append("--fdevice-syntax-only")
+    if opts.minimal is not None and opts.minimal:
+        options.append("--minimal")
+    if opts.no_cache is not None and opts.no_cache:
+        options.append("--no-cache")
+    if opts.fdevice_time_trace is not None:
+        options.append(f"--fdevice-time-trace={opts.fdevice_time_trace}")
+    if opts.frandom_seed is not None:
+        options.append(f"--frandom-seed={opts.frandom_seed}")
+    if opts.ofast_compile is not None:
+        options.append(f"--Ofast-compile={opts.ofast_compile}")
+    # PCH options (CUDA 12.8+)
+    if opts.pch is not None and opts.pch:
+        options.append("--pch")
+    if opts.create_pch is not None:
+        options.append(f"--create-pch={opts.create_pch}")
+    if opts.use_pch is not None:
+        options.append(f"--use-pch={opts.use_pch}")
+    if opts.pch_dir is not None:
+        options.append(f"--pch-dir={opts.pch_dir}")
+    if opts.pch_verbose is not None:
+        options.append(f"--pch-verbose={_handle_boolean_option(opts.pch_verbose)}")
+    if opts.pch_messages is not None:
+        options.append(f"--pch-messages={_handle_boolean_option(opts.pch_messages)}")
+    if opts.instantiate_templates_in_pch is not None:
+        options.append(
+            f"--instantiate-templates-in-pch={_handle_boolean_option(opts.instantiate_templates_in_pch)}"
+        )
+    if opts.numba_debug:
+        options.append("--numba-debug")
+    return [o.encode() for o in options]
+
+
+cdef inline object _prepare_nvvm_options_impl(object opts, bint as_bytes):
+    """Build NVVM-specific compiler options."""
+    options = []
+
+    # Options supported by NVVM
+    assert opts.arch is not None
+    arch = opts.arch
+    if arch.startswith("sm_"):
+        arch = f"compute_{arch[3:]}"
+    options.append(f"-arch={arch}")
+    if opts.debug is not None and opts.debug:
+        options.append("-g")
+    if opts.device_code_optimize is False:
+        options.append("-opt=0")
+    elif opts.device_code_optimize is True:
+        options.append("-opt=3")
+    # NVVM uses 0/1 instead of true/false for boolean options
+    if opts.ftz is not None:
+        options.append(f"-ftz={'1' if opts.ftz else '0'}")
+    if opts.prec_sqrt is not None:
+        options.append(f"-prec-sqrt={'1' if opts.prec_sqrt else '0'}")
+    if opts.prec_div is not None:
+        options.append(f"-prec-div={'1' if opts.prec_div else '0'}")
+    if opts.fma is not None:
+        options.append(f"-fma={'1' if opts.fma else '0'}")
+
+    # Check for unsupported options and raise error if they are set
+    unsupported = []
+    if opts.relocatable_device_code is not None:
+        unsupported.append("relocatable_device_code")
+    if opts.extensible_whole_program is not None and opts.extensible_whole_program:
+        unsupported.append("extensible_whole_program")
+    if opts.lineinfo is not None and opts.lineinfo:
+        unsupported.append("lineinfo")
+    if opts.ptxas_options is not None:
+        unsupported.append("ptxas_options")
+    if opts.max_register_count is not None:
+        unsupported.append("max_register_count")
+    if opts.use_fast_math is not None and opts.use_fast_math:
+        unsupported.append("use_fast_math")
+    if opts.extra_device_vectorization is not None and opts.extra_device_vectorization:
+        unsupported.append("extra_device_vectorization")
+    if opts.gen_opt_lto is not None and opts.gen_opt_lto:
+        unsupported.append("gen_opt_lto")
+    if opts.define_macro is not None:
+        unsupported.append("define_macro")
+    if opts.undefine_macro is not None:
+        unsupported.append("undefine_macro")
+    if opts.include_path is not None:
+        unsupported.append("include_path")
+    if opts.pre_include is not None:
+        unsupported.append("pre_include")
+    if opts.no_source_include is not None and opts.no_source_include:
+        unsupported.append("no_source_include")
+    if opts.std is not None:
+        unsupported.append("std")
+    if opts.builtin_move_forward is not None:
+        unsupported.append("builtin_move_forward")
+    if opts.builtin_initializer_list is not None:
+        unsupported.append("builtin_initializer_list")
+    if opts.disable_warnings is not None and opts.disable_warnings:
+        unsupported.append("disable_warnings")
+    if opts.restrict is not None and opts.restrict:
+        unsupported.append("restrict")
+    if opts.device_as_default_execution_space is not None and opts.device_as_default_execution_space:
+        unsupported.append("device_as_default_execution_space")
+    if opts.device_int128 is not None and opts.device_int128:
+        unsupported.append("device_int128")
+    if opts.optimization_info is not None:
+        unsupported.append("optimization_info")
+    if opts.no_display_error_number is not None and opts.no_display_error_number:
+        unsupported.append("no_display_error_number")
+    if opts.diag_error is not None:
+        unsupported.append("diag_error")
+    if opts.diag_suppress is not None:
+        unsupported.append("diag_suppress")
+    if opts.diag_warn is not None:
+        unsupported.append("diag_warn")
+    if opts.brief_diagnostics is not None:
+        unsupported.append("brief_diagnostics")
+    if opts.time is not None:
+        unsupported.append("time")
+    if opts.split_compile is not None:
+        unsupported.append("split_compile")
+    if opts.fdevice_syntax_only is not None and opts.fdevice_syntax_only:
+        unsupported.append("fdevice_syntax_only")
+    if opts.minimal is not None and opts.minimal:
+        unsupported.append("minimal")
+    if opts.numba_debug is not None and opts.numba_debug:
+        unsupported.append("numba_debug")
+    if unsupported:
+        raise CUDAError(f"The following options are not supported by NVVM backend: {', '.join(unsupported)}")
+
+    if as_bytes:
+        return [o.encode() for o in options]
+    else:
+        return options
diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py
index 72591b54d5..b53f14c45c 100644
--- a/cuda_core/tests/test_module.py
+++ b/cuda_core/tests/test_module.py
@@ -146,7 +146,7 @@ def test_object_code_load_ptx(get_saxpy_kernel_ptx):
     sym_map = mod.symbol_mapping
     mod_obj = ObjectCode.from_ptx(ptx, symbol_mapping=sym_map)
     assert mod.code == ptx
-    if not Program._can_load_generated_ptx():
+    if not Program.driver_can_load_nvrtc_ptx_output():
         pytest.skip("PTX version too new for current driver")
     mod_obj.get_kernel("saxpy<double>")  # force loading
 
@@ -160,7 +160,7 @@ def test_object_code_load_ptx_from_file(get_saxpy_kernel_ptx, tmp_path):
     mod_obj = ObjectCode.from_ptx(str(ptx_file), symbol_mapping=sym_map)
     assert mod_obj.code == str(ptx_file)
     assert mod_obj.code_type == "ptx"
-    if not Program._can_load_generated_ptx():
+    if not Program.driver_can_load_nvrtc_ptx_output():
         pytest.skip("PTX version too new for current driver")
     mod_obj.get_kernel("saxpy<double>")  # force loading