From 3fbed2412ccbda4b647d25d153cf84ec48806da3 Mon Sep 17 00:00:00 2001
From: Gaspar Rochette <gaspar.rochette@pruna.ai>
Date: Thu, 26 Feb 2026 10:58:04 +0000
Subject: [PATCH 1/9] fix: update cublas to torch 2.10

---
 src/sfast/csrc/operators/cublas/CUDABlas.cc   | 44 +++++++++++++++++++
 .../cutlass/cutlass_dual_linear_kernel.cu     |  4 ++
 2 files changed, 48 insertions(+)
diff --git a/src/sfast/csrc/operators/cublas/CUDABlas.cc b/src/sfast/csrc/operators/cublas/CUDABlas.cc
index 800e7a4..974ac97 100644
--- a/src/sfast/csrc/operators/cublas/CUDABlas.cc
+++ b/src/sfast/csrc/operators/cublas/CUDABlas.cc
@@ -226,7 +226,9 @@ cublasStatus_t cublasGemmStridedBatchedExFix(cublasHandle_t &handle,
 template <>
 void bgemm<double>(CUDABLAS_BGEMM_ARGTYPES(double)) {
   // See Note [Writing Nondeterministic Operations]
+#if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10))
   globalContext().alertCuBLASConfigNotDeterministic();
+#endif
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
   cublasOperation_t opb = _cublasOpFromChar(transb);
@@ -239,7 +241,9 @@ void bgemm<double>(CUDABLAS_BGEMM_ARGTYPES(double)) {
 template <>
 void bgemm<float>(CUDABLAS_BGEMM_ARGTYPES(float)) {
   // See Note [Writing Nondeterministic Operations]
+#if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10))
   globalContext().alertCuBLASConfigNotDeterministic();
+#endif
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
   cublasOperation_t opb = _cublasOpFromChar(transb);
@@ -252,7 +256,9 @@ void bgemm<float>(CUDABLAS_BGEMM_ARGTYPES(float)) {
 template <>
 void bgemm<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<double>)) {
   // See Note [Writing Nondeterministic Operations]
+#if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10))
   globalContext().alertCuBLASConfigNotDeterministic();
+#endif
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
   cublasOperation_t opb = _cublasOpFromChar(transb);
@@ -267,7 +273,9 @@ void bgemm<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<double>))
 template <>
 void bgemm<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<float>)) {
   // See Note [Writing Nondeterministic Operations]
+#if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10))
   globalContext().alertCuBLASConfigNotDeterministic();
+#endif
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
   cublasOperation_t opb = _cublasOpFromChar(transb);
@@ -282,7 +290,9 @@ void bgemm<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<float>)) {
 template <>
 void bgemm<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half)) {
   // See Note [Writing Nondeterministic Operations]
+#if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10))
   globalContext().alertCuBLASConfigNotDeterministic();
+#endif
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
   cublasOperation_t opb = _cublasOpFromChar(transb);
@@ -311,7 +321,11 @@ void bgemm<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half)) {
 
   cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
   if (prop->major >= 5){
+  #if TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)
+    if (at::globalContext().allowFP16ReductionCuBLAS() == at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
+  #else
     if (at::globalContext().allowFP16ReductionCuBLAS()) {
+  #endif
       at::Half falpha = alpha;
       at::Half fbeta = beta;
       TORCH_CUDABLAS_CHECK(cublasGemmStridedBatchedExFix(
@@ -350,7 +364,9 @@ void bgemm<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half)) {
 template <>
 void bgemm<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
   // See Note [Writing Nondeterministic Operations]
+  #if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10))
   globalContext().alertCuBLASConfigNotDeterministic();
+  #endif
   BGEMM_CHECK_ARGVALUES(at::BFloat16);
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
@@ -383,7 +399,9 @@ void bgemm<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
 template <>
 void gemm<double>(CUDABLAS_GEMM_ARGTYPES(double)) {
   // See Note [Writing Nondeterministic Operations]
+#if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10))
   globalContext().alertCuBLASConfigNotDeterministic();
+#endif
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
   cublasOperation_t opb = _cublasOpFromChar(transb);
@@ -396,7 +414,9 @@ void gemm<double>(CUDABLAS_GEMM_ARGTYPES(double)) {
 template <>
 void gemm<float>(CUDABLAS_GEMM_ARGTYPES(float)) {
   // See Note [Writing Nondeterministic Operations]
+#if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10))
   globalContext().alertCuBLASConfigNotDeterministic();
+#endif
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
   cublasOperation_t opb = _cublasOpFromChar(transb);
@@ -410,7 +430,9 @@ void gemm<float>(CUDABLAS_GEMM_ARGTYPES(float)) {
   template <>
   void gemm<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<double>)) {
     // See Note [Writing Nondeterministic Operations]
+  #if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10))
     globalContext().alertCuBLASConfigNotDeterministic();
+  #endif
     cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
     cublasOperation_t opa = _cublasOpFromChar(transa);
     cublasOperation_t opb = _cublasOpFromChar(transb);
@@ -427,7 +449,9 @@ void gemm<float>(CUDABLAS_GEMM_ARGTYPES(float)) {
   template <>
   void gemm<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<float>)) {
     // See Note [Writing Nondeterministic Operations]
+  #if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10))
     globalContext().alertCuBLASConfigNotDeterministic();
+  #endif
     cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
     cublasOperation_t opa = _cublasOpFromChar(transa);
     cublasOperation_t opb = _cublasOpFromChar(transb);
@@ -443,7 +467,9 @@ void gemm<float>(CUDABLAS_GEMM_ARGTYPES(float)) {
 template <>
 void gemm<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
   // See Note [Writing Nondeterministic Operations]
+#if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10))
   globalContext().alertCuBLASConfigNotDeterministic();
+#endif
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
   cublasOperation_t opb = _cublasOpFromChar(transb);
@@ -490,12 +516,20 @@ void gemm<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
     TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
 #else
     cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH;
+#if TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)
+    if (at::globalContext().allowFP16ReductionCuBLAS() != at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
+#else
     if (!at::globalContext().allowFP16ReductionCuBLAS()) {
+#endif
       cublas_flags = static_cast<cublasMath_t>(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
     }
     TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, cublas_flags));
 #endif  // defined(CUDA_VERSION) && CUDA_VERSION < 11000
+#if TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)
+    if (at::globalContext().allowFP16ReductionCuBLAS() == at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
+#else
     if (at::globalContext().allowFP16ReductionCuBLAS()) {
+#endif
       at::Half falpha = alpha;
       at::Half fbeta = beta;
       TORCH_CUDABLAS_CHECK(cublasGemmEx_(
@@ -606,7 +640,9 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
 #if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
 template <>
 void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
+#if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10))
   globalContext().alertCuBLASConfigNotDeterministic();
+#endif
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
   cublasOperation_t opb = _cublasOpFromChar(transb);
@@ -1126,7 +1162,9 @@ void trsmBatched<c10::complex<double>>(
   template <>
   void gemv<c10::complex<double>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<double>)) {
     // See Note [Writing Nondeterministic Operations]
+  #if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10))
     globalContext().alertCuBLASConfigNotDeterministic();
+  #endif
     cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
     cublasOperation_t op = _cublasOpFromChar(trans);
     _cublasAdjustLdLevel2(m, n, &lda);
@@ -1145,7 +1183,9 @@ void gemv<c10::complex<float>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<float>)) {
   // loss still happens on TF32. So we disable it here.
   NoTF32Guard disable_tf32;
   // See Note [Writing Nondeterministic Operations]
+#if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10))
   globalContext().alertCuBLASConfigNotDeterministic();
+#endif
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t op = _cublasOpFromChar(trans);
   _cublasAdjustLdLevel2(m, n, &lda);
@@ -1160,7 +1200,9 @@ void gemv<c10::complex<float>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<float>)) {
 template <>
 void gemv<double>(CUDABLAS_GEMV_ARGTYPES(double)) {
   // See Note [Writing Nondeterministic Operations]
+#if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10))
   globalContext().alertCuBLASConfigNotDeterministic();
+#endif
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t op = _cublasOpFromChar(trans);
   _cublasAdjustLdLevel2(m, n, &lda);
@@ -1175,7 +1217,9 @@ void gemv<float>(CUDABLAS_GEMV_ARGTYPES(float)) {
   // loss still happens on TF32. So we disable it here.
   NoTF32Guard disable_tf32;
   // See Note [Writing Nondeterministic Operations]
+#if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10))
   globalContext().alertCuBLASConfigNotDeterministic();
+#endif
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t op = _cublasOpFromChar(trans);
   _cublasAdjustLdLevel2(m, n, &lda);
diff --git a/src/sfast/csrc/operators/cutlass/cutlass_dual_linear_kernel.cu b/src/sfast/csrc/operators/cutlass/cutlass_dual_linear_kernel.cu
index 4559b64..28c3899 100644
--- a/src/sfast/csrc/operators/cutlass/cutlass_dual_linear_kernel.cu
+++ b/src/sfast/csrc/operators/cutlass/cutlass_dual_linear_kernel.cu
@@ -506,7 +506,11 @@ torch::Tensor cutlass_linear_geglu(const torch::Tensor &input,
       AT_DISPATCH_CASE(
           at::kHalf,
           [&] {
+#if TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)
+            if (at::globalContext().allowFP16ReductionCuBLAS() == at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
+#else
             if (at::globalContext().allowFP16ReductionCuBLAS()) {
+#endif
               output = CutlassDualGemmLauncher<
                   at::Half, GemmGEGLUWrapper,
                   cutlass::epilogue::thread::GELU_taylor_fast,

From 7d2d750bf54cb1c12c070150bad749a4fd04723d Mon Sep 17 00:00:00 2001
From: Gaspar Rochette <gaspar.rochette@pruna.ai>
Date: Thu, 26 Feb 2026 14:03:32 +0000
Subject: [PATCH 2/9] fix: import torch version to avoid undefined_as_0 error

---
 src/sfast/csrc/operators/cublas/CUDABlas.cc                    | 1 +
 src/sfast/csrc/operators/cutlass/cutlass_dual_linear_kernel.cu | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/sfast/csrc/operators/cublas/CUDABlas.cc b/src/sfast/csrc/operators/cublas/CUDABlas.cc
index 974ac97..c9e8c75 100644
--- a/src/sfast/csrc/operators/cublas/CUDABlas.cc
+++ b/src/sfast/csrc/operators/cublas/CUDABlas.cc
@@ -7,6 +7,7 @@
 #include <c10/cuda/CUDAFunctions.h>
 #include <c10/macros/Export.h>
 #include <c10/util/irange.h>
+#include <torch/version.h>
 
 // cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also
 // added bf16 support
diff --git a/src/sfast/csrc/operators/cutlass/cutlass_dual_linear_kernel.cu b/src/sfast/csrc/operators/cutlass/cutlass_dual_linear_kernel.cu
index 28c3899..1858dc6 100644
--- a/src/sfast/csrc/operators/cutlass/cutlass_dual_linear_kernel.cu
+++ b/src/sfast/csrc/operators/cutlass/cutlass_dual_linear_kernel.cu
@@ -1,4 +1,5 @@
 #include <torch/extension.h>
+#include <torch/version.h>
 
 #include <c10/cuda/CUDAMathCompat.h>
 #include <c10/cuda/CUDAStream.h>

From 22b9454150ee8e81a32d91f7b3d413a2fefaca1f Mon Sep 17 00:00:00 2001
From: Gaspar Rochette <gaspar.rochette@pruna.ai>
Date: Thu, 26 Feb 2026 14:58:34 +0000
Subject: [PATCH 3/9] fix: update cublas BF16 reduction guard for torch 2.10

---
 src/sfast/csrc/operators/cublas/CUDABlas.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/sfast/csrc/operators/cublas/CUDABlas.cc b/src/sfast/csrc/operators/cublas/CUDABlas.cc
index c9e8c75..e74d30f 100644
--- a/src/sfast/csrc/operators/cublas/CUDABlas.cc
+++ b/src/sfast/csrc/operators/cublas/CUDABlas.cc
@@ -654,7 +654,11 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
 #if TORCH_VERSION_MAJOR > 2 ||                                                 \
     (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 2)
   cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH;
+#if TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)
+  if (at::globalContext().allowBF16ReductionCuBLAS() != at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
+#else
   if (!at::globalContext().allowBF16ReductionCuBLAS()) {
+#endif
     cublas_flags = static_cast<cublasMath_t>(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
   }
   TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, cublas_flags));

From 616944ee261b189f26a53d75e3e0a243fa095127 Mon Sep 17 00:00:00 2001
From: Gaspar Rochette <gaspar.rochette@pruna.ai>
Date: Thu, 26 Feb 2026 16:27:47 +0000
Subject: [PATCH 4/9] fix: version-gate BF16 reduction for torch 2.10 in
 cutlass kernel

---
 .../csrc/operators/cutlass/cutlass_dual_linear_kernel.cu      | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/sfast/csrc/operators/cutlass/cutlass_dual_linear_kernel.cu b/src/sfast/csrc/operators/cutlass/cutlass_dual_linear_kernel.cu
index 1858dc6..45791fe 100644
--- a/src/sfast/csrc/operators/cutlass/cutlass_dual_linear_kernel.cu
+++ b/src/sfast/csrc/operators/cutlass/cutlass_dual_linear_kernel.cu
@@ -487,7 +487,11 @@ torch::Tensor cutlass_linear_geglu(const torch::Tensor &input,
   auto dispatch_bf16 = [&] {
 #if TORCH_VERSION_MAJOR > 2 ||                                                 \
     (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 2)
+    #if TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)
+    if (at::globalContext().allowBF16ReductionCuBLAS() == at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
+    #else
     if (at::globalContext().allowBF16ReductionCuBLAS()) {
+    #endif
       output =
           CutlassDualGemmLauncher<at::BFloat16, GemmGEGLUWrapper,
                                   cutlass::epilogue::thread::GELU_taylor_fast,

From 7040772f8dfbaee581622f3faac689d32828276d Mon Sep 17 00:00:00 2001
From: Gaspar Rochette <gaspar.rochette@pruna.ai>
Date: Tue, 17 Mar 2026 13:05:25 +0000
Subject: [PATCH 5/9] build: add python 3.13 and 3.14

---
 .github/workflows/wheels.yml | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index d82d7ae..e30d307 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -11,9 +11,15 @@ jobs:
         os:
           - ubuntu-22.04
           # - windows-2019
-        python: ['3.10', '3.11', '3.12']
-        torch_version: ['2.10.0']
+        # python: ['3.10', '3.11', '3.12', '3.13', '3.14']
+        python: ['3.13', '3.14']
+        torch_version: ['2.7.0', '2.8.0', '2.9.0', '2.10.0']
         cuda_short_version: ['126']
+        exclude:
+          - torch_version: '2.7.0'
+            python: '3.14'
+          - torch_version: '2.8.0'
+            python: '3.14'
 
     uses: ./.github/workflows/wheels_build.yml
     with:
@@ -28,13 +34,14 @@ jobs:
       fail-fast: false
       matrix:
         os: ['ubuntu-22.04']
-        python: ['3.10', '3.11', '3.12']
+        # python: ['3.10', '3.11', '3.12', '3.13', '3.14']
+        python: ['3.13', '3.14']
 
     uses: ./.github/workflows/wheels_build.yml
     with:
       os: ${{ matrix.os }}
       python: ${{ matrix.python }}
-      torch_version: '2.10.0'
+      torch_version: '2.9.0'
       cuda_short_version: '128'
       append_local_version: '0'  # 0 to disable local version suffix
 

From 3592a611b134dd9d3ac8a3d671f365cdeb555fca Mon Sep 17 00:00:00 2001
From: Gaspar Rochette <gaspar.rochette@pruna.ai>
Date: Tue, 17 Mar 2026 13:53:11 +0000
Subject: [PATCH 6/9] build: bump upload and download artifacts versions

---
 .github/workflows/wheels.yml       | 4 ++--
 .github/workflows/wheels_build.yml | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index e30d307..d6007ec 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -90,7 +90,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Download all wheel artifacts
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v7
         with:
           path: dist
 
@@ -101,7 +101,7 @@ jobs:
           ls -l consolidated_wheels
 
       - name: Upload consolidated wheels
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v6
         with:
           name: built-wheels
           path: consolidated_wheels
diff --git a/.github/workflows/wheels_build.yml b/.github/workflows/wheels_build.yml
index 8570da9..214a425 100644
--- a/.github/workflows/wheels_build.yml
+++ b/.github/workflows/wheels_build.yml
@@ -172,7 +172,7 @@ jobs:
           sudo apt autoremove -y
 
       - name: Recursive checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v5
         with:
           submodules: recursive
           path: "."
@@ -236,14 +236,14 @@ jobs:
 
       - name: Upload artifact (local build)
         if: ${{ inputs.append_local_version != '0' }}
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v6
         with:
           name: ${{ inputs.os }}-py${{ inputs.python }}-torch${{ inputs.torch_version }}+cu${{ inputs.cuda_short_version }}
           path: dist/*.whl
 
       - name: Upload artifact (pypi build)
         if: ${{ inputs.append_local_version == '0' }}
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v6
         with:
           name: ${{ inputs.os }}-py${{ inputs.python }}
           path: dist/*.whl

From 92bbb87eda7e2eb2f12b1f74b0285aed09d13362 Mon Sep 17 00:00:00 2001
From: Gaspar Rochette <gaspar.rochette@pruna.ai>
Date: Wed, 18 Mar 2026 13:04:59 +0000
Subject: [PATCH 7/9] build: restore build of full torch-python compatibility
 matrix

---
 .github/workflows/wheels.yml | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index d6007ec..449b0b1 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -11,8 +11,7 @@ jobs:
         os:
           - ubuntu-22.04
           # - windows-2019
-        # python: ['3.10', '3.11', '3.12', '3.13', '3.14']
-        python: ['3.13', '3.14']
+        python: ['3.10', '3.11', '3.12', '3.13', '3.14']
         torch_version: ['2.7.0', '2.8.0', '2.9.0', '2.10.0']
         cuda_short_version: ['126']
         exclude:
@@ -34,8 +33,7 @@ jobs:
       fail-fast: false
       matrix:
         os: ['ubuntu-22.04']
-        # python: ['3.10', '3.11', '3.12', '3.13', '3.14']
-        python: ['3.13', '3.14']
+        python: ['3.10', '3.11', '3.12', '3.13', '3.14']
 
     uses: ./.github/workflows/wheels_build.yml
     with:

From 1a13788b70b2471928a372026bed417bd07191d8 Mon Sep 17 00:00:00 2001
From: Gaspar Rochette <gaspar.rochette@pruna.ai>
Date: Sat, 4 Apr 2026 08:16:55 +0000
Subject: [PATCH 8/9] build: torch 2.11 wheels

---
 .github/workflows/wheels.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index 449b0b1..0cd7684 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -12,7 +12,8 @@ jobs:
           - ubuntu-22.04
           # - windows-2019
         python: ['3.10', '3.11', '3.12', '3.13', '3.14']
-        torch_version: ['2.7.0', '2.8.0', '2.9.0', '2.10.0']
+        # torch_version: ['2.7.0', '2.8.0', '2.9.0', '2.10.0', '2.11.0']
+        torch_version: ['2.11.0']
         cuda_short_version: ['126']
         exclude:
           - torch_version: '2.7.0'

From 291228a62ec5562665adf5a18bc0be5b961e6b96 Mon Sep 17 00:00:00 2001
From: Gaspar Rochette <gaspar.rochette@pruna.ai>
Date: Sat, 4 Apr 2026 08:18:40 +0000
Subject: [PATCH 9/9] tmp: skip build-pypi

---
 .github/workflows/wheels.yml | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index 0cd7684..198adab 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -28,21 +28,21 @@ jobs:
       torch_version: ${{ matrix.torch_version }}
       cuda_short_version: ${{ matrix.cuda_short_version }}
 
-  build-pypi:
-    # Single canonical build intended for PyPI: no local CUDA/torch suffix
-    strategy:
-      fail-fast: false
-      matrix:
-        os: ['ubuntu-22.04']
-        python: ['3.10', '3.11', '3.12', '3.13', '3.14']
-
-    uses: ./.github/workflows/wheels_build.yml
-    with:
-      os: ${{ matrix.os }}
-      python: ${{ matrix.python }}
-      torch_version: '2.9.0'
-      cuda_short_version: '128'
-      append_local_version: '0'  # 0 to disable local version suffix
+  # build-pypi:
+  #   # Single canonical build intended for PyPI: no local CUDA/torch suffix
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       os: ['ubuntu-22.04']
+  #       python: ['3.10', '3.11', '3.12', '3.13', '3.14']
+
+  #   uses: ./.github/workflows/wheels_build.yml
+  #   with:
+  #     os: ${{ matrix.os }}
+  #     python: ${{ matrix.python }}
+  #     torch_version: '2.9.0'
+  #     cuda_short_version: '128'
+  #     append_local_version: '0'  # 0 to disable local version suffix
 
   # publish to GitHub Release
   # gh_release:
@@ -85,7 +85,8 @@ jobs:
 
 
   consolidate-wheels:
-    needs: [build-local, build-pypi]
+    # needs: [build-local, build-pypi]
+    needs: [build-local]
     runs-on: ubuntu-latest
     steps:
       - name: Download all wheel artifacts