From 3fbed2412ccbda4b647d25d153cf84ec48806da3 Mon Sep 17 00:00:00 2001 From: Gaspar Rochette Date: Thu, 26 Feb 2026 10:58:04 +0000 Subject: [PATCH 1/9] fix: update cublas to torch 2.10 --- src/sfast/csrc/operators/cublas/CUDABlas.cc | 44 +++++++++++++++++++ .../cutlass/cutlass_dual_linear_kernel.cu | 4 ++ 2 files changed, 48 insertions(+) diff --git a/src/sfast/csrc/operators/cublas/CUDABlas.cc b/src/sfast/csrc/operators/cublas/CUDABlas.cc index 800e7a4..974ac97 100644 --- a/src/sfast/csrc/operators/cublas/CUDABlas.cc +++ b/src/sfast/csrc/operators/cublas/CUDABlas.cc @@ -226,7 +226,9 @@ cublasStatus_t cublasGemmStridedBatchedExFix(cublasHandle_t &handle, template <> void bgemm(CUDABLAS_BGEMM_ARGTYPES(double)) { // See Note [Writing Nondeterministic Operations] +#if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)) globalContext().alertCuBLASConfigNotDeterministic(); +#endif cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -239,7 +241,9 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES(double)) { template <> void bgemm(CUDABLAS_BGEMM_ARGTYPES(float)) { // See Note [Writing Nondeterministic Operations] +#if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)) globalContext().alertCuBLASConfigNotDeterministic(); +#endif cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -252,7 +256,9 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES(float)) { template <> void bgemm>(CUDABLAS_BGEMM_ARGTYPES(c10::complex)) { // See Note [Writing Nondeterministic Operations] +#if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)) globalContext().alertCuBLASConfigNotDeterministic(); +#endif cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -267,7 +273,9 @@ void bgemm>(CUDABLAS_BGEMM_ARGTYPES(c10::complex)) template <> void bgemm>(CUDABLAS_BGEMM_ARGTYPES(c10::complex)) { // See Note [Writing Nondeterministic Operations] +#if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)) globalContext().alertCuBLASConfigNotDeterministic(); +#endif cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -282,7 +290,9 @@ void bgemm>(CUDABLAS_BGEMM_ARGTYPES(c10::complex)) { template <> void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::Half)) { // See Note [Writing Nondeterministic Operations] +#if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)) globalContext().alertCuBLASConfigNotDeterministic(); +#endif cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -311,7 +321,11 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::Half)) { cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); if (prop->major >= 5){ + #if TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10) + if (at::globalContext().allowFP16ReductionCuBLAS() == at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) { + #else if (at::globalContext().allowFP16ReductionCuBLAS()) { + #endif at::Half falpha = alpha; at::Half fbeta = beta; TORCH_CUDABLAS_CHECK(cublasGemmStridedBatchedExFix( @@ -350,7 +364,9 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::Half)) { template <> void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) { // See Note [Writing Nondeterministic Operations] + #if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)) globalContext().alertCuBLASConfigNotDeterministic(); + #endif BGEMM_CHECK_ARGVALUES(at::BFloat16); cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); @@ -383,7 +399,9 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) { template <> void gemm(CUDABLAS_GEMM_ARGTYPES(double)) { // See Note [Writing Nondeterministic Operations] +#if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)) globalContext().alertCuBLASConfigNotDeterministic(); +#endif cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -396,7 +414,9 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(double)) { template <> void gemm(CUDABLAS_GEMM_ARGTYPES(float)) { // See Note [Writing Nondeterministic Operations] +#if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)) globalContext().alertCuBLASConfigNotDeterministic(); +#endif cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -410,7 +430,9 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(float)) { template <> void gemm>(CUDABLAS_GEMM_ARGTYPES(c10::complex)) { // See Note [Writing Nondeterministic Operations] + #if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)) globalContext().alertCuBLASConfigNotDeterministic(); + #endif cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -427,7 +449,9 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(float)) { template <> void gemm>(CUDABLAS_GEMM_ARGTYPES(c10::complex)) { // See Note [Writing Nondeterministic Operations] + #if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)) globalContext().alertCuBLASConfigNotDeterministic(); + #endif cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -443,7 +467,9 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(float)) { template <> void gemm(CUDABLAS_GEMM_ARGTYPES(at::Half)) { // See Note [Writing Nondeterministic Operations] +#if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)) globalContext().alertCuBLASConfigNotDeterministic(); +#endif cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -490,12 +516,20 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::Half)) { TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH)); #else cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH; +#if TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10) + if (at::globalContext().allowFP16ReductionCuBLAS() != at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) { +#else if (!at::globalContext().allowFP16ReductionCuBLAS()) { +#endif cublas_flags = static_cast(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION); } TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, cublas_flags)); #endif // defined(CUDA_VERSION) && CUDA_VERSION < 11000 +#if TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10) + if (at::globalContext().allowFP16ReductionCuBLAS() == at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) { +#else if (at::globalContext().allowFP16ReductionCuBLAS()) { +#endif at::Half falpha = alpha; at::Half fbeta = beta; TORCH_CUDABLAS_CHECK(cublasGemmEx_( @@ -606,7 +640,9 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { #if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 template <> void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { +#if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)) globalContext().alertCuBLASConfigNotDeterministic(); +#endif cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -1126,7 +1162,9 @@ void trsmBatched>( template <> void gemv>(CUDABLAS_GEMV_ARGTYPES(c10::complex)) { // See Note [Writing Nondeterministic Operations] + #if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)) globalContext().alertCuBLASConfigNotDeterministic(); + #endif cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t op = _cublasOpFromChar(trans); _cublasAdjustLdLevel2(m, n, &lda); @@ -1145,7 +1183,9 @@ void gemv>(CUDABLAS_GEMV_ARGTYPES(c10::complex)) { // loss still happens on TF32. So we disable it here. NoTF32Guard disable_tf32; // See Note [Writing Nondeterministic Operations] +#if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)) globalContext().alertCuBLASConfigNotDeterministic(); +#endif cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t op = _cublasOpFromChar(trans); _cublasAdjustLdLevel2(m, n, &lda); @@ -1160,7 +1200,9 @@ void gemv>(CUDABLAS_GEMV_ARGTYPES(c10::complex)) { template <> void gemv(CUDABLAS_GEMV_ARGTYPES(double)) { // See Note [Writing Nondeterministic Operations] +#if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)) globalContext().alertCuBLASConfigNotDeterministic(); +#endif cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t op = _cublasOpFromChar(trans); _cublasAdjustLdLevel2(m, n, &lda); @@ -1175,7 +1217,9 @@ void gemv(CUDABLAS_GEMV_ARGTYPES(float)) { // loss still happens on TF32. So we disable it here. NoTF32Guard disable_tf32; // See Note [Writing Nondeterministic Operations] +#if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)) globalContext().alertCuBLASConfigNotDeterministic(); +#endif cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t op = _cublasOpFromChar(trans); _cublasAdjustLdLevel2(m, n, &lda); diff --git a/src/sfast/csrc/operators/cutlass/cutlass_dual_linear_kernel.cu b/src/sfast/csrc/operators/cutlass/cutlass_dual_linear_kernel.cu index 4559b64..28c3899 100644 --- a/src/sfast/csrc/operators/cutlass/cutlass_dual_linear_kernel.cu +++ b/src/sfast/csrc/operators/cutlass/cutlass_dual_linear_kernel.cu @@ -506,7 +506,11 @@ torch::Tensor cutlass_linear_geglu(const torch::Tensor &input, AT_DISPATCH_CASE( at::kHalf, [&] { +#if TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10) + if (at::globalContext().allowFP16ReductionCuBLAS() == at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) { +#else if (at::globalContext().allowFP16ReductionCuBLAS()) { +#endif output = CutlassDualGemmLauncher< at::Half, GemmGEGLUWrapper, cutlass::epilogue::thread::GELU_taylor_fast, From 7d2d750bf54cb1c12c070150bad749a4fd04723d Mon Sep 17 00:00:00 2001 From: Gaspar Rochette Date: Thu, 26 Feb 2026 14:03:32 +0000 Subject: [PATCH 2/9] fix: import torch version to avoid undefined_as_0 error --- src/sfast/csrc/operators/cublas/CUDABlas.cc | 1 + src/sfast/csrc/operators/cutlass/cutlass_dual_linear_kernel.cu | 1 + 2 files changed, 2 insertions(+) diff --git a/src/sfast/csrc/operators/cublas/CUDABlas.cc b/src/sfast/csrc/operators/cublas/CUDABlas.cc index 974ac97..c9e8c75 100644 --- a/src/sfast/csrc/operators/cublas/CUDABlas.cc +++ b/src/sfast/csrc/operators/cublas/CUDABlas.cc @@ -7,6 +7,7 @@ #include #include #include +#include // cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also // added bf16 support diff --git a/src/sfast/csrc/operators/cutlass/cutlass_dual_linear_kernel.cu b/src/sfast/csrc/operators/cutlass/cutlass_dual_linear_kernel.cu index 28c3899..1858dc6 100644 --- a/src/sfast/csrc/operators/cutlass/cutlass_dual_linear_kernel.cu +++ b/src/sfast/csrc/operators/cutlass/cutlass_dual_linear_kernel.cu @@ -1,4 +1,5 @@ #include +#include #include #include From 22b9454150ee8e81a32d91f7b3d413a2fefaca1f Mon Sep 17 00:00:00 2001 From: Gaspar Rochette Date: Thu, 26 Feb 2026 14:58:34 +0000 Subject: [PATCH 3/9] fix: update cublas BF16 reduction guard for torch 2.10 --- src/sfast/csrc/operators/cublas/CUDABlas.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/sfast/csrc/operators/cublas/CUDABlas.cc b/src/sfast/csrc/operators/cublas/CUDABlas.cc index c9e8c75..e74d30f 100644 --- a/src/sfast/csrc/operators/cublas/CUDABlas.cc +++ b/src/sfast/csrc/operators/cublas/CUDABlas.cc @@ -654,7 +654,11 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { #if TORCH_VERSION_MAJOR > 2 || \ (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 2) cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH; +#if TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10) + if (at::globalContext().allowBF16ReductionCuBLAS() != at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) { +#else if (!at::globalContext().allowBF16ReductionCuBLAS()) { +#endif cublas_flags = static_cast(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION); } TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, cublas_flags)); From 616944ee261b189f26a53d75e3e0a243fa095127 Mon Sep 17 00:00:00 2001 From: Gaspar Rochette Date: Thu, 26 Feb 2026 16:27:47 +0000 Subject: [PATCH 4/9] fix: version-gate BF16 reduction for torch 2.10 in cutlass kernel --- .../csrc/operators/cutlass/cutlass_dual_linear_kernel.cu | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/sfast/csrc/operators/cutlass/cutlass_dual_linear_kernel.cu b/src/sfast/csrc/operators/cutlass/cutlass_dual_linear_kernel.cu index 1858dc6..45791fe 100644 --- a/src/sfast/csrc/operators/cutlass/cutlass_dual_linear_kernel.cu +++ b/src/sfast/csrc/operators/cutlass/cutlass_dual_linear_kernel.cu @@ -487,7 +487,11 @@ torch::Tensor cutlass_linear_geglu(const torch::Tensor &input, auto dispatch_bf16 = [&] { #if TORCH_VERSION_MAJOR > 2 || \ (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 2) + #if TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10) + if (at::globalContext().allowBF16ReductionCuBLAS() == at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) { + #else if (at::globalContext().allowBF16ReductionCuBLAS()) { + #endif output = CutlassDualGemmLauncher Date: Tue, 17 Mar 2026 13:05:25 +0000 Subject: [PATCH 5/9] build: add python 3.13 and 3.14 --- .github/workflows/wheels.yml | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index d82d7ae..e30d307 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -11,9 +11,15 @@ jobs: os: - ubuntu-22.04 # - windows-2019 - python: ['3.10', '3.11', '3.12'] - torch_version: ['2.10.0'] + # python: ['3.10', '3.11', '3.12', '3.13', '3.14'] + python: ['3.13', '3.14'] + torch_version: ['2.7.0', '2.8.0', '2.9.0', '2.10.0'] cuda_short_version: ['126'] + exclude: + - torch_version: '2.7.0' + python: '3.14' + - torch_version: '2.8.0' + python: '3.14' uses: ./.github/workflows/wheels_build.yml with: @@ -28,13 +34,14 @@ jobs: fail-fast: false matrix: os: ['ubuntu-22.04'] - python: ['3.10', '3.11', '3.12'] + # python: ['3.10', '3.11', '3.12', '3.13', '3.14'] + python: ['3.13', '3.14'] uses: ./.github/workflows/wheels_build.yml with: os: ${{ matrix.os }} python: ${{ matrix.python }} - torch_version: '2.10.0' + torch_version: '2.9.0' cuda_short_version: '128' append_local_version: '0' # 0 to disable local version suffix From 3592a611b134dd9d3ac8a3d671f365cdeb555fca Mon Sep 17 00:00:00 2001 From: Gaspar Rochette Date: Tue, 17 Mar 2026 13:53:11 +0000 Subject: [PATCH 6/9] build: bump upload and download artifacts versions --- .github/workflows/wheels.yml | 4 ++-- .github/workflows/wheels_build.yml | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index e30d307..d6007ec 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -90,7 +90,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Download all wheel artifacts - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v7 with: path: dist @@ -101,7 +101,7 @@ jobs: ls -l consolidated_wheels - name: Upload consolidated wheels - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 with: name: built-wheels path: consolidated_wheels diff --git a/.github/workflows/wheels_build.yml b/.github/workflows/wheels_build.yml index 8570da9..214a425 100644 --- a/.github/workflows/wheels_build.yml +++ b/.github/workflows/wheels_build.yml @@ -172,7 +172,7 @@ jobs: sudo apt autoremove -y - name: Recursive checkout - uses: actions/checkout@v3 + uses: actions/checkout@v5 with: submodules: recursive path: "." @@ -236,14 +236,14 @@ jobs: - name: Upload artifact (local build) if: ${{ inputs.append_local_version != '0' }} - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 with: name: ${{ inputs.os }}-py${{ inputs.python }}-torch${{ inputs.torch_version }}+cu${{ inputs.cuda_short_version }} path: dist/*.whl - name: Upload artifact (pypi build) if: ${{ inputs.append_local_version == '0' }} - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 with: name: ${{ inputs.os }}-py${{ inputs.python }} path: dist/*.whl From 92bbb87eda7e2eb2f12b1f74b0285aed09d13362 Mon Sep 17 00:00:00 2001 From: Gaspar Rochette Date: Wed, 18 Mar 2026 13:04:59 +0000 Subject: [PATCH 7/9] build: restore build of full torch-python compatibility matrix --- .github/workflows/wheels.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index d6007ec..449b0b1 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -11,8 +11,7 @@ jobs: os: - ubuntu-22.04 # - windows-2019 - # python: ['3.10', '3.11', '3.12', '3.13', '3.14'] - python: ['3.13', '3.14'] + python: ['3.10', '3.11', '3.12', '3.13', '3.14'] torch_version: ['2.7.0', '2.8.0', '2.9.0', '2.10.0'] cuda_short_version: ['126'] exclude: @@ -34,8 +33,7 @@ jobs: fail-fast: false matrix: os: ['ubuntu-22.04'] - # python: ['3.10', '3.11', '3.12', '3.13', '3.14'] - python: ['3.13', '3.14'] + python: ['3.10', '3.11', '3.12', '3.13', '3.14'] uses: ./.github/workflows/wheels_build.yml with: From 1a13788b70b2471928a372026bed417bd07191d8 Mon Sep 17 00:00:00 2001 From: Gaspar Rochette Date: Sat, 4 Apr 2026 08:16:55 +0000 Subject: [PATCH 8/9] build: torch 2.11 wheels --- .github/workflows/wheels.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 449b0b1..0cd7684 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -12,7 +12,8 @@ jobs: - ubuntu-22.04 # - windows-2019 python: ['3.10', '3.11', '3.12', '3.13', '3.14'] - torch_version: ['2.7.0', '2.8.0', '2.9.0', '2.10.0'] + # torch_version: ['2.7.0', '2.8.0', '2.9.0', '2.10.0', '2.11.0'] + torch_version: ['2.11.0'] cuda_short_version: ['126'] exclude: - torch_version: '2.7.0' From 291228a62ec5562665adf5a18bc0be5b961e6b96 Mon Sep 17 00:00:00 2001 From: Gaspar Rochette Date: Sat, 4 Apr 2026 08:18:40 +0000 Subject: [PATCH 9/9] tmp: skip build-pypi --- .github/workflows/wheels.yml | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 0cd7684..198adab 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -28,21 +28,21 @@ jobs: torch_version: ${{ matrix.torch_version }} cuda_short_version: ${{ matrix.cuda_short_version }} - build-pypi: - # Single canonical build intended for PyPI: no local CUDA/torch suffix - strategy: - fail-fast: false - matrix: - os: ['ubuntu-22.04'] - python: ['3.10', '3.11', '3.12', '3.13', '3.14'] - - uses: ./.github/workflows/wheels_build.yml - with: - os: ${{ matrix.os }} - python: ${{ matrix.python }} - torch_version: '2.9.0' - cuda_short_version: '128' - append_local_version: '0' # 0 to disable local version suffix + # build-pypi: + # # Single canonical build intended for PyPI: no local CUDA/torch suffix + # strategy: + # fail-fast: false + # matrix: + # os: ['ubuntu-22.04'] + # python: ['3.10', '3.11', '3.12', '3.13', '3.14'] + + # uses: ./.github/workflows/wheels_build.yml + # with: + # os: ${{ matrix.os }} + # python: ${{ matrix.python }} + # torch_version: '2.9.0' + # cuda_short_version: '128' + # append_local_version: '0' # 0 to disable local version suffix # publish to GitHub Release # gh_release: @@ -85,7 +85,8 @@ jobs: consolidate-wheels: - needs: [build-local, build-pypi] + # needs: [build-local, build-pypi] + needs: [build-local] runs-on: ubuntu-latest steps: - name: Download all wheel artifacts