diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index d82d7ae..198adab 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -11,9 +11,15 @@ jobs: os: - ubuntu-22.04 # - windows-2019 - python: ['3.10', '3.11', '3.12'] - torch_version: ['2.10.0'] + python: ['3.10', '3.11', '3.12', '3.13', '3.14'] + # torch_version: ['2.7.0', '2.8.0', '2.9.0', '2.10.0', '2.11.0'] + torch_version: ['2.11.0'] cuda_short_version: ['126'] + exclude: + - torch_version: '2.7.0' + python: '3.14' + - torch_version: '2.8.0' + python: '3.14' uses: ./.github/workflows/wheels_build.yml with: @@ -22,21 +28,21 @@ jobs: torch_version: ${{ matrix.torch_version }} cuda_short_version: ${{ matrix.cuda_short_version }} - build-pypi: - # Single canonical build intended for PyPI: no local CUDA/torch suffix - strategy: - fail-fast: false - matrix: - os: ['ubuntu-22.04'] - python: ['3.10', '3.11', '3.12'] + # build-pypi: + # # Single canonical build intended for PyPI: no local CUDA/torch suffix + # strategy: + # fail-fast: false + # matrix: + # os: ['ubuntu-22.04'] + # python: ['3.10', '3.11', '3.12', '3.13', '3.14'] - uses: ./.github/workflows/wheels_build.yml - with: - os: ${{ matrix.os }} - python: ${{ matrix.python }} - torch_version: '2.10.0' - cuda_short_version: '128' - append_local_version: '0' # 0 to disable local version suffix + # uses: ./.github/workflows/wheels_build.yml + # with: + # os: ${{ matrix.os }} + # python: ${{ matrix.python }} + # torch_version: '2.9.0' + # cuda_short_version: '128' + # append_local_version: '0' # 0 to disable local version suffix # publish to GitHub Release # gh_release: @@ -79,11 +85,12 @@ jobs: consolidate-wheels: - needs: [build-local, build-pypi] + # needs: [build-local, build-pypi] + needs: [build-local] runs-on: ubuntu-latest steps: - name: Download all wheel artifacts - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v7 with: path: dist @@ -94,7 +101,7 @@ jobs: ls -l consolidated_wheels - name: Upload consolidated wheels - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 with: name: built-wheels path: consolidated_wheels diff --git a/.github/workflows/wheels_build.yml b/.github/workflows/wheels_build.yml index 8570da9..214a425 100644 --- a/.github/workflows/wheels_build.yml +++ b/.github/workflows/wheels_build.yml @@ -172,7 +172,7 @@ jobs: sudo apt autoremove -y - name: Recursive checkout - uses: actions/checkout@v3 + uses: actions/checkout@v5 with: submodules: recursive path: "." @@ -236,14 +236,14 @@ jobs: - name: Upload artifact (local build) if: ${{ inputs.append_local_version != '0' }} - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 with: name: ${{ inputs.os }}-py${{ inputs.python }}-torch${{ inputs.torch_version }}+cu${{ inputs.cuda_short_version }} path: dist/*.whl - name: Upload artifact (pypi build) if: ${{ inputs.append_local_version == '0' }} - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 with: name: ${{ inputs.os }}-py${{ inputs.python }} path: dist/*.whl diff --git a/src/sfast/csrc/operators/cublas/CUDABlas.cc b/src/sfast/csrc/operators/cublas/CUDABlas.cc index 800e7a4..e74d30f 100644 --- a/src/sfast/csrc/operators/cublas/CUDABlas.cc +++ b/src/sfast/csrc/operators/cublas/CUDABlas.cc @@ -7,6 +7,7 @@ #include #include #include +#include // cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also // added bf16 support @@ -226,7 +227,9 @@ cublasStatus_t cublasGemmStridedBatchedExFix(cublasHandle_t &handle, template <> void bgemm(CUDABLAS_BGEMM_ARGTYPES(double)) { // See Note [Writing Nondeterministic Operations] +#if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)) globalContext().alertCuBLASConfigNotDeterministic(); +#endif cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -239,7 +242,9 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES(double)) { template <> void bgemm(CUDABLAS_BGEMM_ARGTYPES(float)) { // See Note [Writing Nondeterministic Operations] +#if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)) globalContext().alertCuBLASConfigNotDeterministic(); +#endif cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -252,7 +257,9 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES(float)) { template <> void bgemm>(CUDABLAS_BGEMM_ARGTYPES(c10::complex)) { // See Note [Writing Nondeterministic Operations] +#if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)) globalContext().alertCuBLASConfigNotDeterministic(); +#endif cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -267,7 +274,9 @@ void bgemm>(CUDABLAS_BGEMM_ARGTYPES(c10::complex)) template <> void bgemm>(CUDABLAS_BGEMM_ARGTYPES(c10::complex)) { // See Note [Writing Nondeterministic Operations] +#if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)) globalContext().alertCuBLASConfigNotDeterministic(); +#endif cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -282,7 +291,9 @@ void bgemm>(CUDABLAS_BGEMM_ARGTYPES(c10::complex)) { template <> void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::Half)) { // See Note [Writing Nondeterministic Operations] +#if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)) globalContext().alertCuBLASConfigNotDeterministic(); +#endif cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -311,7 +322,11 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::Half)) { cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); if (prop->major >= 5){ + #if TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10) + if (at::globalContext().allowFP16ReductionCuBLAS() == at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) { + #else if (at::globalContext().allowFP16ReductionCuBLAS()) { + #endif at::Half falpha = alpha; at::Half fbeta = beta; TORCH_CUDABLAS_CHECK(cublasGemmStridedBatchedExFix( @@ -350,7 +365,9 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::Half)) { template <> void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) { // See Note [Writing Nondeterministic Operations] + #if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)) globalContext().alertCuBLASConfigNotDeterministic(); + #endif BGEMM_CHECK_ARGVALUES(at::BFloat16); cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); @@ -383,7 +400,9 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) { template <> void gemm(CUDABLAS_GEMM_ARGTYPES(double)) { // See Note [Writing Nondeterministic Operations] +#if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)) globalContext().alertCuBLASConfigNotDeterministic(); +#endif cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -396,7 +415,9 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(double)) { template <> void gemm(CUDABLAS_GEMM_ARGTYPES(float)) { // See Note [Writing Nondeterministic Operations] +#if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)) globalContext().alertCuBLASConfigNotDeterministic(); +#endif cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -410,7 +431,9 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(float)) { template <> void gemm>(CUDABLAS_GEMM_ARGTYPES(c10::complex)) { // See Note [Writing Nondeterministic Operations] + #if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)) globalContext().alertCuBLASConfigNotDeterministic(); + #endif cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -427,7 +450,9 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(float)) { template <> void gemm>(CUDABLAS_GEMM_ARGTYPES(c10::complex)) { // See Note [Writing Nondeterministic Operations] + #if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)) globalContext().alertCuBLASConfigNotDeterministic(); + #endif cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -443,7 +468,9 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(float)) { template <> void gemm(CUDABLAS_GEMM_ARGTYPES(at::Half)) { // See Note [Writing Nondeterministic Operations] +#if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)) globalContext().alertCuBLASConfigNotDeterministic(); +#endif cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -490,12 +517,20 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::Half)) { TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH)); #else cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH; +#if TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10) + if (at::globalContext().allowFP16ReductionCuBLAS() != at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) { +#else if (!at::globalContext().allowFP16ReductionCuBLAS()) { +#endif cublas_flags = static_cast(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION); } TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, cublas_flags)); #endif // defined(CUDA_VERSION) && CUDA_VERSION < 11000 +#if TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10) + if (at::globalContext().allowFP16ReductionCuBLAS() == at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) { +#else if (at::globalContext().allowFP16ReductionCuBLAS()) { +#endif at::Half falpha = alpha; at::Half fbeta = beta; TORCH_CUDABLAS_CHECK(cublasGemmEx_( @@ -606,7 +641,9 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { #if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 template <> void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { +#if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)) globalContext().alertCuBLASConfigNotDeterministic(); +#endif cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -617,7 +654,11 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { #if TORCH_VERSION_MAJOR > 2 || \ (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 2) cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH; +#if TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10) + if (at::globalContext().allowBF16ReductionCuBLAS() != at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) { +#else if (!at::globalContext().allowBF16ReductionCuBLAS()) { +#endif cublas_flags = static_cast(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION); } TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, cublas_flags)); @@ -1126,7 +1167,9 @@ void trsmBatched>( template <> void gemv>(CUDABLAS_GEMV_ARGTYPES(c10::complex)) { // See Note [Writing Nondeterministic Operations] + #if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)) globalContext().alertCuBLASConfigNotDeterministic(); + #endif cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t op = _cublasOpFromChar(trans); _cublasAdjustLdLevel2(m, n, &lda); @@ -1145,7 +1188,9 @@ void gemv>(CUDABLAS_GEMV_ARGTYPES(c10::complex)) { // loss still happens on TF32. So we disable it here. NoTF32Guard disable_tf32; // See Note [Writing Nondeterministic Operations] +#if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)) globalContext().alertCuBLASConfigNotDeterministic(); +#endif cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t op = _cublasOpFromChar(trans); _cublasAdjustLdLevel2(m, n, &lda); @@ -1160,7 +1205,9 @@ void gemv>(CUDABLAS_GEMV_ARGTYPES(c10::complex)) { template <> void gemv(CUDABLAS_GEMV_ARGTYPES(double)) { // See Note [Writing Nondeterministic Operations] +#if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)) globalContext().alertCuBLASConfigNotDeterministic(); +#endif cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t op = _cublasOpFromChar(trans); _cublasAdjustLdLevel2(m, n, &lda); @@ -1175,7 +1222,9 @@ void gemv(CUDABLAS_GEMV_ARGTYPES(float)) { // loss still happens on TF32. So we disable it here. NoTF32Guard disable_tf32; // See Note [Writing Nondeterministic Operations] +#if !(TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10)) globalContext().alertCuBLASConfigNotDeterministic(); +#endif cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t op = _cublasOpFromChar(trans); _cublasAdjustLdLevel2(m, n, &lda); diff --git a/src/sfast/csrc/operators/cutlass/cutlass_dual_linear_kernel.cu b/src/sfast/csrc/operators/cutlass/cutlass_dual_linear_kernel.cu index 4559b64..45791fe 100644 --- a/src/sfast/csrc/operators/cutlass/cutlass_dual_linear_kernel.cu +++ b/src/sfast/csrc/operators/cutlass/cutlass_dual_linear_kernel.cu @@ -1,4 +1,5 @@ #include +#include #include #include @@ -486,7 +487,11 @@ torch::Tensor cutlass_linear_geglu(const torch::Tensor &input, auto dispatch_bf16 = [&] { #if TORCH_VERSION_MAJOR > 2 || \ (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 2) + #if TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10) + if (at::globalContext().allowBF16ReductionCuBLAS() == at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) { + #else if (at::globalContext().allowBF16ReductionCuBLAS()) { + #endif output = CutlassDualGemmLauncher 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 10) + if (at::globalContext().allowFP16ReductionCuBLAS() == at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) { +#else if (at::globalContext().allowFP16ReductionCuBLAS()) { +#endif output = CutlassDualGemmLauncher< at::Half, GemmGEGLUWrapper, cutlass::epilogue::thread::GELU_taylor_fast,