From ac2bc595293509e852e8aba03e8d1516f3d94655 Mon Sep 17 00:00:00 2001 From: Katharine Hyatt Date: Tue, 24 Feb 2026 09:03:33 -0500 Subject: [PATCH 01/11] Initial support for copy! + CUDA --- Project.toml | 12 ++++++++++-- ext/StridedCUDAExt.jl | 16 ++++++++++++++++ test/cuda.jl | 25 +++++++++++++++++++++++++ test/runtests.jl | 39 ++++++++++++++++++++++++--------------- 4 files changed, 75 insertions(+), 17 deletions(-) create mode 100644 ext/StridedCUDAExt.jl create mode 100644 test/cuda.jl diff --git a/Project.toml b/Project.toml index f00886f..988d6f4 100644 --- a/Project.toml +++ b/Project.toml @@ -1,15 +1,22 @@ name = "Strided" uuid = "5e0ebb24-38b0-5f93-81fe-25c709ecae67" -authors = ["Lukas Devos ", "Maarten Van Damme ", "Jutho Haegeman "] version = "2.3.2" +authors = ["Lukas Devos ", "Maarten Van Damme ", "Jutho Haegeman "] [deps] LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" StridedViews = "4db3bf67-4bd7-4b4e-b153-31dc3fb37143" TupleTools = "9d95972d-f1c8-5527-a6e0-b4b365fa01f6" +[weakdeps] +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" + +[extensions] +StridedCUDAExt = "CUDA" + [compat] Aqua = "0.8" +CUDA = "5" LinearAlgebra = "1.6" Random = "1.6" StridedViews = "0.3.2,0.4" @@ -19,8 +26,9 @@ julia = "1.6" [extras] Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Test", "Random", "Aqua"] +test = ["Test", "Random", "Aqua", "CUDA"] diff --git a/ext/StridedCUDAExt.jl b/ext/StridedCUDAExt.jl new file mode 100644 index 0000000..326ca7c --- /dev/null +++ b/ext/StridedCUDAExt.jl @@ -0,0 +1,16 @@ +module StridedCUDAExt + +using Strided, CUDA +import Strided: _mapreduce_fuse! + +ALL_FS = Union{typeof(adjoint), typeof(conj), typeof(identity), typeof(transpose)} + +function Base.copy!(dst::StridedView{TD,ND,TAD,FD}, src::StridedView{TS,NS,TAS,FS}) where {TD, ND, TAD <: CuArray{TD}, FD<:ALL_FS, TS, NS, TAS <: CuArray{TS}, FS<:ALL_FS} + all_dst_inds = map(ix->Strided.StridedViews._computeind(Tuple(ix), dst.strides), CartesianIndices(size(dst))) + viewed_dst = view(parent(dst), all_dst_inds) + all_src_inds = map(ix->Strided.StridedViews._computeind(Tuple(ix), src.strides), CartesianIndices(size(src))) + viewed_src = view(parent(src), all_src_inds) + return map!(identity, viewed_dst, viewed_src) +end + +end diff --git a/test/cuda.jl b/test/cuda.jl new file mode 100644 index 0000000..e60e4b7 --- /dev/null +++ b/test/cuda.jl @@ -0,0 +1,25 @@ +for T in (Float32, Float64, Complex{Float32}, Complex{Float64}) + @testset "Copy with CuStridedView: $T" begin + m1 = 32 + m2 = 16 + A1 = CUDA.randn(T, (m1, m2)) + A2 = similar(A1) + A1c = copy(A1) + A2c = copy(A2) + B1 = StridedView(A1c) + B2 = StridedView(A2c) + @test copy!(A2, A1) == copy!(B2, B1) + @test copy!(transpose(A2), transpose(A1)) == copy!(transpose(B2), transpose(B1)) + if T <: Complex + @test_broken copy!(transpose(A2), adjoint(A1)) == copy!(transpose(B2), adjoint(B1)) + @test_broken copy!(adjoint(A2), adjoint(A1)) == copy!(adjoint(B2), adjoint(B1)) + @test_broken copy!(A2, conj(A1)) == copy!(B2, conj(B1)) + @test_broken copy!(conj(A2), conj(A1)) == copy!(conj(B2), conj(B1)) + else + @test copy!(transpose(A2), adjoint(A1)) == copy!(transpose(B2), adjoint(B1)) + @test copy!(adjoint(A2), adjoint(A1)) == copy!(adjoint(B2), adjoint(B1)) + @test copy!(A2, conj(A1)) == copy!(B2, conj(B1)) + @test copy!(conj(A2), conj(A1)) == copy!(conj(B2), conj(B1)) + end + end +end diff --git a/test/runtests.jl b/test/runtests.jl index fc411cc..cf18c69 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,25 +3,34 @@ using LinearAlgebra using Random using Strided using Strided: StridedView +using CUDA +using Aqua Random.seed!(1234) -println("Base.Threads.nthreads() = $(Base.Threads.nthreads())") +is_buildkite = get(ENV, "BUILDKITE", "false") == "true" -println("Running tests single-threaded:") -Strided.disable_threads() -include("othertests.jl") -include("blasmultests.jl") +if !is_buildkite + println("Base.Threads.nthreads() = $(Base.Threads.nthreads())") -println("Running tests multi-threaded:") -Strided.enable_threads() -Strided.set_num_threads(Base.Threads.nthreads() + 1) -include("othertests.jl") -include("blasmultests.jl") + println("Running tests single-threaded:") + Strided.disable_threads() + include("othertests.jl") + include("blasmultests.jl") -Strided.enable_threaded_mul() -include("blasmultests.jl") -Strided.disable_threaded_mul() + println("Running tests multi-threaded:") + Strided.enable_threads() + Strided.set_num_threads(Base.Threads.nthreads() + 1) + include("othertests.jl") + include("blasmultests.jl") -using Aqua -Aqua.test_all(Strided; piracies = false) + Strided.enable_threaded_mul() + include("blasmultests.jl") + Strided.disable_threaded_mul() + + Aqua.test_all(Strided; piracies=false) +end + +if CUDA.functional() + include("cuda.jl") +end From 91c61b881abad02511ef9afd221e3ff794132b01 Mon Sep 17 00:00:00 2001 From: Katharine Hyatt Date: Tue, 24 Feb 2026 14:36:56 -0500 Subject: [PATCH 02/11] Working extension --- ext/StridedCUDAExt.jl | 22 ++++++++++++++++++---- test/cuda.jl | 25 +++++++------------------ test/runtests.jl | 1 + 3 files changed, 26 insertions(+), 22 deletions(-) diff --git a/ext/StridedCUDAExt.jl b/ext/StridedCUDAExt.jl index 326ca7c..ab69704 100644 --- a/ext/StridedCUDAExt.jl +++ b/ext/StridedCUDAExt.jl @@ -5,12 +5,26 @@ import Strided: _mapreduce_fuse! ALL_FS = Union{typeof(adjoint), typeof(conj), typeof(identity), typeof(transpose)} -function Base.copy!(dst::StridedView{TD,ND,TAD,FD}, src::StridedView{TS,NS,TAS,FS}) where {TD, ND, TAD <: CuArray{TD}, FD<:ALL_FS, TS, NS, TAS <: CuArray{TS}, FS<:ALL_FS} - all_dst_inds = map(ix->Strided.StridedViews._computeind(Tuple(ix), dst.strides), CartesianIndices(size(dst))) +function Base.copy!(dst::StridedView{TD, ND, TAD, F}, src::StridedView{TS, NS, TAS, F}) where {TD, ND, TAD <: CuArray{TD}, F <: ALL_FS, TS, NS, TAS <: CuArray{TS}} + all_dst_inds = map(ix -> Strided.StridedViews._computeind(Tuple(ix), dst.strides), CartesianIndices(size(dst))) viewed_dst = view(parent(dst), all_dst_inds) - all_src_inds = map(ix->Strided.StridedViews._computeind(Tuple(ix), src.strides), CartesianIndices(size(src))) + all_src_inds = map(ix -> Strided.StridedViews._computeind(Tuple(ix), src.strides), CartesianIndices(size(src))) viewed_src = view(parent(src), all_src_inds) - return map!(identity, viewed_dst, viewed_src) + map!(identity, viewed_dst, viewed_src) + return dst +end + +function Base.copy!(dst::StridedView{TD, ND, TAD, FD}, src::StridedView{TS, NS, TAS, FS}) where {TD, ND, TAD <: CuArray{TD}, FD <: ALL_FS, TS, NS, TAS <: CuArray{TS}, FS <: ALL_FS} + all_dst_inds = map(ix -> Strided.StridedViews._computeind(Tuple(ix), dst.strides), CartesianIndices(size(dst))) + viewed_dst = view(parent(dst), all_dst_inds) + all_src_inds = map(ix -> Strided.StridedViews._computeind(Tuple(ix), src.strides), CartesianIndices(size(src))) + viewed_src = view(parent(src), all_src_inds) + if FS <: typeof(conj) && FD <: typeof(identity) + map!(conj, viewed_dst, viewed_src) + elseif FD <: typeof(conj) && FS <: typeof(identity) + map!(conj, viewed_dst, viewed_src) + end + return dst end end diff --git a/test/cuda.jl b/test/cuda.jl index e60e4b7..e86b4eb 100644 --- a/test/cuda.jl +++ b/test/cuda.jl @@ -1,25 +1,14 @@ for T in (Float32, Float64, Complex{Float32}, Complex{Float64}) - @testset "Copy with CuStridedView: $T" begin - m1 = 32 - m2 = 16 + m1 = 32 + m2 = 16 + @testset "Copy with CuStridedView: $T, $f1, $f2" for f2 in (identity, conj, adjoint, transpose), f1 in (identity, conj, transpose, adjoint) A1 = CUDA.randn(T, (m1, m2)) A2 = similar(A1) A1c = copy(A1) A2c = copy(A2) - B1 = StridedView(A1c) - B2 = StridedView(A2c) - @test copy!(A2, A1) == copy!(B2, B1) - @test copy!(transpose(A2), transpose(A1)) == copy!(transpose(B2), transpose(B1)) - if T <: Complex - @test_broken copy!(transpose(A2), adjoint(A1)) == copy!(transpose(B2), adjoint(B1)) - @test_broken copy!(adjoint(A2), adjoint(A1)) == copy!(adjoint(B2), adjoint(B1)) - @test_broken copy!(A2, conj(A1)) == copy!(B2, conj(B1)) - @test_broken copy!(conj(A2), conj(A1)) == copy!(conj(B2), conj(B1)) - else - @test copy!(transpose(A2), adjoint(A1)) == copy!(transpose(B2), adjoint(B1)) - @test copy!(adjoint(A2), adjoint(A1)) == copy!(adjoint(B2), adjoint(B1)) - @test copy!(A2, conj(A1)) == copy!(B2, conj(B1)) - @test copy!(conj(A2), conj(A1)) == copy!(conj(B2), conj(B1)) - end + B1 = f1(StridedView(A1c)) + B2 = f2(StridedView(A2c)) + axes(f1(A1)) == axes(f2(A2)) || continue + @test collect(CuMatrix(copy!(f2(A2), f1(A1)))) == Adapt.adapt(Vector{T}, copy!(B2, B1)) end end diff --git a/test/runtests.jl b/test/runtests.jl index cf18c69..fd0f715 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -5,6 +5,7 @@ using Strided using Strided: StridedView using CUDA using Aqua +using CUDA: Adapt Random.seed!(1234) From 4e9edfaf518e07a76efc1048d9f8a323ec3207e9 Mon Sep 17 00:00:00 2001 From: Katharine Hyatt Date: Wed, 25 Feb 2026 08:06:54 -0500 Subject: [PATCH 03/11] Refactor to use GPUArrays internals --- Project.toml | 6 +++++- ext/StridedCUDAExt.jl | 26 ++++++++------------------ ext/StridedGPUArraysExt.jl | 15 +++++++++++++++ test/runtests.jl | 2 +- 4 files changed, 29 insertions(+), 20 deletions(-) create mode 100644 ext/StridedGPUArraysExt.jl diff --git a/Project.toml b/Project.toml index 988d6f4..24a4060 100644 --- a/Project.toml +++ b/Project.toml @@ -9,14 +9,17 @@ StridedViews = "4db3bf67-4bd7-4b4e-b153-31dc3fb37143" TupleTools = "9d95972d-f1c8-5527-a6e0-b4b365fa01f6" [weakdeps] +GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" [extensions] +StridedGPUArraysExt = "GPUArrays" StridedCUDAExt = "CUDA" [compat] Aqua = "0.8" CUDA = "5" +GPUArrays = "11.4.1" LinearAlgebra = "1.6" Random = "1.6" StridedViews = "0.3.2,0.4" @@ -27,8 +30,9 @@ julia = "1.6" [extras] Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Test", "Random", "Aqua", "CUDA"] +test = ["Test", "Random", "Aqua", "CUDA", "GPUArrays"] diff --git a/ext/StridedCUDAExt.jl b/ext/StridedCUDAExt.jl index ab69704..aedd7da 100644 --- a/ext/StridedCUDAExt.jl +++ b/ext/StridedCUDAExt.jl @@ -1,29 +1,19 @@ module StridedCUDAExt using Strided, CUDA -import Strided: _mapreduce_fuse! +using CUDA: Adapt, KernelAdaptor +using CUDA: GPUArrays -ALL_FS = Union{typeof(adjoint), typeof(conj), typeof(identity), typeof(transpose)} +const ALL_FS = Union{typeof(adjoint), typeof(conj), typeof(identity), typeof(transpose)} -function Base.copy!(dst::StridedView{TD, ND, TAD, F}, src::StridedView{TS, NS, TAS, F}) where {TD, ND, TAD <: CuArray{TD}, F <: ALL_FS, TS, NS, TAS <: CuArray{TS}} - all_dst_inds = map(ix -> Strided.StridedViews._computeind(Tuple(ix), dst.strides), CartesianIndices(size(dst))) - viewed_dst = view(parent(dst), all_dst_inds) - all_src_inds = map(ix -> Strided.StridedViews._computeind(Tuple(ix), src.strides), CartesianIndices(size(src))) - viewed_src = view(parent(src), all_src_inds) - map!(identity, viewed_dst, viewed_src) - return dst +function Adapt.adapt_storage(to::KernelAdaptor, xs::StridedView{T,N,TA,F}) where {T,N,TA<:CuArray{T},F <: ALL_FS} + return StridedView(Adapt.adapt(to, parent(xs)), xs.size, xs.strides, xs.offset, xs.op) end function Base.copy!(dst::StridedView{TD, ND, TAD, FD}, src::StridedView{TS, NS, TAS, FS}) where {TD, ND, TAD <: CuArray{TD}, FD <: ALL_FS, TS, NS, TAS <: CuArray{TS}, FS <: ALL_FS} - all_dst_inds = map(ix -> Strided.StridedViews._computeind(Tuple(ix), dst.strides), CartesianIndices(size(dst))) - viewed_dst = view(parent(dst), all_dst_inds) - all_src_inds = map(ix -> Strided.StridedViews._computeind(Tuple(ix), src.strides), CartesianIndices(size(src))) - viewed_src = view(parent(src), all_src_inds) - if FS <: typeof(conj) && FD <: typeof(identity) - map!(conj, viewed_dst, viewed_src) - elseif FD <: typeof(conj) && FS <: typeof(identity) - map!(conj, viewed_dst, viewed_src) - end + bc_style = Base.Broadcast.BroadcastStyle(TAS) + bc = Base.Broadcast.Broadcasted(bc_style, identity, (src,), axes(dst)) + GPUArrays._copyto!(dst, bc) return dst end diff --git a/ext/StridedGPUArraysExt.jl b/ext/StridedGPUArraysExt.jl new file mode 100644 index 0000000..409ad5e --- /dev/null +++ b/ext/StridedGPUArraysExt.jl @@ -0,0 +1,15 @@ +module StridedGPUArraysExt + +using Strided, GPUArrays +using GPUArrays: Adapt, KernelAbstractions + +ALL_FS = Union{typeof(adjoint), typeof(conj), typeof(identity), typeof(transpose)} + +KernelAbstractions.get_backend(sv::StridedView{T, N, TA}) where {T, N, TA <: AnyGPUArray{T}} = KernelAbstractions.get_backend(parent(sv)) + +function Base.Broadcast.BroadcastStyle(gpu_sv::StridedView{T, N, TA}) where {T, N, TA <: AnyGPUArray{T}} + raw_style = Base.Broadcast.BroadcastStyle(TA) + return typeof(raw_style)(Val(N)) # sets the dimensionality correctly +end + +end diff --git a/test/runtests.jl b/test/runtests.jl index fd0f715..b596e72 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,8 +3,8 @@ using LinearAlgebra using Random using Strided using Strided: StridedView -using CUDA using Aqua +using CUDA, GPUArrays using CUDA: Adapt Random.seed!(1234) From 09d54210c42afb3a5e1c636468ccf8c79863fbf9 Mon Sep 17 00:00:00 2001 From: Katharine Hyatt Date: Wed, 25 Feb 2026 08:56:53 -0500 Subject: [PATCH 04/11] Update CI and add BK --- .buildkite/pipeline.yml | 65 ++++++++++++++++++++++++++++++++++++++++ .github/workflows/ci.yml | 4 +-- 2 files changed, 66 insertions(+), 3 deletions(-) create mode 100644 .buildkite/pipeline.yml diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml new file mode 100644 index 0000000..a420e6f --- /dev/null +++ b/.buildkite/pipeline.yml @@ -0,0 +1,65 @@ +env: + SECRET_CODECOV_TOKEN: "EEXB5DS9rR3VXck1NzJougBwxy+3bGKAX9sq1hTwe+rvftmQzdnpy3MlJXLUXQXnBvjezhHZpt07nlG1p9Pi39bnUIddPJHJVVbtqjiGbVuAjVno2tcm8cvi/mYDPoJw7hs8G36IVDb3wklO9wAiO7vwO2br8LQOHMNZBTCUfkb30aT3e/yBnb2QiwNspKCvcd7XYpsmMy78Egdg219sfZ783fG/H7VHv0YzZThj+IAUhm8ftsPURHRmHk28wSdFGzwI2CX8nEx4LgtDhqa+JH84YajIiwWaFymfkw6phpSF3KQNlR53qRWUDD6hClhOizmYyQuZZ8TO5gnNDsrGLg==;U2FsdGVkX1/pfvZY/FJSU7D+DE+6I18s5BSfa63C+31RoDKiHqENegG4whXuxZ5a6YE0XegF8jOretp+E7FiyQ==" + +steps: + - label: "Julia v1 -- CUDA" + plugins: + - JuliaCI/julia#v1: + version: "1" + - JuliaCI/julia-test#v1: ~ + - JuliaCI/julia-coverage#v1: + dirs: + - src + - ext + agents: + queue: "juliagpu" + cuda: "*" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 30 + + - label: "Julia LTS -- CUDA" + plugins: + - JuliaCI/julia#v1: + version: "1.10" # "lts" isn't valid + - JuliaCI/julia-test#v1: ~ + - JuliaCI/julia-coverage#v1: + dirs: + - src + - ext + agents: + queue: "juliagpu" + cuda: "*" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 30 + + - label: "Julia v1 -- AMDGPU" + plugins: + - JuliaCI/julia#v1: + version: "1" + - JuliaCI/julia-test#v1: ~ + - JuliaCI/julia-coverage#v1: + dirs: + - src + - ext + agents: + queue: "juliagpu" + rocm: "*" + rocmgpu: "*" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 30 + + - label: "Julia LTS -- AMDGPU" + plugins: + - JuliaCI/julia#v1: + version: "1.10" # "lts" isn't valid + - JuliaCI/julia-test#v1: ~ + - JuliaCI/julia-coverage#v1: + dirs: + - src + - ext + agents: + queue: "juliagpu" + rocm: "*" + rocmgpu: "*" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 30 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 22e2e79..5469526 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,7 +2,6 @@ name: CI on: push: branches: - - 'master' - 'main' - 'release-' tags: '*' @@ -21,7 +20,6 @@ jobs: fail-fast: false matrix: version: - - '1.6' # previous LTS release - 'lts' # current LTS release - '1' # current stable release os: @@ -45,4 +43,4 @@ jobs: - uses: codecov/codecov-action@v5 with: file: lcov.info - \ No newline at end of file + From 9721b1eb9bafb70e4a32decccf172c3d900062b8 Mon Sep 17 00:00:00 2001 From: Katharine Hyatt Date: Wed, 25 Feb 2026 09:03:22 -0500 Subject: [PATCH 05/11] Also trial AMD support --- Project.toml | 6 +++++- ext/StridedAMDGPUExt.jl | 16 ++++++++++++++++ ext/StridedCUDAExt.jl | 8 ++------ test/amd.jl | 14 ++++++++++++++ test/cuda.jl | 20 ++++++++++---------- test/runtests.jl | 9 ++++++--- 6 files changed, 53 insertions(+), 20 deletions(-) create mode 100644 ext/StridedAMDGPUExt.jl create mode 100644 test/amd.jl diff --git a/Project.toml b/Project.toml index 24a4060..d8ca727 100644 --- a/Project.toml +++ b/Project.toml @@ -9,14 +9,17 @@ StridedViews = "4db3bf67-4bd7-4b4e-b153-31dc3fb37143" TupleTools = "9d95972d-f1c8-5527-a6e0-b4b365fa01f6" [weakdeps] +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" [extensions] +StridedAMDGPUExt = "AMDGPU" StridedGPUArraysExt = "GPUArrays" StridedCUDAExt = "CUDA" [compat] +AMDGPU = "2" Aqua = "0.8" CUDA = "5" GPUArrays = "11.4.1" @@ -28,6 +31,7 @@ TupleTools = "1.6" julia = "1.6" [extras] +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" @@ -35,4 +39,4 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Test", "Random", "Aqua", "CUDA", "GPUArrays"] +test = ["Test", "Random", "Aqua", "AMDGPU", "CUDA", "GPUArrays"] diff --git a/ext/StridedAMDGPUExt.jl b/ext/StridedAMDGPUExt.jl new file mode 100644 index 0000000..8ded5a9 --- /dev/null +++ b/ext/StridedAMDGPUExt.jl @@ -0,0 +1,16 @@ +module StridedAMDGPUExt + +using Strided, StridedViews, AMDGPU +using AMDGPU: Adapt +using AMDGPU: GPUArrays + +const ALL_FS = Union{typeof(adjoint), typeof(conj), typeof(identity), typeof(transpose)} + +function Base.copy!(dst::StridedView{TD, ND, TAD, FD}, src::StridedView{TS, NS, TAS, FS}) where {TD, ND, TAD <: ROCArray{TD}, FD <: ALL_FS, TS, NS, TAS <: ROCArray{TS}, FS <: ALL_FS} + bc_style = Base.Broadcast.BroadcastStyle(TAS) + bc = Base.Broadcast.Broadcasted(bc_style, identity, (src,), axes(dst)) + GPUArrays._copyto!(dst, bc) + return dst +end + +end diff --git a/ext/StridedCUDAExt.jl b/ext/StridedCUDAExt.jl index aedd7da..513836f 100644 --- a/ext/StridedCUDAExt.jl +++ b/ext/StridedCUDAExt.jl @@ -1,17 +1,13 @@ module StridedCUDAExt -using Strided, CUDA +using Strided, StridedViews, CUDA using CUDA: Adapt, KernelAdaptor using CUDA: GPUArrays const ALL_FS = Union{typeof(adjoint), typeof(conj), typeof(identity), typeof(transpose)} -function Adapt.adapt_storage(to::KernelAdaptor, xs::StridedView{T,N,TA,F}) where {T,N,TA<:CuArray{T},F <: ALL_FS} - return StridedView(Adapt.adapt(to, parent(xs)), xs.size, xs.strides, xs.offset, xs.op) -end - function Base.copy!(dst::StridedView{TD, ND, TAD, FD}, src::StridedView{TS, NS, TAS, FS}) where {TD, ND, TAD <: CuArray{TD}, FD <: ALL_FS, TS, NS, TAS <: CuArray{TS}, FS <: ALL_FS} - bc_style = Base.Broadcast.BroadcastStyle(TAS) + bc_style = Base.Broadcast.BroadcastStyle(TAS) bc = Base.Broadcast.Broadcasted(bc_style, identity, (src,), axes(dst)) GPUArrays._copyto!(dst, bc) return dst diff --git a/test/amd.jl b/test/amd.jl new file mode 100644 index 0000000..b08b941 --- /dev/null +++ b/test/amd.jl @@ -0,0 +1,14 @@ +for T in (Float32, Float64, Complex{Float32}, Complex{Float64}) + @testset "Copy with ROCStridedView: $T, $f1, $f2" for f2 in (identity, conj, adjoint, transpose), f1 in (identity, conj, transpose, adjoint) + for m1 in (0, 16, 32), m2 in (0, 16, 32) + A1 = AMDGPU.randn(T, (m1, m2)) + A2 = similar(A1) + A1c = copy(A1) + A2c = copy(A2) + B1 = f1(StridedView(A1c)) + B2 = f2(StridedView(A2c)) + axes(f1(A1)) == axes(f2(A2)) || continue + @test collect(ROCMatrix(copy!(f2(A2), f1(A1)))) == Adapt.adapt(Vector{T}, copy!(B2, B1)) + end + end +end diff --git a/test/cuda.jl b/test/cuda.jl index e86b4eb..695fec9 100644 --- a/test/cuda.jl +++ b/test/cuda.jl @@ -1,14 +1,14 @@ for T in (Float32, Float64, Complex{Float32}, Complex{Float64}) - m1 = 32 - m2 = 16 @testset "Copy with CuStridedView: $T, $f1, $f2" for f2 in (identity, conj, adjoint, transpose), f1 in (identity, conj, transpose, adjoint) - A1 = CUDA.randn(T, (m1, m2)) - A2 = similar(A1) - A1c = copy(A1) - A2c = copy(A2) - B1 = f1(StridedView(A1c)) - B2 = f2(StridedView(A2c)) - axes(f1(A1)) == axes(f2(A2)) || continue - @test collect(CuMatrix(copy!(f2(A2), f1(A1)))) == Adapt.adapt(Vector{T}, copy!(B2, B1)) + for m1 in (0, 16, 32), m2 in (0, 16, 32) + A1 = CUDA.randn(T, (m1, m2)) + A2 = similar(A1) + A1c = copy(A1) + A2c = copy(A2) + B1 = f1(StridedView(A1c)) + B2 = f2(StridedView(A2c)) + axes(f1(A1)) == axes(f2(A2)) || continue + @test collect(CuMatrix(copy!(f2(A2), f1(A1)))) == CUDA.Adapt.adapt(Vector{T}, copy!(B2, B1)) + end end end diff --git a/test/runtests.jl b/test/runtests.jl index b596e72..3f9ee6f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -4,8 +4,7 @@ using Random using Strided using Strided: StridedView using Aqua -using CUDA, GPUArrays -using CUDA: Adapt +using AMDGPU, CUDA, GPUArrays Random.seed!(1234) @@ -29,9 +28,13 @@ if !is_buildkite include("blasmultests.jl") Strided.disable_threaded_mul() - Aqua.test_all(Strided; piracies=false) + Aqua.test_all(Strided; piracies = false) end if CUDA.functional() include("cuda.jl") end + +if AMDGPU.functional() + include("amd.jl") +end From 3326f5a9aa9bd2a37901d57795062b8529c6fd33 Mon Sep 17 00:00:00 2001 From: Katharine Hyatt Date: Thu, 26 Feb 2026 05:20:22 -0500 Subject: [PATCH 06/11] Workaround for size zero --- test/amd.jl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test/amd.jl b/test/amd.jl index b08b941..8e6d18f 100644 --- a/test/amd.jl +++ b/test/amd.jl @@ -1,7 +1,11 @@ for T in (Float32, Float64, Complex{Float32}, Complex{Float64}) @testset "Copy with ROCStridedView: $T, $f1, $f2" for f2 in (identity, conj, adjoint, transpose), f1 in (identity, conj, transpose, adjoint) for m1 in (0, 16, 32), m2 in (0, 16, 32) - A1 = AMDGPU.randn(T, (m1, m2)) + if iszero(m1 * m2) + A1 = AMDGPU.ROCMatrix{T}(undef, (m1, m2)) + else + A1 = AMDGPU.randn(T, (m1, m2)) + end A2 = similar(A1) A1c = copy(A1) A2c = copy(A2) From c63b8705961f54ca34df6e00fa90c7370852581d Mon Sep 17 00:00:00 2001 From: Katharine Hyatt Date: Thu, 26 Feb 2026 06:00:45 -0500 Subject: [PATCH 07/11] Update StridedViews dep --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index d8ca727..f694518 100644 --- a/Project.toml +++ b/Project.toml @@ -25,7 +25,7 @@ CUDA = "5" GPUArrays = "11.4.1" LinearAlgebra = "1.6" Random = "1.6" -StridedViews = "0.3.2,0.4" +StridedViews = "0.4.4" Test = "1.6" TupleTools = "1.6" julia = "1.6" From 6ec9797855bccceb5a2dd410f5395a054d0646b4 Mon Sep 17 00:00:00 2001 From: Katharine Hyatt Date: Thu, 26 Feb 2026 07:15:23 -0500 Subject: [PATCH 08/11] Fix AMD test --- test/amd.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/amd.jl b/test/amd.jl index 8e6d18f..48cd446 100644 --- a/test/amd.jl +++ b/test/amd.jl @@ -12,7 +12,7 @@ for T in (Float32, Float64, Complex{Float32}, Complex{Float64}) B1 = f1(StridedView(A1c)) B2 = f2(StridedView(A2c)) axes(f1(A1)) == axes(f2(A2)) || continue - @test collect(ROCMatrix(copy!(f2(A2), f1(A1)))) == Adapt.adapt(Vector{T}, copy!(B2, B1)) + @test collect(ROCMatrix(copy!(f2(A2), f1(A1)))) == AMDGPU.Adapt.adapt(Vector{T}, copy!(B2, B1)) end end end From d3769f618f584ca6a3e209225d8eebccfdc1bd10 Mon Sep 17 00:00:00 2001 From: Katharine Hyatt Date: Thu, 26 Feb 2026 07:36:24 -0500 Subject: [PATCH 09/11] Another AMD fix --- test/amd.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/amd.jl b/test/amd.jl index 48cd446..fc77c49 100644 --- a/test/amd.jl +++ b/test/amd.jl @@ -4,7 +4,7 @@ for T in (Float32, Float64, Complex{Float32}, Complex{Float64}) if iszero(m1 * m2) A1 = AMDGPU.ROCMatrix{T}(undef, (m1, m2)) else - A1 = AMDGPU.randn(T, (m1, m2)) + A1 = ROCMatrix(randn(T, (m1, m2))) end A2 = similar(A1) A1c = copy(A1) From 59c3e921d32e94231b50de35fcf3e3d227cf1df5 Mon Sep 17 00:00:00 2001 From: Katharine Hyatt Date: Thu, 26 Feb 2026 09:24:29 -0500 Subject: [PATCH 10/11] Bump StridedViews version --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index f694518..55aca8e 100644 --- a/Project.toml +++ b/Project.toml @@ -25,7 +25,7 @@ CUDA = "5" GPUArrays = "11.4.1" LinearAlgebra = "1.6" Random = "1.6" -StridedViews = "0.4.4" +StridedViews = "0.4.5" Test = "1.6" TupleTools = "1.6" julia = "1.6" From 6814a52dd73e85d35a248ad45f64c817fb92d421 Mon Sep 17 00:00:00 2001 From: Katharine Hyatt Date: Thu, 26 Feb 2026 10:15:41 -0500 Subject: [PATCH 11/11] Add Number restriction --- ext/StridedAMDGPUExt.jl | 2 +- ext/StridedCUDAExt.jl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ext/StridedAMDGPUExt.jl b/ext/StridedAMDGPUExt.jl index 8ded5a9..6fa3c40 100644 --- a/ext/StridedAMDGPUExt.jl +++ b/ext/StridedAMDGPUExt.jl @@ -6,7 +6,7 @@ using AMDGPU: GPUArrays const ALL_FS = Union{typeof(adjoint), typeof(conj), typeof(identity), typeof(transpose)} -function Base.copy!(dst::StridedView{TD, ND, TAD, FD}, src::StridedView{TS, NS, TAS, FS}) where {TD, ND, TAD <: ROCArray{TD}, FD <: ALL_FS, TS, NS, TAS <: ROCArray{TS}, FS <: ALL_FS} +function Base.copy!(dst::StridedView{TD, ND, TAD, FD}, src::StridedView{TS, NS, TAS, FS}) where {TD <: Number, ND, TAD <: ROCArray{TD}, FD <: ALL_FS, TS <: Number, NS, TAS <: ROCArray{TS}, FS <: ALL_FS} bc_style = Base.Broadcast.BroadcastStyle(TAS) bc = Base.Broadcast.Broadcasted(bc_style, identity, (src,), axes(dst)) GPUArrays._copyto!(dst, bc) diff --git a/ext/StridedCUDAExt.jl b/ext/StridedCUDAExt.jl index 513836f..ec0abfd 100644 --- a/ext/StridedCUDAExt.jl +++ b/ext/StridedCUDAExt.jl @@ -6,7 +6,7 @@ using CUDA: GPUArrays const ALL_FS = Union{typeof(adjoint), typeof(conj), typeof(identity), typeof(transpose)} -function Base.copy!(dst::StridedView{TD, ND, TAD, FD}, src::StridedView{TS, NS, TAS, FS}) where {TD, ND, TAD <: CuArray{TD}, FD <: ALL_FS, TS, NS, TAS <: CuArray{TS}, FS <: ALL_FS} +function Base.copy!(dst::StridedView{TD, ND, TAD, FD}, src::StridedView{TS, NS, TAS, FS}) where {TD <: Number, ND, TAD <: CuArray{TD}, FD <: ALL_FS, TS <: Number, NS, TAS <: CuArray{TS}, FS <: ALL_FS} bc_style = Base.Broadcast.BroadcastStyle(TAS) bc = Base.Broadcast.Broadcasted(bc_style, identity, (src,), axes(dst)) GPUArrays._copyto!(dst, bc)