From ac2bc595293509e852e8aba03e8d1516f3d94655 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Tue, 24 Feb 2026 09:03:33 -0500
Subject: [PATCH 01/11] Initial support for copy! + CUDA

---
 Project.toml          | 12 ++++++++++--
 ext/StridedCUDAExt.jl | 16 ++++++++++++++++
 test/cuda.jl          | 25 +++++++++++++++++++++++++
 test/runtests.jl      | 39 ++++++++++++++++++++++++---------------
 4 files changed, 75 insertions(+), 17 deletions(-)
 create mode 100644 ext/StridedCUDAExt.jl
 create mode 100644 test/cuda.jl

diff --git a/Project.toml b/Project.toml
index f00886f..988d6f4 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,15 +1,22 @@
 name = "Strided"
 uuid = "5e0ebb24-38b0-5f93-81fe-25c709ecae67"
-authors = ["Lukas Devos <lukas.devos@ugent.be>", "Maarten Van Damme <maartenvd1994@gmail.com>", "Jutho Haegeman <jutho.haegeman@ugent.be>"]
 version = "2.3.2"
+authors = ["Lukas Devos <lukas.devos@ugent.be>", "Maarten Van Damme <maartenvd1994@gmail.com>", "Jutho Haegeman <jutho.haegeman@ugent.be>"]
 
 [deps]
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 StridedViews = "4db3bf67-4bd7-4b4e-b153-31dc3fb37143"
 TupleTools = "9d95972d-f1c8-5527-a6e0-b4b365fa01f6"
 
+[weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+
+[extensions]
+StridedCUDAExt = "CUDA"
+
 [compat]
 Aqua = "0.8"
+CUDA = "5"
 LinearAlgebra = "1.6"
 Random = "1.6"
 StridedViews = "0.3.2,0.4"
@@ -19,8 +26,9 @@ julia = "1.6"
 
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test", "Random", "Aqua"]
+test = ["Test", "Random", "Aqua", "CUDA"]
diff --git a/ext/StridedCUDAExt.jl b/ext/StridedCUDAExt.jl
new file mode 100644
index 0000000..326ca7c
--- /dev/null
+++ b/ext/StridedCUDAExt.jl
@@ -0,0 +1,16 @@
+module StridedCUDAExt
+
+using Strided, CUDA
+import Strided: _mapreduce_fuse!
+
+ALL_FS = Union{typeof(adjoint), typeof(conj), typeof(identity), typeof(transpose)}
+
+function Base.copy!(dst::StridedView{TD,ND,TAD,FD}, src::StridedView{TS,NS,TAS,FS}) where {TD, ND, TAD <: CuArray{TD}, FD<:ALL_FS, TS, NS, TAS <: CuArray{TS}, FS<:ALL_FS}
+    all_dst_inds = map(ix->Strided.StridedViews._computeind(Tuple(ix), dst.strides), CartesianIndices(size(dst)))
+    viewed_dst = view(parent(dst), all_dst_inds)
+    all_src_inds = map(ix->Strided.StridedViews._computeind(Tuple(ix), src.strides), CartesianIndices(size(src)))
+    viewed_src = view(parent(src), all_src_inds)
+    return map!(identity, viewed_dst, viewed_src)
+end
+
+end
diff --git a/test/cuda.jl b/test/cuda.jl
new file mode 100644
index 0000000..e60e4b7
--- /dev/null
+++ b/test/cuda.jl
@@ -0,0 +1,25 @@
+for T in (Float32, Float64, Complex{Float32}, Complex{Float64})
+    @testset "Copy with CuStridedView: $T" begin
+        m1 = 32
+        m2 = 16
+        A1 = CUDA.randn(T, (m1, m2))
+        A2 = similar(A1)
+        A1c = copy(A1)
+        A2c = copy(A2)
+        B1 = StridedView(A1c)
+        B2 = StridedView(A2c)
+        @test copy!(A2, A1) == copy!(B2, B1)
+        @test copy!(transpose(A2), transpose(A1)) == copy!(transpose(B2), transpose(B1))
+        if T <: Complex
+            @test_broken copy!(transpose(A2), adjoint(A1)) == copy!(transpose(B2), adjoint(B1))
+            @test_broken copy!(adjoint(A2), adjoint(A1)) == copy!(adjoint(B2), adjoint(B1))
+            @test_broken copy!(A2, conj(A1)) == copy!(B2, conj(B1))
+            @test_broken copy!(conj(A2), conj(A1)) == copy!(conj(B2), conj(B1))
+        else
+            @test copy!(transpose(A2), adjoint(A1)) == copy!(transpose(B2), adjoint(B1))
+            @test copy!(adjoint(A2), adjoint(A1)) == copy!(adjoint(B2), adjoint(B1))
+            @test copy!(A2, conj(A1)) == copy!(B2, conj(B1))
+            @test copy!(conj(A2), conj(A1)) == copy!(conj(B2), conj(B1))
+        end
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index fc411cc..cf18c69 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -3,25 +3,34 @@ using LinearAlgebra
 using Random
 using Strided
 using Strided: StridedView
+using CUDA
+using Aqua
 
 Random.seed!(1234)
 
-println("Base.Threads.nthreads() =  $(Base.Threads.nthreads())")
+is_buildkite = get(ENV, "BUILDKITE", "false") == "true"
 
-println("Running tests single-threaded:")
-Strided.disable_threads()
-include("othertests.jl")
-include("blasmultests.jl")
+if !is_buildkite
+    println("Base.Threads.nthreads() =  $(Base.Threads.nthreads())")
 
-println("Running tests multi-threaded:")
-Strided.enable_threads()
-Strided.set_num_threads(Base.Threads.nthreads() + 1)
-include("othertests.jl")
-include("blasmultests.jl")
+    println("Running tests single-threaded:")
+    Strided.disable_threads()
+    include("othertests.jl")
+    include("blasmultests.jl")
 
-Strided.enable_threaded_mul()
-include("blasmultests.jl")
-Strided.disable_threaded_mul()
+    println("Running tests multi-threaded:")
+    Strided.enable_threads()
+    Strided.set_num_threads(Base.Threads.nthreads() + 1)
+    include("othertests.jl")
+    include("blasmultests.jl")
 
-using Aqua
-Aqua.test_all(Strided; piracies = false)
+    Strided.enable_threaded_mul()
+    include("blasmultests.jl")
+    Strided.disable_threaded_mul()
+
+    Aqua.test_all(Strided; piracies=false)
+end
+
+if CUDA.functional()
+    include("cuda.jl")
+end

From 91c61b881abad02511ef9afd221e3ff794132b01 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Tue, 24 Feb 2026 14:36:56 -0500
Subject: [PATCH 02/11] Working extension

---
 ext/StridedCUDAExt.jl | 22 ++++++++++++++++++----
 test/cuda.jl          | 25 +++++++------------------
 test/runtests.jl      |  1 +
 3 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/ext/StridedCUDAExt.jl b/ext/StridedCUDAExt.jl
index 326ca7c..ab69704 100644
--- a/ext/StridedCUDAExt.jl
+++ b/ext/StridedCUDAExt.jl
@@ -5,12 +5,26 @@ import Strided: _mapreduce_fuse!
 
 ALL_FS = Union{typeof(adjoint), typeof(conj), typeof(identity), typeof(transpose)}
 
-function Base.copy!(dst::StridedView{TD,ND,TAD,FD}, src::StridedView{TS,NS,TAS,FS}) where {TD, ND, TAD <: CuArray{TD}, FD<:ALL_FS, TS, NS, TAS <: CuArray{TS}, FS<:ALL_FS}
-    all_dst_inds = map(ix->Strided.StridedViews._computeind(Tuple(ix), dst.strides), CartesianIndices(size(dst)))
+function Base.copy!(dst::StridedView{TD, ND, TAD, F}, src::StridedView{TS, NS, TAS, F}) where {TD, ND, TAD <: CuArray{TD}, F <: ALL_FS, TS, NS, TAS <: CuArray{TS}}
+    all_dst_inds = map(ix -> Strided.StridedViews._computeind(Tuple(ix), dst.strides), CartesianIndices(size(dst)))
     viewed_dst = view(parent(dst), all_dst_inds)
-    all_src_inds = map(ix->Strided.StridedViews._computeind(Tuple(ix), src.strides), CartesianIndices(size(src)))
+    all_src_inds = map(ix -> Strided.StridedViews._computeind(Tuple(ix), src.strides), CartesianIndices(size(src)))
     viewed_src = view(parent(src), all_src_inds)
-    return map!(identity, viewed_dst, viewed_src)
+    map!(identity, viewed_dst, viewed_src)
+    return dst
+end
+
+function Base.copy!(dst::StridedView{TD, ND, TAD, FD}, src::StridedView{TS, NS, TAS, FS}) where {TD, ND, TAD <: CuArray{TD}, FD <: ALL_FS, TS, NS, TAS <: CuArray{TS}, FS <: ALL_FS}
+    all_dst_inds = map(ix -> Strided.StridedViews._computeind(Tuple(ix), dst.strides), CartesianIndices(size(dst)))
+    viewed_dst = view(parent(dst), all_dst_inds)
+    all_src_inds = map(ix -> Strided.StridedViews._computeind(Tuple(ix), src.strides), CartesianIndices(size(src)))
+    viewed_src = view(parent(src), all_src_inds)
+    if FS <: typeof(conj) && FD <: typeof(identity)
+        map!(conj, viewed_dst, viewed_src)
+    elseif FD <: typeof(conj) && FS <: typeof(identity)
+        map!(conj, viewed_dst, viewed_src)
+    end
+    return dst
 end
 
 end
diff --git a/test/cuda.jl b/test/cuda.jl
index e60e4b7..e86b4eb 100644
--- a/test/cuda.jl
+++ b/test/cuda.jl
@@ -1,25 +1,14 @@
 for T in (Float32, Float64, Complex{Float32}, Complex{Float64})
-    @testset "Copy with CuStridedView: $T" begin
-        m1 = 32
-        m2 = 16
+    m1 = 32
+    m2 = 16
+    @testset "Copy with CuStridedView: $T, $f1, $f2" for f2 in (identity, conj, adjoint, transpose), f1 in (identity, conj, transpose, adjoint)
         A1 = CUDA.randn(T, (m1, m2))
         A2 = similar(A1)
         A1c = copy(A1)
         A2c = copy(A2)
-        B1 = StridedView(A1c)
-        B2 = StridedView(A2c)
-        @test copy!(A2, A1) == copy!(B2, B1)
-        @test copy!(transpose(A2), transpose(A1)) == copy!(transpose(B2), transpose(B1))
-        if T <: Complex
-            @test_broken copy!(transpose(A2), adjoint(A1)) == copy!(transpose(B2), adjoint(B1))
-            @test_broken copy!(adjoint(A2), adjoint(A1)) == copy!(adjoint(B2), adjoint(B1))
-            @test_broken copy!(A2, conj(A1)) == copy!(B2, conj(B1))
-            @test_broken copy!(conj(A2), conj(A1)) == copy!(conj(B2), conj(B1))
-        else
-            @test copy!(transpose(A2), adjoint(A1)) == copy!(transpose(B2), adjoint(B1))
-            @test copy!(adjoint(A2), adjoint(A1)) == copy!(adjoint(B2), adjoint(B1))
-            @test copy!(A2, conj(A1)) == copy!(B2, conj(B1))
-            @test copy!(conj(A2), conj(A1)) == copy!(conj(B2), conj(B1))
-        end
+        B1 = f1(StridedView(A1c))
+        B2 = f2(StridedView(A2c))
+        axes(f1(A1)) == axes(f2(A2)) || continue
+        @test collect(CuMatrix(copy!(f2(A2), f1(A1)))) == Adapt.adapt(Vector{T}, copy!(B2, B1))
     end
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index cf18c69..fd0f715 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -5,6 +5,7 @@ using Strided
 using Strided: StridedView
 using CUDA
 using Aqua
+using CUDA: Adapt
 
 Random.seed!(1234)
 

From 4e9edfaf518e07a76efc1048d9f8a323ec3207e9 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Wed, 25 Feb 2026 08:06:54 -0500
Subject: [PATCH 03/11] Refactor to use GPUArrays internals

---
 Project.toml               |  6 +++++-
 ext/StridedCUDAExt.jl      | 26 ++++++++------------------
 ext/StridedGPUArraysExt.jl | 15 +++++++++++++++
 test/runtests.jl           |  2 +-
 4 files changed, 29 insertions(+), 20 deletions(-)
 create mode 100644 ext/StridedGPUArraysExt.jl

diff --git a/Project.toml b/Project.toml
index 988d6f4..24a4060 100644
--- a/Project.toml
+++ b/Project.toml
@@ -9,14 +9,17 @@ StridedViews = "4db3bf67-4bd7-4b4e-b153-31dc3fb37143"
 TupleTools = "9d95972d-f1c8-5527-a6e0-b4b365fa01f6"
 
 [weakdeps]
+GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 
 [extensions]
+StridedGPUArraysExt = "GPUArrays"
 StridedCUDAExt = "CUDA"
 
 [compat]
 Aqua = "0.8"
 CUDA = "5"
+GPUArrays = "11.4.1"
 LinearAlgebra = "1.6"
 Random = "1.6"
 StridedViews = "0.3.2,0.4"
@@ -27,8 +30,9 @@ julia = "1.6"
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test", "Random", "Aqua", "CUDA"]
+test = ["Test", "Random", "Aqua", "CUDA", "GPUArrays"]
diff --git a/ext/StridedCUDAExt.jl b/ext/StridedCUDAExt.jl
index ab69704..aedd7da 100644
--- a/ext/StridedCUDAExt.jl
+++ b/ext/StridedCUDAExt.jl
@@ -1,29 +1,19 @@
 module StridedCUDAExt
 
 using Strided, CUDA
-import Strided: _mapreduce_fuse!
+using CUDA: Adapt, KernelAdaptor
+using CUDA: GPUArrays
 
-ALL_FS = Union{typeof(adjoint), typeof(conj), typeof(identity), typeof(transpose)}
+const ALL_FS = Union{typeof(adjoint), typeof(conj), typeof(identity), typeof(transpose)}
 
-function Base.copy!(dst::StridedView{TD, ND, TAD, F}, src::StridedView{TS, NS, TAS, F}) where {TD, ND, TAD <: CuArray{TD}, F <: ALL_FS, TS, NS, TAS <: CuArray{TS}}
-    all_dst_inds = map(ix -> Strided.StridedViews._computeind(Tuple(ix), dst.strides), CartesianIndices(size(dst)))
-    viewed_dst = view(parent(dst), all_dst_inds)
-    all_src_inds = map(ix -> Strided.StridedViews._computeind(Tuple(ix), src.strides), CartesianIndices(size(src)))
-    viewed_src = view(parent(src), all_src_inds)
-    map!(identity, viewed_dst, viewed_src)
-    return dst
+function Adapt.adapt_storage(to::KernelAdaptor, xs::StridedView{T,N,TA,F}) where {T,N,TA<:CuArray{T},F <: ALL_FS}
+    return StridedView(Adapt.adapt(to, parent(xs)), xs.size, xs.strides, xs.offset, xs.op)
 end
 
 function Base.copy!(dst::StridedView{TD, ND, TAD, FD}, src::StridedView{TS, NS, TAS, FS}) where {TD, ND, TAD <: CuArray{TD}, FD <: ALL_FS, TS, NS, TAS <: CuArray{TS}, FS <: ALL_FS}
-    all_dst_inds = map(ix -> Strided.StridedViews._computeind(Tuple(ix), dst.strides), CartesianIndices(size(dst)))
-    viewed_dst = view(parent(dst), all_dst_inds)
-    all_src_inds = map(ix -> Strided.StridedViews._computeind(Tuple(ix), src.strides), CartesianIndices(size(src)))
-    viewed_src = view(parent(src), all_src_inds)
-    if FS <: typeof(conj) && FD <: typeof(identity)
-        map!(conj, viewed_dst, viewed_src)
-    elseif FD <: typeof(conj) && FS <: typeof(identity)
-        map!(conj, viewed_dst, viewed_src)
-    end
+    bc_style = Base.Broadcast.BroadcastStyle(TAS) 
+    bc = Base.Broadcast.Broadcasted(bc_style, identity, (src,), axes(dst))
+    GPUArrays._copyto!(dst, bc)
     return dst
 end
 
diff --git a/ext/StridedGPUArraysExt.jl b/ext/StridedGPUArraysExt.jl
new file mode 100644
index 0000000..409ad5e
--- /dev/null
+++ b/ext/StridedGPUArraysExt.jl
@@ -0,0 +1,15 @@
+module StridedGPUArraysExt
+
+using Strided, GPUArrays
+using GPUArrays: Adapt, KernelAbstractions
+
+ALL_FS = Union{typeof(adjoint), typeof(conj), typeof(identity), typeof(transpose)}
+
+KernelAbstractions.get_backend(sv::StridedView{T, N, TA}) where {T, N, TA <: AnyGPUArray{T}} = KernelAbstractions.get_backend(parent(sv))
+
+function Base.Broadcast.BroadcastStyle(gpu_sv::StridedView{T, N, TA}) where {T, N, TA <: AnyGPUArray{T}}
+    raw_style = Base.Broadcast.BroadcastStyle(TA)
+    return typeof(raw_style)(Val(N)) # sets the dimensionality correctly
+end
+
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index fd0f715..b596e72 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -3,8 +3,8 @@ using LinearAlgebra
 using Random
 using Strided
 using Strided: StridedView
-using CUDA
 using Aqua
+using CUDA, GPUArrays
 using CUDA: Adapt
 
 Random.seed!(1234)

From 09d54210c42afb3a5e1c636468ccf8c79863fbf9 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Wed, 25 Feb 2026 08:56:53 -0500
Subject: [PATCH 04/11] Update CI and add BK

---
 .buildkite/pipeline.yml  | 65 ++++++++++++++++++++++++++++++++++++++++
 .github/workflows/ci.yml |  4 +--
 2 files changed, 66 insertions(+), 3 deletions(-)
 create mode 100644 .buildkite/pipeline.yml

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
new file mode 100644
index 0000000..a420e6f
--- /dev/null
+++ b/.buildkite/pipeline.yml
@@ -0,0 +1,65 @@
+env:
+  SECRET_CODECOV_TOKEN: "EEXB5DS9rR3VXck1NzJougBwxy+3bGKAX9sq1hTwe+rvftmQzdnpy3MlJXLUXQXnBvjezhHZpt07nlG1p9Pi39bnUIddPJHJVVbtqjiGbVuAjVno2tcm8cvi/mYDPoJw7hs8G36IVDb3wklO9wAiO7vwO2br8LQOHMNZBTCUfkb30aT3e/yBnb2QiwNspKCvcd7XYpsmMy78Egdg219sfZ783fG/H7VHv0YzZThj+IAUhm8ftsPURHRmHk28wSdFGzwI2CX8nEx4LgtDhqa+JH84YajIiwWaFymfkw6phpSF3KQNlR53qRWUDD6hClhOizmYyQuZZ8TO5gnNDsrGLg==;U2FsdGVkX1/pfvZY/FJSU7D+DE+6I18s5BSfa63C+31RoDKiHqENegG4whXuxZ5a6YE0XegF8jOretp+E7FiyQ=="
+
+steps:
+  - label: "Julia v1 -- CUDA"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1"
+      - JuliaCI/julia-test#v1: ~
+      - JuliaCI/julia-coverage#v1:
+          dirs:
+            - src
+            - ext
+    agents:
+      queue: "juliagpu"
+      cuda: "*"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 30
+
+  - label: "Julia LTS -- CUDA"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.10" # "lts" isn't valid
+      - JuliaCI/julia-test#v1: ~
+      - JuliaCI/julia-coverage#v1:
+          dirs:
+            - src
+            - ext
+    agents:
+      queue: "juliagpu"
+      cuda: "*"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 30
+  
+  - label: "Julia v1 -- AMDGPU"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1"
+      - JuliaCI/julia-test#v1: ~
+      - JuliaCI/julia-coverage#v1:
+          dirs:
+            - src
+            - ext
+    agents:
+      queue: "juliagpu"
+      rocm: "*"
+      rocmgpu: "*"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 30
+
+  - label: "Julia LTS -- AMDGPU"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.10" # "lts" isn't valid
+      - JuliaCI/julia-test#v1: ~
+      - JuliaCI/julia-coverage#v1:
+          dirs:
+            - src
+            - ext
+    agents:
+      queue: "juliagpu"
+      rocm: "*"
+      rocmgpu: "*"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 30
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 22e2e79..5469526 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2,7 +2,6 @@ name: CI
 on:
   push:
     branches:
-      - 'master'
       - 'main'
       - 'release-'
     tags: '*'
@@ -21,7 +20,6 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - '1.6' # previous LTS release
           - 'lts' # current LTS release
           - '1' # current stable release
         os:
@@ -45,4 +43,4 @@ jobs:
       - uses: codecov/codecov-action@v5
         with:
           file: lcov.info
-        
\ No newline at end of file
+        

From 9721b1eb9bafb70e4a32decccf172c3d900062b8 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Wed, 25 Feb 2026 09:03:22 -0500
Subject: [PATCH 05/11] Also trial AMD support

---
 Project.toml            |  6 +++++-
 ext/StridedAMDGPUExt.jl | 16 ++++++++++++++++
 ext/StridedCUDAExt.jl   |  8 ++------
 test/amd.jl             | 14 ++++++++++++++
 test/cuda.jl            | 20 ++++++++++----------
 test/runtests.jl        |  9 ++++++---
 6 files changed, 53 insertions(+), 20 deletions(-)
 create mode 100644 ext/StridedAMDGPUExt.jl
 create mode 100644 test/amd.jl

diff --git a/Project.toml b/Project.toml
index 24a4060..d8ca727 100644
--- a/Project.toml
+++ b/Project.toml
@@ -9,14 +9,17 @@ StridedViews = "4db3bf67-4bd7-4b4e-b153-31dc3fb37143"
 TupleTools = "9d95972d-f1c8-5527-a6e0-b4b365fa01f6"
 
 [weakdeps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 
 [extensions]
+StridedAMDGPUExt = "AMDGPU"
 StridedGPUArraysExt = "GPUArrays"
 StridedCUDAExt = "CUDA"
 
 [compat]
+AMDGPU = "2"
 Aqua = "0.8"
 CUDA = "5"
 GPUArrays = "11.4.1"
@@ -28,6 +31,7 @@ TupleTools = "1.6"
 julia = "1.6"
 
 [extras]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
@@ -35,4 +39,4 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test", "Random", "Aqua", "CUDA", "GPUArrays"]
+test = ["Test", "Random", "Aqua", "AMDGPU", "CUDA", "GPUArrays"]
diff --git a/ext/StridedAMDGPUExt.jl b/ext/StridedAMDGPUExt.jl
new file mode 100644
index 0000000..8ded5a9
--- /dev/null
+++ b/ext/StridedAMDGPUExt.jl
@@ -0,0 +1,16 @@
+module StridedAMDGPUExt
+
+using Strided, StridedViews, AMDGPU
+using AMDGPU: Adapt
+using AMDGPU: GPUArrays
+
+const ALL_FS = Union{typeof(adjoint), typeof(conj), typeof(identity), typeof(transpose)}
+
+function Base.copy!(dst::StridedView{TD, ND, TAD, FD}, src::StridedView{TS, NS, TAS, FS}) where {TD, ND, TAD <: ROCArray{TD}, FD <: ALL_FS, TS, NS, TAS <: ROCArray{TS}, FS <: ALL_FS}
+    bc_style = Base.Broadcast.BroadcastStyle(TAS)
+    bc = Base.Broadcast.Broadcasted(bc_style, identity, (src,), axes(dst))
+    GPUArrays._copyto!(dst, bc)
+    return dst
+end
+
+end
diff --git a/ext/StridedCUDAExt.jl b/ext/StridedCUDAExt.jl
index aedd7da..513836f 100644
--- a/ext/StridedCUDAExt.jl
+++ b/ext/StridedCUDAExt.jl
@@ -1,17 +1,13 @@
 module StridedCUDAExt
 
-using Strided, CUDA
+using Strided, StridedViews, CUDA
 using CUDA: Adapt, KernelAdaptor
 using CUDA: GPUArrays
 
 const ALL_FS = Union{typeof(adjoint), typeof(conj), typeof(identity), typeof(transpose)}
 
-function Adapt.adapt_storage(to::KernelAdaptor, xs::StridedView{T,N,TA,F}) where {T,N,TA<:CuArray{T},F <: ALL_FS}
-    return StridedView(Adapt.adapt(to, parent(xs)), xs.size, xs.strides, xs.offset, xs.op)
-end
-
 function Base.copy!(dst::StridedView{TD, ND, TAD, FD}, src::StridedView{TS, NS, TAS, FS}) where {TD, ND, TAD <: CuArray{TD}, FD <: ALL_FS, TS, NS, TAS <: CuArray{TS}, FS <: ALL_FS}
-    bc_style = Base.Broadcast.BroadcastStyle(TAS) 
+    bc_style = Base.Broadcast.BroadcastStyle(TAS)
     bc = Base.Broadcast.Broadcasted(bc_style, identity, (src,), axes(dst))
     GPUArrays._copyto!(dst, bc)
     return dst
diff --git a/test/amd.jl b/test/amd.jl
new file mode 100644
index 0000000..b08b941
--- /dev/null
+++ b/test/amd.jl
@@ -0,0 +1,14 @@
+for T in (Float32, Float64, Complex{Float32}, Complex{Float64})
+    @testset "Copy with ROCStridedView: $T, $f1, $f2" for f2 in (identity, conj, adjoint, transpose), f1 in (identity, conj, transpose, adjoint)
+        for m1 in (0, 16, 32), m2 in (0, 16, 32)
+            A1 = AMDGPU.randn(T, (m1, m2))
+            A2 = similar(A1)
+            A1c = copy(A1)
+            A2c = copy(A2)
+            B1 = f1(StridedView(A1c))
+            B2 = f2(StridedView(A2c))
+            axes(f1(A1)) == axes(f2(A2)) || continue
+            @test collect(ROCMatrix(copy!(f2(A2), f1(A1)))) == Adapt.adapt(Vector{T}, copy!(B2, B1))
+        end
+    end
+end
diff --git a/test/cuda.jl b/test/cuda.jl
index e86b4eb..695fec9 100644
--- a/test/cuda.jl
+++ b/test/cuda.jl
@@ -1,14 +1,14 @@
 for T in (Float32, Float64, Complex{Float32}, Complex{Float64})
-    m1 = 32
-    m2 = 16
     @testset "Copy with CuStridedView: $T, $f1, $f2" for f2 in (identity, conj, adjoint, transpose), f1 in (identity, conj, transpose, adjoint)
-        A1 = CUDA.randn(T, (m1, m2))
-        A2 = similar(A1)
-        A1c = copy(A1)
-        A2c = copy(A2)
-        B1 = f1(StridedView(A1c))
-        B2 = f2(StridedView(A2c))
-        axes(f1(A1)) == axes(f2(A2)) || continue
-        @test collect(CuMatrix(copy!(f2(A2), f1(A1)))) == Adapt.adapt(Vector{T}, copy!(B2, B1))
+        for m1 in (0, 16, 32), m2 in (0, 16, 32)
+            A1 = CUDA.randn(T, (m1, m2))
+            A2 = similar(A1)
+            A1c = copy(A1)
+            A2c = copy(A2)
+            B1 = f1(StridedView(A1c))
+            B2 = f2(StridedView(A2c))
+            axes(f1(A1)) == axes(f2(A2)) || continue
+            @test collect(CuMatrix(copy!(f2(A2), f1(A1)))) == CUDA.Adapt.adapt(Vector{T}, copy!(B2, B1))
+        end
     end
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index b596e72..3f9ee6f 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -4,8 +4,7 @@ using Random
 using Strided
 using Strided: StridedView
 using Aqua
-using CUDA, GPUArrays
-using CUDA: Adapt
+using AMDGPU, CUDA, GPUArrays
 
 Random.seed!(1234)
 
@@ -29,9 +28,13 @@ if !is_buildkite
     include("blasmultests.jl")
     Strided.disable_threaded_mul()
 
-    Aqua.test_all(Strided; piracies=false)
+    Aqua.test_all(Strided; piracies = false)
 end
 
 if CUDA.functional()
     include("cuda.jl")
 end
+
+if AMDGPU.functional()
+    include("amd.jl")
+end

From 3326f5a9aa9bd2a37901d57795062b8529c6fd33 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Thu, 26 Feb 2026 05:20:22 -0500
Subject: [PATCH 06/11] Workaround for size zero

---
 test/amd.jl | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/test/amd.jl b/test/amd.jl
index b08b941..8e6d18f 100644
--- a/test/amd.jl
+++ b/test/amd.jl
@@ -1,7 +1,11 @@
 for T in (Float32, Float64, Complex{Float32}, Complex{Float64})
     @testset "Copy with ROCStridedView: $T, $f1, $f2" for f2 in (identity, conj, adjoint, transpose), f1 in (identity, conj, transpose, adjoint)
         for m1 in (0, 16, 32), m2 in (0, 16, 32)
-            A1 = AMDGPU.randn(T, (m1, m2))
+            if iszero(m1 * m2)
+                A1 = AMDGPU.ROCMatrix{T}(undef, (m1, m2))
+            else
+                A1 = AMDGPU.randn(T, (m1, m2))
+            end
             A2 = similar(A1)
             A1c = copy(A1)
             A2c = copy(A2)

From c63b8705961f54ca34df6e00fa90c7370852581d Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Thu, 26 Feb 2026 06:00:45 -0500
Subject: [PATCH 07/11] Update StridedViews dep

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index d8ca727..f694518 100644
--- a/Project.toml
+++ b/Project.toml
@@ -25,7 +25,7 @@ CUDA = "5"
 GPUArrays = "11.4.1"
 LinearAlgebra = "1.6"
 Random = "1.6"
-StridedViews = "0.3.2,0.4"
+StridedViews = "0.4.4"
 Test = "1.6"
 TupleTools = "1.6"
 julia = "1.6"

From 6ec9797855bccceb5a2dd410f5395a054d0646b4 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Thu, 26 Feb 2026 07:15:23 -0500
Subject: [PATCH 08/11] Fix AMD test

---
 test/amd.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/amd.jl b/test/amd.jl
index 8e6d18f..48cd446 100644
--- a/test/amd.jl
+++ b/test/amd.jl
@@ -12,7 +12,7 @@ for T in (Float32, Float64, Complex{Float32}, Complex{Float64})
             B1 = f1(StridedView(A1c))
             B2 = f2(StridedView(A2c))
             axes(f1(A1)) == axes(f2(A2)) || continue
-            @test collect(ROCMatrix(copy!(f2(A2), f1(A1)))) == Adapt.adapt(Vector{T}, copy!(B2, B1))
+            @test collect(ROCMatrix(copy!(f2(A2), f1(A1)))) == AMDGPU.Adapt.adapt(Vector{T}, copy!(B2, B1))
         end
     end
 end

From d3769f618f584ca6a3e209225d8eebccfdc1bd10 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Thu, 26 Feb 2026 07:36:24 -0500
Subject: [PATCH 09/11] Another AMD fix

---
 test/amd.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/amd.jl b/test/amd.jl
index 48cd446..fc77c49 100644
--- a/test/amd.jl
+++ b/test/amd.jl
@@ -4,7 +4,7 @@ for T in (Float32, Float64, Complex{Float32}, Complex{Float64})
             if iszero(m1 * m2)
                 A1 = AMDGPU.ROCMatrix{T}(undef, (m1, m2))
             else
-                A1 = AMDGPU.randn(T, (m1, m2))
+                A1 = ROCMatrix(randn(T, (m1, m2)))
             end
             A2 = similar(A1)
             A1c = copy(A1)

From 59c3e921d32e94231b50de35fcf3e3d227cf1df5 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Thu, 26 Feb 2026 09:24:29 -0500
Subject: [PATCH 10/11] Bump StridedViews version

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index f694518..55aca8e 100644
--- a/Project.toml
+++ b/Project.toml
@@ -25,7 +25,7 @@ CUDA = "5"
 GPUArrays = "11.4.1"
 LinearAlgebra = "1.6"
 Random = "1.6"
-StridedViews = "0.4.4"
+StridedViews = "0.4.5"
 Test = "1.6"
 TupleTools = "1.6"
 julia = "1.6"

From 6814a52dd73e85d35a248ad45f64c817fb92d421 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Thu, 26 Feb 2026 10:15:41 -0500
Subject: [PATCH 11/11] Add Number restriction

---
 ext/StridedAMDGPUExt.jl | 2 +-
 ext/StridedCUDAExt.jl   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ext/StridedAMDGPUExt.jl b/ext/StridedAMDGPUExt.jl
index 8ded5a9..6fa3c40 100644
--- a/ext/StridedAMDGPUExt.jl
+++ b/ext/StridedAMDGPUExt.jl
@@ -6,7 +6,7 @@ using AMDGPU: GPUArrays
 
 const ALL_FS = Union{typeof(adjoint), typeof(conj), typeof(identity), typeof(transpose)}
 
-function Base.copy!(dst::StridedView{TD, ND, TAD, FD}, src::StridedView{TS, NS, TAS, FS}) where {TD, ND, TAD <: ROCArray{TD}, FD <: ALL_FS, TS, NS, TAS <: ROCArray{TS}, FS <: ALL_FS}
+function Base.copy!(dst::StridedView{TD, ND, TAD, FD}, src::StridedView{TS, NS, TAS, FS}) where {TD <: Number, ND, TAD <: ROCArray{TD}, FD <: ALL_FS, TS <: Number, NS, TAS <: ROCArray{TS}, FS <: ALL_FS}
     bc_style = Base.Broadcast.BroadcastStyle(TAS)
     bc = Base.Broadcast.Broadcasted(bc_style, identity, (src,), axes(dst))
     GPUArrays._copyto!(dst, bc)
diff --git a/ext/StridedCUDAExt.jl b/ext/StridedCUDAExt.jl
index 513836f..ec0abfd 100644
--- a/ext/StridedCUDAExt.jl
+++ b/ext/StridedCUDAExt.jl
@@ -6,7 +6,7 @@ using CUDA: GPUArrays
 
 const ALL_FS = Union{typeof(adjoint), typeof(conj), typeof(identity), typeof(transpose)}
 
-function Base.copy!(dst::StridedView{TD, ND, TAD, FD}, src::StridedView{TS, NS, TAS, FS}) where {TD, ND, TAD <: CuArray{TD}, FD <: ALL_FS, TS, NS, TAS <: CuArray{TS}, FS <: ALL_FS}
+function Base.copy!(dst::StridedView{TD, ND, TAD, FD}, src::StridedView{TS, NS, TAS, FS}) where {TD <: Number, ND, TAD <: CuArray{TD}, FD <: ALL_FS, TS <: Number, NS, TAS <: CuArray{TS}, FS <: ALL_FS}
     bc_style = Base.Broadcast.BroadcastStyle(TAS)
     bc = Base.Broadcast.Broadcasted(bc_style, identity, (src,), axes(dst))
     GPUArrays._copyto!(dst, bc)