JuliaGPU · pxl-th · Mar 23, 2026 · Mar 21, 2026 · Mar 21, 2026 · Mar 21, 2026
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -2,6 +2,7 @@
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 DocumenterVitepress = "4710194d-e776-4893-9690-8d956a29c365"
 GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
+LiveServer = "16fef848-5104-11e9-1b77-fb7a48bbb589"
 SIMD = "fdea26ae-647d-5447-a871-4b548cad5224"
 
 [compat]

diff --git a/docs/make.jl b/docs/make.jl
@@ -41,6 +41,7 @@ function main()
                 "Devices" => "api/devices.md",
                 "Streams" => "api/streams.md",
                 "Kernel Programming" => "api/kernel_programming.md",
+                "Graphs" => "api/graphs.md",
                 "Exceptions" => "api/exceptions.md",
                 "Memory" => "api/memory.md",
                 "Host-Call" => "api/hostcall.md",

diff --git a/docs/src/api/graphs.md b/docs/src/api/graphs.md
@@ -0,0 +1,62 @@
+# Graphs
+
+[Graphs](https://rocm.docs.amd.com/projects/HIP/en/latest/how-to/hip_runtime_api/hipgraph.html)
+allow capturing GPU kernels and executing them as one unit, reducing host overhead.
+
+Simple operations can be captured as is:
+
+```@example graph-1
+using AMDGPU
+
+f!(o) = o .+= one(eltype(o))
+
+z = AMDGPU.zeros(Int, 4, 4)
+graph = AMDGPU.@captured f!(z)
+@assert sum(z) == 16
+
+AMDGPU.launch(graph)
+@assert sum(z) == 16 * 2
+```
+
+However, if your code contains more complex flow, it requires more preparations:
+- code **must not** result in hostcall invokation.
+- if code contains malloc and respective frees, then it can be captured and relaunched as is.
+- if code contains **only** allocations (without freeing), allocations must be cached with `GPUArrays.@cached` beforehand (see example below).
+- other unsupported operations (e.g. RNG init) must be done beforehand as well.
+- updating graph, does not update allocated pointers, only instantiation is supported in such cases.
+
+```@example graph-2
+using AMDGPU, GPUArrays
+
+function f(o)
+    x = AMDGPU.rand(Float32, size(o))
+    y = AMDGPU.rand(Float32, size(o))
+    o .+= sin.(x) * cos.(y) .+ 1f0
+    return
+end
+
+cache = GPUArrays.AllocCache()
+z = AMDGPU.zeros(Float32, 256, 256)
+N = 10
+
+# Execute function normally and cache all allocations.
+GPUArrays.@cached cache f(z)
+
+# Capture graph using AllocCache to avoid capturing malloc/free calls.
+graph = GPUArrays.@cached cache AMDGPU.@captured f(z)
+
+# Allocations cache must be kept alive while executing graph.
+for i in 1:N
+    AMDGPU.launch(graph)
+end
+AMDGPU.synchronize()
+```
+
+```@docs
+AMDGPU.capture
+AMDGPU.@captured
+AMDGPU.instantiate
+AMDGPU.update
+AMDGPU.is_capturing
+AMDGPU.launch
+```
diff --git a/docs/src/tutorials/profiling.md b/docs/src/tutorials/profiling.md
@@ -2,8 +2,8 @@
 
 ## rocprof
 
-[rocprofv2](https://github.com/ROCm/rocprofiler?tab=readme-ov-file#rocprofiler-v2)
-allows profiling both HSA & HIP API calls (rocprof being deprecated).
+[rocprofv3](https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/how-to/using-rocprofv3.html)
+allows profiling both HSA & HIP API calls.
 
 Let's profile simple copying kernel saved in `profile.jl` file:
 ```julia
@@ -39,11 +39,10 @@ main(2^24)
 ### Profiling problematic code
 
 ```bash
-ENABLE_JITPROFILING=1 rocprofv2 --plugin perfetto --hip-trace --hsa-trace --kernel-trace -o prof julia ./profile.jl
+ENABLE_JITPROFILING=1 rocprofv3 --output-directory ./profiling --output-format pftrace --hip-trace --hsa-trace --kernel-trace -- julia ./profile.jl
 ```
 
-This will produce `prof_output.pftrace` file which can be visualized
-using [Perfetto UI](https://ui.perfetto.dev/).
+This will produce `.pftrace` file which can be visualized using [Perfetto UI](https://ui.perfetto.dev/).
 
 ![image](../assets/profile_1.png)
 

diff --git a/src/hip/HIP.jl b/src/hip/HIP.jl
@@ -1,5 +1,6 @@
 module HIP
 export HIPError, devices, device_synchronize, default_stream
+export HIPGraph, HIPGraphExec, @captured, capture, instantiate, update, is_capturing, launch
 
 using CEnum
 
@@ -90,6 +91,7 @@ include("stream.jl")
 include("event.jl")
 include("pool.jl")
 include("module.jl")
+include("graph.jl")
 
 """
 Blocks until all kernels on all streams have completed.

diff --git a/src/hip/graph.jl b/src/hip/graph.jl
@@ -0,0 +1,171 @@
+"""
+    instantiate(graph::HIPGraph)::HIPGraphExec
+
+Instantiate captured graph making it executable with [`launch`](@ref).
+"""
+instantiate
+
+"""
+    capture(f::Function; flags = hipStreamCaptureModeGlobal, throw_error::Bool = true)::Union{Nothing, HIPGraph}
+
+Capture fiven function `f` to a graph.
+If successful, returns a captured graph that needs to be [`instantiate`](@ref)'d to obtain executable graph.
+"""
+capture
+
+function unchecked_hipStreamEndCapture(stream, pGraph)
+    AMDGPU.prepare_state()
+    @gcsafe_ccall(libhip.hipStreamEndCapture(stream::hipStream_t, pGraph::Ptr{hipGraph_t})::hipError_t)
+end
+
+mutable struct HIPGraph
+    handle::hipGraph_t
+
+    function HIPGraph(flags = hipStreamCaptureModeGlobal)
+        handle_ref = Ref{hipGraph_t}()
+        hipGraphCreate(handle_ref, flags)
+
+        obj = new(handle_ref[])
+        finalizer(obj) do obj
+            hipGraphDestroy(obj)
+        end
+        return obj
+    end
+
+    global function capture(f::Function; flags = hipStreamCaptureModeGlobal, throw_error::Bool = true)::Union{Nothing, HIPGraph}
+        gc_state = GC.enable(false)
+        stream = AMDGPU.stream()
+        try
+            hipStreamBeginCapture(stream, flags)
+            f()
+        finally
+            handle_ref = Ref{hipGraph_t}()
+            st = unchecked_hipStreamEndCapture(stream, handle_ref)
+            GC.enable(gc_state)
+
+            if st == hipErrorStreamCaptureInvalidated && !throw_error
+                return nothing
+            elseif st != hipSuccess
+                throw(HIPError(st))
+            end
+
+            obj = new(handle_ref[])
+            finalizer(hipGraphDestroy, obj)
+            return obj
+        end
+        return nothing
+    end
+end
+
+Base.unsafe_convert(::Type{hipGraph_t}, graph::HIPGraph) = graph.handle
+
+mutable struct HIPGraphExec
+    handle::hipGraphExec_t
+
+    global function instantiate(graph::HIPGraph)
+        handle_ref = Ref{hipGraphExec_t}()
+        hipGraphInstantiateWithFlags(handle_ref, graph, 0)
+        obj = new(handle_ref[])
+
+        finalizer(obj) do obj
+            hipGraphExecDestroy(obj)
+        end
+        return obj
+    end
+end
+
+Base.unsafe_convert(::Type{hipGraphExec_t}, exec::HIPGraphExec) = exec.handle
+
+"""
+    launch(exec::HIPGraphExec, stream::HIPStream = AMDGPU.stream())
+
+Launch executable graph on a given stream.
+"""
+function launch(exec::HIPGraphExec, stream::HIPStream = AMDGPU.stream())
+    hipGraphLaunch(exec, stream)
+end
+
+"""
+    update(exec::HIPGraphExec, graph::HIPGraph; throw_error::Bool = true)::Bool
+
+Given executable graph, perform update with graph.
+Return `true` if successful, `false` otherwise.
+
+If `throw_error=false` allows avoiding throwing an exception if update was not successful.
+"""
+function update(exec::HIPGraphExec, graph::HIPGraph; throw_error::Bool = true)::Bool
+    error_node = Ref{hipGraphNode_t}()
+    update_res_ref = Ref{hipGraphExecUpdateResult}()
+    hipGraphExecUpdate(exec, graph, error_node, update_res_ref)
+
+    update_res = update_res_ref[]
+    if update_res != hipGraphExecUpdateSuccess
+        throw_error && error("Failed to update HIPGraphExec: `$(update_res)`.")
+        return false
+    end
+    return true
+end
+
+function capture_status(stream::HIPStream)
+    status_ref = Ref{hipStreamCaptureStatus}()
+    id_ref = Ref{Culonglong}()
+    hipStreamGetCaptureInfo(stream, status_ref, id_ref)
+    status = status_ref[]
+    return (; status, id=(status == hipStreamCaptureStatusActive) ? id_ref[] : nothing)
+end
+
+"""
+    is_capturing(stream::HIPStream = AMDGPU.stream())::Bool
+
+For a given `stream` check if capturing for a graph is performed.
+"""
+function is_capturing(stream::HIPStream = AMDGPU.stream())::Bool
+    capture_status(stream).status == hipStreamCaptureStatusActive
+end
+
+"""
+    graph = AMDGPU.@captured begin
+        # code to capture in a graph.
+    end
+
+Macro to capture a given expression in a graph & execute it.
+Returns captured graph, that can be relaunched with [`launch`](@ref) or updated with [`update`](@ref).
+
+If capture fails (e.g. due to JIT), attempts recovery, compilation and re-capture.
+"""
+macro captured(ex)
+    quote
+        executed = false
+        GC.enable(false)
+        graph = try
+            capture(; throw_error=false) do
+                $(esc(ex))
+            end
+        finally
+            GC.enable(true)
+        end
+
+        if graph === nothing
+            # If the capture failed, this may have been due to JIT compilation.
+            # execute the body out of capture, and try capturing again.
+            $(esc(ex))
+
+            # Don't tolerate capture failures now so that the user will be informed.
+            GC.enable(false)
+            graph = try
+                capture() do
+                    $(esc(ex))
+                end
+            catch
+                rethrow()
+            finally
+                GC.enable(true)
+            end
+            executed = true
+        end
+
+        exec = instantiate(graph)
+        executed || launch(exec)
+        exec
+    end
+end
diff --git a/src/hip/module.jl b/src/hip/module.jl
@@ -2,7 +2,10 @@ mutable struct HIPModule
     handle::hipModule_t
 
     function HIPModule(data)
-        device_synchronize()
+        # During stream capture no GPU work is actually executing, so syncing
+        # would call hipStreamQuery on a capturing stream, which returns
+        # hipErrorStreamCaptureUnsupported and invalidates the capture.
+        is_capturing() || device_synchronize()
 
         mod_ref = Ref{hipModule_t}()
         hipModuleLoadData(mod_ref, data)

diff --git a/src/memory.jl b/src/memory.jl
@@ -409,9 +409,10 @@ mutable struct Managed{M}
     const mem::M
     stream::HIPStream
     dirty::Bool
+    captured::Bool
 
-    function Managed(mem; stream=AMDGPU.stream(), dirty=true)
-        new{typeof(mem)}(mem, stream, dirty)
+    function Managed(mem; stream=AMDGPU.stream(), dirty=true, captured=false)
+        new{typeof(mem)}(mem, stream, dirty, captured)
     end
 end
 
@@ -472,7 +473,7 @@ function pool_alloc(::Type{B}, bytesize) where B
     maybe_collect()
     time = Base.@elapsed begin
         s = AMDGPU.stream()
-        managed = Managed(B(bytesize; stream=s); stream=s)
+        managed = Managed(B(bytesize; stream=s); stream=s, captured=AMDGPU.is_capturing())
     end
 
     Base.@atomic alloc_stats.alloc_count += 1

diff --git a/test/core/graph_tests.jl b/test/core/graph_tests.jl
@@ -0,0 +1,72 @@
+using Test
+using AMDGPU
+using GPUArrays
+
+@testset "HIP Graphs" begin
+    @testset "+1" begin
+        f!(o) = o .+= one(eltype(o))
+
+        z = AMDGPU.zeros(Int, 4, 4)
+        graph = AMDGPU.@captured f!(z)
+        @test sum(z) == 16
+
+        AMDGPU.launch(graph)
+        @test sum(z) == 16 * 2
+        AMDGPU.launch(graph)
+        @test sum(z) == 16 * 3
+    end
+
+    @testset "malloc/free" begin
+        function f!(o)
+            x = AMDGPU.ones(eltype(o), size(o))
+            o .+= x .+ one(eltype(o))
+            AMDGPU.unsafe_free!(x)
+        end
+
+        z = AMDGPU.zeros(Int, 4, 4)
+        graph = AMDGPU.@captured f!(z)
+        @test sum(z) == 32
+
+        AMDGPU.launch(graph)
+        @test sum(z) == 32 * 2
+        AMDGPU.launch(graph)
+        @test sum(z) == 32 * 3
+    end
+
+    @testset "only malloc + alloc cache" begin
+        function f!(o)
+            x = AMDGPU.ones(eltype(o), size(o))
+            y = AMDGPU.ones(eltype(o), size(o))
+            o .+= (x * y) .+ one(eltype(o))
+        end
+
+        z = AMDGPU.zeros(Int, 4, 4)
+        cache = GPUArrays.AllocCache()
+        # Pre-populate alloc cache, to avoid malloc calls during capture.
+        GPUArrays.@cached cache f!(z)
+        # Capture with alloc cache.
+        graph = GPUArrays.@cached cache AMDGPU.@captured f!(z)
+        @test sum(z) == length(z) * 5 * 2
+
+        AMDGPU.launch(graph)
+        @test sum(z) == length(z) * 5 * 3
+        AMDGPU.launch(graph)
+        @test sum(z) == length(z) * 5 * 4
+    end
+
+    @testset "Update graph" begin
+        f1!(o) = o .+= one(eltype(o))
+        f2!(o) = o .+= eltype(o)(2)
+
+        z = AMDGPU.zeros(Int, 4, 4)
+        graph = AMDGPU.@captured f1!(z)
+        @test sum(z) == 16
+
+        g_new = AMDGPU.capture() do
+            f2!(z)
+        end
+        @test AMDGPU.update(graph, g_new)
+        AMDGPU.launch(graph)
+        @test sum(z) == 16 * 3
+    end
+end