Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
DocumenterVitepress = "4710194d-e776-4893-9690-8d956a29c365"
GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
LiveServer = "16fef848-5104-11e9-1b77-fb7a48bbb589"
SIMD = "fdea26ae-647d-5447-a871-4b548cad5224"

[compat]
Expand Down
1 change: 1 addition & 0 deletions docs/make.jl
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ function main()
"Devices" => "api/devices.md",
"Streams" => "api/streams.md",
"Kernel Programming" => "api/kernel_programming.md",
"Graphs" => "api/graphs.md",
"Exceptions" => "api/exceptions.md",
"Memory" => "api/memory.md",
"Host-Call" => "api/hostcall.md",
Expand Down
62 changes: 62 additions & 0 deletions docs/src/api/graphs.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Graphs

[Graphs](https://rocm.docs.amd.com/projects/HIP/en/latest/how-to/hip_runtime_api/hipgraph.html)
allow capturing GPU kernels and executing them as one unit, reducing host overhead.

Simple operations can be captured as is:

```@example graph-1
using AMDGPU

f!(o) = o .+= one(eltype(o))

z = AMDGPU.zeros(Int, 4, 4)
graph = AMDGPU.@captured f!(z)
@assert sum(z) == 16

AMDGPU.launch(graph)
@assert sum(z) == 16 * 2
```

However, if your code contains more complex flow, it requires more preparations:
- code **must not** result in hostcall invokation.
- if code contains malloc and respective frees, then it can be captured and relaunched as is.
- if code contains **only** allocations (without freeing), allocations must be cached with `GPUArrays.@cached` beforehand (see example below).
- other unsupported operations (e.g. RNG init) must be done beforehand as well.
- updating graph, does not update allocated pointers, only instantiation is supported in such cases.

```@example graph-2
using AMDGPU, GPUArrays

function f(o)
x = AMDGPU.rand(Float32, size(o))
y = AMDGPU.rand(Float32, size(o))
o .+= sin.(x) * cos.(y) .+ 1f0
return
end

cache = GPUArrays.AllocCache()
z = AMDGPU.zeros(Float32, 256, 256)
N = 10

# Execute function normally and cache all allocations.
GPUArrays.@cached cache f(z)

# Capture graph using AllocCache to avoid capturing malloc/free calls.
graph = GPUArrays.@cached cache AMDGPU.@captured f(z)

# Allocations cache must be kept alive while executing graph.
for i in 1:N
AMDGPU.launch(graph)
end
AMDGPU.synchronize()
```

```@docs
AMDGPU.capture
AMDGPU.@captured
AMDGPU.instantiate
AMDGPU.update
AMDGPU.is_capturing
AMDGPU.launch
```
9 changes: 4 additions & 5 deletions docs/src/tutorials/profiling.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

## rocprof

[rocprofv2](https://github.com/ROCm/rocprofiler?tab=readme-ov-file#rocprofiler-v2)
allows profiling both HSA & HIP API calls (rocprof being deprecated).
[rocprofv3](https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/how-to/using-rocprofv3.html)
allows profiling both HSA & HIP API calls.

Let's profile simple copying kernel saved in `profile.jl` file:
```julia
Expand Down Expand Up @@ -39,11 +39,10 @@ main(2^24)
### Profiling problematic code

```bash
ENABLE_JITPROFILING=1 rocprofv2 --plugin perfetto --hip-trace --hsa-trace --kernel-trace -o prof julia ./profile.jl
ENABLE_JITPROFILING=1 rocprofv3 --output-directory ./profiling --output-format pftrace --hip-trace --hsa-trace --kernel-trace -- julia ./profile.jl
```

This will produce `prof_output.pftrace` file which can be visualized
using [Perfetto UI](https://ui.perfetto.dev/).
This will produce `.pftrace` file which can be visualized using [Perfetto UI](https://ui.perfetto.dev/).

![image](../assets/profile_1.png)

Expand Down
2 changes: 2 additions & 0 deletions src/hip/HIP.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
module HIP
export HIPError, devices, device_synchronize, default_stream
export HIPGraph, HIPGraphExec, @captured, capture, instantiate, update, is_capturing, launch

using CEnum

Expand Down Expand Up @@ -90,6 +91,7 @@ include("stream.jl")
include("event.jl")
include("pool.jl")
include("module.jl")
include("graph.jl")

"""
Blocks until all kernels on all streams have completed.
Expand Down
171 changes: 171 additions & 0 deletions src/hip/graph.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
"""
instantiate(graph::HIPGraph)::HIPGraphExec

Instantiate captured graph making it executable with [`launch`](@ref).
"""
instantiate

"""
capture(f::Function; flags = hipStreamCaptureModeGlobal, throw_error::Bool = true)::Union{Nothing, HIPGraph}

Capture fiven function `f` to a graph.
If successful, returns a captured graph that needs to be [`instantiate`](@ref)'d to obtain executable graph.
"""
capture

function unchecked_hipStreamEndCapture(stream, pGraph)
AMDGPU.prepare_state()
@gcsafe_ccall(libhip.hipStreamEndCapture(stream::hipStream_t, pGraph::Ptr{hipGraph_t})::hipError_t)
end

mutable struct HIPGraph
handle::hipGraph_t

function HIPGraph(flags = hipStreamCaptureModeGlobal)
handle_ref = Ref{hipGraph_t}()
hipGraphCreate(handle_ref, flags)

obj = new(handle_ref[])
finalizer(obj) do obj
hipGraphDestroy(obj)
end
return obj
end

global function capture(f::Function; flags = hipStreamCaptureModeGlobal, throw_error::Bool = true)::Union{Nothing, HIPGraph}
gc_state = GC.enable(false)
stream = AMDGPU.stream()
try
hipStreamBeginCapture(stream, flags)
f()
finally
handle_ref = Ref{hipGraph_t}()
st = unchecked_hipStreamEndCapture(stream, handle_ref)
GC.enable(gc_state)

if st == hipErrorStreamCaptureInvalidated && !throw_error
return nothing
elseif st != hipSuccess
throw(HIPError(st))
end

obj = new(handle_ref[])
finalizer(hipGraphDestroy, obj)
return obj
end
return nothing
end
end

Base.unsafe_convert(::Type{hipGraph_t}, graph::HIPGraph) = graph.handle

mutable struct HIPGraphExec
handle::hipGraphExec_t

global function instantiate(graph::HIPGraph)
handle_ref = Ref{hipGraphExec_t}()
hipGraphInstantiateWithFlags(handle_ref, graph, 0)
obj = new(handle_ref[])

finalizer(obj) do obj
hipGraphExecDestroy(obj)
end
return obj
end
end

Base.unsafe_convert(::Type{hipGraphExec_t}, exec::HIPGraphExec) = exec.handle

"""
launch(exec::HIPGraphExec, stream::HIPStream = AMDGPU.stream())

Launch executable graph on a given stream.
"""
function launch(exec::HIPGraphExec, stream::HIPStream = AMDGPU.stream())
hipGraphLaunch(exec, stream)
end

"""
update(exec::HIPGraphExec, graph::HIPGraph; throw_error::Bool = true)::Bool

Given executable graph, perform update with graph.
Return `true` if successful, `false` otherwise.

If `throw_error=false` allows avoiding throwing an exception if update was not successful.
"""
function update(exec::HIPGraphExec, graph::HIPGraph; throw_error::Bool = true)::Bool
error_node = Ref{hipGraphNode_t}()
update_res_ref = Ref{hipGraphExecUpdateResult}()
hipGraphExecUpdate(exec, graph, error_node, update_res_ref)

update_res = update_res_ref[]
if update_res != hipGraphExecUpdateSuccess
throw_error && error("Failed to update HIPGraphExec: `$(update_res)`.")
return false
end
return true
end

function capture_status(stream::HIPStream)
status_ref = Ref{hipStreamCaptureStatus}()
id_ref = Ref{Culonglong}()
hipStreamGetCaptureInfo(stream, status_ref, id_ref)
status = status_ref[]
return (; status, id=(status == hipStreamCaptureStatusActive) ? id_ref[] : nothing)
end

"""
is_capturing(stream::HIPStream = AMDGPU.stream())::Bool

For a given `stream` check if capturing for a graph is performed.
"""
function is_capturing(stream::HIPStream = AMDGPU.stream())::Bool
capture_status(stream).status == hipStreamCaptureStatusActive
end

"""
graph = AMDGPU.@captured begin
# code to capture in a graph.
end

Macro to capture a given expression in a graph & execute it.
Returns captured graph, that can be relaunched with [`launch`](@ref) or updated with [`update`](@ref).

If capture fails (e.g. due to JIT), attempts recovery, compilation and re-capture.
"""
macro captured(ex)
quote
executed = false
GC.enable(false)
graph = try
capture(; throw_error=false) do
$(esc(ex))
end
finally
GC.enable(true)
end

if graph === nothing
# If the capture failed, this may have been due to JIT compilation.
# execute the body out of capture, and try capturing again.
$(esc(ex))

# Don't tolerate capture failures now so that the user will be informed.
GC.enable(false)
graph = try
capture() do
$(esc(ex))
end
catch
rethrow()
finally
GC.enable(true)
end
executed = true
end

exec = instantiate(graph)
executed || launch(exec)
exec
end
end
5 changes: 4 additions & 1 deletion src/hip/module.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@ mutable struct HIPModule
handle::hipModule_t

function HIPModule(data)
device_synchronize()
# During stream capture no GPU work is actually executing, so syncing
# would call hipStreamQuery on a capturing stream, which returns
# hipErrorStreamCaptureUnsupported and invalidates the capture.
is_capturing() || device_synchronize()

mod_ref = Ref{hipModule_t}()
hipModuleLoadData(mod_ref, data)
Expand Down
7 changes: 4 additions & 3 deletions src/memory.jl
Original file line number Diff line number Diff line change
Expand Up @@ -409,9 +409,10 @@ mutable struct Managed{M}
const mem::M
stream::HIPStream
dirty::Bool
captured::Bool

function Managed(mem; stream=AMDGPU.stream(), dirty=true)
new{typeof(mem)}(mem, stream, dirty)
function Managed(mem; stream=AMDGPU.stream(), dirty=true, captured=false)
new{typeof(mem)}(mem, stream, dirty, captured)
end
end

Expand Down Expand Up @@ -472,7 +473,7 @@ function pool_alloc(::Type{B}, bytesize) where B
maybe_collect()
time = Base.@elapsed begin
s = AMDGPU.stream()
managed = Managed(B(bytesize; stream=s); stream=s)
managed = Managed(B(bytesize; stream=s); stream=s, captured=AMDGPU.is_capturing())
end

Base.@atomic alloc_stats.alloc_count += 1
Expand Down
72 changes: 72 additions & 0 deletions test/core/graph_tests.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
using Test
using AMDGPU
using GPUArrays

@testset "HIP Graphs" begin
@testset "+1" begin
f!(o) = o .+= one(eltype(o))

z = AMDGPU.zeros(Int, 4, 4)
graph = AMDGPU.@captured f!(z)
@test sum(z) == 16

AMDGPU.launch(graph)
@test sum(z) == 16 * 2
AMDGPU.launch(graph)
@test sum(z) == 16 * 3
end

@testset "malloc/free" begin
function f!(o)
x = AMDGPU.ones(eltype(o), size(o))
o .+= x .+ one(eltype(o))
AMDGPU.unsafe_free!(x)
end

z = AMDGPU.zeros(Int, 4, 4)
graph = AMDGPU.@captured f!(z)
@test sum(z) == 32

AMDGPU.launch(graph)
@test sum(z) == 32 * 2
AMDGPU.launch(graph)
@test sum(z) == 32 * 3
end

@testset "only malloc + alloc cache" begin
function f!(o)
x = AMDGPU.ones(eltype(o), size(o))
y = AMDGPU.ones(eltype(o), size(o))
o .+= (x * y) .+ one(eltype(o))
end

z = AMDGPU.zeros(Int, 4, 4)
cache = GPUArrays.AllocCache()
# Pre-populate alloc cache, to avoid malloc calls during capture.
GPUArrays.@cached cache f!(z)
# Capture with alloc cache.
graph = GPUArrays.@cached cache AMDGPU.@captured f!(z)
@test sum(z) == length(z) * 5 * 2

AMDGPU.launch(graph)
@test sum(z) == length(z) * 5 * 3
AMDGPU.launch(graph)
@test sum(z) == length(z) * 5 * 4
end

@testset "Update graph" begin
f1!(o) = o .+= one(eltype(o))
f2!(o) = o .+= eltype(o)(2)

z = AMDGPU.zeros(Int, 4, 4)
graph = AMDGPU.@captured f1!(z)
@test sum(z) == 16

g_new = AMDGPU.capture() do
f2!(z)
end
@test AMDGPU.update(graph, g_new)
AMDGPU.launch(graph)
@test sum(z) == 16 * 3
end
end
Loading