Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace blocking cl.launch by asynchronous cl.call. #220

Merged
merged 1 commit into from
Sep 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ Breaking changes:
- The `cl.info` method, and the `getindex` overloading to access properties of OpenCL
objects, have been replaced by `getproperty` overloading on the objects themselves
(e.g., `cl.info(dev, :name)` and `dev[:name]` are now simply `dev.name`).
- The blocking `cl.launch` has been replaced by a nonblocking `cl.call`, while also removing
the `getindex`-overloading shorthand.


New features:
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ c_buff = cl.Buffer(Float32, length(a), :w)
p = cl.Program(source=sum_kernel) |> cl.build!
k = cl.Kernel(p, "sum")

cl.launch(k, size(a), nothing, a_buff, b_buff, c_buff)
cl.call(k, a_buff, b_buff, c_buff; global_size=size(a))

r = cl.read(c_buff)

Expand Down
19 changes: 7 additions & 12 deletions examples/hands_on_opencl/ex04/vadd_chain.jl
Original file line number Diff line number Diff line change
Expand Up @@ -75,20 +75,15 @@ d_f = cl.Buffer(Float32, LENGTH, :w)
vadd = cl.Kernel(program, "vadd")

# execute the kernel over the entire range of 1d input
# calling `queue` is blocking, it accepts the kernel, global / local work sizes,
# calling `queue` is asynchronous, it accepts the kernel, global / local work sizes,
# the the kernel's arguments.

# here we call the kernel with work size set to the number of elements and a local
# work size of nothing. This enables the opencl runtime to optimize the local size
# for simple kernels
cl.launch(vadd, size(h_a), nothing, d_a, d_b, d_c, UInt32(LENGTH))

# an alternative syntax is to create an partial function to call
# by julia's getindex syntax for Kernel types.
# here the queue, global_size, and (optional) local_size are passed in which
# returns a partial function with these parameters set.
vadd[size(h_e)](d_e, d_c, d_d, UInt32(LENGTH))
vadd[size(h_g)](d_g, d_d, d_f, UInt32(LENGTH))
# here we call the kernel with work size set to the number of elements and no local
# work size. This enables the opencl runtime to optimize the local size for simple
# kernels
cl.call(vadd, d_a, d_b, d_c, UInt32(LENGTH); global_size=size(h_a))
cl.call(vadd, d_e, d_c, d_d, UInt32(LENGTH); global_size=size(h_e))
cl.call(vadd, d_g, d_d, d_f, UInt32(LENGTH); global_size=size(h_g))

# copy back the results from the compute device
# copy!(queue, dst, src) follows same interface as julia's built in copy!
Expand Down
10 changes: 4 additions & 6 deletions examples/hands_on_opencl/ex07/matmul.jl
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,8 @@ mmul = cl.Kernel(prg, "mmul")
for i in 1:COUNT
fill!(h_C, 0.0)
cl.queue!(:profile) do
evt = cl.launch(mmul, (Ndim, Mdim), nothing,
Int32(Mdim), Int32(Ndim), Int32(Pdim),
d_a, d_b, d_c)
evt = cl.call(mmul, Int32(Mdim), Int32(Ndim), Int32(Pdim),
d_a, d_b, d_c; global_size=(Ndim, Mdim))
# profiling events are measured in ns
run_time = evt.profile_duration / 1e9
cl.copy!(h_C, d_c)
Expand Down Expand Up @@ -152,9 +151,8 @@ else

for i in 1:COUNT
fill!(h_C, 0.0)
evt = cl.launch(mmul, (Ndim,), (ORDER,),
Int32(Mdim), Int32(Ndim), Int32(Pdim),
d_a, d_b, d_c)
evt = cl.call(mmul, Int32(Mdim), Int32(Ndim), Int32(Pdim),
d_a, d_b, d_c; global_size=Ndim, local_size=ORDER)
# profiling events are measured in ns
run_time = evt.profile_duration / 1e9
cl.copy!(h_C, d_c)
Expand Down
16 changes: 7 additions & 9 deletions examples/hands_on_opencl/ex08/matmul.jl
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,8 @@ mmul = cl.Kernel(prg, "mmul")
for i in 1:COUNT
fill!(h_C, 0.0)
cl.queue!(:profile) do
evt = cl.launch(mmul, (Ndim, Mdim), nothing,
Int32(Mdim), Int32(Ndim), Int32(Pdim),
d_a, d_b, d_c)
evt = cl.call(mmul, Int32(Mdim), Int32(Ndim), Int32(Pdim),
d_a, d_b, d_c; global_size=(Ndim, Mdim))
# profiling events are measured in ns
run_time = evt.profile_duration / 1e9
cl.copy!(h_C, d_c)
Expand All @@ -126,9 +125,8 @@ mmul = cl.Kernel(prg, "mmul")
for i in 1:COUNT
fill!(h_C, 0.0)
cl.queue!(:profile) do
evt = cl.launch(mmul, (Ndim,), (ORDER ÷ 16,),
Int32(Mdim), Int32(Ndim), Int32(Pdim),
d_a, d_b, d_c)
evt = cl.call(mmul,Int32(Mdim), Int32(Ndim), Int32(Pdim),
d_a, d_b, d_c; global_size=Ndim, local_size=(ORDER ÷ 16))
# profiling events are measured in ns
run_time = evt.profile_duration / 1e9
cl.copy!(h_C, d_c)
Expand Down Expand Up @@ -186,9 +184,9 @@ for i in 1:COUNT
localmem1 = cl.LocalMem(Float32, blocksize^2)
localmem2 = cl.LocalMem(Float32, blocksize^2)
cl.queue!(:profile) do
evt = cl.launch(mmul, (Ndim,), (ORDER ÷ 16,),
Int32(Mdim), Int32(Ndim), Int32(Pdim),
d_a, d_b, d_c, localmem1, localmem2)
evt = cl.call(mmul, Int32(Mdim), Int32(Ndim), Int32(Pdim),
d_a, d_b, d_c, localmem1, localmem2;
global_size=Ndim, local_size=(ORDER ÷ 16))
# profiling events are measured in ns
run_time = evt.profile_duration / 1e9
cl.copy!(h_C, d_c)
Expand Down
4 changes: 2 additions & 2 deletions examples/hands_on_opencl/ex09/pi_ocl.jl
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ global_size = (nwork_groups * work_group_size,)
local_size = (work_group_size,)
localmem = cl.LocalMem(Float32, work_group_size)

cl.launch(pi_kernel, global_size, local_size,
Int32(niters), Float32(step_size), localmem, d_partial_sums)
cl.call(pi_kernel, Int32(niters), Float32(step_size), localmem,
d_partial_sums; global_size, local_size)

cl.copy!(h_psum, d_partial_sums)

Expand Down
4 changes: 2 additions & 2 deletions examples/hands_on_opencl/exA/pi_vocl.jl
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,8 @@ global_size = (nwork_groups * work_group_size,)
local_size = (work_group_size,)
localmem = cl.LocalMem(Float32, work_group_size)

cl.launch(pi_kernel, global_size, local_size,
Int32(niters), Float32(step_size), localmem, d_partial_sums)
cl.call(pi_kernel, Int32(niters), Float32(step_size), localmem, d_partial_sums;
global_size, local_size)

cl.copy!(h_psum, d_partial_sums)

Expand Down
2 changes: 1 addition & 1 deletion examples/notebooks/julia_set_fractal.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@
" prg = cl.Program(source=julia_source) |> cl.build!\n",
" k = cl.Kernel(prg, \"julia\")\n",
"\n",
" cl.launch(k, length(q), nothing, q_buff, o_buff, UInt16(maxiter))\n",
" cl.call(k, q_buff, o_buff, UInt16(maxiter); global_size=length(q))\n",
" cl.copy!(out, o_buff)\n",
"\n",
" return out\n",
Expand Down
2 changes: 1 addition & 1 deletion examples/notebooks/mandelbrot_fractal.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@
" prg = cl.Program(source=mandel_source) |> cl.build!\n",
"\n",
" k = cl.Kernel(prg, \"mandelbrot\")\n",
" cl.launch(k, length(q), nothing, q_buff, o_buff, UInt16(maxiter))\n",
" cl.call(k, q_buff, o_buff, UInt16(maxiter); global_size=length(q))\n",
"\n",
" cl.copy!(out, o_buff)\n",
"\n",
Expand Down
55 changes: 13 additions & 42 deletions lib/kernel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -219,47 +219,8 @@ function set_args!(k::Kernel, args...)
end
end

# produce a cl.call thunk with kernel queue, global/local sizes
Base.getindex(k::Kernel, args...) = begin
if length(args) < 1 || length(args) > 2
throw(ArgumentError("kernel must be called with a global size as arguments"))
end
if !(isa(args[1], Dims)) || length(args[1]) > 3
throw(ArgumentError("kernel global size must be of Dims type (dim <= 3)"))
end
if length(args) == 2 && (!(isa(args[2], Dims)) || length(args[2]) > 3)
throw(ArgumentError("kernel local size must be of Dims type (dim <= 3)"))
end
global_size = args[1]
local_size = length(args) == 2 ? args[2] : nothing
# TODO: we cannot pass keywords in anon functions yet, return kernel call thunk
return (args...) -> launch(k, global_size, local_size, args...)
end

# blocking kernel call that finishes queue
# XXX: shouldn't be blocking!
function launch(k::Kernel, global_work_size, local_work_size,
args...; global_work_offset=nothing,
wait_on::Union{Nothing,Vector{Event}}=nothing)
set_args!(k, args...)
evt = enqueue_kernel(k,
global_work_size,
local_work_size,
global_work_offset=global_work_offset,
wait_on=wait_on)
finish(queue())
return evt
end

function enqueue_kernel(k::Kernel, global_work_size)
enqueue_kernel(k, global_work_size, nothing)
end

function enqueue_kernel(k::Kernel,
global_work_size,
local_work_size;
global_work_offset=nothing,
wait_on::Union{Nothing,Vector{Event}}=nothing)
function enqueue_kernel(k::Kernel, global_work_size, local_work_size=nothing;
global_work_offset=nothing, wait_on::Vector{Event}=Event[])
max_work_dim = device().max_work_item_dims
work_dim = length(global_work_size)
if work_dim > max_work_dim
Expand All @@ -282,6 +243,8 @@ function enqueue_kernel(k::Kernel,
for (i, o) in enumerate(global_work_offset)
goffset[i] = o
end
else
# null global offset means (0, 0, 0)
end

lsize = C_NULL
Expand All @@ -296,9 +259,11 @@ function enqueue_kernel(k::Kernel,
for (i, s) in enumerate(local_work_size)
lsize[i] = s
end
else
# null local size means OpenCL decides
end

if wait_on !== nothing
if !isempty(wait_on)
n_events = cl_uint(length(wait_on))
wait_event_ids = [evt.id for evt in wait_on]
else
Expand All @@ -312,6 +277,12 @@ function enqueue_kernel(k::Kernel,
return Event(ret_event[], retain=false)
end

function call(k::Kernel, args...; global_size=(1,), local_size=nothing,
global_work_offset=nothing, wait_on::Vector{Event}=Event[])
set_args!(k, args...)
enqueue_kernel(k, global_size, local_size; global_work_offset, wait_on)
end

function enqueue_task(k::Kernel; wait_for=nothing)
n_evts = 0
evt_ids = C_NULL
Expand Down
6 changes: 3 additions & 3 deletions test/behaviour.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
prg = cl.Program(source=hello_world_kernel) |> cl.build!
kern = cl.Kernel(prg, "hello")

cl.launch(kern, str_len, nothing, out_buf)
cl.call(kern, out_buf; global_size=str_len)
h = cl.read(out_buf)

@test hello_world_str == GC.@preserve h unsafe_string(pointer(h))
Expand Down Expand Up @@ -213,7 +213,7 @@ end
R_buf = cl.Buffer(Float32, length(X), :w)

global_size = size(X)
cl.launch(part3, global_size, nothing, X_buf, Y_buf, R_buf, P_buf)
cl.call(part3, X_buf, Y_buf, R_buf, P_buf; global_size, local_size=nothing)

r = cl.read(R_buf)
@test all(x -> x == 13.5, r)
Expand Down Expand Up @@ -251,7 +251,7 @@ end

P = MutableParams(0.5, 10.0)
P_buf = cl.Buffer(Float32, 2, :w)
cl.launch(part3, 1, nothing, P_buf, P)
cl.call(part3, P_buf, P)

r = cl.read(P_buf)

Expand Down
2 changes: 1 addition & 1 deletion test/context.jl
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
# try
# p = cl.Program(source = empty_kernel) |> cl.build!
# k = cl.Kernel(p, "test")
# cl.launch(k, 1, 10000000)
# cl.call(k; global_size=1, local_size=10000000)
# catch
# end
# end
Expand Down
24 changes: 7 additions & 17 deletions test/kernel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -105,33 +105,23 @@
k = cl.Kernel(p, "test")

# dimensions must be the same size
@test_throws ArgumentError cl.launch(k, (1,), (1,1), d_buff)
@test_throws ArgumentError cl.launch(k, (1,1), (1,), d_buff)
@test_throws ArgumentError cl.call(k, d_buff; global_size=(1,), local_size=(1,1))
@test_throws ArgumentError cl.call(k, d_buff; global_size=(1,1), local_size=(1,))

# dimensions are bounded
max_work_dim = cl.device().max_work_item_dims
bad = tuple([1 for _ in 1:(max_work_dim + 1)])
@test_throws MethodError cl.launch(k, bad, d_buff)

# devices have finite work sizes
@test_throws MethodError cl.launch(k, (typemax(Int),), d_buff)

# blocking call to kernel finishes cmd queue
cl.launch(k, 1, 1, d_buff)

# calls are asynchronous, but cl.read blocks
cl.call(k, d_buff)
r = cl.read(d_buff)
@test r[1] == 2

# alternative kernel call syntax
k[(1,), (1,)](d_buff)
r = cl.read(d_buff)
@test r[1] == 3

# enqueue task is an alias for calling
# a kernel with a global/local size of 1
evt = cl.enqueue_task(k)
r = cl.read(d_buff)
@test r[1] == 4
@test r[1] == 3
end

@testset "packed structures" begin
Expand All @@ -150,7 +140,7 @@
structkernel = cl.Kernel(prg, "structest")
out = cl.Buffer(Float32, 2, :w)
bstruct = (1, Int32(4))
structkernel[(1,)](out, bstruct)
cl.call(structkernel, out, bstruct)
r = cl.read(out)
@test r == [1f0, 4f0]
end
Expand Down Expand Up @@ -185,7 +175,7 @@
structkernel = cl.Kernel(prg, "structest")
out = cl.Buffer(Float32, 4, :w)
astruct = CLTestStruct((1f0, 2f0, 3f0), nothing, 22f0)
structkernel[(1,)](out, astruct)
cl.call(structkernel, out, astruct)
r = cl.read(out)
@test r == [1f0, 2f0, 3f0, 22f0]
end
Expand Down