From 7b22dfbb7718f9d205d47402c6850a13d137e072 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 10 Sep 2024 10:54:58 +0200 Subject: [PATCH] Replace blocking cl.launch by asynchronous cl.call. --- NEWS.md | 2 + README.md | 2 +- examples/hands_on_opencl/ex04/vadd_chain.jl | 19 +++---- examples/hands_on_opencl/ex07/matmul.jl | 10 ++-- examples/hands_on_opencl/ex08/matmul.jl | 16 +++--- examples/hands_on_opencl/ex09/pi_ocl.jl | 4 +- examples/hands_on_opencl/exA/pi_vocl.jl | 4 +- examples/notebooks/julia_set_fractal.ipynb | 2 +- examples/notebooks/mandelbrot_fractal.ipynb | 2 +- lib/kernel.jl | 55 +++++---------------- test/behaviour.jl | 6 +-- test/context.jl | 2 +- test/kernel.jl | 24 +++------ 13 files changed, 51 insertions(+), 97 deletions(-) diff --git a/NEWS.md b/NEWS.md index 82cb1dc..c433c33 100644 --- a/NEWS.md +++ b/NEWS.md @@ -18,6 +18,8 @@ Breaking changes: - The `cl.info` method, and the `getindex` overloading to access properties of OpenCL objects, have been replaced by `getproperty` overloading on the objects themselves (e.g., `cl.info(dev, :name)` and `dev[:name]` are now simply `dev.name`). +- The blocking `cl.launch` has been replaced by a nonblocking `cl.call`, while also removing + the `getindex`-overloading shorthand. New features: diff --git a/README.md b/README.md index 1c81d77..8b906ac 100644 --- a/README.md +++ b/README.md @@ -81,7 +81,7 @@ c_buff = cl.Buffer(Float32, length(a), :w) p = cl.Program(source=sum_kernel) |> cl.build! k = cl.Kernel(p, "sum") -cl.launch(k, size(a), nothing, a_buff, b_buff, c_buff) +cl.call(k, a_buff, b_buff, c_buff; global_size=size(a)) r = cl.read(c_buff) diff --git a/examples/hands_on_opencl/ex04/vadd_chain.jl b/examples/hands_on_opencl/ex04/vadd_chain.jl index cc3c94e..68e2fe6 100644 --- a/examples/hands_on_opencl/ex04/vadd_chain.jl +++ b/examples/hands_on_opencl/ex04/vadd_chain.jl @@ -75,20 +75,15 @@ d_f = cl.Buffer(Float32, LENGTH, :w) vadd = cl.Kernel(program, "vadd") # execute the kernel over the entire range of 1d input -# calling `queue` is blocking, it accepts the kernel, global / local work sizes, +# calling `queue` is asynchronous, it accepts the kernel, global / local work sizes, # the the kernel's arguments. -# here we call the kernel with work size set to the number of elements and a local -# work size of nothing. This enables the opencl runtime to optimize the local size -# for simple kernels -cl.launch(vadd, size(h_a), nothing, d_a, d_b, d_c, UInt32(LENGTH)) - -# an alternative syntax is to create an partial function to call -# by julia's getindex syntax for Kernel types. -# here the queue, global_size, and (optional) local_size are passed in which -# returns a partial function with these parameters set. -vadd[size(h_e)](d_e, d_c, d_d, UInt32(LENGTH)) -vadd[size(h_g)](d_g, d_d, d_f, UInt32(LENGTH)) +# here we call the kernel with work size set to the number of elements and no local +# work size. This enables the opencl runtime to optimize the local size for simple +# kernels +cl.call(vadd, d_a, d_b, d_c, UInt32(LENGTH); global_size=size(h_a)) +cl.call(vadd, d_e, d_c, d_d, UInt32(LENGTH); global_size=size(h_e)) +cl.call(vadd, d_g, d_d, d_f, UInt32(LENGTH); global_size=size(h_g)) # copy back the results from the compute device # copy!(queue, dst, src) follows same interface as julia's built in copy! diff --git a/examples/hands_on_opencl/ex07/matmul.jl b/examples/hands_on_opencl/ex07/matmul.jl index e2ecb7e..0e421f0 100644 --- a/examples/hands_on_opencl/ex07/matmul.jl +++ b/examples/hands_on_opencl/ex07/matmul.jl @@ -103,9 +103,8 @@ mmul = cl.Kernel(prg, "mmul") for i in 1:COUNT fill!(h_C, 0.0) cl.queue!(:profile) do - evt = cl.launch(mmul, (Ndim, Mdim), nothing, - Int32(Mdim), Int32(Ndim), Int32(Pdim), - d_a, d_b, d_c) + evt = cl.call(mmul, Int32(Mdim), Int32(Ndim), Int32(Pdim), + d_a, d_b, d_c; global_size=(Ndim, Mdim)) # profiling events are measured in ns run_time = evt.profile_duration / 1e9 cl.copy!(h_C, d_c) @@ -152,9 +151,8 @@ else for i in 1:COUNT fill!(h_C, 0.0) - evt = cl.launch(mmul, (Ndim,), (ORDER,), - Int32(Mdim), Int32(Ndim), Int32(Pdim), - d_a, d_b, d_c) + evt = cl.call(mmul, Int32(Mdim), Int32(Ndim), Int32(Pdim), + d_a, d_b, d_c; global_size=Ndim, local_size=ORDER) # profiling events are measured in ns run_time = evt.profile_duration / 1e9 cl.copy!(h_C, d_c) diff --git a/examples/hands_on_opencl/ex08/matmul.jl b/examples/hands_on_opencl/ex08/matmul.jl index e5c9564..44b83aa 100644 --- a/examples/hands_on_opencl/ex08/matmul.jl +++ b/examples/hands_on_opencl/ex08/matmul.jl @@ -103,9 +103,8 @@ mmul = cl.Kernel(prg, "mmul") for i in 1:COUNT fill!(h_C, 0.0) cl.queue!(:profile) do - evt = cl.launch(mmul, (Ndim, Mdim), nothing, - Int32(Mdim), Int32(Ndim), Int32(Pdim), - d_a, d_b, d_c) + evt = cl.call(mmul, Int32(Mdim), Int32(Ndim), Int32(Pdim), + d_a, d_b, d_c; global_size=(Ndim, Mdim)) # profiling events are measured in ns run_time = evt.profile_duration / 1e9 cl.copy!(h_C, d_c) @@ -126,9 +125,8 @@ mmul = cl.Kernel(prg, "mmul") for i in 1:COUNT fill!(h_C, 0.0) cl.queue!(:profile) do - evt = cl.launch(mmul, (Ndim,), (ORDER ÷ 16,), - Int32(Mdim), Int32(Ndim), Int32(Pdim), - d_a, d_b, d_c) + evt = cl.call(mmul,Int32(Mdim), Int32(Ndim), Int32(Pdim), + d_a, d_b, d_c; global_size=Ndim, local_size=(ORDER ÷ 16)) # profiling events are measured in ns run_time = evt.profile_duration / 1e9 cl.copy!(h_C, d_c) @@ -186,9 +184,9 @@ for i in 1:COUNT localmem1 = cl.LocalMem(Float32, blocksize^2) localmem2 = cl.LocalMem(Float32, blocksize^2) cl.queue!(:profile) do - evt = cl.launch(mmul, (Ndim,), (ORDER ÷ 16,), - Int32(Mdim), Int32(Ndim), Int32(Pdim), - d_a, d_b, d_c, localmem1, localmem2) + evt = cl.call(mmul, Int32(Mdim), Int32(Ndim), Int32(Pdim), + d_a, d_b, d_c, localmem1, localmem2; + global_size=Ndim, local_size=(ORDER ÷ 16)) # profiling events are measured in ns run_time = evt.profile_duration / 1e9 cl.copy!(h_C, d_c) diff --git a/examples/hands_on_opencl/ex09/pi_ocl.jl b/examples/hands_on_opencl/ex09/pi_ocl.jl index eee2fc0..341e02a 100644 --- a/examples/hands_on_opencl/ex09/pi_ocl.jl +++ b/examples/hands_on_opencl/ex09/pi_ocl.jl @@ -68,8 +68,8 @@ global_size = (nwork_groups * work_group_size,) local_size = (work_group_size,) localmem = cl.LocalMem(Float32, work_group_size) -cl.launch(pi_kernel, global_size, local_size, - Int32(niters), Float32(step_size), localmem, d_partial_sums) +cl.call(pi_kernel, Int32(niters), Float32(step_size), localmem, + d_partial_sums; global_size, local_size) cl.copy!(h_psum, d_partial_sums) diff --git a/examples/hands_on_opencl/exA/pi_vocl.jl b/examples/hands_on_opencl/exA/pi_vocl.jl index 779d2ca..7b640c2 100644 --- a/examples/hands_on_opencl/exA/pi_vocl.jl +++ b/examples/hands_on_opencl/exA/pi_vocl.jl @@ -107,8 +107,8 @@ global_size = (nwork_groups * work_group_size,) local_size = (work_group_size,) localmem = cl.LocalMem(Float32, work_group_size) -cl.launch(pi_kernel, global_size, local_size, - Int32(niters), Float32(step_size), localmem, d_partial_sums) +cl.call(pi_kernel, Int32(niters), Float32(step_size), localmem, d_partial_sums; + global_size, local_size) cl.copy!(h_psum, d_partial_sums) diff --git a/examples/notebooks/julia_set_fractal.ipynb b/examples/notebooks/julia_set_fractal.ipynb index a019e85..f4861f3 100644 --- a/examples/notebooks/julia_set_fractal.ipynb +++ b/examples/notebooks/julia_set_fractal.ipynb @@ -306,7 +306,7 @@ " prg = cl.Program(source=julia_source) |> cl.build!\n", " k = cl.Kernel(prg, \"julia\")\n", "\n", - " cl.launch(k, length(q), nothing, q_buff, o_buff, UInt16(maxiter))\n", + " cl.call(k, q_buff, o_buff, UInt16(maxiter); global_size=length(q))\n", " cl.copy!(out, o_buff)\n", "\n", " return out\n", diff --git a/examples/notebooks/mandelbrot_fractal.ipynb b/examples/notebooks/mandelbrot_fractal.ipynb index b92e5b5..599680c 100644 --- a/examples/notebooks/mandelbrot_fractal.ipynb +++ b/examples/notebooks/mandelbrot_fractal.ipynb @@ -76,7 +76,7 @@ " prg = cl.Program(source=mandel_source) |> cl.build!\n", "\n", " k = cl.Kernel(prg, \"mandelbrot\")\n", - " cl.launch(k, length(q), nothing, q_buff, o_buff, UInt16(maxiter))\n", + " cl.call(k, q_buff, o_buff, UInt16(maxiter); global_size=length(q))\n", "\n", " cl.copy!(out, o_buff)\n", "\n", diff --git a/lib/kernel.jl b/lib/kernel.jl index 29656fe..48f54b8 100644 --- a/lib/kernel.jl +++ b/lib/kernel.jl @@ -219,47 +219,8 @@ function set_args!(k::Kernel, args...) end end -# produce a cl.call thunk with kernel queue, global/local sizes -Base.getindex(k::Kernel, args...) = begin - if length(args) < 1 || length(args) > 2 - throw(ArgumentError("kernel must be called with a global size as arguments")) - end - if !(isa(args[1], Dims)) || length(args[1]) > 3 - throw(ArgumentError("kernel global size must be of Dims type (dim <= 3)")) - end - if length(args) == 2 && (!(isa(args[2], Dims)) || length(args[2]) > 3) - throw(ArgumentError("kernel local size must be of Dims type (dim <= 3)")) - end - global_size = args[1] - local_size = length(args) == 2 ? args[2] : nothing - # TODO: we cannot pass keywords in anon functions yet, return kernel call thunk - return (args...) -> launch(k, global_size, local_size, args...) -end - -# blocking kernel call that finishes queue -# XXX: shouldn't be blocking! -function launch(k::Kernel, global_work_size, local_work_size, - args...; global_work_offset=nothing, - wait_on::Union{Nothing,Vector{Event}}=nothing) - set_args!(k, args...) - evt = enqueue_kernel(k, - global_work_size, - local_work_size, - global_work_offset=global_work_offset, - wait_on=wait_on) - finish(queue()) - return evt -end - -function enqueue_kernel(k::Kernel, global_work_size) - enqueue_kernel(k, global_work_size, nothing) -end - -function enqueue_kernel(k::Kernel, - global_work_size, - local_work_size; - global_work_offset=nothing, - wait_on::Union{Nothing,Vector{Event}}=nothing) +function enqueue_kernel(k::Kernel, global_work_size, local_work_size=nothing; + global_work_offset=nothing, wait_on::Vector{Event}=Event[]) max_work_dim = device().max_work_item_dims work_dim = length(global_work_size) if work_dim > max_work_dim @@ -282,6 +243,8 @@ function enqueue_kernel(k::Kernel, for (i, o) in enumerate(global_work_offset) goffset[i] = o end + else + # null global offset means (0, 0, 0) end lsize = C_NULL @@ -296,9 +259,11 @@ function enqueue_kernel(k::Kernel, for (i, s) in enumerate(local_work_size) lsize[i] = s end + else + # null local size means OpenCL decides end - if wait_on !== nothing + if !isempty(wait_on) n_events = cl_uint(length(wait_on)) wait_event_ids = [evt.id for evt in wait_on] else @@ -312,6 +277,12 @@ function enqueue_kernel(k::Kernel, return Event(ret_event[], retain=false) end +function call(k::Kernel, args...; global_size=(1,), local_size=nothing, + global_work_offset=nothing, wait_on::Vector{Event}=Event[]) + set_args!(k, args...) + enqueue_kernel(k, global_size, local_size; global_work_offset, wait_on) +end + function enqueue_task(k::Kernel; wait_for=nothing) n_evts = 0 evt_ids = C_NULL diff --git a/test/behaviour.jl b/test/behaviour.jl index a76ae63..cbd0874 100644 --- a/test/behaviour.jl +++ b/test/behaviour.jl @@ -18,7 +18,7 @@ prg = cl.Program(source=hello_world_kernel) |> cl.build! kern = cl.Kernel(prg, "hello") - cl.launch(kern, str_len, nothing, out_buf) + cl.call(kern, out_buf; global_size=str_len) h = cl.read(out_buf) @test hello_world_str == GC.@preserve h unsafe_string(pointer(h)) @@ -213,7 +213,7 @@ end R_buf = cl.Buffer(Float32, length(X), :w) global_size = size(X) - cl.launch(part3, global_size, nothing, X_buf, Y_buf, R_buf, P_buf) + cl.call(part3, X_buf, Y_buf, R_buf, P_buf; global_size, local_size=nothing) r = cl.read(R_buf) @test all(x -> x == 13.5, r) @@ -251,7 +251,7 @@ end P = MutableParams(0.5, 10.0) P_buf = cl.Buffer(Float32, 2, :w) - cl.launch(part3, 1, nothing, P_buf, P) + cl.call(part3, P_buf, P) r = cl.read(P_buf) diff --git a/test/context.jl b/test/context.jl index 49be7f8..5f682cb 100644 --- a/test/context.jl +++ b/test/context.jl @@ -36,7 +36,7 @@ # try # p = cl.Program(source = empty_kernel) |> cl.build! # k = cl.Kernel(p, "test") - # cl.launch(k, 1, 10000000) + # cl.call(k; global_size=1, local_size=10000000) # catch # end # end diff --git a/test/kernel.jl b/test/kernel.jl index edfc521..b4a6128 100644 --- a/test/kernel.jl +++ b/test/kernel.jl @@ -105,33 +105,23 @@ k = cl.Kernel(p, "test") # dimensions must be the same size - @test_throws ArgumentError cl.launch(k, (1,), (1,1), d_buff) - @test_throws ArgumentError cl.launch(k, (1,1), (1,), d_buff) + @test_throws ArgumentError cl.call(k, d_buff; global_size=(1,), local_size=(1,1)) + @test_throws ArgumentError cl.call(k, d_buff; global_size=(1,1), local_size=(1,)) # dimensions are bounded max_work_dim = cl.device().max_work_item_dims bad = tuple([1 for _ in 1:(max_work_dim + 1)]) - @test_throws MethodError cl.launch(k, bad, d_buff) - - # devices have finite work sizes - @test_throws MethodError cl.launch(k, (typemax(Int),), d_buff) - - # blocking call to kernel finishes cmd queue - cl.launch(k, 1, 1, d_buff) + # calls are asynchronous, but cl.read blocks + cl.call(k, d_buff) r = cl.read(d_buff) @test r[1] == 2 - # alternative kernel call syntax - k[(1,), (1,)](d_buff) - r = cl.read(d_buff) - @test r[1] == 3 - # enqueue task is an alias for calling # a kernel with a global/local size of 1 evt = cl.enqueue_task(k) r = cl.read(d_buff) - @test r[1] == 4 + @test r[1] == 3 end @testset "packed structures" begin @@ -150,7 +140,7 @@ structkernel = cl.Kernel(prg, "structest") out = cl.Buffer(Float32, 2, :w) bstruct = (1, Int32(4)) - structkernel[(1,)](out, bstruct) + cl.call(structkernel, out, bstruct) r = cl.read(out) @test r == [1f0, 4f0] end @@ -185,7 +175,7 @@ structkernel = cl.Kernel(prg, "structest") out = cl.Buffer(Float32, 4, :w) astruct = CLTestStruct((1f0, 2f0, 3f0), nothing, 22f0) - structkernel[(1,)](out, astruct) + cl.call(structkernel, out, astruct) r = cl.read(out) @test r == [1f0, 2f0, 3f0, 22f0] end