From 7b22dfbb7718f9d205d47402c6850a13d137e072 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 10 Sep 2024 10:54:58 +0200
Subject: [PATCH] Replace blocking cl.launch by asynchronous cl.call.

---
 NEWS.md                                     |  2 +
 README.md                                   |  2 +-
 examples/hands_on_opencl/ex04/vadd_chain.jl | 19 +++----
 examples/hands_on_opencl/ex07/matmul.jl     | 10 ++--
 examples/hands_on_opencl/ex08/matmul.jl     | 16 +++---
 examples/hands_on_opencl/ex09/pi_ocl.jl     |  4 +-
 examples/hands_on_opencl/exA/pi_vocl.jl     |  4 +-
 examples/notebooks/julia_set_fractal.ipynb  |  2 +-
 examples/notebooks/mandelbrot_fractal.ipynb |  2 +-
 lib/kernel.jl                               | 55 +++++----------------
 test/behaviour.jl                           |  6 +--
 test/context.jl                             |  2 +-
 test/kernel.jl                              | 24 +++------
 13 files changed, 51 insertions(+), 97 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 82cb1dc..c433c33 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -18,6 +18,8 @@ Breaking changes:
 - The `cl.info` method, and the `getindex` overloading to access properties of OpenCL
   objects, have been replaced by `getproperty` overloading on the objects themselves
   (e.g., `cl.info(dev, :name)` and `dev[:name]` are now simply `dev.name`).
+- The blocking `cl.launch` has been replaced by a nonblocking `cl.call`, while also removing
+  the `getindex`-overloading shorthand.
 
 
 New features:
diff --git a/README.md b/README.md
index 1c81d77..8b906ac 100644
--- a/README.md
+++ b/README.md
@@ -81,7 +81,7 @@ c_buff = cl.Buffer(Float32, length(a), :w)
 p = cl.Program(source=sum_kernel) |> cl.build!
 k = cl.Kernel(p, "sum")
 
-cl.launch(k, size(a), nothing, a_buff, b_buff, c_buff)
+cl.call(k, a_buff, b_buff, c_buff; global_size=size(a))
 
 r = cl.read(c_buff)
 
diff --git a/examples/hands_on_opencl/ex04/vadd_chain.jl b/examples/hands_on_opencl/ex04/vadd_chain.jl
index cc3c94e..68e2fe6 100644
--- a/examples/hands_on_opencl/ex04/vadd_chain.jl
+++ b/examples/hands_on_opencl/ex04/vadd_chain.jl
@@ -75,20 +75,15 @@ d_f = cl.Buffer(Float32, LENGTH, :w)
 vadd = cl.Kernel(program, "vadd")
 
 # execute the kernel over the entire range of 1d input
-# calling `queue` is blocking, it accepts the kernel, global / local work sizes,
+# calling `queue` is asynchronous, it accepts the kernel, global / local work sizes,
 # the the kernel's arguments.
 
-# here we call the kernel with work size set to the number of elements and a local
-# work size of nothing. This enables the opencl runtime to optimize the local size
-# for simple kernels
-cl.launch(vadd, size(h_a), nothing, d_a, d_b, d_c, UInt32(LENGTH))
-
-# an alternative syntax is to create an partial function to call
-# by julia's getindex syntax for Kernel types.
-# here the queue, global_size, and (optional) local_size are passed in which
-# returns a partial function with these parameters set.
-vadd[size(h_e)](d_e, d_c, d_d, UInt32(LENGTH))
-vadd[size(h_g)](d_g, d_d, d_f, UInt32(LENGTH))
+# here we call the kernel with work size set to the number of elements and no local
+# work size. This enables the opencl runtime to optimize the local size for simple
+# kernels
+cl.call(vadd, d_a, d_b, d_c, UInt32(LENGTH); global_size=size(h_a))
+cl.call(vadd, d_e, d_c, d_d, UInt32(LENGTH); global_size=size(h_e))
+cl.call(vadd, d_g, d_d, d_f, UInt32(LENGTH); global_size=size(h_g))
 
 # copy back the results from the compute device
 # copy!(queue, dst, src) follows same interface as julia's built in copy!
diff --git a/examples/hands_on_opencl/ex07/matmul.jl b/examples/hands_on_opencl/ex07/matmul.jl
index e2ecb7e..0e421f0 100644
--- a/examples/hands_on_opencl/ex07/matmul.jl
+++ b/examples/hands_on_opencl/ex07/matmul.jl
@@ -103,9 +103,8 @@ mmul = cl.Kernel(prg, "mmul")
 for i in 1:COUNT
     fill!(h_C, 0.0)
     cl.queue!(:profile) do
-        evt = cl.launch(mmul, (Ndim, Mdim), nothing,
-                        Int32(Mdim), Int32(Ndim), Int32(Pdim),
-                        d_a, d_b, d_c)
+        evt = cl.call(mmul, Int32(Mdim), Int32(Ndim), Int32(Pdim),
+                      d_a, d_b, d_c; global_size=(Ndim, Mdim))
         # profiling events are measured in ns
         run_time = evt.profile_duration / 1e9
         cl.copy!(h_C, d_c)
@@ -152,9 +151,8 @@ else
 
 for i in 1:COUNT
     fill!(h_C, 0.0)
-    evt = cl.launch(mmul, (Ndim,), (ORDER,),
-                    Int32(Mdim), Int32(Ndim), Int32(Pdim),
-                    d_a, d_b, d_c)
+    evt = cl.call(mmul,  Int32(Mdim), Int32(Ndim), Int32(Pdim),
+                  d_a, d_b, d_c; global_size=Ndim, local_size=ORDER)
     # profiling events are measured in ns
     run_time = evt.profile_duration / 1e9
     cl.copy!(h_C, d_c)
diff --git a/examples/hands_on_opencl/ex08/matmul.jl b/examples/hands_on_opencl/ex08/matmul.jl
index e5c9564..44b83aa 100644
--- a/examples/hands_on_opencl/ex08/matmul.jl
+++ b/examples/hands_on_opencl/ex08/matmul.jl
@@ -103,9 +103,8 @@ mmul = cl.Kernel(prg, "mmul")
 for i in 1:COUNT
     fill!(h_C, 0.0)
     cl.queue!(:profile) do
-        evt = cl.launch(mmul, (Ndim, Mdim), nothing,
-                        Int32(Mdim), Int32(Ndim), Int32(Pdim),
-                        d_a, d_b, d_c)
+        evt = cl.call(mmul, Int32(Mdim), Int32(Ndim), Int32(Pdim),
+                      d_a, d_b, d_c; global_size=(Ndim, Mdim))
         # profiling events are measured in ns
         run_time = evt.profile_duration / 1e9
         cl.copy!(h_C, d_c)
@@ -126,9 +125,8 @@ mmul = cl.Kernel(prg, "mmul")
 for i in 1:COUNT
     fill!(h_C, 0.0)
     cl.queue!(:profile) do
-        evt = cl.launch(mmul, (Ndim,), (ORDER ÷ 16,),
-                        Int32(Mdim), Int32(Ndim), Int32(Pdim),
-                        d_a, d_b, d_c)
+        evt = cl.call(mmul,Int32(Mdim), Int32(Ndim), Int32(Pdim),
+                      d_a, d_b, d_c; global_size=Ndim, local_size=(ORDER ÷ 16))
         # profiling events are measured in ns
         run_time = evt.profile_duration / 1e9
         cl.copy!(h_C, d_c)
@@ -186,9 +184,9 @@ for i in 1:COUNT
     localmem1 = cl.LocalMem(Float32, blocksize^2)
     localmem2 = cl.LocalMem(Float32, blocksize^2)
     cl.queue!(:profile) do
-        evt = cl.launch(mmul, (Ndim,), (ORDER ÷ 16,),
-                        Int32(Mdim), Int32(Ndim), Int32(Pdim),
-                        d_a, d_b, d_c, localmem1, localmem2)
+        evt = cl.call(mmul, Int32(Mdim), Int32(Ndim), Int32(Pdim),
+                      d_a, d_b, d_c, localmem1, localmem2;
+                      global_size=Ndim, local_size=(ORDER ÷ 16))
         # profiling events are measured in ns
         run_time = evt.profile_duration / 1e9
         cl.copy!(h_C, d_c)
diff --git a/examples/hands_on_opencl/ex09/pi_ocl.jl b/examples/hands_on_opencl/ex09/pi_ocl.jl
index eee2fc0..341e02a 100644
--- a/examples/hands_on_opencl/ex09/pi_ocl.jl
+++ b/examples/hands_on_opencl/ex09/pi_ocl.jl
@@ -68,8 +68,8 @@ global_size = (nwork_groups * work_group_size,)
 local_size  = (work_group_size,)
 localmem    = cl.LocalMem(Float32, work_group_size)
 
-cl.launch(pi_kernel, global_size, local_size,
-          Int32(niters), Float32(step_size), localmem, d_partial_sums)
+cl.call(pi_kernel, Int32(niters), Float32(step_size), localmem,
+        d_partial_sums; global_size, local_size)
 
 cl.copy!(h_psum, d_partial_sums)
 
diff --git a/examples/hands_on_opencl/exA/pi_vocl.jl b/examples/hands_on_opencl/exA/pi_vocl.jl
index 779d2ca..7b640c2 100644
--- a/examples/hands_on_opencl/exA/pi_vocl.jl
+++ b/examples/hands_on_opencl/exA/pi_vocl.jl
@@ -107,8 +107,8 @@ global_size = (nwork_groups * work_group_size,)
 local_size  = (work_group_size,)
 localmem    = cl.LocalMem(Float32, work_group_size)
 
-cl.launch(pi_kernel, global_size, local_size,
-          Int32(niters), Float32(step_size), localmem, d_partial_sums)
+cl.call(pi_kernel, Int32(niters), Float32(step_size), localmem, d_partial_sums;
+        global_size, local_size)
 
 cl.copy!(h_psum, d_partial_sums)
 
diff --git a/examples/notebooks/julia_set_fractal.ipynb b/examples/notebooks/julia_set_fractal.ipynb
index a019e85..f4861f3 100644
--- a/examples/notebooks/julia_set_fractal.ipynb
+++ b/examples/notebooks/julia_set_fractal.ipynb
@@ -306,7 +306,7 @@
     "    prg = cl.Program(source=julia_source) |> cl.build!\n",
     "    k = cl.Kernel(prg, \"julia\")\n",
     "\n",
-    "    cl.launch(k, length(q), nothing, q_buff, o_buff, UInt16(maxiter))\n",
+    "    cl.call(k, q_buff, o_buff, UInt16(maxiter); global_size=length(q))\n",
     "    cl.copy!(out, o_buff)\n",
     "\n",
     "    return out\n",
diff --git a/examples/notebooks/mandelbrot_fractal.ipynb b/examples/notebooks/mandelbrot_fractal.ipynb
index b92e5b5..599680c 100644
--- a/examples/notebooks/mandelbrot_fractal.ipynb
+++ b/examples/notebooks/mandelbrot_fractal.ipynb
@@ -76,7 +76,7 @@
     "    prg = cl.Program(source=mandel_source) |> cl.build!\n",
     "\n",
     "    k = cl.Kernel(prg, \"mandelbrot\")\n",
-    "    cl.launch(k, length(q), nothing, q_buff, o_buff, UInt16(maxiter))\n",
+    "    cl.call(k, q_buff, o_buff, UInt16(maxiter); global_size=length(q))\n",
     "\n",
     "    cl.copy!(out, o_buff)\n",
     "\n",
diff --git a/lib/kernel.jl b/lib/kernel.jl
index 29656fe..48f54b8 100644
--- a/lib/kernel.jl
+++ b/lib/kernel.jl
@@ -219,47 +219,8 @@ function set_args!(k::Kernel, args...)
     end
 end
 
-# produce a cl.call thunk with kernel queue, global/local sizes
-Base.getindex(k::Kernel, args...) = begin
-    if length(args) < 1 || length(args) > 2
-        throw(ArgumentError("kernel must be called with a global size as arguments"))
-    end
-    if !(isa(args[1], Dims)) || length(args[1]) > 3
-        throw(ArgumentError("kernel global size must be of Dims type (dim <= 3)"))
-    end
-    if length(args) == 2 && (!(isa(args[2], Dims)) || length(args[2]) > 3)
-        throw(ArgumentError("kernel local size must be of Dims type (dim <= 3)"))
-    end
-    global_size = args[1]
-    local_size  = length(args) == 2 ? args[2] : nothing
-    # TODO: we cannot pass keywords in anon functions yet, return kernel call thunk
-    return (args...) -> launch(k, global_size, local_size, args...)
-end
-
-# blocking kernel call that finishes queue
-# XXX: shouldn't be blocking!
-function launch(k::Kernel, global_work_size, local_work_size,
-                args...; global_work_offset=nothing,
-                wait_on::Union{Nothing,Vector{Event}}=nothing)
-    set_args!(k, args...)
-    evt = enqueue_kernel(k,
-                         global_work_size,
-                         local_work_size,
-                         global_work_offset=global_work_offset,
-                         wait_on=wait_on)
-    finish(queue())
-    return evt
-end
-
-function enqueue_kernel(k::Kernel, global_work_size)
-    enqueue_kernel(k, global_work_size, nothing)
-end
-
-function enqueue_kernel(k::Kernel,
-                        global_work_size,
-                        local_work_size;
-                        global_work_offset=nothing,
-                        wait_on::Union{Nothing,Vector{Event}}=nothing)
+function enqueue_kernel(k::Kernel, global_work_size, local_work_size=nothing;
+                        global_work_offset=nothing, wait_on::Vector{Event}=Event[])
     max_work_dim = device().max_work_item_dims
     work_dim     = length(global_work_size)
     if work_dim > max_work_dim
@@ -282,6 +243,8 @@ function enqueue_kernel(k::Kernel,
         for (i, o) in enumerate(global_work_offset)
             goffset[i] = o
         end
+    else
+        # null global offset means (0, 0, 0)
     end
 
     lsize = C_NULL
@@ -296,9 +259,11 @@ function enqueue_kernel(k::Kernel,
         for (i, s) in enumerate(local_work_size)
             lsize[i] = s
         end
+    else
+        # null local size means OpenCL decides
     end
 
-    if wait_on !== nothing
+    if !isempty(wait_on)
         n_events = cl_uint(length(wait_on))
         wait_event_ids = [evt.id for evt in wait_on]
     else
@@ -312,6 +277,12 @@ function enqueue_kernel(k::Kernel,
     return Event(ret_event[], retain=false)
 end
 
+function call(k::Kernel, args...; global_size=(1,), local_size=nothing,
+              global_work_offset=nothing, wait_on::Vector{Event}=Event[])
+    set_args!(k, args...)
+    enqueue_kernel(k, global_size, local_size; global_work_offset, wait_on)
+end
+
 function enqueue_task(k::Kernel; wait_for=nothing)
     n_evts  = 0
     evt_ids = C_NULL
diff --git a/test/behaviour.jl b/test/behaviour.jl
index a76ae63..cbd0874 100644
--- a/test/behaviour.jl
+++ b/test/behaviour.jl
@@ -18,7 +18,7 @@
     prg   = cl.Program(source=hello_world_kernel) |> cl.build!
     kern  = cl.Kernel(prg, "hello")
 
-    cl.launch(kern, str_len, nothing, out_buf)
+    cl.call(kern, out_buf; global_size=str_len)
     h = cl.read(out_buf)
 
     @test hello_world_str == GC.@preserve h unsafe_string(pointer(h))
@@ -213,7 +213,7 @@ end
     R_buf = cl.Buffer(Float32, length(X), :w)
 
     global_size = size(X)
-    cl.launch(part3, global_size, nothing, X_buf, Y_buf, R_buf, P_buf)
+    cl.call(part3, X_buf, Y_buf, R_buf, P_buf; global_size, local_size=nothing)
 
     r = cl.read(R_buf)
     @test all(x -> x == 13.5, r)
@@ -251,7 +251,7 @@ end
 
     P = MutableParams(0.5, 10.0)
     P_buf = cl.Buffer(Float32, 2, :w)
-    cl.launch(part3, 1, nothing, P_buf, P)
+    cl.call(part3, P_buf, P)
 
     r = cl.read(P_buf)
 
diff --git a/test/context.jl b/test/context.jl
index 49be7f8..5f682cb 100644
--- a/test/context.jl
+++ b/test/context.jl
@@ -36,7 +36,7 @@
         #        try
         #            p = cl.Program(source = empty_kernel) |> cl.build!
         #            k = cl.Kernel(p, "test")
-        #            cl.launch(k, 1, 10000000)
+        #            cl.call(k; global_size=1, local_size=10000000)
         #        catch
         #        end
         #    end
diff --git a/test/kernel.jl b/test/kernel.jl
index edfc521..b4a6128 100644
--- a/test/kernel.jl
+++ b/test/kernel.jl
@@ -105,33 +105,23 @@
         k = cl.Kernel(p, "test")
 
         # dimensions must be the same size
-        @test_throws ArgumentError cl.launch(k, (1,), (1,1), d_buff)
-        @test_throws ArgumentError cl.launch(k, (1,1), (1,), d_buff)
+        @test_throws ArgumentError cl.call(k, d_buff; global_size=(1,), local_size=(1,1))
+        @test_throws ArgumentError cl.call(k, d_buff; global_size=(1,1), local_size=(1,))
 
         # dimensions are bounded
         max_work_dim = cl.device().max_work_item_dims
         bad = tuple([1 for _ in 1:(max_work_dim + 1)])
-        @test_throws MethodError cl.launch(k, bad, d_buff)
-
-        # devices have finite work sizes
-        @test_throws MethodError cl.launch(k, (typemax(Int),), d_buff)
-
-        # blocking call to kernel finishes cmd queue
-        cl.launch(k, 1, 1, d_buff)
 
+        # calls are asynchronous, but cl.read blocks
+        cl.call(k, d_buff)
         r = cl.read(d_buff)
         @test r[1] == 2
 
-        # alternative kernel call syntax
-        k[(1,), (1,)](d_buff)
-        r = cl.read(d_buff)
-        @test r[1] == 3
-
         # enqueue task is an alias for calling
         # a kernel with a global/local size of 1
         evt = cl.enqueue_task(k)
         r = cl.read(d_buff)
-        @test r[1] == 4
+        @test r[1] == 3
     end
 
     @testset "packed structures" begin
@@ -150,7 +140,7 @@
         structkernel = cl.Kernel(prg, "structest")
         out = cl.Buffer(Float32, 2, :w)
         bstruct = (1, Int32(4))
-        structkernel[(1,)](out, bstruct)
+        cl.call(structkernel, out, bstruct)
         r = cl.read(out)
         @test r  == [1f0, 4f0]
     end
@@ -185,7 +175,7 @@
         structkernel = cl.Kernel(prg, "structest")
         out = cl.Buffer(Float32, 4, :w)
         astruct = CLTestStruct((1f0, 2f0, 3f0), nothing, 22f0)
-        structkernel[(1,)](out, astruct)
+        cl.call(structkernel, out, astruct)
         r = cl.read(out)
         @test r == [1f0, 2f0, 3f0, 22f0]
     end