diff --git a/OpenCL_Ext.txt b/OpenCL_Ext.txt index 99d8e1a6..0fe4b83c 100644 --- a/OpenCL_Ext.txt +++ b/OpenCL_Ext.txt @@ -109,6 +109,7 @@ include::ext/cl_khr_subgroup_rotate.asciidoc[] include::ext/cl_khr_work_group_uniform_arithmetic.asciidoc[] include::ext/cl_khr_command_buffer_mutable_dispatch.asciidoc[] +include::ext/cl_khr_command_buffer_multi_device.asciidoc[] // NOTE: To keep meaningful section numbers, new // extension documents should be added above here! diff --git a/config/rouge/lib/rouge/lexers/opencl.rb b/config/rouge/lib/rouge/lexers/opencl.rb index e9f37a74..69d22503 100644 --- a/config/rouge/lib/rouge/lexers/opencl.rb +++ b/config/rouge/lib/rouge/lexers/opencl.rb @@ -154,9 +154,6 @@ class OpenCL < Cpp cl_command_buffer_state_khr cl_command_buffer_properties_khr cl_ndrange_kernel_command_properties_khr - cl_mutable_command_khr - cl_mutable_command_info_khr - cl_mutable_base_config_khr cl_dx9_surface_info_khr cl_motion_estimation_desc_intel cl_mem_ext_host_ptr @@ -179,13 +176,14 @@ class OpenCL < Cpp cl_device_integer_dot_product_acceleration_properties_khr cl_command_buffer_khr cl_mutable_command_khr + cl_mutable_command_info_khr cl_command_buffer_structure_type_khr cl_mutable_base_config_khr - cl_mutable_command_info_khr cl_mutable_dispatch_arg_khr cl_mutable_dispatch_config_khr cl_mutable_dispatch_exec_info_khr cl_mutable_dispatch_fields_khr + cl_platform_command_buffer_capabilities_khr ) # Here are some interesting tokens diff --git a/ext/cl_khr_command_buffer_multi_device.asciidoc b/ext/cl_khr_command_buffer_multi_device.asciidoc new file mode 100644 index 00000000..caa176a5 --- /dev/null +++ b/ext/cl_khr_command_buffer_multi_device.asciidoc @@ -0,0 +1,760 @@ +// Copyright 2018-2023 The Khronos Group. This work is licensed under a +// Creative Commons Attribution 4.0 International License; see +// http://creativecommons.org/licenses/by/4.0/ + +[[cl_khr_command_buffer_multi_device]] +== Command Buffers - Multiple Devices (Provisional) + +This extension enables users to record commands across multiple queues in the same command-buffer, +providing execution of heterogeneous task graphs from command-queues associated with different devices. + +=== General Information + +==== Name Strings + +`cl_khr_command_buffer_multi_device` + +==== Version History + +[cols="1,1,3",options="header",] +|==== +| *Date* | *Version* | *Description* +| 2023-04-14 | 0.9.0 | First assigned version (provisional). +|==== + +==== Dependencies + +This extension requires the `cl_khr_command_buffer` extension version 0.9.0. + +==== Contributors + +Ewan Crawford, Codeplay Software Ltd. + +Gordon Brown, Codeplay Software Ltd. + +Kenneth Benzie, Codeplay Software Ltd. + +Alastair Murray, Codeplay Software Ltd. + +Jack Frankland, Codeplay Software Ltd. + +Balaji Calidas, Qualcomm Technologies Inc. + +Joshua Kelly, Qualcomm Technologies, Inc. + +Kevin Petit, Arm Ltd. + +Aharon Abramson, Intel. + +Ben Ashbaugh, Intel. + +Boaz Ouriel, Intel. + +Pekka Jääskeläinen, Tampere University + +Jan Solanti, Tampere University + +Nikhil Joshi, NVIDIA + +James Price, Google + + +=== Overview + +The `cl_khr_command_buffer` extension separates command construction from +enqueue by providing a mechanism to record a set of commands which can then be +repeatedly enqueued. However, the commands in a command-buffer can +only be recorded to a single command-queue specified on command-buffer creation. + +`cl_khr_command_buffer_multi_device` extends the scope of a command-buffer to +allow commands to be recorded across multiple queues in the same command-buffer, +providing execution of heterogeneous task graphs from command-queues associated +with different devices. + +The ability for a user to deep copy an existing command-buffer so that the +commands target a different device is also made possible by +`cl_khr_command_buffer_multi_device`. Depending on platform support the mapping +of commands to the new target device can be done either explicitly by the user, +or automatically by the OpenCL runtime. + +=== New Types + +Bitfield for querying command-buffer capabilities of an OpenCL Platform with +{clGetPlatformInfo}, see the +<>: +[source,opencl] +---- +typedef cl_bitfield cl_platform_command_buffer_capabilities_khr +---- + +=== New API Functions + +[source,opencl] +---- +cl_command_buffer_khr clRemapCommandBufferKHR( + cl_command_buffer_khr command_buffer, + cl_bool automatic, + cl_uint num_queues, + const cl_command_queue* queues, + cl_uint num_handles, + const cl_mutable_command_khr* handles, + cl_mutable_command_khr* handles_ret, + cl_int* errcode_ret); +---- + +=== New API Enums + +Enums for querying device command-buffer capabilities with +{clGetDeviceInfo}, see the +<>: + +[source,opencl] +---- +// Accepted values for the param_name parameter to clGetDeviceInfo +CL_DEVICE_COMMAND_BUFFER_NUM_SYNC_DEVICES_KHR 0x12AB +CL_DEVICE_COMMAND_BUFFER_SYNC_DEVICES_KHR 0x12AC + +// Bits for cl_device_command_buffer_capabilities_khr bitfield +CL_COMMAND_BUFFER_CAPABILITY_MULTIPLE_QUEUE_KHR (0x1 << 4) + +// Bits for cl_command_buffer_flags_khr +CL_COMMAND_BUFFER_DEVICE_SIDE_SYNC_KHR (0x1 << 2) +---- + +Enums for querying platform command-buffer capabilities with +{clGetPlatformInfo}, see the +<>: + +[source,opencl] +---- +// Accepted values for the param_name parameter to clGetPlatformInfo +CL_PLATFORM_COMMAND_BUFFER_CAPABILITIES_KHR 0x0908 + +// Bits for cl_platform_command_buffer_capabilities_khr bitfield +CL_COMMAND_BUFFER_PLATFORM_UNIVERSAL_SYNC_KHR (0x1 << 0) +CL_COMMAND_BUFFER_PLATFORM_REMAP_QUEUES_KHR (0x1 << 1) +CL_COMMAND_BUFFER_PLATFORM_AUTOMATIC_REMAP_KHR (0x1 << 2) +---- + +=== Modifications to section 4.1 of the OpenCL API Specification + +Add to *Table 3*, _Platform Queries_, + +[[cl_khr_command_buffer_multi_device-platform-queries]] +[cols="1,1,4",options="header"] +|==== +| Platform Info +| Return Type +| Description + +| {CL_PLATFORM_COMMAND_BUFFER_CAPABILITIES_KHR} +| {cl_platform_command_buffer_capabilities_khr_TYPE} +| Describes platform command-buffer capabilities, encoded as bits in a bitfield. + Supported capabilities are: + + {CL_COMMAND_BUFFER_PLATFORM_UNIVERSAL_SYNC_KHR} - Platform supports the ability + to synchronize all commands in a command-buffer using sync-points, irrespective + of the queue the individual commands are recorded to. + + {CL_COMMAND_BUFFER_PLATFORM_REMAP_QUEUES_KHR} - Platform supports the ability + to create a deep copy of an existing command-buffer with the commands + explicitly remapped to different, potentially <>, + queues. + + {CL_COMMAND_BUFFER_PLATFORM_AUTOMATIC_REMAP_KHR} - Platform supports the + ability to create a remapped command-buffer where the mapping of commands to + queues is done by the OpenCL runtime in a way it determines as optimal. If + {CL_COMMAND_BUFFER_PLATFORM_AUTOMATIC_REMAP_KHR} is reported, + {CL_COMMAND_BUFFER_PLATFORM_REMAP_QUEUES_KHR} must also be reported. + +|==== + +=== Modifications to section 4.2 of the OpenCL API Specification + +Add {CL_DEVICE_COMMAND_BUFFER_NUM_SYNC_DEVICES_KHR} and +{CL_DEVICE_COMMAND_BUFFER_SYNC_DEVICES_KHR} rows to *Table 5*, _Device Queries_, +of section 4.2. Also, add additional text to the +{CL_DEVICE_COMMAND_BUFFER_CAPABILITIES_KHR} row: + +[[cl_khr_command_buffer_multi_device-device-queries]] +[cols="1,1,4",options="header"] +|==== +| {cl_device_info_TYPE} +| Return Type +| Description + +| {CL_DEVICE_COMMAND_BUFFER_CAPABILITIES_KHR} +| {cl_device_command_buffer_capabilities_khr_TYPE} +| Describes device command-buffer capabilities, encoded as bits in a bitfield. + Supported capabilities are: + + {CL_COMMAND_BUFFER_CAPABILITY_MULTIPLE_QUEUE_KHR} Device supports the ability + to record commands to more than one command-queue associated with _device_ in + a single command-buffer. + +| {CL_DEVICE_COMMAND_BUFFER_NUM_SYNC_DEVICES_KHR} +| {cl_uint_TYPE} +| Return the number of devices _device_ can use device-side synchronization + with. + +| {CL_DEVICE_COMMAND_BUFFER_SYNC_DEVICES_KHR} +| {cl_device_id_TYPE}[] +| Return the list of devices and sub-devices _device_ can use device-side + synchronization with. A device should list itself only if it has native + support for synchronizing commands. + +|==== + +=== Modifications to section 5.11 of the OpenCL API Specification + +Add additional wording to the description column of *Table 36*, _Event Object +Queries_: + +{CL_EVENT_COMMAND_QUEUE} - For events returned by a command-buffer enqueue +operation to multiple command-queues, `NULL` is returned. + +{CL_EVENT_COMMAND_EXECUTION_STATUS} - For events returned by a command-buffer +enqueue operation to multiple command-queues the semantics of execution status +is as follows: + + * {CL_QUEUED} - Command-buffer has been enqueued across the command-queues. + + * {CL_SUBMITTED} - Commands from the command-buffer have been submitted by + the host to any device associated with one of the command-queues. + + * {CL_RUNNING} - Any command from the command-buffer has started execution on + a device. + + * {CL_COMPLETE} - All commands have completed on all devices. + +=== Modifications to section 5.14 of the OpenCL API Specification + +==== Query Updates + +Add additional wording to description column of *Table 38*, _Event Profiling +Queries_: + +* {CL_PROFILING_COMMAND_QUEUED} - For events returned by a command-buffer +enqueue operation to multiple command-queues, the host time when the +command-buffer has been enqueued across the command-queues is used. + +* {CL_PROFILING_COMMAND_SUBMIT} - For events returned by a command-buffer +enqueue operation to multiple command-queues, the host time is used when +command-buffer commands have been submitted to any command-queue. + +* {CL_PROFILING_COMMAND_START} - For events returned by a command-buffer +enqueue operation to multiple command-queues, the host time is used when +any device starts executing a command-buffer command. + +* {CL_PROFILING_COMMAND_END} - For events returned by a command-buffer +enqueue operation to multiple command-queues, the host time is used when +the last command-buffer command finishes execution on any device. + +* {CL_PROFILING_COMMAND_COMPLETE} - For events returned by a command-buffer +enqueue operation to multiple command-queues, the host time is used when the +command-buffer has completed execution across all command-queues. + +[NOTE] +==== +If no reliable device timer sources are available to inform the host side, +or parallel runtime scheduling makes it impossible to identify a first/last +command, then an implementation may fallback to reporting +{CL_PROFILING_COMMAND_SUBMIT} and {CL_PROFILING_COMMAND_COMPLETE} for +{CL_PROFILING_COMMAND_START} and {CL_PROFILING_COMMAND_END} respectively. +==== + +==== Error Updates + +Extend the wording defining the {CL_PROFILING_INFO_NOT_AVAILABLE} error return +code from {clGetEventProfilingInfo} to append the following sentence: + +* If _event_ was created from a call to {clEnqueueCommandBufferKHR}, + {CL_PROFILING_INFO_NOT_AVAILABLE} is returned if all the queues passed + do not have {CL_QUEUE_PROFILING_ENABLE} set. + +=== Modifications to Section 5.X - Command Buffers of the OpenCL API Specification + +==== Additional Section 5.X Introduction Text + +A command-buffer can contain commands recorded to the queues of different +devices if a vendor provides support for inter-device {cl_sync_point_khr_TYPE} +synchronization. This feature is reported either through +{CL_DEVICE_COMMAND_BUFFER_SYNC_DEVICES_KHR}, which informs the user what devices can +synchronize with each other natively on the device-side, or through +{CL_COMMAND_BUFFER_PLATFORM_UNIVERSAL_SYNC_KHR}, which allows synchronization +between all devices in a platform, falling back to host-side synchronization +when device-side synchronization isn't available. These two mechanisms are +referred to as **device-side sync** and **universal sync** respectively. + +If these mechanisms don't report that more than one device can be used in a +command-buffer, it will still be possible to perform multiple queue recording in a +command-buffer if the {CL_COMMAND_BUFFER_CAPABILITY_MULTIPLE_QUEUE_KHR} +capability is reported for a device. However, with this capability all the +queues commands are recorded to must target the same device. + +Commands recorded to different command-queues in the same command-buffer may be +executed concurrently to each other unless synchronized explicitly with +sync-points. Ordering of other commands submitted to the same command-queues as +used to enqueue a command-buffer is the responsibility of the programmer. A +command-buffer enqueue spanning multiple queues can return an event to use for +synchronization, which will complete once all commands in the command-buffer +have completed. If ordering restrictions are required, this event (or +command-queue barriers) may be used by the user to synchronize the +command-buffer enqueue with regular commands, or another command-buffer enqueue. + +==== Add new section "Section 5.X.Y - Remapping Command Buffers" + +Platforms reporting the {CL_COMMAND_BUFFER_PLATFORM_REMAP_QUEUES_KHR} capability +support generating a deep copy of a command-buffer with its commands remapped to a +list of command-queues that are potentially <> with the queues +used to create the command-buffer. That is, the remapped command-buffer can +execute on queues that differ in terms of properties and/or associated device +from the original command-buffer queues. + +This functionality is invoked through a new synchronous entry-point +{clRemapCommandBufferKHR} which takes a list of queues to which the commands +should now target. It then returns a command-buffer containing the same +commands as the original, with the same command dependencies, but targeting +different queues. A list of command handles may also be passed to the +entry-point, which allows handles to the equivalent commands in the remapped +command-buffer to be returned by an output parameter. + +Device properties restrict remapping possibilities, as existing commands +can have a configuration which is not supported by another device, and so +remapping may fail with an error relating to this incompatibility. Examples +of command configurations which can introduce incompatibilities when trying to +map to a new device are: + +* Program language features used in a kernel not supported by the new device. +* ND-Range configuration, e.g exceeds new the device max work-group size. +* Misalignment of sub-buffers based on minimum alignment of new device. + +In additional to this functionality, platforms reporting +{CL_COMMAND_BUFFER_PLATFORM_AUTOMATIC_REMAP_KHR} allow the user to create a +remapped command-buffer where the mapping of queues to commands is determined by +the OpenCL runtime in a way it determines as optimal. This is particularly +useful in hot plugging environments where devices may appear and disappear +during runtime. + +The function +include::{generated}/api/protos/clRemapCommandBufferKHR.txt[] + +Creates a deep copy of the input command-buffer with the copied commands +remapped to target the passed command-queues. The returned command-buffer +has the same state as the input command-buffer, unless the input +command-buffer is in the <> state, in which case the returned +command-buffer has state <>. + +_command_buffer_ Specifies the command-buffer to create a remapped deep copy of. + +_automatic_ Indicates if the remapping is done explicitly by the user, or +automatically by the OpenCL runtime. If _automatic_ is {CL_FALSE}, then each +element of _queues_ will replace the queue used on _command_buffer_ creation at +the same index. If {CL_TRUE} and {CL_COMMAND_BUFFER_PLATFORM_AUTOMATIC_REMAP_KHR} +is supported, then the OpenCL runtime will decide in a way it determines optimal +which of the elements in _queues_ each command in the returned command-buffer +will be associated with. + +_num_queues_ The number of command-queues listed in _queues_, must not be 0. + +_queues_ A pointer to an ordered list of command-queues for the returned +command-buffer to target, must be a non-`NULL` value. + +_num_handles_ The number of command handles passed in both _handles_ and +_handles_ret_ lists, may be 0. + +_handles_ An ordered list of handles belonging to _command_buffer_ to create +remapped copies of, may be `NULL`. + +_handles_ret_ Returns an ordered list of handles where each handle is equivalent +to the handle at the same index in _handles_, but belonging to the returned +command-buffer. + +_errcode_ret_ Returns an appropriate error code. If _errcode_ret_ is `NULL`, no +error code is returned. + +{clRemapCommandBufferKHR} returns a valid command-buffer with _errcode_ret_ set +to {CL_SUCCESS} if the command-buffer is created successfully. Otherwise, it +returns a `NULL` value without setting _handles_ret_, and with one of the +following error values returned in _errcode_ret_: + +* {CL_INVALID_COMMAND_BUFFER_KHR} if _command_buffer_ is not a valid + command-buffer. + +* {CL_INVALID_VALUE} if _num_queues_ is 0, or if _queues_ is `NULL`. + +* {CL_INVALID_VALUE} if _automatic_ is {CL_FALSE} and _num_queues_ is not equal + to the number of queues used on creation of _command_buffer_. + +* {CL_INVALID_VALUE} if _handles_ or _handles_ret_ is `NULL` and + _num_handles_ is > 0, or either _handles_ or _handles_ret_ is not + `NULL` and _num_handles_ is 0. + +* {CL_INVALID_VALUE} if any handle in _handles_ is not a valid command handle + belonging to _command_buffer_. + +* {CL_INVALID_COMMAND_QUEUE} if any command-queue in _queues_ is not a valid + command-queue. + +* {CL_INVALID_CONTEXT} if _command_buffer_ and all the command-queues in + _queues_ do not have the same OpenCL context. + +* {CL_INVALID_OPERATION} if the platform does not support the + {CL_COMMAND_BUFFER_PLATFORM_REMAP_QUEUES_KHR} flag. + +* {CL_INVALID_OPERATION} if the platform does not support the + {CL_COMMAND_BUFFER_PLATFORM_AUTOMATIC_REMAP_KHR} flag and _automatic_ is + {CL_TRUE}. + +* {CL_INCOMPATIBLE_COMMAND_QUEUE_KHR} if such an error would be returned by + passing _queues_ to {clCreateCommandBufferKHR}. + +* Any error relating to device support that can be returned by a command + recording entry-point may also be returned. As a command in _command_buffer_ + can have a configuration that is not supported by a device that is associated + with the queue in _queues_ the command is being remapped to. + +* {CL_OUT_OF_RESOURCES} if there is a failure to allocate resources + required by the OpenCL implementation on the device. + +* {CL_OUT_OF_HOST_MEMORY} if there is a failure to allocate resources + required by the OpenCL implementation on the host. + +==== Modifications to clCreateCommandBufferKHR + +==== New Property Flag + +Modify the {CL_COMMAND_BUFFER_FLAGS_KHR} property in the +{clCreateCommandBufferKHR} properties table to introduce a new flag to the +bitfield. The following text is now included in the description of property +values. + +[cols=",,",options="header",] +|==== +| *Recording Properties* +| *Property Value* +| *Description* + +| {CL_COMMAND_BUFFER_FLAGS_KHR} +| {cl_command_buffer_flags_khr_TYPE} +| {CL_COMMAND_BUFFER_DEVICE_SIDE_SYNC_KHR} - All commands in the command-buffer + must use native synchronization, as reported by + {CL_DEVICE_COMMAND_BUFFER_SYNC_DEVICES_KHR}. This can be used as a safeguard + for performant applications that don't want to accidentally fallback to host + synchronization when passing multiple queues. +|==== + +==== Add to clCreateCommandBufferKHR description + +.Summary of command-buffer creation configurations +[width="100%",options="header"] + +|==== +| All devices associated with `queues` can device-side sync | Platform supports universal sync | Condition | Result + +.3+| Yes +.3+| Yes or No +| Any device does not support the multi-queue capability, and has more than one + queue targeting it +| Error - {CL_INCOMPATIBLE_COMMAND_QUEUE_KHR} +| User sets {CL_COMMAND_BUFFER_DEVICE_SIDE_SYNC_KHR} flag +| OK +| Otherwise +| OK + +.3+| No +.3+| Yes +| Any device does not support the multi-queue capability, and has more than one + queue targeting it +| Error - {CL_INCOMPATIBLE_COMMAND_QUEUE_KHR} +| User sets {CL_COMMAND_BUFFER_DEVICE_SIDE_SYNC_KHR} flag +| Error - {CL_INCOMPATIBLE_COMMAND_QUEUE_KHR} +| Otherwise +| OK - May be performance implications when synchronizing commands between devices + without device-side sync support. + +.2+| No +.2+| No +| Always +| Error - {CL_INCOMPATIBLE_COMMAND_QUEUE_KHR} + +|==== + +===== Parameter Updates + +Parameter descriptions changed to: + +_num_queues_ The number of command-queues listed in _queues_. + +_queues_ Is a pointer to a list of command-queues that the command-buffer may be +executed on. _queues_ must be a non-`NULL` value and length of the list equal to +_num_queues_. + +===== Error Updates + +The returned error: + +* {CL_INVALID_VALUE} if _num_queues_ is not one. + +Is changed to: + +* {CL_INVALID_VALUE} if _num_queues_ is zero. + +Additional errors: + +* {CL_INCOMPATIBLE_COMMAND_QUEUE_KHR} if _queues_ includes more than one + command-queue associated with a device that does not support capability + {CL_COMMAND_BUFFER_CAPABILITY_MULTIPLE_QUEUE_KHR}. + +* {CL_INCOMPATIBLE_COMMAND_QUEUE_KHR} if the + {CL_COMMAND_BUFFER_DEVICE_SIDE_SYNC_KHR} flag is set, and any device + associated with a command-queue in _queues_ does not have the other devices + associated with _queues_ enumerated in + {CL_DEVICE_COMMAND_BUFFER_SYNC_DEVICES_KHR}. + +* {CL_INCOMPATIBLE_COMMAND_QUEUE_KHR} if the platform doesn't support the + {CL_COMMAND_BUFFER_PLATFORM_UNIVERSAL_SYNC_KHR} capability, and any device + associated with a command-queue in _queues_ does not have the other devices + associated with _queues_ enumerated in + {CL_DEVICE_COMMAND_BUFFER_SYNC_DEVICES_KHR}. + +==== Command recording entry points + +The descriptions of command recording entry-points are modified as described in +this section. These changes apply to all of {clCommandCopyBufferKHR}, +{clCommandCopyBufferRectKHR}, {clCommandCopyBufferToImageKHR}, +{clCommandCopyImageKHR}, {clCommandCopyImageToBufferKHR}, +{clCommandFillBufferKHR}, {clCommandFillImageKHR}, and +{clCommandNDRangeKernelKHR}. + +===== Parameter Update + +Parameter description of _command_queue_ is changed to: + +_command_queue_ Specifies the command-queue the command will be recorded to. +If _command_queue_ is `NULL` then only one command-queue must have been set +on _command_buffer_ creation, otherwise _command_queue_ must be a non-`NULL` +value. + +===== Error Update + +The error condition: + +* {CL_INVALID_COMMAND_QUEUE} if _command_queue_ is not `NULL`. + +Is changed to : + +* {CL_INVALID_COMMAND_QUEUE} if _command_queue_ is `NULL` and _command_buffer_ + was created with more than one queue, or if _command_queue_ is non-`NULL` and + not a command-queue listed on _command_buffer_ creation. + +=== Sample Code + +[source,opencl] +---- +#define CL_CHECK(ERROR) \ + if (ERROR) { \ + std::cerr << "OpenCL error: " << ERROR << "\n"; \ + return ERROR; \ + } + +int main() { + cl_platform_id platform; + CL_CHECK(clGetPlatformIDs(1, &platform, nullptr)); + cl_platform_command_buffer_capabilities_khr platform_caps; + CL_CHECK(clGetPlatformInfo(platform, + CL_PLATFORM_COMMAND_BUFFER_CAPABILITIES_KHR, + sizeof(platform_caps), &platform_caps, NULL)); + if (!(platform_caps & CL_COMMAND_BUFFER_PLATFORM_AUTOMATIC_REMAP_KHR)) { + std::cerr << "Command-buffer remapping not supported but used in example, " + "skipping\n"; + return 0; + } + + cl_uint num_devices = 0; + CL_CHECK(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices)); + std::vector devices(num_devices); + CL_CHECK( + clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, devices.data(), nullptr)); + + // Checks omitted for brevity that either a) the platform supports + // CL_COMMAND_BUFFER_PLATFORM_UNIVERSAL_SYNC_KHR or b) each device is listed + // in the others CL_DEVICE_COMMAND_BUFFER_SYNC_DEVICES_KHR + + cl_int error; + cl_context context = + clCreateContext(NULL, num_devices, devices.data(), NULL, NULL, &error); + CL_CHECK(error); + + std::vector queues(num_devices); + for (cl_uint i = 0; i < num_devices; i++) { + queues[i] = clCreateCommandQueue(context, devices[i], 0, &error); + CL_CHECK(error); + } + + const char *code = R"OpenCLC( + kernel void vector_addition(global int* tile1, global int* tile2, + global int* res) { + size_t index = get_global_id(0); + res[index] = tile1[index] + tile2[index]; + } + )OpenCLC"; + const size_t length = std::strlen(code); + + cl_program program = + clCreateProgramWithSource(context, 1, &code, &length, &error); + CL_CHECK(error); + + CL_CHECK( + clBuildProgram(program, num_devices, devices.data(), NULL, NULL, NULL)); + + cl_kernel kernel = clCreateKernel(program, "vector_addition", &error); + CL_CHECK(error); + + constexpr size_t frame_count = 60; + constexpr size_t frame_elements = 1024; + constexpr size_t frame_size = frame_elements * sizeof(cl_int); + + constexpr size_t tile_count = 16; + constexpr size_t tile_elements = frame_elements / tile_count; + constexpr size_t tile_size = tile_elements * sizeof(cl_int); + + cl_mem buffer_tile1 = + clCreateBuffer(context, CL_MEM_READ_ONLY, tile_size, NULL, &error); + CL_CHECK(error); + + cl_mem buffer_tile2 = + clCreateBuffer(context, CL_MEM_READ_ONLY, tile_size, NULL, &error); + CL_CHECK(error); + + cl_mem buffer_res = + clCreateBuffer(context, CL_MEM_WRITE_ONLY, tile_size, NULL, &error); + CL_CHECK(error); + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(buffer_tile1), &buffer_tile1)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(buffer_tile2), &buffer_tile2)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(buffer_res), &buffer_res)); + + cl_command_buffer_khr original_cmdbuf = + clCreateCommandBufferKHR(num_devices, queues.data(), nullptr, &error); + CL_CHECK(error); + + cl_mem buffer_src1 = + clCreateBuffer(context, CL_MEM_READ_ONLY, frame_size, NULL, &error); + CL_CHECK(error); + + cl_mem buffer_src2 = + clCreateBuffer(context, CL_MEM_READ_ONLY, frame_size, NULL, &error); + CL_CHECK(error); + + cl_mem buffer_dst = + clCreateBuffer(context, CL_MEM_READ_WRITE, frame_size, NULL, &error); + CL_CHECK(error); + + cl_sync_point_khr tile_sync_point = 0; + for (size_t tile_index = 0; tile_index < tile_count; tile_index++) { + cl_sync_point_khr copy_sync_points[2]; + CL_CHECK(clCommandCopyBufferKHR( + original_cmdbuf, queues[tile_index % num_devices], buffer_src1, + buffer_tile1, tile_index * tile_size, 0, tile_size, + tile_sync_point ? 1 : 0, tile_sync_point ? &tile_sync_point : NULL, + ©_sync_points[0], NULL)); + + CL_CHECK(clCommandCopyBufferKHR( + original_cmdbuf, queues[tile_index % num_devices], buffer_src2, + buffer_tile2, tile_index * tile_size, 0, tile_size, + tile_sync_point ? 1 : 0, + tile_sync_point ? &tile_sync_point : nullptr, + ©_sync_points[1], NULL)); + + cl_sync_point_khr nd_sync_point; + CL_CHECK(clCommandNDRangeKernelKHR( + original_cmdbuf, queues[tile_index % num_devices], NULL, kernel, 1, + NULL, &tile_elements, NULL, 2, copy_sync_points, &nd_sync_point, NULL)); + + CL_CHECK(clCommandCopyBufferKHR( + original_cmdbuf, queues[tile_index % num_devices], buffer_res, + buffer_dst, 0, tile_index * tile_size, tile_size, 1, &nd_sync_point, + &tile_sync_point, NULL)); + } + + CL_CHECK(clFinalizeCommandBufferKHR(original_cmdbuf)); + + std::random_device random_device; + std::mt19937 random_engine{random_device()}; + std::uniform_int_distribution random_distribution{ + 0, std::numeric_limits::max() / 2}; + auto random_generator = [&]() { return random_distribution(random_engine); }; + + auto enqueue_frame = [&](cl_command_buffer_khr command_buffer) { + for (size_t frame_index = 0; frame_index < frame_count; frame_index++) { + std::array enqueue_events; + std::vector src1(frame_elements); + std::generate(src1.begin(), src1.end(), random_generator); + CL_CHECK(clEnqueueWriteBuffer(queues[0], buffer_src1, CL_FALSE, 0, + frame_size, src1.data(), 0, nullptr, + &enqueue_events[0])); + std::vector src2(frame_elements); + std::generate(src2.begin(), src2.end(), random_generator); + CL_CHECK(clEnqueueWriteBuffer(queues[0], buffer_src2, CL_FALSE, 0, + frame_size, src2.data(), 0, nullptr, + &enqueue_events[1])); + + CL_CHECK(clEnqueueCommandBufferKHR(0, NULL, command_buffer, 2, + enqueue_events.data(), + &enqueue_events[2])); + + CL_CHECK(clWaitForEvents(1, enqueue_events[2])); + + for (auto e : enqueue_events) { + CL_CHECK(clReleaseEvent(e)); + } + } + return 0; + }; + + error = enqueue_frame(original_cmdbuf); + CL_CHECK(error); + + // Remap from N queues to 1 queue and run again + cl_command_buffer_khr remapped_cmdbuf = clRemapCommandBufferKHR( + original_cmdbuf, CL_TRUE, 1, queues.data(), 0, NULL, NULL, &error); + CL_CHECK(error); + + error = enqueue_frame(remapped_cmdbuf); + CL_CHECK(error); + + for (unsigned i = 0; i < num_devices; ++i) { + CL_CHECK(clReleaseCommandQueue(queues[i])); + } + CL_CHECK(clReleaseMemObject(buffer_src1)); + CL_CHECK(clReleaseMemObject(buffer_src2)); + CL_CHECK(clReleaseMemObject(buffer_dst)); + + CL_CHECK(clReleaseMemObject(buffer_tile1)); + CL_CHECK(clReleaseMemObject(buffer_tile2)); + CL_CHECK(clReleaseMemObject(buffer_res)); + + CL_CHECK(clReleaseCommandBufferKHR(original_cmdbuf)); + CL_CHECK(clReleaseCommandBufferKHR(remapped_cmdbuf)); + + CL_CHECK(clReleaseKernel(kernel)); + CL_CHECK(clReleaseProgram(program)); + CL_CHECK(clReleaseContext(context)); + + return 0; +} +---- + +=== Issues + +. In cl_event profiling info for a command-buffer running across the queues for + several devices, how do we know what the first & last commands executed are + if there is concurrent execution across devices. ++ +-- +*RESOLVED*: Allowed an implementation to fallback to {CL_PROFILING_COMMAND_SUBMIT} +and {CL_PROFILING_COMMAND_COMPLETE} when reporting {CL_PROFILING_COMMAND_START} & +{CL_PROFILING_COMMAND_END}. +-- +. Is an atomic constraint required? This would forbid regular clEnqueue* commands, +from interleaving execution on a queue which a command-buffer is being executed +on. ++ +-- +*RESOLVED*: This behavior can block parallelism, and constraint is expressible +by the user through existing synchronization mechanisms if they require it. +-- +. It is currently an error if a set of command-queues passed to +{clEnqueueCommandBufferKHR} aren't compatible with those set on recording. +Should we relax this as an optional capability that allows an implementation to +do a more expensive command-buffer enqueue for this case? ++ +-- +*RESOLVED*: Added as an optional feature. +-- diff --git a/ext/quick_reference.asciidoc b/ext/quick_reference.asciidoc index 4d7559a0..c15ad235 100644 --- a/ext/quick_reference.asciidoc +++ b/ext/quick_reference.asciidoc @@ -29,6 +29,10 @@ | Record and Replay Commands | Provisional Extension +| <> +| Allow a command-buffer to contain commands targeting different devices +| Provisional Extension + | <> | Modify kernel execution commands between enqueues of a command-buffer | Provisional Extension diff --git a/xml/cl.xml b/xml/cl.xml index 149cdd63..307a3b61 100644 --- a/xml/cl.xml +++ b/xml/cl.xml @@ -250,6 +250,7 @@ server's OpenCL/api-docs repository. typedef cl_uint cl_command_buffer_structure_type_khr; typedef cl_bitfield cl_device_fp_atomic_capabilities_ext; typedef cl_uint cl_image_requirements_info_ext; + typedef cl_bitfield cl_platform_command_buffer_capabilities_khr; Structure types @@ -1292,20 +1293,29 @@ server's OpenCL/api-docs repository. + + + + + + + - + + - - + + + @@ -1366,7 +1376,7 @@ server's OpenCL/api-docs repository. - + @@ -1741,7 +1751,8 @@ server's OpenCL/api-docs repository. - + + @@ -3191,6 +3202,17 @@ server's OpenCL/api-docs repository. void* param_value size_t* param_value_size_ret + + cl_command_buffer_khr clRemapCommandBufferKHR + cl_command_buffer_khr command_buffer + cl_bool automatic + cl_uint num_queues + const cl_command_queue* queues + cl_uint num_handles + const cl_mutable_command_khr* handles + cl_mutable_command_khr* handles_ret + cl_int* errcode_ret + cl_int clSetContentSizeBufferPoCL cl_mem buffer @@ -7246,5 +7268,34 @@ server's OpenCL/api-docs repository. + + + + + + + + + + + + + + + + + + + + + + + + + + + + +