Skip to content

Commit

Permalink
Fix memory benchmarks for unexpected gl_SubgroupSize
Browse files Browse the repository at this point in the history
Some Intel GPUs have flexible subgroup sizes.
subgroupSize can be 32 but minSubgroupSize can be smaller.
In this case, unless you forcibly control the subgroup size
at pipeline creation time, gl_SubgroupSize will report 32 but
the actual number of invocations in the subgroup may be 8.

In the memory benchmarks, use a bitcount of the ballot to compute
the dynamic (actual) size of the subgroup.  The alternative is
to use the much more recent (and less portable) subgroup size
control extension.

Fixes: #43
  • Loading branch information
dneto0 committed Nov 24, 2023
1 parent e5af042 commit f44be0e
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 4 deletions.
6 changes: 4 additions & 2 deletions benchmarks/memory/copy_storage_buffer_scalar.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#extension GL_EXT_control_flow_attributes : enable
#extension GL_KHR_shader_subgroup_basic : enable
#extension GL_KHR_shader_subgroup_ballot : enable

layout (local_size_x = 32, local_size_y = 1, local_size_z = 1) in;

Expand All @@ -32,11 +33,12 @@ layout(set = 0, binding = 1) buffer OutputBuffer {
const uint WG_X = 32;

void main() {
const uint subgroup_size = subgroupBallotBitCount(subgroupBallot(true));
// Must guarantee index is in range during dispatch.
uint index = gl_WorkGroupID.x * WG_X * kElementsPerThread +
gl_SubgroupID * gl_SubgroupSize * kElementsPerThread +
gl_SubgroupID * subgroup_size * kElementsPerThread +
gl_SubgroupInvocationID;
uint stride = gl_SubgroupSize;
const uint stride = subgroup_size;
// We want to space out memory accesses by `stride`, so that adjacent threads
// access adjacent memory.
[[unroll]] for (uint i = 0; i < kElementsPerThread; ++i, index += stride) {
Expand Down
6 changes: 4 additions & 2 deletions benchmarks/memory/copy_storage_buffer_vector.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#extension GL_EXT_control_flow_attributes : enable
#extension GL_KHR_shader_subgroup_basic : enable
#extension GL_KHR_shader_subgroup_ballot : enable

layout (local_size_x = 32, local_size_y = 1, local_size_z = 1) in;

Expand All @@ -32,11 +33,12 @@ layout(set = 0, binding = 1) buffer OutputBuffer {
const uint WG_X = 32;

void main() {
const uint subgroup_size = subgroupBallotBitCount(subgroupBallot(true));
// Must guarantee index is in range during dispatch.
uint index = gl_WorkGroupID.x * WG_X * kElementsPerThread +
gl_SubgroupID * gl_SubgroupSize * kElementsPerThread +
gl_SubgroupID * subgroup_size * kElementsPerThread +
gl_SubgroupInvocationID;
uint stride = gl_SubgroupSize;
uint stride = subgroup_size;
// We want to space out memory accesses by `stride`, so that adjacent threads
// access adjacent memory.
[[unroll]] for (uint i = 0; i < kElementsPerThread; ++i, index += stride) {
Expand Down

0 comments on commit f44be0e

Please sign in to comment.