diff --git a/codegen/wgpu_native_patcher.py b/codegen/wgpu_native_patcher.py index e683aadc..0773d279 100644 --- a/codegen/wgpu_native_patcher.py +++ b/codegen/wgpu_native_patcher.py @@ -121,22 +121,21 @@ def write_mappings(): # Write a few native-only mappings: key => int pylines.append("enum_str2int = {") - for name in ["BackendType"]: + for name, use_snake in ( + ("BackendType", False), + ("NativeFeature", True), + ("PipelineStatisticName", True), + ): pylines.append(f' "{name}":' + " {") for key, val in hp.enums[name].items(): if key == "Force32": continue + if use_snake: + key = to_snake_case(key).replace("_", "-") pylines.append(f' "{key}": {val},') pylines.append(" },") - for name in ["NativeFeature"]: - pylines.append(f' "{name}":' + " {") - for key, val in hp.enums[name].items(): - if key == "Force32": - continue - xkey = to_snake_case(key).replace("_", "-") - pylines.append(f' "{xkey}": {val},') - pylines.append(" },") pylines.append("}") + pylines.append("") # Write a few native-only mappings: int => key # If possible, resolve to WebGPU names, otherwise use the native name. diff --git a/docs/backends.rst b/docs/backends.rst index 93a2d4fd..01c2f576 100644 --- a/docs/backends.rst +++ b/docs/backends.rst @@ -45,7 +45,6 @@ It also works out of the box, because the wgpu-native DLL is shipped with wgpu-p The wgpu_native backend provides a few extra functionalities: .. py:function:: wgpu.backends.wgpu_native.request_device_sync(adapter, trace_path, *, label="", required_features, required_limits, default_queue) - An alternative to :func:`wgpu.GPUAdapter.request_adapter`, that streams a trace of all low level calls to disk, so the visualization can be replayed (also on other systems), investigated, and debugged. @@ -188,6 +187,64 @@ they reduce driver overhead on the CPU. :param offset: The byte offset in the indirect buffer containing the first argument. :param count: The number of draw operations to perform. +Some GPUs allow you collect statistics on their pipelines. Those GPUs that support this +have the feature "pipeline-statistics-query", and you must enable this feature when +getting the device. + +You create a query set using the function +``wgpu.backends.wgpu_native.create_statistics_query_set``. + +The possible statistics are: + +* ``PipelineStatisticName.VertexShaderInvocations`` = "vertex-shader-invocations" + * The number of times the vertex shader is called. +* ``PipelineStatisticName.ClipperInvocations`` = "clipper-invocations" + * The number of triangles generated by the vertex shader. +* ``PipelineStatisticName.ClipperPrimitivesOut`` = "clipper-primitives-out" + * The number of primitives output by the clipper. +* ``PipelineStatisticName.FragmentShaderInvocations`` = "fragment-shader-invocations" + * The number of times the fragment shader is called. +* ``PipelineStatisticName.ComputeShaderInvocations`` = "compute-shader-invocations" + * The number of times the compute shader is called. + +The statistics argument is a list or a tuple of statistics names. Each element of the +sequence must either be: + +* The enumeration, e.g. ``PipelineStatisticName.FragmentShaderInvocations`` +* A camel case string, e.g. ``"VertexShaderInvocations"`` +* A snake-case string, e.g. ``"vertex-shader-invocations"`` +* An underscored string, e.g. ``"vertex_shader_invocations"`` + +You may use any number of these statistics in a query set. Each result is an 8-byte +unsigned integer, and the total size of each entry in the query set is 8 times +the number of statistics chosen. + +The statistics are always output to the query set in the order above, even if they are +given in a different order in the list. + +.. py:function:: wgpu.backends.wgpu_native.create_statistics_query_set(device, count, statistics): + + Create a query set that could hold count entries for the specified statistics. + The statistics are specified as a list of strings. + + :param device: The device. + :param count: Number of entries that go into the query set. + :param statistics: A sequence of strings giving the desired statistics. + +.. py:function:: wgpu.backends.wgpu_native.begin_pipeline_statistics_query(encoder, query_set, index): + + Start collecting statistics. + + :param encoder: The ComputePassEncoder or RenderPassEncoder. + :param query_set: The query set into which to save the result. + :param index: The index of the query set into which to write the result. + +.. py:function:: wgpu.backends.wgpu_native.begin_pipeline_statistics_query(encoder, query_set, index): + + Stop collecting statistics and write them into the query set. + + :param encoder: The ComputePassEncoder or RenderPassEncoder. + The js_webgpu backend --------------------- diff --git a/tests/test_wgpu_statistics_query.py b/tests/test_wgpu_statistics_query.py new file mode 100644 index 00000000..1495046b --- /dev/null +++ b/tests/test_wgpu_statistics_query.py @@ -0,0 +1,175 @@ +""" +Test statistics queries. +""" + +import sys + +import numpy as np +import pytest +from pytest import skip + +import wgpu +from testutils import can_use_wgpu_lib, is_ci, run_tests +from wgpu import TextureFormat +from wgpu.backends.wgpu_native.extras import ( + PipelineStatisticName, + begin_pipeline_statistics_query, + create_statistics_query_set, + end_pipeline_statistics_query, +) + +if not can_use_wgpu_lib: + skip("Skipping tests that need the wgpu lib", allow_module_level=True) +elif is_ci and sys.platform == "win32": + skip("These tests fail on dx12 for some reason", allow_module_level=True) + + +default_shader_source = """ + +// Draws a square with side 0.1 centered at the indicated location. +// If reverse, we take the vertices clockwise rather than counterclockwise so that +// we can test culling. + +@vertex +fn vertex(@builtin(vertex_index) vertex_index : u32, + @builtin(instance_index) instance_index : u32 +) -> @builtin(position) vec4 { + var positions = array( + vec2f(-0.05, -0.05), + vec2f( 0.05, -0.05), + vec2f(-0.05, 0.05), + vec2f( 0.05, 0.05), + ); + var p = positions[vertex_index]; + if instance_index == 1 { + // Swapping x and y will cause the coordinates to be cw instead of ccw + p = vec2f(p.y, p.x); + } + return vec4f(p, 0.0, 1.0); +} + +@fragment +fn fragment( ) -> @location(0) vec4f { + return vec4f(); +} + +@compute @workgroup_size(64) +fn compute() { +} +""" + + +def test_render_occluding_squares(): + adapter = wgpu.gpu.request_adapter(power_preference="high-performance") + try: + device = adapter.request_device(required_features=["pipeline-statistics-query"]) + except RuntimeError: + pytest.skip("pipeline-statistics-query not supported") + + pipeline_layout = device.create_pipeline_layout(bind_group_layouts=[]) + + output_texture = device.create_texture( + size=[1024, 1024], + usage=wgpu.TextureUsage.RENDER_ATTACHMENT, + format=TextureFormat.rgba8unorm, + ) + + shader = device.create_shader_module(code=default_shader_source) + render_pipeline = device.create_render_pipeline( + layout=pipeline_layout, + vertex={ + "module": shader, + }, + fragment={ + "module": shader, + "targets": [{"format": output_texture.format}], + }, + primitive={ + "topology": wgpu.PrimitiveTopology.triangle_strip, + "cull_mode": wgpu.CullMode.back, + }, + ) + + compute_pipeline = device.create_compute_pipeline( + layout=pipeline_layout, + compute={"module": shader}, + ) + + color_attachment = { + "clear_value": (0, 0, 0, 0), # only first value matters + "load_op": "clear", + "store_op": "store", + "view": output_texture.create_view(), + } + + occlusion_query_set = create_statistics_query_set( + device, + count=2, + statistics=[ + "vertex-shader-invocations", # name can be snake case string + "ClipperInvocations", # name can be CamelCase + "clipper-primitives-out", + "fragment_shader_invocations", # name can have underscores + PipelineStatisticName.ComputeShaderInvocations, # and there's an enum. + ], + ) + occlusion_buffer = device.create_buffer( + size=2 * 5 * np.uint64().itemsize, + usage=wgpu.BufferUsage.COPY_SRC | wgpu.BufferUsage.QUERY_RESOLVE, + ) + + command_encoder = device.create_command_encoder() + + render_pass = command_encoder.begin_render_pass( + color_attachments=[color_attachment] + ) + begin_pipeline_statistics_query(render_pass, occlusion_query_set, 0) + render_pass.set_pipeline(render_pipeline) + render_pass.draw(4, 2) + end_pipeline_statistics_query(render_pass) + render_pass.end() + + compute_pass = command_encoder.begin_compute_pass() + begin_pipeline_statistics_query(compute_pass, occlusion_query_set, 1) + compute_pass.set_pipeline(compute_pipeline) + compute_pass.dispatch_workgroups(10) + end_pipeline_statistics_query(compute_pass) + compute_pass.end() + + command_encoder.resolve_query_set(occlusion_query_set, 0, 2, occlusion_buffer, 0) + device.queue.submit([command_encoder.finish()]) + + render_result = ( + device.queue.read_buffer(occlusion_buffer, size=40).cast("Q").tolist() + ) + compute_result = ( + device.queue.read_buffer(occlusion_buffer, buffer_offset=40).cast("Q").tolist() + ) + + # We know that compute was called 10 * 60 times, exactly + assert compute_result == [0, 0, 0, 0, 10 * 64] + assert render_result[0] == 8 # 4 vertices, 2 instances + assert render_result[1] == 4 # 4 triangles + # unclear what exactly render_result[2] is. + assert render_result[3] > 1000 + assert render_result[4] == 0 # no calls to the compute engine + + +def test_enum_is_in_sync(): + """ + The enum PipelineStatisticsName is created by hand, while the enum_str2int value + is generated automatically from wgpu.h. They should both contain the same strings. + If this test fails, their values have diverged. + + Either fix PipelineStatisticsName or modify this test and explain what the difference + is. + """ + from wgpu.backends.wgpu_native._mappings import enum_str2int + + enum_list = set(PipelineStatisticName) + native_list = set(enum_str2int["PipelineStatisticName"].keys()) + assert enum_list == native_list + + +if __name__ == "__main__": + run_tests(globals()) diff --git a/wgpu/backends/wgpu_native/_api.py b/wgpu/backends/wgpu_native/_api.py index d996278e..8e7f7279 100644 --- a/wgpu/backends/wgpu_native/_api.py +++ b/wgpu/backends/wgpu_native/_api.py @@ -1952,13 +1952,44 @@ def create_render_bundle_encoder( return GPURenderBundleEncoder(label, render_bundle_id, self) def create_query_set(self, *, label: str = "", type: enums.QueryType, count: int): + return self._create_query_set(label, type, count, None) + + def _create_statistics_query_set(self, label, count, statistics): + values = [] + for name in statistics: + key = to_snake_case(name.replace("_", "-"), "-") + value = enum_str2int["PipelineStatisticName"][key] + values.append(value) + values.sort() + return self._create_query_set( + label, lib.WGPUNativeQueryType_PipelineStatistics, count, values + ) + + def _create_query_set(self, label, type, count, statistics): + next_in_chain = ffi.NULL + if statistics: + c_statistics = ffi.new("WGPUPipelineStatisticName[]", statistics) + # H: chain: WGPUChainedStruct, pipelineStatistics: WGPUPipelineStatisticName *, pipelineStatisticCount: int + query_set_descriptor_extras = new_struct_p( + "WGPUQuerySetDescriptorExtras *", + pipelineStatisticCount=len(statistics), + pipelineStatistics=ffi.cast( + "WGPUPipelineStatisticName const *", c_statistics + ), + # not used: chain + ) + query_set_descriptor_extras.chain.sType = ( + lib.WGPUSType_QuerySetDescriptorExtras + ) + next_in_chain = ffi.cast("WGPUChainedStruct *", query_set_descriptor_extras) + # H: nextInChain: WGPUChainedStruct *, label: char *, type: WGPUQueryType, count: int query_set_descriptor = new_struct_p( "WGPUQuerySetDescriptor *", label=to_c_label(label), type=type, count=count, - # not used: nextInChain + nextInChain=next_in_chain, ) # H: WGPUQuerySet f(WGPUDevice device, WGPUQuerySetDescriptor const * descriptor) @@ -2601,8 +2632,6 @@ def begin_render_pass( timestamp_writes: structs.RenderPassTimestampWrites = optional, max_draw_count: int = 50000000, ): - # Note that occlusion_query_set is ignored because wgpu-native does not have it. - c_timestamp_writes_struct = ffi.NULL if timestamp_writes is not None: check_struct("RenderPassTimestampWrites", timestamp_writes) @@ -3036,6 +3065,16 @@ def dispatch_workgroups_indirect( self._internal, buffer_id, int(indirect_offset) ) + def _begin_pipeline_statistics_query(self, query_set, query_index): + # H: void f(WGPUComputePassEncoder computePassEncoder, WGPUQuerySet querySet, uint32_t queryIndex) + libf.wgpuComputePassEncoderBeginPipelineStatisticsQuery( + self._internal, query_set._internal, int(query_index) + ) + + def _end_pipeline_statistics_query(self): + # H: void f(WGPUComputePassEncoder computePassEncoder) + libf.wgpuComputePassEncoderEndPipelineStatisticsQuery(self._internal) + def end(self): # H: void f(WGPUComputePassEncoder computePassEncoder) libf.wgpuComputePassEncoderEnd(self._internal) @@ -3185,6 +3224,16 @@ def _multi_draw_indexed_indirect(self, buffer, offset, count): self._internal, buffer._internal, int(offset), int(count) ) + def _begin_pipeline_statistics_query(self, query_set, query_index): + # H: void f(WGPURenderPassEncoder renderPassEncoder, WGPUQuerySet querySet, uint32_t queryIndex) + libf.wgpuRenderPassEncoderBeginPipelineStatisticsQuery( + self._internal, query_set._internal, int(query_index) + ) + + def _end_pipeline_statistics_query(self): + # H: void f(WGPURenderPassEncoder renderPassEncoder) + libf.wgpuRenderPassEncoderEndPipelineStatisticsQuery(self._internal) + class GPURenderBundleEncoder( classes.GPURenderBundleEncoder, diff --git a/wgpu/backends/wgpu_native/_mappings.py b/wgpu/backends/wgpu_native/_mappings.py index 0a9d9f2a..976671cc 100644 --- a/wgpu/backends/wgpu_native/_mappings.py +++ b/wgpu/backends/wgpu_native/_mappings.py @@ -333,7 +333,15 @@ "shader-primitive-index": 196639, "shader-early-depth-test": 196640, }, + "PipelineStatisticName": { + "vertex-shader-invocations": 0, + "clipper-invocations": 1, + "clipper-primitives-out": 2, + "fragment-shader-invocations": 3, + "compute-shader-invocations": 4, + }, } + enum_int2str = { "BackendType": { 0: "Undefined", diff --git a/wgpu/backends/wgpu_native/extras.py b/wgpu/backends/wgpu_native/extras.py index 4b363439..28aed319 100644 --- a/wgpu/backends/wgpu_native/extras.py +++ b/wgpu/backends/wgpu_native/extras.py @@ -1,14 +1,25 @@ import os - -from ._api import GPUBindGroupLayout, structs, enums, Dict, logger from typing import List +from . import GPUComputePassEncoder, GPURenderPassEncoder +from ._api import Dict, GPUBindGroupLayout, enums, logger, structs +from ...enums import Enum + + # NOTE: these functions represent backend-specific extra API. # NOTE: changes to this module must be reflected in docs/backends.rst. # We don't use Sphinx automodule because this way the doc build do not # need to be able to load wgpu-native. +class PipelineStatisticName(Enum): # wgpu native + VertexShaderInvocations = "vertex-shader-invocations" + ClipperInvocations = "clipper-invocations" + ClipperPrimitivesOut = "clipper-primitives-out" + FragmentShaderInvocations = "fragment-shader-invocations" + ComputeShaderInvocations = "compute-shader-invocations" + + def request_device_sync( adapter, trace_path, @@ -69,22 +80,42 @@ def set_push_constants( def multi_draw_indirect(render_pass_encoder, buffer, *, offset=0, count): """ - This is equvalent to + This is equivalent to for i in range(count): render_pass_encoder.draw(buffer, offset + i * 16) - You must enable the featue "multi-draw-indirect" to use this function. + You must enable the feature "multi-draw-indirect" to use this function. """ render_pass_encoder._multi_draw_indirect(buffer, offset, count) def multi_draw_indexed_indirect(render_pass_encoder, buffer, *, offset=0, count): """ - This is equvalent to + This is equivalent to for i in range(count): render_pass_encoder.draw_indexed(buffer, offset + i * 20) - You must enable the featue "multi-draw-indirect" to use this function. + You must enable the feature "multi-draw-indirect" to use this function. """ render_pass_encoder._multi_draw_indexed_indirect(buffer, offset, count) + + +def create_statistics_query_set(device, *, label="", count: int, statistics): + """ + Create a query set that can collect the specified pipeline statistics. + You must enable the feature "pipeline-statitistics_query" to collect pipeline + statistics. + """ + return device._create_statistics_query_set(label, count, statistics) + + +def begin_pipeline_statistics_query(encoder, query_set, query_index): + print(encoder, type(encoder)) + assert isinstance(encoder, (GPURenderPassEncoder, GPUComputePassEncoder)) + encoder._begin_pipeline_statistics_query(query_set, query_index) + + +def end_pipeline_statistics_query(encoder): + assert isinstance(encoder, (GPURenderPassEncoder, GPUComputePassEncoder)) + encoder._end_pipeline_statistics_query() diff --git a/wgpu/resources/codegen_report.md b/wgpu/resources/codegen_report.md index 26022791..35cc493c 100644 --- a/wgpu/resources/codegen_report.md +++ b/wgpu/resources/codegen_report.md @@ -20,7 +20,7 @@ * Diffs for GPUQueue: add read_buffer, add read_texture, hide copy_external_image_to_texture * Validated 37 classes, 124 methods, 46 properties ### Patching API for backends/wgpu_native/_api.py -* Validated 37 classes, 106 methods, 0 properties +* Validated 37 classes, 112 methods, 0 properties ## Validating backends/wgpu_native/_api.py * Enum field FeatureName.texture-compression-bc-sliced-3d missing in wgpu.h * Enum field FeatureName.clip-distances missing in wgpu.h @@ -35,6 +35,6 @@ * Enum CanvasAlphaMode missing in wgpu.h * Enum CanvasToneMappingMode missing in wgpu.h * Wrote 236 enum mappings and 47 struct-field mappings to wgpu_native/_mappings.py -* Validated 136 C function calls -* Not using 69 C functions -* Validated 81 C structs +* Validated 140 C function calls +* Not using 65 C functions +* Validated 82 C structs