Merge branch 'async-native' of github.com:pygfx/wgpu-py into async-na…

…tive
pygfx · Oct 2, 2024 · f55d79c · f55d79c
2 parents 1046c41 + aed2616
commit f55d79c
Show file tree

Hide file tree

Showing 7 changed files with 342 additions and 23 deletions.
diff --git a/codegen/wgpu_native_patcher.py b/codegen/wgpu_native_patcher.py
@@ -121,22 +121,21 @@ def write_mappings():
 
     # Write a few native-only mappings: key => int
     pylines.append("enum_str2int = {")
-    for name in ["BackendType"]:
+    for name, use_snake in (
+        ("BackendType", False),
+        ("NativeFeature", True),
+        ("PipelineStatisticName", True),
+    ):
         pylines.append(f'    "{name}":' + " {")
         for key, val in hp.enums[name].items():
             if key == "Force32":
                 continue
+            if use_snake:
+                key = to_snake_case(key).replace("_", "-")
             pylines.append(f'        "{key}": {val},')
         pylines.append("    },")
-    for name in ["NativeFeature"]:
-        pylines.append(f'    "{name}":' + " {")
-        for key, val in hp.enums[name].items():
-            if key == "Force32":
-                continue
-            xkey = to_snake_case(key).replace("_", "-")
-            pylines.append(f'        "{xkey}": {val},')
-        pylines.append("    },")
     pylines.append("}")
+    pylines.append("")
 
     # Write a few native-only mappings: int => key
     # If possible, resolve to WebGPU names, otherwise use the native name.

diff --git a/docs/backends.rst b/docs/backends.rst
@@ -45,7 +45,6 @@ It also works out of the box, because the wgpu-native DLL is shipped with wgpu-p
 The wgpu_native backend provides a few extra functionalities:
 
 .. py:function:: wgpu.backends.wgpu_native.request_device_sync(adapter, trace_path, *, label="", required_features, required_limits, default_queue)
-
     An alternative to :func:`wgpu.GPUAdapter.request_adapter`, that streams a trace
     of all low level calls to disk, so the visualization can be replayed (also on other systems),
     investigated, and debugged.
@@ -188,6 +187,64 @@ they reduce driver overhead on the CPU.
     :param offset: The byte offset in the indirect buffer containing the first argument.
     :param count: The number of draw operations to perform.
 
+Some GPUs allow you collect statistics on their pipelines. Those GPUs that support this
+have the feature "pipeline-statistics-query", and you must enable this feature when
+getting the device.
+
+You create a query set using the function
+``wgpu.backends.wgpu_native.create_statistics_query_set``.
+
+The possible statistics are:
+
+*    ``PipelineStatisticName.VertexShaderInvocations`` = "vertex-shader-invocations"
+      * The number of times the vertex shader is called.
+*    ``PipelineStatisticName.ClipperInvocations`` = "clipper-invocations"
+      * The number of triangles generated by the vertex shader.
+*    ``PipelineStatisticName.ClipperPrimitivesOut`` = "clipper-primitives-out"
+      * The number of primitives output by the clipper.
+*    ``PipelineStatisticName.FragmentShaderInvocations`` = "fragment-shader-invocations"
+      * The number of times the fragment shader is called.
+*    ``PipelineStatisticName.ComputeShaderInvocations`` = "compute-shader-invocations"
+      * The number of times the compute shader is called.
+
+The statistics argument is a list or a tuple of statistics names.  Each element of the
+sequence must either be:
+
+*    The enumeration, e.g. ``PipelineStatisticName.FragmentShaderInvocations``
+*    A camel case string, e.g. ``"VertexShaderInvocations"``
+*    A snake-case string, e.g. ``"vertex-shader-invocations"``
+*    An underscored string, e.g.  ``"vertex_shader_invocations"``
+
+You may use any number of these statistics in a query set. Each result is an 8-byte
+unsigned integer, and the total size of each entry in the query set is 8 times
+the number of statistics chosen.
+
+The statistics are always output to the query set in the order above, even if they are
+given in a different order in the list.
+
+.. py:function:: wgpu.backends.wgpu_native.create_statistics_query_set(device, count, statistics):
+
+    Create a query set that could hold count entries for the specified statistics.
+    The statistics are specified as a list of strings.
+
+    :param device: The device.
+    :param count: Number of entries that go into the query set.
+    :param statistics: A sequence of strings giving the desired statistics.
+
+.. py:function:: wgpu.backends.wgpu_native.begin_pipeline_statistics_query(encoder, query_set, index):
+
+    Start collecting statistics.
+
+    :param encoder: The ComputePassEncoder or RenderPassEncoder.
+    :param query_set: The query set into which to save the result.
+    :param index: The index of the query set into which to write the result.
+
+.. py:function:: wgpu.backends.wgpu_native.begin_pipeline_statistics_query(encoder, query_set, index):
+
+    Stop collecting statistics and write them into the query set.
+
+    :param encoder: The ComputePassEncoder or RenderPassEncoder.
+
 
 The js_webgpu backend
 ---------------------

diff --git a/tests/test_wgpu_statistics_query.py b/tests/test_wgpu_statistics_query.py
@@ -0,0 +1,175 @@
+"""
+Test statistics queries.
+"""
+
+import sys
+
+import numpy as np
+import pytest
+from pytest import skip
+
+import wgpu
+from testutils import can_use_wgpu_lib, is_ci, run_tests
+from wgpu import TextureFormat
+from wgpu.backends.wgpu_native.extras import (
+    PipelineStatisticName,
+    begin_pipeline_statistics_query,
+    create_statistics_query_set,
+    end_pipeline_statistics_query,
+)
+
+if not can_use_wgpu_lib:
+    skip("Skipping tests that need the wgpu lib", allow_module_level=True)
+elif is_ci and sys.platform == "win32":
+    skip("These tests fail on dx12 for some reason", allow_module_level=True)
+
+
+default_shader_source = """
+
+// Draws a square with side 0.1 centered at the indicated location.
+// If reverse, we take the vertices clockwise rather than counterclockwise so that
+// we can test culling.
+
+@vertex
+fn vertex(@builtin(vertex_index) vertex_index : u32,
+           @builtin(instance_index) instance_index : u32
+) -> @builtin(position) vec4<f32> {
+    var positions = array<vec2f, 4>(
+        vec2f(-0.05, -0.05),
+        vec2f( 0.05, -0.05),
+        vec2f(-0.05,  0.05),
+        vec2f( 0.05,  0.05),
+    );
+    var p = positions[vertex_index];
+    if instance_index == 1 {
+        // Swapping x and y will cause the coordinates to be cw instead of ccw
+        p = vec2f(p.y, p.x);
+    }
+    return vec4f(p, 0.0, 1.0);
+}
+
+@fragment
+fn fragment( ) -> @location(0) vec4f {
+    return vec4f();
+}
+
+@compute @workgroup_size(64)
+fn compute() {
+}
+"""
+
+
+def test_render_occluding_squares():
+    adapter = wgpu.gpu.request_adapter(power_preference="high-performance")
+    try:
+        device = adapter.request_device(required_features=["pipeline-statistics-query"])
+    except RuntimeError:
+        pytest.skip("pipeline-statistics-query not supported")
+
+    pipeline_layout = device.create_pipeline_layout(bind_group_layouts=[])
+
+    output_texture = device.create_texture(
+        size=[1024, 1024],
+        usage=wgpu.TextureUsage.RENDER_ATTACHMENT,
+        format=TextureFormat.rgba8unorm,
+    )
+
+    shader = device.create_shader_module(code=default_shader_source)
+    render_pipeline = device.create_render_pipeline(
+        layout=pipeline_layout,
+        vertex={
+            "module": shader,
+        },
+        fragment={
+            "module": shader,
+            "targets": [{"format": output_texture.format}],
+        },
+        primitive={
+            "topology": wgpu.PrimitiveTopology.triangle_strip,
+            "cull_mode": wgpu.CullMode.back,
+        },
+    )
+
+    compute_pipeline = device.create_compute_pipeline(
+        layout=pipeline_layout,
+        compute={"module": shader},
+    )
+
+    color_attachment = {
+        "clear_value": (0, 0, 0, 0),  # only first value matters
+        "load_op": "clear",
+        "store_op": "store",
+        "view": output_texture.create_view(),
+    }
+
+    occlusion_query_set = create_statistics_query_set(
+        device,
+        count=2,
+        statistics=[
+            "vertex-shader-invocations",  # name can be snake case string
+            "ClipperInvocations",  # name can be CamelCase
+            "clipper-primitives-out",
+            "fragment_shader_invocations",  # name can have underscores
+            PipelineStatisticName.ComputeShaderInvocations,  # and there's an enum.
+        ],
+    )
+    occlusion_buffer = device.create_buffer(
+        size=2 * 5 * np.uint64().itemsize,
+        usage=wgpu.BufferUsage.COPY_SRC | wgpu.BufferUsage.QUERY_RESOLVE,
+    )
+
+    command_encoder = device.create_command_encoder()
+
+    render_pass = command_encoder.begin_render_pass(
+        color_attachments=[color_attachment]
+    )
+    begin_pipeline_statistics_query(render_pass, occlusion_query_set, 0)
+    render_pass.set_pipeline(render_pipeline)
+    render_pass.draw(4, 2)
+    end_pipeline_statistics_query(render_pass)
+    render_pass.end()
+
+    compute_pass = command_encoder.begin_compute_pass()
+    begin_pipeline_statistics_query(compute_pass, occlusion_query_set, 1)
+    compute_pass.set_pipeline(compute_pipeline)
+    compute_pass.dispatch_workgroups(10)
+    end_pipeline_statistics_query(compute_pass)
+    compute_pass.end()
+
+    command_encoder.resolve_query_set(occlusion_query_set, 0, 2, occlusion_buffer, 0)
+    device.queue.submit([command_encoder.finish()])
+
+    render_result = (
+        device.queue.read_buffer(occlusion_buffer, size=40).cast("Q").tolist()
+    )
+    compute_result = (
+        device.queue.read_buffer(occlusion_buffer, buffer_offset=40).cast("Q").tolist()
+    )
+
+    # We know that compute was called 10 * 60 times, exactly
+    assert compute_result == [0, 0, 0, 0, 10 * 64]
+    assert render_result[0] == 8  # 4 vertices, 2 instances
+    assert render_result[1] == 4  # 4 triangles
+    # unclear what exactly render_result[2] is.
+    assert render_result[3] > 1000
+    assert render_result[4] == 0  # no calls to the compute engine
+
+
+def test_enum_is_in_sync():
+    """
+    The enum PipelineStatisticsName is created by hand, while the enum_str2int value
+    is generated automatically from wgpu.h.  They should both contain the same strings.
+    If this test fails, their values have diverged.
+
+    Either fix PipelineStatisticsName or modify this test and explain what the difference
+    is.
+    """
+    from wgpu.backends.wgpu_native._mappings import enum_str2int
+
+    enum_list = set(PipelineStatisticName)
+    native_list = set(enum_str2int["PipelineStatisticName"].keys())
+    assert enum_list == native_list
+
+
+if __name__ == "__main__":
+    run_tests(globals())
diff --git a/wgpu/backends/wgpu_native/_api.py b/wgpu/backends/wgpu_native/_api.py
@@ -1952,13 +1952,44 @@ def create_render_bundle_encoder(
         return GPURenderBundleEncoder(label, render_bundle_id, self)
 
     def create_query_set(self, *, label: str = "", type: enums.QueryType, count: int):
+        return self._create_query_set(label, type, count, None)
+
+    def _create_statistics_query_set(self, label, count, statistics):
+        values = []
+        for name in statistics:
+            key = to_snake_case(name.replace("_", "-"), "-")
+            value = enum_str2int["PipelineStatisticName"][key]
+            values.append(value)
+        values.sort()
+        return self._create_query_set(
+            label, lib.WGPUNativeQueryType_PipelineStatistics, count, values
+        )
+
+    def _create_query_set(self, label, type, count, statistics):
+        next_in_chain = ffi.NULL
+        if statistics:
+            c_statistics = ffi.new("WGPUPipelineStatisticName[]", statistics)
+            # H: chain: WGPUChainedStruct, pipelineStatistics: WGPUPipelineStatisticName *, pipelineStatisticCount: int
+            query_set_descriptor_extras = new_struct_p(
+                "WGPUQuerySetDescriptorExtras *",
+                pipelineStatisticCount=len(statistics),
+                pipelineStatistics=ffi.cast(
+                    "WGPUPipelineStatisticName const *", c_statistics
+                ),
+                # not used: chain
+            )
+            query_set_descriptor_extras.chain.sType = (
+                lib.WGPUSType_QuerySetDescriptorExtras
+            )
+            next_in_chain = ffi.cast("WGPUChainedStruct *", query_set_descriptor_extras)
+
         # H: nextInChain: WGPUChainedStruct *, label: char *, type: WGPUQueryType, count: int
         query_set_descriptor = new_struct_p(
             "WGPUQuerySetDescriptor *",
             label=to_c_label(label),
             type=type,
             count=count,
-            # not used: nextInChain
+            nextInChain=next_in_chain,
         )
 
         # H: WGPUQuerySet f(WGPUDevice device, WGPUQuerySetDescriptor const * descriptor)
@@ -2601,8 +2632,6 @@ def begin_render_pass(
         timestamp_writes: structs.RenderPassTimestampWrites = optional,
         max_draw_count: int = 50000000,
     ):
-        # Note that occlusion_query_set is ignored because wgpu-native does not have it.
-
         c_timestamp_writes_struct = ffi.NULL
         if timestamp_writes is not None:
             check_struct("RenderPassTimestampWrites", timestamp_writes)
@@ -3036,6 +3065,16 @@ def dispatch_workgroups_indirect(
             self._internal, buffer_id, int(indirect_offset)
         )
 
+    def _begin_pipeline_statistics_query(self, query_set, query_index):
+        # H: void f(WGPUComputePassEncoder computePassEncoder, WGPUQuerySet querySet, uint32_t queryIndex)
+        libf.wgpuComputePassEncoderBeginPipelineStatisticsQuery(
+            self._internal, query_set._internal, int(query_index)
+        )
+
+    def _end_pipeline_statistics_query(self):
+        # H: void f(WGPUComputePassEncoder computePassEncoder)
+        libf.wgpuComputePassEncoderEndPipelineStatisticsQuery(self._internal)
+
     def end(self):
         # H: void f(WGPUComputePassEncoder computePassEncoder)
         libf.wgpuComputePassEncoderEnd(self._internal)
@@ -3185,6 +3224,16 @@ def _multi_draw_indexed_indirect(self, buffer, offset, count):
             self._internal, buffer._internal, int(offset), int(count)
         )
 
+    def _begin_pipeline_statistics_query(self, query_set, query_index):
+        # H: void f(WGPURenderPassEncoder renderPassEncoder, WGPUQuerySet querySet, uint32_t queryIndex)
+        libf.wgpuRenderPassEncoderBeginPipelineStatisticsQuery(
+            self._internal, query_set._internal, int(query_index)
+        )
+
+    def _end_pipeline_statistics_query(self):
+        # H: void f(WGPURenderPassEncoder renderPassEncoder)
+        libf.wgpuRenderPassEncoderEndPipelineStatisticsQuery(self._internal)
+
 
 class GPURenderBundleEncoder(
     classes.GPURenderBundleEncoder,

diff --git a/wgpu/backends/wgpu_native/_mappings.py b/wgpu/backends/wgpu_native/_mappings.py
@@ -333,7 +333,15 @@
         "shader-primitive-index": 196639,
         "shader-early-depth-test": 196640,
     },
+    "PipelineStatisticName": {
+        "vertex-shader-invocations": 0,
+        "clipper-invocations": 1,
+        "clipper-primitives-out": 2,
+        "fragment-shader-invocations": 3,
+        "compute-shader-invocations": 4,
+    },
 }
+
 enum_int2str = {
     "BackendType": {
         0: "Undefined",