Skip to content

Commit

Permalink
Merge branch 'async-native' of github.com:pygfx/wgpu-py into async-na…
Browse files Browse the repository at this point in the history
…tive
  • Loading branch information
almarklein committed Oct 2, 2024
2 parents 1046c41 + aed2616 commit f55d79c
Show file tree
Hide file tree
Showing 7 changed files with 342 additions and 23 deletions.
17 changes: 8 additions & 9 deletions codegen/wgpu_native_patcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,22 +121,21 @@ def write_mappings():

# Write a few native-only mappings: key => int
pylines.append("enum_str2int = {")
for name in ["BackendType"]:
for name, use_snake in (
("BackendType", False),
("NativeFeature", True),
("PipelineStatisticName", True),
):
pylines.append(f' "{name}":' + " {")
for key, val in hp.enums[name].items():
if key == "Force32":
continue
if use_snake:
key = to_snake_case(key).replace("_", "-")
pylines.append(f' "{key}": {val},')
pylines.append(" },")
for name in ["NativeFeature"]:
pylines.append(f' "{name}":' + " {")
for key, val in hp.enums[name].items():
if key == "Force32":
continue
xkey = to_snake_case(key).replace("_", "-")
pylines.append(f' "{xkey}": {val},')
pylines.append(" },")
pylines.append("}")
pylines.append("")

# Write a few native-only mappings: int => key
# If possible, resolve to WebGPU names, otherwise use the native name.
Expand Down
59 changes: 58 additions & 1 deletion docs/backends.rst
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ It also works out of the box, because the wgpu-native DLL is shipped with wgpu-p
The wgpu_native backend provides a few extra functionalities:

.. py:function:: wgpu.backends.wgpu_native.request_device_sync(adapter, trace_path, *, label="", required_features, required_limits, default_queue)
An alternative to :func:`wgpu.GPUAdapter.request_adapter`, that streams a trace
of all low level calls to disk, so the visualization can be replayed (also on other systems),
investigated, and debugged.
Expand Down Expand Up @@ -188,6 +187,64 @@ they reduce driver overhead on the CPU.
:param offset: The byte offset in the indirect buffer containing the first argument.
:param count: The number of draw operations to perform.

Some GPUs allow you collect statistics on their pipelines. Those GPUs that support this
have the feature "pipeline-statistics-query", and you must enable this feature when
getting the device.

You create a query set using the function
``wgpu.backends.wgpu_native.create_statistics_query_set``.

The possible statistics are:

* ``PipelineStatisticName.VertexShaderInvocations`` = "vertex-shader-invocations"
* The number of times the vertex shader is called.
* ``PipelineStatisticName.ClipperInvocations`` = "clipper-invocations"
* The number of triangles generated by the vertex shader.
* ``PipelineStatisticName.ClipperPrimitivesOut`` = "clipper-primitives-out"
* The number of primitives output by the clipper.
* ``PipelineStatisticName.FragmentShaderInvocations`` = "fragment-shader-invocations"
* The number of times the fragment shader is called.
* ``PipelineStatisticName.ComputeShaderInvocations`` = "compute-shader-invocations"
* The number of times the compute shader is called.

The statistics argument is a list or a tuple of statistics names. Each element of the
sequence must either be:

* The enumeration, e.g. ``PipelineStatisticName.FragmentShaderInvocations``
* A camel case string, e.g. ``"VertexShaderInvocations"``
* A snake-case string, e.g. ``"vertex-shader-invocations"``
* An underscored string, e.g. ``"vertex_shader_invocations"``

You may use any number of these statistics in a query set. Each result is an 8-byte
unsigned integer, and the total size of each entry in the query set is 8 times
the number of statistics chosen.

The statistics are always output to the query set in the order above, even if they are
given in a different order in the list.

.. py:function:: wgpu.backends.wgpu_native.create_statistics_query_set(device, count, statistics):
Create a query set that could hold count entries for the specified statistics.
The statistics are specified as a list of strings.

:param device: The device.
:param count: Number of entries that go into the query set.
:param statistics: A sequence of strings giving the desired statistics.

.. py:function:: wgpu.backends.wgpu_native.begin_pipeline_statistics_query(encoder, query_set, index):
Start collecting statistics.

:param encoder: The ComputePassEncoder or RenderPassEncoder.
:param query_set: The query set into which to save the result.
:param index: The index of the query set into which to write the result.

.. py:function:: wgpu.backends.wgpu_native.begin_pipeline_statistics_query(encoder, query_set, index):
Stop collecting statistics and write them into the query set.

:param encoder: The ComputePassEncoder or RenderPassEncoder.


The js_webgpu backend
---------------------
Expand Down
175 changes: 175 additions & 0 deletions tests/test_wgpu_statistics_query.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
"""
Test statistics queries.
"""

import sys

import numpy as np
import pytest
from pytest import skip

import wgpu
from testutils import can_use_wgpu_lib, is_ci, run_tests
from wgpu import TextureFormat
from wgpu.backends.wgpu_native.extras import (
PipelineStatisticName,
begin_pipeline_statistics_query,
create_statistics_query_set,
end_pipeline_statistics_query,
)

if not can_use_wgpu_lib:
skip("Skipping tests that need the wgpu lib", allow_module_level=True)
elif is_ci and sys.platform == "win32":
skip("These tests fail on dx12 for some reason", allow_module_level=True)


default_shader_source = """
// Draws a square with side 0.1 centered at the indicated location.
// If reverse, we take the vertices clockwise rather than counterclockwise so that
// we can test culling.
@vertex
fn vertex(@builtin(vertex_index) vertex_index : u32,
@builtin(instance_index) instance_index : u32
) -> @builtin(position) vec4<f32> {
var positions = array<vec2f, 4>(
vec2f(-0.05, -0.05),
vec2f( 0.05, -0.05),
vec2f(-0.05, 0.05),
vec2f( 0.05, 0.05),
);
var p = positions[vertex_index];
if instance_index == 1 {
// Swapping x and y will cause the coordinates to be cw instead of ccw
p = vec2f(p.y, p.x);
}
return vec4f(p, 0.0, 1.0);
}
@fragment
fn fragment( ) -> @location(0) vec4f {
return vec4f();
}
@compute @workgroup_size(64)
fn compute() {
}
"""


def test_render_occluding_squares():
adapter = wgpu.gpu.request_adapter(power_preference="high-performance")
try:
device = adapter.request_device(required_features=["pipeline-statistics-query"])
except RuntimeError:
pytest.skip("pipeline-statistics-query not supported")

pipeline_layout = device.create_pipeline_layout(bind_group_layouts=[])

output_texture = device.create_texture(
size=[1024, 1024],
usage=wgpu.TextureUsage.RENDER_ATTACHMENT,
format=TextureFormat.rgba8unorm,
)

shader = device.create_shader_module(code=default_shader_source)
render_pipeline = device.create_render_pipeline(
layout=pipeline_layout,
vertex={
"module": shader,
},
fragment={
"module": shader,
"targets": [{"format": output_texture.format}],
},
primitive={
"topology": wgpu.PrimitiveTopology.triangle_strip,
"cull_mode": wgpu.CullMode.back,
},
)

compute_pipeline = device.create_compute_pipeline(
layout=pipeline_layout,
compute={"module": shader},
)

color_attachment = {
"clear_value": (0, 0, 0, 0), # only first value matters
"load_op": "clear",
"store_op": "store",
"view": output_texture.create_view(),
}

occlusion_query_set = create_statistics_query_set(
device,
count=2,
statistics=[
"vertex-shader-invocations", # name can be snake case string
"ClipperInvocations", # name can be CamelCase
"clipper-primitives-out",
"fragment_shader_invocations", # name can have underscores
PipelineStatisticName.ComputeShaderInvocations, # and there's an enum.
],
)
occlusion_buffer = device.create_buffer(
size=2 * 5 * np.uint64().itemsize,
usage=wgpu.BufferUsage.COPY_SRC | wgpu.BufferUsage.QUERY_RESOLVE,
)

command_encoder = device.create_command_encoder()

render_pass = command_encoder.begin_render_pass(
color_attachments=[color_attachment]
)
begin_pipeline_statistics_query(render_pass, occlusion_query_set, 0)
render_pass.set_pipeline(render_pipeline)
render_pass.draw(4, 2)
end_pipeline_statistics_query(render_pass)
render_pass.end()

compute_pass = command_encoder.begin_compute_pass()
begin_pipeline_statistics_query(compute_pass, occlusion_query_set, 1)
compute_pass.set_pipeline(compute_pipeline)
compute_pass.dispatch_workgroups(10)
end_pipeline_statistics_query(compute_pass)
compute_pass.end()

command_encoder.resolve_query_set(occlusion_query_set, 0, 2, occlusion_buffer, 0)
device.queue.submit([command_encoder.finish()])

render_result = (
device.queue.read_buffer(occlusion_buffer, size=40).cast("Q").tolist()
)
compute_result = (
device.queue.read_buffer(occlusion_buffer, buffer_offset=40).cast("Q").tolist()
)

# We know that compute was called 10 * 60 times, exactly
assert compute_result == [0, 0, 0, 0, 10 * 64]
assert render_result[0] == 8 # 4 vertices, 2 instances
assert render_result[1] == 4 # 4 triangles
# unclear what exactly render_result[2] is.
assert render_result[3] > 1000
assert render_result[4] == 0 # no calls to the compute engine


def test_enum_is_in_sync():
"""
The enum PipelineStatisticsName is created by hand, while the enum_str2int value
is generated automatically from wgpu.h. They should both contain the same strings.
If this test fails, their values have diverged.
Either fix PipelineStatisticsName or modify this test and explain what the difference
is.
"""
from wgpu.backends.wgpu_native._mappings import enum_str2int

enum_list = set(PipelineStatisticName)
native_list = set(enum_str2int["PipelineStatisticName"].keys())
assert enum_list == native_list


if __name__ == "__main__":
run_tests(globals())
55 changes: 52 additions & 3 deletions wgpu/backends/wgpu_native/_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -1952,13 +1952,44 @@ def create_render_bundle_encoder(
return GPURenderBundleEncoder(label, render_bundle_id, self)

def create_query_set(self, *, label: str = "", type: enums.QueryType, count: int):
return self._create_query_set(label, type, count, None)

def _create_statistics_query_set(self, label, count, statistics):
values = []
for name in statistics:
key = to_snake_case(name.replace("_", "-"), "-")
value = enum_str2int["PipelineStatisticName"][key]
values.append(value)
values.sort()
return self._create_query_set(
label, lib.WGPUNativeQueryType_PipelineStatistics, count, values
)

def _create_query_set(self, label, type, count, statistics):
next_in_chain = ffi.NULL
if statistics:
c_statistics = ffi.new("WGPUPipelineStatisticName[]", statistics)
# H: chain: WGPUChainedStruct, pipelineStatistics: WGPUPipelineStatisticName *, pipelineStatisticCount: int
query_set_descriptor_extras = new_struct_p(
"WGPUQuerySetDescriptorExtras *",
pipelineStatisticCount=len(statistics),
pipelineStatistics=ffi.cast(
"WGPUPipelineStatisticName const *", c_statistics
),
# not used: chain
)
query_set_descriptor_extras.chain.sType = (
lib.WGPUSType_QuerySetDescriptorExtras
)
next_in_chain = ffi.cast("WGPUChainedStruct *", query_set_descriptor_extras)

# H: nextInChain: WGPUChainedStruct *, label: char *, type: WGPUQueryType, count: int
query_set_descriptor = new_struct_p(
"WGPUQuerySetDescriptor *",
label=to_c_label(label),
type=type,
count=count,
# not used: nextInChain
nextInChain=next_in_chain,
)

# H: WGPUQuerySet f(WGPUDevice device, WGPUQuerySetDescriptor const * descriptor)
Expand Down Expand Up @@ -2601,8 +2632,6 @@ def begin_render_pass(
timestamp_writes: structs.RenderPassTimestampWrites = optional,
max_draw_count: int = 50000000,
):
# Note that occlusion_query_set is ignored because wgpu-native does not have it.

c_timestamp_writes_struct = ffi.NULL
if timestamp_writes is not None:
check_struct("RenderPassTimestampWrites", timestamp_writes)
Expand Down Expand Up @@ -3036,6 +3065,16 @@ def dispatch_workgroups_indirect(
self._internal, buffer_id, int(indirect_offset)
)

def _begin_pipeline_statistics_query(self, query_set, query_index):
# H: void f(WGPUComputePassEncoder computePassEncoder, WGPUQuerySet querySet, uint32_t queryIndex)
libf.wgpuComputePassEncoderBeginPipelineStatisticsQuery(
self._internal, query_set._internal, int(query_index)
)

def _end_pipeline_statistics_query(self):
# H: void f(WGPUComputePassEncoder computePassEncoder)
libf.wgpuComputePassEncoderEndPipelineStatisticsQuery(self._internal)

def end(self):
# H: void f(WGPUComputePassEncoder computePassEncoder)
libf.wgpuComputePassEncoderEnd(self._internal)
Expand Down Expand Up @@ -3185,6 +3224,16 @@ def _multi_draw_indexed_indirect(self, buffer, offset, count):
self._internal, buffer._internal, int(offset), int(count)
)

def _begin_pipeline_statistics_query(self, query_set, query_index):
# H: void f(WGPURenderPassEncoder renderPassEncoder, WGPUQuerySet querySet, uint32_t queryIndex)
libf.wgpuRenderPassEncoderBeginPipelineStatisticsQuery(
self._internal, query_set._internal, int(query_index)
)

def _end_pipeline_statistics_query(self):
# H: void f(WGPURenderPassEncoder renderPassEncoder)
libf.wgpuRenderPassEncoderEndPipelineStatisticsQuery(self._internal)


class GPURenderBundleEncoder(
classes.GPURenderBundleEncoder,
Expand Down
8 changes: 8 additions & 0 deletions wgpu/backends/wgpu_native/_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,15 @@
"shader-primitive-index": 196639,
"shader-early-depth-test": 196640,
},
"PipelineStatisticName": {
"vertex-shader-invocations": 0,
"clipper-invocations": 1,
"clipper-primitives-out": 2,
"fragment-shader-invocations": 3,
"compute-shader-invocations": 4,
},
}

enum_int2str = {
"BackendType": {
0: "Undefined",
Expand Down
Loading

0 comments on commit f55d79c

Please sign in to comment.