Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vulkan AS rebuild-on-replay: Copy AS input buffers on build #3426

Open
wants to merge 2 commits into
base: v1.x
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
512 changes: 512 additions & 0 deletions renderdoc/driver/vulkan/vk_acceleration_structure.cpp

Large diffs are not rendered by default.

82 changes: 80 additions & 2 deletions renderdoc/driver/vulkan/vk_acceleration_structure.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,58 @@

#pragma once

#include "vk_manager.h"
#include "vk_resources.h"

class WrappedVulkan;
struct VkInitialContents;

// Just holds the built flag, will eventually hold all the AS build data
struct VkAccelerationStructureInfo
{
struct GeometryData
{
struct Triangles
{
VkFormat vertexFormat;
VkDeviceSize vertexStride;
uint32_t maxVertex;
VkIndexType indexType;
bool hasTransformData;
};

struct Aabbs
{
VkDeviceSize stride;
};

VkGeometryTypeKHR geometryType = VK_GEOMETRY_TYPE_TRIANGLES_KHR;
VkGeometryFlagsKHR flags;

VkDeviceMemory readbackMem;
VkDeviceSize memSize;

Triangles tris;
Aabbs aabbs;

VkAccelerationStructureBuildRangeInfoKHR buildRangeInfo;
};

~VkAccelerationStructureInfo();

void AddRef() { Atomic::Inc32(&refCount); }
void Release();

VkDevice device = VK_NULL_HANDLE;

VkAccelerationStructureTypeKHR type =
VkAccelerationStructureTypeKHR::VK_ACCELERATION_STRUCTURE_TYPE_GENERIC_KHR;
VkBuildAccelerationStructureFlagsKHR flags = 0;

rdcarray<GeometryData> geometryData;

bool accelerationStructureBuilt = false;

private:
int32_t refCount = 1;
};

class VulkanAccelerationStructureManager
Expand All @@ -43,8 +87,35 @@ class VulkanAccelerationStructureManager
bool isTLAS;
};

struct Allocation
{
VkDeviceMemory mem = VK_NULL_HANDLE;
VkDeviceSize size = 0;
VkBuffer buf = VK_NULL_HANDLE;
};

struct RecordAndOffset
{
VkResourceRecord *record = NULL;
VkDeviceAddress address = 0x0;
VkDeviceSize offset = 0;
};

explicit VulkanAccelerationStructureManager(WrappedVulkan *driver);

// Allocates readback mem and injects commands into the command buffer so that the input buffers
// are copied.
RDResult CopyInputBuffers(VkCommandBuffer commandBuffer,
const VkAccelerationStructureBuildGeometryInfoKHR &info,
const VkAccelerationStructureBuildRangeInfoKHR *buildRange,
CaptureState state);
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This state seems redundant here - we don't do this during replay, and we don't need to differentiate between background and active capturing. Doing a quick look through unless I'm missing something this value is (ultimately) not used.

Same applies below to CopyAccelerationStructure.


// Copies the metadata from src to dst, the input buffers are identical so don't need to be
// duplicated. Compaction is ignored but the copy is still performed so the dst handle is valid
// on replay
void CopyAccelerationStructure(VkCommandBuffer commandBuffer,
const VkCopyAccelerationStructureInfoKHR &pInfo, CaptureState state);

// Called when the initial state is prepared. Any TLAS and BLAS data is copied into temporary
// buffers and the handles for that memory and the buffers is stored in the init state
bool Prepare(VkAccelerationStructureKHR unwrappedAs, const rdcarray<uint32_t> &queueFamilyIndices,
Expand All @@ -59,6 +130,13 @@ class VulkanAccelerationStructureManager
void Apply(ResourceId id, const VkInitialContents &initial);

private:
Allocation CreateReadBackMemory(VkDevice device, VkDeviceSize size, VkDeviceSize alignment = 0);

RecordAndOffset GetDeviceAddressData(VkDeviceAddress address) const;

template <typename T>
void DeletePreviousInfo(VkCommandBuffer commandBuffer, T *info, CaptureState state);

VkDeviceSize SerialisedASSize(VkAccelerationStructureKHR unwrappedAs);

WrappedVulkan *m_pDriver;
Expand Down
2 changes: 2 additions & 0 deletions renderdoc/driver/vulkan/vk_core.h
Original file line number Diff line number Diff line change
Expand Up @@ -1225,6 +1225,7 @@ class WrappedVulkan : public IFrameCapturer
void RemapQueueFamilyIndices(uint32_t &srcQueueFamily, uint32_t &dstQueueFamily);
uint32_t GetQueueFamilyIndex() const { return m_QueueFamilyIdx; }
bool ReleaseResource(WrappedVkRes *res);
const rdcarray<uint32_t> &GetQueueFamilyIndices() const { return m_QueueFamilyIndices; }

void AddDebugMessage(MessageCategory c, MessageSeverity sv, MessageSource src, rdcstr d);

Expand Down Expand Up @@ -1275,6 +1276,7 @@ class WrappedVulkan : public IFrameCapturer

void TrackBufferAddress(VkDevice device, VkBuffer buffer);
void UntrackBufferAddress(VkDevice device, VkBuffer buffer);
void GetResIDFromAddr(GPUAddressRange::Address addr, ResourceId &id, uint64_t &offs);

EventFlags GetEventFlags(uint32_t eid) { return m_EventFlags[eid]; }
rdcarray<EventUsage> GetUsage(ResourceId id) { return m_ResourceUses[id]; }
Expand Down
5 changes: 5 additions & 0 deletions renderdoc/driver/vulkan/vk_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,11 @@ void WrappedVulkan::UntrackBufferAddress(VkDevice device, VkBuffer buffer)
m_AddressTracker.RemoveFrom(rng);
}

void WrappedVulkan::GetResIDFromAddr(GPUAddressRange::Address addr, ResourceId &id, uint64_t &offs)
{
m_AddressTracker.GetResIDFromAddr(addr, id, offs);
}

void WrappedVulkan::ChooseMemoryIndices()
{
// we need to do this little dance because Get*MemoryIndex checks to see if the existing
Expand Down
108 changes: 106 additions & 2 deletions renderdoc/driver/vulkan/vk_resources.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3948,6 +3948,107 @@ void ImgRefs::Split(bool splitAspects, bool splitLevels, bool splitLayers)
areLayersSplit = newSplitLayerCount > 1;
}

void QueryPoolInfo::Add(uint32_t firstQuery, rdcarray<uint64_t> values)
{
Reset(firstQuery, (uint32_t)values.size());

m_Entries.reserve(m_Entries.size() + values.size());
for(uint64_t value : values)
m_Entries.emplace_back(firstQuery++, value);

std::sort(m_Entries.begin(), m_Entries.end());
}

void QueryPoolInfo::Reset(uint32_t firstQuery, uint32_t queryCount)
{
m_Entries.removeIf([&](const auto &entry) {
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please avoid auto except where necessary for lambdas or extremely ugly for std iterators, you can use Entry here.

Also please use explicit lambda capture rather than unconditional capture to be clear about what is stored in the lambda.

return (entry.index >= firstQuery) && (entry.index < (firstQuery + queryCount));
});
}

void QueryPoolInfo::Replace(uint32_t firstQuery, uint32_t queryCount, void *pData,
VkDeviceSize stride, VkQueryResultFlags flags) const
{
const auto writeEntry = [&](Entry queryPoolInfoEntry) {
const size_t num_bytes = (flags & VK_QUERY_RESULT_64_BIT) ? 8 : 4;

byte *pStart = (byte *)pData + (queryPoolInfoEntry.index * stride);
memcpy(pStart, &queryPoolInfoEntry.value, num_bytes);
};

Replace(firstQuery, queryCount, writeEntry);
}

void QueryPoolInfo::Replace(uint32_t firstQuery, uint32_t queryCount,
const std::function<void(uint32_t, rdcarray<uint64_t>)> &writeEntry) const
{
rdcarray<Entry> entries;
entries.reserve(queryCount);

// Swap out any AS compaction sizes with the replacements
Replace(firstQuery, queryCount, [&](Entry entry) { entries.push_back(entry); });

std::sort(entries.begin(), entries.end());

// Now batch into contiguous ranges and dispatch
for(size_t i = 0; i < entries.size();)
{
uint32_t queryIndex = entries[i].index;
rdcarray<uint64_t> batch;

while(queryIndex == entries[++i].index)
{
batch.push_back(entries[i].value);
++queryIndex;
}

writeEntry(queryIndex, std::move(batch));
}
}

bool QueryPoolInfo::HasReplacementEntries(uint32_t firstQuery, uint32_t queryCount) const
{
uint32_t start, end;
rdctie(start, end) = GetIntersection(firstQuery, queryCount);
return start <= end;
}

rdcpair<uint32_t, uint32_t> QueryPoolInfo::GetIntersection(uint32_t firstQuery,
uint32_t queryCount) const
{
if(m_Entries.empty())
return {1, 0}; // Invalid

const uint32_t start = RDCMAX(firstQuery, m_Entries.front().index);
const uint32_t end = RDCMIN(firstQuery + queryCount - 1, (uint32_t)m_Entries.back().index);
return {start, end};
}

void QueryPoolInfo::Replace(uint32_t firstQuery, uint32_t queryCount,
const std::function<void(Entry)> &writeEntry) const
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason for this function to be separated out? It looks like it's only called from the other Replace() to apply any replacements so could be done inside that function? That should allow some simplification and ease of reading since it took me a couple of passes to understand the code flow with the functions being named the same.

{
if(!m_Entries.empty())
{
// Find the intersection of the two query ranges
uint32_t start, end;
rdctie(start, end) = GetIntersection(firstQuery, queryCount);
if(end < start)
return;

uint32_t j = 0;
for(uint32_t i = start; i < end; ++i)
{
// The indices are sparse but ordered
while(i != m_Entries[j].index)
{
++j;
}

writeEntry(m_Entries[j]);
}
}
}

VkResourceRecord::~VkResourceRecord()
{
// bufferviews and imageviews have non-owning pointers to the sparseinfo struct
Expand Down Expand Up @@ -3996,8 +4097,11 @@ VkResourceRecord::~VkResourceRecord()
if(resType == eResCommandPool)
SAFE_DELETE(cmdPoolInfo);

if(resType == eResAccelerationStructureKHR)
SAFE_DELETE(accelerationStructureInfo);
if(resType == eResQueryPool)
SAFE_DELETE(queryPoolInfo);

if(resType == eResAccelerationStructureKHR && accelerationStructureInfo)
accelerationStructureInfo->Release();
}

void VkResourceRecord::MarkImageFrameReferenced(VkResourceRecord *img, const ImageRange &range,
Expand Down
36 changes: 36 additions & 0 deletions renderdoc/driver/vulkan/vk_resources.h
Original file line number Diff line number Diff line change
Expand Up @@ -2236,6 +2236,41 @@ inline FrameRefType MarkMemoryReferenced(std::unordered_map<ResourceId, MemRefs>
return MarkMemoryReferenced(memRefs, mem, offset, size, refType, ComposeFrameRefs);
}

// Used to replace QueryPool results
class QueryPoolInfo
{
public:
void Add(uint32_t firstQuery, rdcarray<uint64_t> values);

void Reset(uint32_t firstQuery, uint32_t queryCount);

void Replace(uint32_t firstQuery, uint32_t queryCount, void *pData, VkDeviceSize stride,
VkQueryResultFlags flags) const;

// Calls writeEntry with matching contiguous entries, buffered into an array.
void Replace(uint32_t firstQuery, uint32_t queryCount,
const std::function<void(uint32_t, rdcarray<uint64_t>)> &writeEntry) const;

bool HasReplacementEntries(uint32_t firstQuery, uint32_t queryCount) const;

private:
struct Entry
{
Entry(uint32_t i, uint64_t v) : index(i), value(v) {}
bool operator<(Entry other) const { return index < other.index; }

uint32_t index;
uint64_t value;
};

rdcpair<uint32_t, uint32_t> GetIntersection(uint32_t firstQuery, uint32_t queryCount) const;

void Replace(uint32_t firstQuery, uint32_t queryCount,
const std::function<void(Entry)> &writeEntry) const;

rdcarray<Entry> m_Entries;
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thinking about this more generally, query pools are single-typed so we know at creation time whether we're either going to replace all results or none of them (when the query pool type is compacted size).

Do you have any empirical data or intuition on how large those query pools are? I would assume either 1, some small fixed count (UE5 seems to use 64, configurable), or maybe the application's expected max number of BLASs (some thousands?).

Unless that seems unreasonable or you're otherwise worried about having a huge number of possible queries with only a few actually being valid, I think this code could be simplified a good deal by a) only allocating this struct when needed for patching and b) pre-allocating all the entries up front. Then you don't have to worry about intersecting, resizing this array on the fly, etc. We just read all our values out of it.

};

struct DescUpdateTemplate;
struct ImageLayouts;
struct VkAccelerationStructureInfo;
Expand Down Expand Up @@ -2332,6 +2367,7 @@ struct VkResourceRecord : public ResourceRecord
DescPoolInfo *descPoolInfo; // only for descriptor pools
CmdPoolInfo *cmdPoolInfo; // only for command pools
uint32_t queueFamilyIndex; // only for queues
QueryPoolInfo *queryPoolInfo; // only for query pools
VkAccelerationStructureInfo *accelerationStructureInfo; // only for acceleration structures
};

Expand Down
Loading
Loading