Reduce overhead of indirect draw validation
Allows indirect draws with different index buffer sizes to be
batched into a single dispatch, which can dramatically reduce the
overhead of the indirect draw validation done on D3D12 backends.
Indirect draws still need to be from the same indirect buffer for
batching to occur.
Bug: dawn:2329
Change-Id: I79c91ce900d141378d5275742d2d1a7b03d89531
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/170142
Reviewed-by: Corentin Wallez <cwallez@chromium.org>
Kokoro: Kokoro <noreply+kokoro@google.com>
Reviewed-by: Austin Eng <enga@chromium.org>
Commit-Queue: Brandon Jones <bajones@chromium.org>
diff --git a/src/dawn/native/IndirectDrawMetadata.cpp b/src/dawn/native/IndirectDrawMetadata.cpp
index f64f036..5963655 100644
--- a/src/dawn/native/IndirectDrawMetadata.cpp
+++ b/src/dawn/native/IndirectDrawMetadata.cpp
@@ -186,8 +186,8 @@
DAWN_UNREACHABLE();
}
- const IndexedIndirectConfig config = {indirectBuffer, numIndexBufferElements,
- duplicateBaseVertexInstance, DrawType::Indexed};
+ const IndexedIndirectConfig config = {indirectBuffer, duplicateBaseVertexInstance,
+ DrawType::Indexed};
auto it = mIndexedIndirectBufferValidationInfo.find(config);
if (it == mIndexedIndirectBufferValidationInfo.end()) {
auto result = mIndexedIndirectBufferValidationInfo.emplace(
@@ -197,6 +197,7 @@
IndirectDraw draw{};
draw.inputBufferOffset = indirectOffset;
+ draw.numIndexBufferElements = numIndexBufferElements;
draw.cmd = cmd;
it->second.AddIndirectDraw(mMaxDrawCallsPerBatch, mMaxBatchOffsetRange, draw);
}
@@ -205,7 +206,7 @@
uint64_t indirectOffset,
bool duplicateBaseVertexInstance,
DrawIndirectCmd* cmd) {
- const IndexedIndirectConfig config = {indirectBuffer, 0, duplicateBaseVertexInstance,
+ const IndexedIndirectConfig config = {indirectBuffer, duplicateBaseVertexInstance,
DrawType::NonIndexed};
auto it = mIndexedIndirectBufferValidationInfo.find(config);
if (it == mIndexedIndirectBufferValidationInfo.end()) {
@@ -216,22 +217,21 @@
IndirectDraw draw{};
draw.inputBufferOffset = indirectOffset;
+ draw.numIndexBufferElements = 0;
draw.cmd = cmd;
it->second.AddIndirectDraw(mMaxDrawCallsPerBatch, mMaxBatchOffsetRange, draw);
}
bool IndirectDrawMetadata::IndexedIndirectConfig::operator<(
const IndexedIndirectConfig& other) const {
- return std::tie(inputIndirectBuffer, numIndexBufferElements, duplicateBaseVertexInstance,
- drawType) < std::tie(other.inputIndirectBuffer, other.numIndexBufferElements,
- other.duplicateBaseVertexInstance, other.drawType);
+ return std::tie(inputIndirectBuffer, duplicateBaseVertexInstance, drawType) <
+ std::tie(other.inputIndirectBuffer, other.duplicateBaseVertexInstance, other.drawType);
}
bool IndirectDrawMetadata::IndexedIndirectConfig::operator==(
const IndexedIndirectConfig& other) const {
- return std::tie(inputIndirectBuffer, numIndexBufferElements, duplicateBaseVertexInstance,
- drawType) == std::tie(other.inputIndirectBuffer, other.numIndexBufferElements,
- other.duplicateBaseVertexInstance, other.drawType);
+ return std::tie(inputIndirectBuffer, duplicateBaseVertexInstance, drawType) ==
+ std::tie(other.inputIndirectBuffer, other.duplicateBaseVertexInstance, other.drawType);
}
} // namespace dawn::native
diff --git a/src/dawn/native/IndirectDrawMetadata.h b/src/dawn/native/IndirectDrawMetadata.h
index eb4b812..c7a19fd 100644
--- a/src/dawn/native/IndirectDrawMetadata.h
+++ b/src/dawn/native/IndirectDrawMetadata.h
@@ -57,6 +57,7 @@
public:
struct IndirectDraw {
uint64_t inputBufferOffset;
+ uint64_t numIndexBufferElements;
// This is a pointer to the command that should be populated with the validated
// indirect scratch buffer. It is only valid up until the encoded command buffer
// is submitted.
@@ -111,7 +112,6 @@
};
struct IndexedIndirectConfig {
BufferBase* inputIndirectBuffer;
- uint64_t numIndexBufferElements;
bool duplicateBaseVertexInstance;
DrawType drawType;
diff --git a/src/dawn/native/IndirectDrawValidationEncoder.cpp b/src/dawn/native/IndirectDrawValidationEncoder.cpp
index a1488da..504d535 100644
--- a/src/dawn/native/IndirectDrawValidationEncoder.cpp
+++ b/src/dawn/native/IndirectDrawValidationEncoder.cpp
@@ -58,13 +58,24 @@
constexpr uint32_t kValidationEnabled = 4;
constexpr uint32_t kIndirectFirstInstanceEnabled = 8;
+// Equivalent to the IndirectDraw struct defined in the shader below.
+struct IndirectDraw {
+ uint32_t indirectOffset;
+ uint32_t numIndexBufferElementsLow;
+ uint32_t numIndexBufferElementsHigh;
+};
+static_assert(sizeof(IndirectDraw) == sizeof(uint32_t) * 3);
+static_assert(alignof(IndirectDraw) == alignof(uint32_t));
+
// Equivalent to the BatchInfo struct defined in the shader below.
struct BatchInfo {
- uint64_t numIndexBufferElements;
uint32_t numDraws;
uint32_t flags;
};
+// The size, in bytes, of the IndirectDraw struct defined in the shader below.
+constexpr uint32_t kIndirectDrawByteSize = sizeof(uint32_t) * 3;
+
// TODO(https://crbug.com/dawn/1108): Propagate validation feedback from this shader in
// various failure modes.
static const char sRenderValidationShaderSource[] = R"(
@@ -80,12 +91,16 @@
const kValidationEnabled = 4u;
const kIndirectFirstInstanceEnabled = 8u;
- struct BatchInfo {
+ struct IndirectDraw {
+ indirectOffset: u32,
numIndexBufferElementsLow: u32,
numIndexBufferElementsHigh: u32,
+ }
+
+ struct BatchInfo {
numDraws: u32,
flags: u32,
- indirectOffsets: array<u32>,
+ draws: array<IndirectDraw>,
}
struct IndirectParams {
@@ -125,7 +140,7 @@
fn set_pass(drawIndex: u32) {
let numInputParams = numIndirectParamsPerDrawCallInput();
var outIndex = drawIndex * numIndirectParamsPerDrawCallOutput();
- let inIndex = batch.indirectOffsets[drawIndex];
+ let inIndex = batch.draws[drawIndex].indirectOffset;
// The first 2 parameter is reserved for the duplicated first/baseVertex and firstInstance
@@ -154,7 +169,7 @@
return;
}
- let inputIndex = batch.indirectOffsets[id.x];
+ let inputIndex = batch.draws[id.x].indirectOffset;
if(!bool(batch.flags & kIndirectFirstInstanceEnabled)) {
// firstInstance is always the last parameter
let firstInstance = inputParams.data[inputIndex + numIndirectParamsPerDrawCallInput() - 1u];
@@ -169,23 +184,27 @@
return;
}
- if (batch.numIndexBufferElementsHigh >= 2u) {
+ let numIndexBufferElementsHigh = batch.draws[id.x].numIndexBufferElementsHigh;
+
+ if (numIndexBufferElementsHigh >= 2u) {
// firstIndex and indexCount are both u32. The maximum possible sum of these
// values is 0x1fffffffe, which is less than 0x200000000. Nothing to validate.
set_pass(id.x);
return;
}
+ let numIndexBufferElementsLow = batch.draws[id.x].numIndexBufferElementsLow;
+
let firstIndex = inputParams.data[inputIndex + kFirstIndexEntry];
- if (batch.numIndexBufferElementsHigh == 0u &&
- batch.numIndexBufferElementsLow < firstIndex) {
+ if (numIndexBufferElementsHigh == 0u &&
+ numIndexBufferElementsLow < firstIndex) {
fail(id.x);
return;
}
// Note that this subtraction may underflow, but only when
// numIndexBufferElementsHigh is 1u. The result is still correct in that case.
- let maxIndexCount = batch.numIndexBufferElementsLow - firstIndex;
+ let maxIndexCount = numIndexBufferElementsLow - firstIndex;
let indexCount = inputParams.data[inputIndex + kIndexCountEntry];
if (indexCount > maxIndexCount) {
fail(id.x);
@@ -233,7 +252,7 @@
}
size_t GetBatchDataSize(uint32_t numDraws) {
- return sizeof(BatchInfo) + numDraws * sizeof(uint32_t);
+ return sizeof(BatchInfo) + (numDraws * kIndirectDrawByteSize);
}
} // namespace
@@ -242,7 +261,7 @@
const uint64_t batchDrawCallLimitByDispatchSize =
static_cast<uint64_t>(limits.v1.maxComputeWorkgroupsPerDimension) * kWorkgroupSize;
const uint64_t batchDrawCallLimitByStorageBindingSize =
- (limits.v1.maxStorageBufferBindingSize - sizeof(BatchInfo)) / sizeof(uint32_t);
+ (limits.v1.maxStorageBufferBindingSize - sizeof(BatchInfo)) / kIndirectDrawByteSize;
return static_cast<uint32_t>(
std::min({batchDrawCallLimitByDispatchSize, batchDrawCallLimitByStorageBindingSize,
uint64_t(std::numeric_limits<uint32_t>::max())}));
@@ -261,7 +280,6 @@
struct Batch {
const IndirectDrawMetadata::IndirectValidationBatch* metadata;
- uint64_t numIndexBufferElements;
uint64_t dataBufferOffset;
uint64_t dataSize;
uint64_t inputIndirectOffset;
@@ -316,7 +334,6 @@
Batch newBatch;
newBatch.metadata = &batch;
- newBatch.numIndexBufferElements = config.numIndexBufferElements;
newBatch.dataSize = GetBatchDataSize(batch.draws.size());
newBatch.inputIndirectOffset = minOffsetAlignedDown;
newBatch.inputIndirectSize =
@@ -391,16 +408,22 @@
uint8_t* batchData = static_cast<uint8_t*>(pass.batchData.get());
for (Batch& batch : pass.batches) {
batch.batchInfo = new (&batchData[batch.dataBufferOffset]) BatchInfo();
- batch.batchInfo->numIndexBufferElements = batch.numIndexBufferElements;
batch.batchInfo->numDraws = static_cast<uint32_t>(batch.metadata->draws.size());
batch.batchInfo->flags = pass.flags;
- uint32_t* indirectOffsets = reinterpret_cast<uint32_t*>(batch.batchInfo + 1);
+ IndirectDraw* indirectDraw = reinterpret_cast<IndirectDraw*>(batch.batchInfo + 1);
uint64_t outputParamsOffset = batch.outputParamsOffset;
for (auto& draw : batch.metadata->draws) {
// The shader uses this to index an array of u32, hence the division by 4 bytes.
- *indirectOffsets++ =
+ indirectDraw->indirectOffset =
static_cast<uint32_t>((draw.inputBufferOffset - batch.inputIndirectOffset) / 4);
+ // The index buffer elements are 64 bit values, and so need to be set as a
+ // low uint32_t and a high uint32_t.
+ indirectDraw->numIndexBufferElementsLow =
+ static_cast<uint32_t>(draw.numIndexBufferElements & 0xFFFFFFFF);
+ indirectDraw->numIndexBufferElementsHigh =
+ static_cast<uint32_t>((draw.numIndexBufferElements >> 32) & 0xFFFFFFFF);
+ indirectDraw++;
draw.cmd->indirectBuffer = outputParamsBuffer.GetBuffer();
draw.cmd->indirectOffset = outputParamsOffset;