d3d11: Clear uniform buffer padding with UpdateSubresource
Using UpdateSubresource instead of CopySubresourceRegion can avoid GPU
stall on Intel Gen12.
Bug: chromium:344814092
Change-Id: I72399ce88bfce6cecbf4b5506369c09dea08b782
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/193241
Reviewed-by: Austin Eng <enga@chromium.org>
Reviewed-by: Peng Huang <penghuang@chromium.org>
Commit-Queue: Jie A Chen <jie.a.chen@intel.com>
Reviewed-by: Corentin Wallez <cwallez@chromium.org>
diff --git a/src/dawn/native/d3d11/BufferD3D11.cpp b/src/dawn/native/d3d11/BufferD3D11.cpp
index 70812df..3401b6b 100644
--- a/src/dawn/native/d3d11/BufferD3D11.cpp
+++ b/src/dawn/native/d3d11/BufferD3D11.cpp
@@ -137,6 +137,8 @@
return 1;
}
+constexpr size_t kConstantBufferUpdateAlignment = 16;
+
} // namespace
// For CPU-to-GPU upload buffers(CopySrc|MapWrite), they can be emulated in the system memory, and
@@ -371,39 +373,31 @@
SetLabelImpl();
if (!mappedAtCreation) {
- if (GetDevice()->IsToggleEnabled(Toggle::NonzeroClearResourcesOnCreationForTesting)) {
if (commandContext) {
- DAWN_TRY(ClearWholeBuffer(commandContext, 1u));
+ DAWN_TRY(ClearInitialResource(commandContext));
} else {
auto tmpCommandContext =
ToBackend(GetDevice()->GetQueue())
->GetScopedPendingCommandContext(QueueBase::SubmitMode::Normal);
- DAWN_TRY(ClearWholeBuffer(&tmpCommandContext, 1u));
+ DAWN_TRY(ClearInitialResource(&tmpCommandContext));
}
- }
-
- // Initialize the padding bytes to zero.
- if (GetDevice()->IsToggleEnabled(Toggle::LazyClearResourceOnFirstUse)) {
- uint32_t paddingBytes = GetAllocatedSize() - GetSize();
- if (paddingBytes > 0) {
- uint32_t clearSize = paddingBytes;
- uint64_t clearOffset = GetSize();
- if (commandContext) {
- DAWN_TRY(ClearInternal(commandContext, 0, clearOffset, clearSize));
-
- } else {
- auto tmpCommandContext =
- ToBackend(GetDevice()->GetQueue())
- ->GetScopedPendingCommandContext(QueueBase::SubmitMode::Normal);
- DAWN_TRY(ClearInternal(&tmpCommandContext, 0, clearOffset, clearSize));
- }
- }
- }
}
return {};
}
+MaybeError Buffer::ClearInitialResource(const ScopedCommandRecordingContext* commandContext) {
+ if (GetDevice()->IsToggleEnabled(Toggle::NonzeroClearResourcesOnCreationForTesting)) {
+ DAWN_TRY(ClearWholeBuffer(commandContext, 1u));
+ }
+
+ // Initialize the padding bytes to zero.
+ if (GetDevice()->IsToggleEnabled(Toggle::LazyClearResourceOnFirstUse)) {
+ DAWN_TRY(ClearPaddingInternal(commandContext));
+ }
+ return {};
+}
+
Buffer::~Buffer() = default;
bool Buffer::IsCPUWritableAtCreation() const {
@@ -568,6 +562,18 @@
return WriteInternal(commandContext, offset, clearData.data(), size);
}
+MaybeError Buffer::ClearPaddingInternal(const ScopedCommandRecordingContext* commandContext) {
+ uint32_t paddingBytes = GetAllocatedSize() - GetSize();
+ if (paddingBytes == 0) {
+ return {};
+ }
+ uint32_t clearSize = paddingBytes;
+ uint64_t clearOffset = GetSize();
+ DAWN_TRY(ClearInternal(commandContext, 0, clearOffset, clearSize));
+
+ return {};
+}
+
MaybeError Buffer::Write(const ScopedCommandRecordingContext* commandContext,
uint64_t offset,
const void* data,
@@ -847,7 +853,6 @@
if (size >= GetSize() && offset == 0) {
// Offset and size must be aligned with 16 for using UpdateSubresource1() on constant
// buffer.
- constexpr size_t kConstantBufferUpdateAlignment = 16;
size_t alignedSize = Align(size, kConstantBufferUpdateAlignment);
DAWN_ASSERT(alignedSize <= GetAllocatedSize());
std::unique_ptr<uint8_t[]> alignedBuffer;
@@ -940,4 +945,40 @@
return {};
}
+MaybeError GPUOnlyBuffer::ClearPaddingInternal(
+ const ScopedCommandRecordingContext* commandContext) {
+ uint32_t paddingBytes = GetAllocatedSize() - GetSize();
+ if (paddingBytes == 0) {
+ return {};
+ }
+
+ uint32_t clearSize = paddingBytes;
+ uint64_t clearOffset = GetSize();
+ // 'UpdateSubresource1' is more preferable for updating uniform buffers, as it incurs no
+ // GPU stall.
+ if (GetUsage() & wgpu::BufferUsage::Uniform && !GetD3D11NonConstantBuffer()) {
+ clearSize = Align(paddingBytes, kConstantBufferUpdateAlignment);
+ clearOffset = GetAllocatedSize() - clearSize;
+
+ D3D11_BOX dstBox;
+ dstBox.left = clearOffset;
+ dstBox.top = 0;
+ dstBox.front = 0;
+ dstBox.right = GetAllocatedSize();
+ dstBox.bottom = 1;
+ dstBox.back = 1;
+
+ std::vector<uint8_t> clearData(clearSize, 0);
+ commandContext->UpdateSubresource1(GetD3D11ConstantBuffer(),
+ /*DstSubresource=*/0, &dstBox, clearData.data(),
+ /*SrcRowPitch=*/0,
+ /*SrcDepthPitch=*/0,
+ /*CopyFlags=*/D3D11_COPY_DISCARD);
+ } else {
+ DAWN_TRY(ClearInternal(commandContext, 0, clearOffset, clearSize));
+ }
+
+ return {};
+}
+
} // namespace dawn::native::d3d11
diff --git a/src/dawn/native/d3d11/BufferD3D11.h b/src/dawn/native/d3d11/BufferD3D11.h
index 02791ff..f8e1175 100644
--- a/src/dawn/native/d3d11/BufferD3D11.h
+++ b/src/dawn/native/d3d11/BufferD3D11.h
@@ -144,11 +144,14 @@
uint64_t offset,
uint64_t size);
+ virtual MaybeError ClearPaddingInternal(const ScopedCommandRecordingContext* commandContext);
+
raw_ptr<uint8_t, AllowPtrArithmetic> mMappedData = nullptr;
private:
MaybeError Initialize(bool mappedAtCreation,
const ScopedCommandRecordingContext* commandContext);
+ MaybeError ClearInitialResource(const ScopedCommandRecordingContext* commandContext);
MaybeError MapAsyncImpl(wgpu::MapMode mode, size_t offset, size_t size) override;
void UnmapImpl() override;
bool IsCPUWritableAtCreation() const override;
@@ -209,6 +212,8 @@
const void* data,
size_t size) override;
+ MaybeError ClearPaddingInternal(const ScopedCommandRecordingContext* commandContext) override;
+
bool mConstantBufferIsUpdated = true;
};
diff --git a/src/dawn/tests/end2end/QueueTests.cpp b/src/dawn/tests/end2end/QueueTests.cpp
index 9e9eba4..04f7e6f 100644
--- a/src/dawn/tests/end2end/QueueTests.cpp
+++ b/src/dawn/tests/end2end/QueueTests.cpp
@@ -216,6 +216,54 @@
EXPECT_BUFFER_U32_EQ(value, buffer, 0);
}
+// Test using various offset and size alignments to write a uniform buffer.
+TEST_P(QueueWriteBufferTests, WriteUniformBufferWithVariousOffsetAndSizeAlignments) {
+ wgpu::BufferDescriptor descriptor;
+ descriptor.size = 128;
+ descriptor.usage =
+ wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::Uniform;
+ wgpu::Buffer buffer = device.CreateBuffer(&descriptor);
+
+ constexpr size_t kElementCount = 16;
+ uint32_t data[kElementCount] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+ constexpr size_t kElementBytes = sizeof(data[0]);
+ queue.WriteBuffer(buffer, 0, data, sizeof(data));
+ EXPECT_BUFFER_U32_RANGE_EQ(data, buffer, 0, kElementCount);
+
+ // Alignments: offset -- 4, size -- 4
+ size_t offset = 1;
+ data[offset] = 100;
+ size_t size = kElementBytes;
+ queue.WriteBuffer(buffer, offset * kElementBytes, &data[offset], size);
+ EXPECT_BUFFER_U32_RANGE_EQ(data, buffer, 0, kElementCount);
+
+ // Alignments: offset -- 16, size -- 16
+ offset = 4;
+ data[offset] = 101;
+ data[offset + 1] = 102;
+ data[offset + 2] = 103;
+ data[offset + 3] = 104;
+ size = 4 * kElementBytes;
+ queue.WriteBuffer(buffer, offset * kElementBytes, &data[offset], size);
+ EXPECT_BUFFER_U32_RANGE_EQ(data, buffer, 0, kElementCount);
+
+ // Alignments: offset -- 4, size -- 16
+ offset = 10;
+ data[offset] = 105;
+ data[offset + 1] = 106;
+ data[offset + 2] = 107;
+ data[offset + 3] = 108;
+ queue.WriteBuffer(buffer, offset * kElementBytes, &data[offset], size);
+ EXPECT_BUFFER_U32_RANGE_EQ(data, buffer, 0, kElementCount);
+
+ // Alignments: offset -- 16, size -- 4
+ offset = 12;
+ data[offset] = 109;
+ size = kElementBytes;
+ queue.WriteBuffer(buffer, offset * kElementBytes, &data[offset], size);
+ EXPECT_BUFFER_U32_RANGE_EQ(data, buffer, 0, kElementCount);
+}
+
DAWN_INSTANTIATE_TEST(QueueWriteBufferTests,
D3D11Backend(),
D3D12Backend(),