d3d11: Clear uniform buffer padding with UpdateSubresource

Using UpdateSubresource instead of CopySubresourceRegion can avoid GPU
stall on Intel Gen12.

Bug: chromium:344814092
Change-Id: I72399ce88bfce6cecbf4b5506369c09dea08b782
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/193241
Reviewed-by: Austin Eng <enga@chromium.org>
Reviewed-by: Peng Huang <penghuang@chromium.org>
Commit-Queue: Jie A Chen <jie.a.chen@intel.com>
Reviewed-by: Corentin Wallez <cwallez@chromium.org>
diff --git a/src/dawn/native/d3d11/BufferD3D11.cpp b/src/dawn/native/d3d11/BufferD3D11.cpp
index 70812df..3401b6b 100644
--- a/src/dawn/native/d3d11/BufferD3D11.cpp
+++ b/src/dawn/native/d3d11/BufferD3D11.cpp
@@ -137,6 +137,8 @@
     return 1;
 }
 
+constexpr size_t kConstantBufferUpdateAlignment = 16;
+
 }  // namespace
 
 // For CPU-to-GPU upload buffers(CopySrc|MapWrite), they can be emulated in the system memory, and
@@ -371,39 +373,31 @@
     SetLabelImpl();
 
     if (!mappedAtCreation) {
-        if (GetDevice()->IsToggleEnabled(Toggle::NonzeroClearResourcesOnCreationForTesting)) {
             if (commandContext) {
-                DAWN_TRY(ClearWholeBuffer(commandContext, 1u));
+                DAWN_TRY(ClearInitialResource(commandContext));
             } else {
                 auto tmpCommandContext =
                     ToBackend(GetDevice()->GetQueue())
                         ->GetScopedPendingCommandContext(QueueBase::SubmitMode::Normal);
-                DAWN_TRY(ClearWholeBuffer(&tmpCommandContext, 1u));
+                DAWN_TRY(ClearInitialResource(&tmpCommandContext));
             }
-        }
-
-        // Initialize the padding bytes to zero.
-        if (GetDevice()->IsToggleEnabled(Toggle::LazyClearResourceOnFirstUse)) {
-            uint32_t paddingBytes = GetAllocatedSize() - GetSize();
-            if (paddingBytes > 0) {
-                uint32_t clearSize = paddingBytes;
-                uint64_t clearOffset = GetSize();
-                if (commandContext) {
-                    DAWN_TRY(ClearInternal(commandContext, 0, clearOffset, clearSize));
-
-                } else {
-                    auto tmpCommandContext =
-                        ToBackend(GetDevice()->GetQueue())
-                            ->GetScopedPendingCommandContext(QueueBase::SubmitMode::Normal);
-                    DAWN_TRY(ClearInternal(&tmpCommandContext, 0, clearOffset, clearSize));
-                }
-            }
-        }
     }
 
     return {};
 }
 
+MaybeError Buffer::ClearInitialResource(const ScopedCommandRecordingContext* commandContext) {
+    if (GetDevice()->IsToggleEnabled(Toggle::NonzeroClearResourcesOnCreationForTesting)) {
+        DAWN_TRY(ClearWholeBuffer(commandContext, 1u));
+    }
+
+    // Initialize the padding bytes to zero.
+    if (GetDevice()->IsToggleEnabled(Toggle::LazyClearResourceOnFirstUse)) {
+        DAWN_TRY(ClearPaddingInternal(commandContext));
+    }
+    return {};
+}
+
 Buffer::~Buffer() = default;
 
 bool Buffer::IsCPUWritableAtCreation() const {
@@ -568,6 +562,18 @@
     return WriteInternal(commandContext, offset, clearData.data(), size);
 }
 
+MaybeError Buffer::ClearPaddingInternal(const ScopedCommandRecordingContext* commandContext) {
+    uint32_t paddingBytes = GetAllocatedSize() - GetSize();
+    if (paddingBytes == 0) {
+        return {};
+    }
+    uint32_t clearSize = paddingBytes;
+    uint64_t clearOffset = GetSize();
+    DAWN_TRY(ClearInternal(commandContext, 0, clearOffset, clearSize));
+
+    return {};
+}
+
 MaybeError Buffer::Write(const ScopedCommandRecordingContext* commandContext,
                          uint64_t offset,
                          const void* data,
@@ -847,7 +853,6 @@
     if (size >= GetSize() && offset == 0) {
         // Offset and size must be aligned with 16 for using UpdateSubresource1() on constant
         // buffer.
-        constexpr size_t kConstantBufferUpdateAlignment = 16;
         size_t alignedSize = Align(size, kConstantBufferUpdateAlignment);
         DAWN_ASSERT(alignedSize <= GetAllocatedSize());
         std::unique_ptr<uint8_t[]> alignedBuffer;
@@ -940,4 +945,40 @@
     return {};
 }
 
+MaybeError GPUOnlyBuffer::ClearPaddingInternal(
+    const ScopedCommandRecordingContext* commandContext) {
+    uint32_t paddingBytes = GetAllocatedSize() - GetSize();
+    if (paddingBytes == 0) {
+        return {};
+    }
+
+    uint32_t clearSize = paddingBytes;
+    uint64_t clearOffset = GetSize();
+    // 'UpdateSubresource1' is more preferable for updating uniform buffers, as it incurs no
+    // GPU stall.
+    if (GetUsage() & wgpu::BufferUsage::Uniform && !GetD3D11NonConstantBuffer()) {
+        clearSize = Align(paddingBytes, kConstantBufferUpdateAlignment);
+        clearOffset = GetAllocatedSize() - clearSize;
+
+        D3D11_BOX dstBox;
+        dstBox.left = clearOffset;
+        dstBox.top = 0;
+        dstBox.front = 0;
+        dstBox.right = GetAllocatedSize();
+        dstBox.bottom = 1;
+        dstBox.back = 1;
+
+        std::vector<uint8_t> clearData(clearSize, 0);
+        commandContext->UpdateSubresource1(GetD3D11ConstantBuffer(),
+                                           /*DstSubresource=*/0, &dstBox, clearData.data(),
+                                           /*SrcRowPitch=*/0,
+                                           /*SrcDepthPitch=*/0,
+                                           /*CopyFlags=*/D3D11_COPY_DISCARD);
+    } else {
+        DAWN_TRY(ClearInternal(commandContext, 0, clearOffset, clearSize));
+    }
+
+    return {};
+}
+
 }  // namespace dawn::native::d3d11
diff --git a/src/dawn/native/d3d11/BufferD3D11.h b/src/dawn/native/d3d11/BufferD3D11.h
index 02791ff..f8e1175 100644
--- a/src/dawn/native/d3d11/BufferD3D11.h
+++ b/src/dawn/native/d3d11/BufferD3D11.h
@@ -144,11 +144,14 @@
                                      uint64_t offset,
                                      uint64_t size);
 
+    virtual MaybeError ClearPaddingInternal(const ScopedCommandRecordingContext* commandContext);
+
     raw_ptr<uint8_t, AllowPtrArithmetic> mMappedData = nullptr;
 
   private:
     MaybeError Initialize(bool mappedAtCreation,
                           const ScopedCommandRecordingContext* commandContext);
+    MaybeError ClearInitialResource(const ScopedCommandRecordingContext* commandContext);
     MaybeError MapAsyncImpl(wgpu::MapMode mode, size_t offset, size_t size) override;
     void UnmapImpl() override;
     bool IsCPUWritableAtCreation() const override;
@@ -209,6 +212,8 @@
                              const void* data,
                              size_t size) override;
 
+    MaybeError ClearPaddingInternal(const ScopedCommandRecordingContext* commandContext) override;
+
     bool mConstantBufferIsUpdated = true;
 };
 
diff --git a/src/dawn/tests/end2end/QueueTests.cpp b/src/dawn/tests/end2end/QueueTests.cpp
index 9e9eba4..04f7e6f 100644
--- a/src/dawn/tests/end2end/QueueTests.cpp
+++ b/src/dawn/tests/end2end/QueueTests.cpp
@@ -216,6 +216,54 @@
     EXPECT_BUFFER_U32_EQ(value, buffer, 0);
 }
 
+// Test using various offset and size alignments to write a uniform buffer.
+TEST_P(QueueWriteBufferTests, WriteUniformBufferWithVariousOffsetAndSizeAlignments) {
+    wgpu::BufferDescriptor descriptor;
+    descriptor.size = 128;
+    descriptor.usage =
+        wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::Uniform;
+    wgpu::Buffer buffer = device.CreateBuffer(&descriptor);
+
+    constexpr size_t kElementCount = 16;
+    uint32_t data[kElementCount] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+    constexpr size_t kElementBytes = sizeof(data[0]);
+    queue.WriteBuffer(buffer, 0, data, sizeof(data));
+    EXPECT_BUFFER_U32_RANGE_EQ(data, buffer, 0, kElementCount);
+
+    // Alignments: offset -- 4, size -- 4
+    size_t offset = 1;
+    data[offset] = 100;
+    size_t size = kElementBytes;
+    queue.WriteBuffer(buffer, offset * kElementBytes, &data[offset], size);
+    EXPECT_BUFFER_U32_RANGE_EQ(data, buffer, 0, kElementCount);
+
+    // Alignments: offset -- 16, size -- 16
+    offset = 4;
+    data[offset] = 101;
+    data[offset + 1] = 102;
+    data[offset + 2] = 103;
+    data[offset + 3] = 104;
+    size = 4 * kElementBytes;
+    queue.WriteBuffer(buffer, offset * kElementBytes, &data[offset], size);
+    EXPECT_BUFFER_U32_RANGE_EQ(data, buffer, 0, kElementCount);
+
+    // Alignments: offset -- 4, size -- 16
+    offset = 10;
+    data[offset] = 105;
+    data[offset + 1] = 106;
+    data[offset + 2] = 107;
+    data[offset + 3] = 108;
+    queue.WriteBuffer(buffer, offset * kElementBytes, &data[offset], size);
+    EXPECT_BUFFER_U32_RANGE_EQ(data, buffer, 0, kElementCount);
+
+    // Alignments: offset -- 16, size -- 4
+    offset = 12;
+    data[offset] = 109;
+    size = kElementBytes;
+    queue.WriteBuffer(buffer, offset * kElementBytes, &data[offset], size);
+    EXPECT_BUFFER_U32_RANGE_EQ(data, buffer, 0, kElementCount);
+}
+
 DAWN_INSTANTIATE_TEST(QueueWriteBufferTests,
                       D3D11Backend(),
                       D3D12Backend(),