d3d11: Emulate upload buffers using system memory

This optimizes buffer uploading by using d3d11 UpdateSubresource1 rather
than CopySubresourceRegion, which can be slow on Intel Gen12.

Bug: dawn:1732
Bug: chromium:1485789
Change-Id: I3c16ecde10543aaef27a2d36510bed2844b8e3ed
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/168940
Reviewed-by: Corentin Wallez <cwallez@chromium.org>
Reviewed-by: Peng Huang <penghuang@chromium.org>
Commit-Queue: Jie A Chen <jie.a.chen@intel.com>
Reviewed-by: Austin Eng <enga@chromium.org>
diff --git a/src/dawn/native/d3d11/BufferD3D11.cpp b/src/dawn/native/d3d11/BufferD3D11.cpp
index f8b2395..79e9971 100644
--- a/src/dawn/native/d3d11/BufferD3D11.cpp
+++ b/src/dawn/native/d3d11/BufferD3D11.cpp
@@ -32,6 +32,7 @@
 #include <utility>
 #include <vector>
 
+#include "dawn/common/Alloc.h"
 #include "dawn/common/Assert.h"
 #include "dawn/common/Constants.h"
 #include "dawn/common/Math.h"
@@ -69,6 +70,10 @@
     return usage & kMappableBufferUsages;
 }
 
+bool IsUpload(wgpu::BufferUsage usage) {
+    return usage == (wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::MapWrite);
+}
+
 D3D11_USAGE D3D11BufferUsage(wgpu::BufferUsage usage) {
     if (IsMappable(usage)) {
         return D3D11_USAGE_STAGING;
@@ -152,11 +157,41 @@
 
 }  // namespace
 
+// For CPU-to-GPU upload buffers(CopySrc|MapWrite), they can be emulated in the system memory, and
+// then written into the dest GPU buffer via ID3D11DeviceContext::UpdateSubresource.
+class UploadBuffer final : public Buffer {
+    using Buffer::Buffer;
+    ~UploadBuffer() override;
+
+    MaybeError InitializeInternal() override;
+    MaybeError MapInternal(const ScopedCommandRecordingContext* commandContext) override;
+    void UnmapInternal(const ScopedCommandRecordingContext* commandContext) override;
+
+    MaybeError ClearInternal(const ScopedCommandRecordingContext* commandContext,
+                             uint8_t clearValue,
+                             uint64_t offset = 0,
+                             uint64_t size = 0) override;
+
+    uint8_t* GetUploadData() override;
+
+    std::unique_ptr<uint8_t[]> mUploadData;
+};
+
 // static
 ResultOrError<Ref<Buffer>> Buffer::Create(Device* device,
                                           const UnpackedPtr<BufferDescriptor>& descriptor,
-                                          const ScopedCommandRecordingContext* commandContext) {
-    Ref<Buffer> buffer = AcquireRef(new Buffer(device, descriptor));
+                                          const ScopedCommandRecordingContext* commandContext,
+                                          bool allowUploadBufferEmulation) {
+    bool useUploadBuffer = allowUploadBufferEmulation;
+    useUploadBuffer &= IsUpload(descriptor->usage);
+    constexpr uint64_t kMaxUploadBufferSize = 4 * 1024 * 1024;
+    useUploadBuffer &= descriptor->size <= kMaxUploadBufferSize;
+    Ref<Buffer> buffer;
+    if (useUploadBuffer) {
+        buffer = AcquireRef(new UploadBuffer(device, descriptor));
+    } else {
+        buffer = AcquireRef(new Buffer(device, descriptor));
+    }
     DAWN_TRY(buffer->Initialize(descriptor->mappedAtCreation, commandContext));
     return buffer;
 }
@@ -180,6 +215,45 @@
     }
     mAllocatedSize = Align(size, alignment);
 
+    DAWN_TRY(InitializeInternal());
+
+    SetLabelImpl();
+
+    if (!mappedAtCreation) {
+        if (GetDevice()->IsToggleEnabled(Toggle::NonzeroClearResourcesOnCreationForTesting)) {
+            if (commandContext) {
+                DAWN_TRY(ClearInternal(commandContext, 1u));
+            } else {
+                auto tmpCommandContext =
+                    ToBackend(GetDevice()->GetQueue())
+                        ->GetScopedPendingCommandContext(QueueBase::SubmitMode::Normal);
+                DAWN_TRY(ClearInternal(&tmpCommandContext, 1u));
+            }
+        }
+
+        // Initialize the padding bytes to zero.
+        if (GetDevice()->IsToggleEnabled(Toggle::LazyClearResourceOnFirstUse)) {
+            uint32_t paddingBytes = GetAllocatedSize() - GetSize();
+            if (paddingBytes > 0) {
+                uint32_t clearSize = paddingBytes;
+                uint64_t clearOffset = GetSize();
+                if (commandContext) {
+                    DAWN_TRY(ClearInternal(commandContext, 0, clearOffset, clearSize));
+
+                } else {
+                    auto tmpCommandContext =
+                        ToBackend(GetDevice()->GetQueue())
+                            ->GetScopedPendingCommandContext(QueueBase::SubmitMode::Normal);
+                    DAWN_TRY(ClearInternal(&tmpCommandContext, 0, clearOffset, clearSize));
+                }
+            }
+        }
+    }
+
+    return {};
+}
+
+MaybeError Buffer::InitializeInternal() {
     bool needsConstantBuffer = GetUsage() & wgpu::BufferUsage::Uniform;
     bool onlyNeedsConstantBuffer =
         needsConstantBuffer && IsSubset(GetUsage(), kD3D11AllowedUniformBufferUsages);
@@ -221,39 +295,6 @@
 
     DAWN_ASSERT(mD3d11NonConstantBuffer || mD3d11ConstantBuffer);
 
-    SetLabelImpl();
-
-    if (!mappedAtCreation) {
-        if (GetDevice()->IsToggleEnabled(Toggle::NonzeroClearResourcesOnCreationForTesting)) {
-            if (commandContext) {
-                DAWN_TRY(ClearInternal(commandContext, 1u));
-            } else {
-                auto tmpCommandContext =
-                    ToBackend(GetDevice()->GetQueue())
-                        ->GetScopedPendingCommandContext(QueueBase::SubmitMode::Normal);
-                DAWN_TRY(ClearInternal(&tmpCommandContext, 1u));
-            }
-        }
-
-        // Initialize the padding bytes to zero.
-        if (GetDevice()->IsToggleEnabled(Toggle::LazyClearResourceOnFirstUse)) {
-            uint32_t paddingBytes = GetAllocatedSize() - GetSize();
-            if (paddingBytes > 0) {
-                uint32_t clearSize = paddingBytes;
-                uint64_t clearOffset = GetSize();
-                if (commandContext) {
-                    DAWN_TRY(ClearInternal(commandContext, 0, clearOffset, clearSize));
-
-                } else {
-                    auto tmpCommandContext =
-                        ToBackend(GetDevice()->GetQueue())
-                            ->GetScopedPendingCommandContext(QueueBase::SubmitMode::Normal);
-                    DAWN_TRY(ClearInternal(&tmpCommandContext, 0, clearOffset, clearSize));
-                }
-            }
-        }
-    }
-
     return {};
 }
 
@@ -295,7 +336,7 @@
 }
 
 MaybeError Buffer::MapAsyncImpl(wgpu::MapMode mode, size_t offset, size_t size) {
-    DAWN_ASSERT(mD3d11NonConstantBuffer);
+    DAWN_ASSERT(mD3d11NonConstantBuffer || GetUploadData());
 
     mMapReadySerial = mLastUsageSerial;
     const ExecutionSerial completedSerial = GetDevice()->GetQueue()->GetCompletedCommandSerial();
@@ -327,7 +368,7 @@
 }
 
 void Buffer::UnmapImpl() {
-    DAWN_ASSERT(mD3d11NonConstantBuffer);
+    DAWN_ASSERT(mD3d11NonConstantBuffer || GetUploadData());
     mMapReadySerial = kMaxExecutionSerial;
     if (mMappedData) {
         auto commandContext = ToBackend(GetDevice()->GetQueue())
@@ -662,6 +703,15 @@
                                 size_t size,
                                 Buffer* destination,
                                 uint64_t destinationOffset) {
+    // Upload buffers shouldn't be copied to.
+    DAWN_ASSERT(!destination->GetUploadData());
+    // Use UpdateSubresource1() if the source is an upload buffer.
+    if (source->GetUploadData()) {
+        DAWN_TRY(destination->WriteInternal(commandContext, destinationOffset,
+                                            source->GetUploadData() + sourceOffset, size));
+        return {};
+    }
+
     D3D11_BOX srcBox;
     srcBox.left = static_cast<UINT>(sourceOffset);
     srcBox.top = 0;
@@ -699,6 +749,10 @@
     return {};
 }
 
+uint8_t* Buffer::GetUploadData() {
+    return nullptr;
+}
+
 ResultOrError<Buffer::ScopedMap> Buffer::ScopedMap::Create(
     const ScopedCommandRecordingContext* commandContext,
     Buffer* buffer) {
@@ -752,4 +806,40 @@
     return mBuffer ? mBuffer->mMappedData.get() : nullptr;
 }
 
+UploadBuffer::~UploadBuffer() = default;
+
+MaybeError UploadBuffer::InitializeInternal() {
+    mUploadData = std::unique_ptr<uint8_t[]>(AllocNoThrow<uint8_t>(GetAllocatedSize()));
+    if (mUploadData == nullptr) {
+        return DAWN_OUT_OF_MEMORY_ERROR("Failed to allocate memory for buffer uploading.");
+    }
+    return {};
+}
+
+uint8_t* UploadBuffer::GetUploadData() {
+    return mUploadData.get();
+}
+
+MaybeError UploadBuffer::MapInternal(const ScopedCommandRecordingContext* commandContext) {
+    mMappedData = mUploadData.get();
+    return {};
+}
+
+void UploadBuffer::UnmapInternal(const ScopedCommandRecordingContext* commandContext) {
+    mMappedData = nullptr;
+}
+
+MaybeError UploadBuffer::ClearInternal(const ScopedCommandRecordingContext* commandContext,
+                                       uint8_t clearValue,
+                                       uint64_t offset,
+                                       uint64_t size) {
+    if (size == 0) {
+        DAWN_ASSERT(offset == 0);
+        size = GetAllocatedSize();
+    }
+
+    memset(mUploadData.get() + offset, clearValue, size);
+    return {};
+}
+
 }  // namespace dawn::native::d3d11
diff --git a/src/dawn/native/d3d11/BufferD3D11.h b/src/dawn/native/d3d11/BufferD3D11.h
index 7c185fb..288de0a 100644
--- a/src/dawn/native/d3d11/BufferD3D11.h
+++ b/src/dawn/native/d3d11/BufferD3D11.h
@@ -40,11 +40,12 @@
 class Device;
 class ScopedCommandRecordingContext;
 
-class Buffer final : public BufferBase {
+class Buffer : public BufferBase {
   public:
     static ResultOrError<Ref<Buffer>> Create(Device* device,
                                              const UnpackedPtr<BufferDescriptor>& descriptor,
-                                             const ScopedCommandRecordingContext* commandContext);
+                                             const ScopedCommandRecordingContext* commandContext,
+                                             bool allowUploadBufferEmulation = true);
 
     MaybeError EnsureDataInitialized(const ScopedCommandRecordingContext* commandContext);
     MaybeError EnsureDataInitializedAsDestination(
@@ -119,11 +120,27 @@
         bool mNeedsUnmap = false;
     };
 
-  private:
+  protected:
     using BufferBase::BufferBase;
 
     ~Buffer() override;
 
+    virtual MaybeError InitializeInternal();
+
+    virtual MaybeError MapInternal(const ScopedCommandRecordingContext* commandContext);
+    virtual void UnmapInternal(const ScopedCommandRecordingContext* commandContext);
+
+    // Clear the buffer without checking if the buffer is initialized.
+    virtual MaybeError ClearInternal(const ScopedCommandRecordingContext* commandContext,
+                                     uint8_t clearValue,
+                                     uint64_t offset = 0,
+                                     uint64_t size = 0);
+
+    virtual uint8_t* GetUploadData();
+
+    raw_ptr<uint8_t, AllowPtrArithmetic> mMappedData = nullptr;
+
+  private:
     MaybeError Initialize(bool mappedAtCreation,
                           const ScopedCommandRecordingContext* commandContext);
     MaybeError MapAsyncImpl(wgpu::MapMode mode, size_t offset, size_t size) override;
@@ -133,15 +150,7 @@
     MaybeError MapAtCreationImpl() override;
     void* GetMappedPointer() override;
 
-    MaybeError MapInternal(const ScopedCommandRecordingContext* commandContext);
-    void UnmapInternal(const ScopedCommandRecordingContext* commandContext);
-
     MaybeError InitializeToZero(const ScopedCommandRecordingContext* commandContext);
-    // Clear the buffer without checking if the buffer is initialized.
-    MaybeError ClearInternal(const ScopedCommandRecordingContext* commandContext,
-                             uint8_t clearValue,
-                             uint64_t offset = 0,
-                             uint64_t size = 0);
     // Write the buffer without checking if the buffer is initialized.
     MaybeError WriteInternal(const ScopedCommandRecordingContext* commandContext,
                              uint64_t bufferOffset,
@@ -159,7 +168,6 @@
     // The buffer object for non-constant buffer usages(e.g. storage buffer, vertex buffer, etc.)
     ComPtr<ID3D11Buffer> mD3d11NonConstantBuffer;
     bool mConstantBufferIsUpdated = true;
-    raw_ptr<uint8_t, AllowPtrArithmetic> mMappedData = nullptr;
     ExecutionSerial mMapReadySerial = kMaxExecutionSerial;
 };
 
diff --git a/src/dawn/native/d3d11/DeviceD3D11.cpp b/src/dawn/native/d3d11/DeviceD3D11.cpp
index e194452..dc5b66d 100644
--- a/src/dawn/native/d3d11/DeviceD3D11.cpp
+++ b/src/dawn/native/d3d11/DeviceD3D11.cpp
@@ -560,7 +560,8 @@
     Ref<BufferBase> buffer;
     // We don't cache the buffer if it's too large.
     if (bufferSize > kMaxStagingBufferSize) {
-        DAWN_TRY_ASSIGN(buffer, Buffer::Create(this, Unpack(&descriptor), commandContext));
+        DAWN_TRY_ASSIGN(buffer, Buffer::Create(this, Unpack(&descriptor), commandContext,
+                                               /*allowUploadBufferEmulation=*/false));
         return buffer;
     }
 
@@ -580,7 +581,8 @@
     }
 
     // Create a new staging buffer as no existing one can be re-used.
-    DAWN_TRY_ASSIGN(buffer, Buffer::Create(this, Unpack(&descriptor), commandContext));
+    DAWN_TRY_ASSIGN(buffer, Buffer::Create(this, Unpack(&descriptor), commandContext,
+                                           /*allowUploadBufferEmulation=*/false));
     mTotalStagingBufferSize += bufferSize;
 
     // Purge the old staging buffers if the total size is too large.
diff --git a/src/dawn/tests/end2end/BufferTests.cpp b/src/dawn/tests/end2end/BufferTests.cpp
index f7da0de..6a5a7ac 100644
--- a/src/dawn/tests/end2end/BufferTests.cpp
+++ b/src/dawn/tests/end2end/BufferTests.cpp
@@ -120,6 +120,13 @@
         descriptor.usage = wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc;
         return device.CreateBuffer(&descriptor);
     }
+
+    wgpu::Buffer CreateUniformBuffer(uint64_t size) {
+        wgpu::BufferDescriptor descriptor;
+        descriptor.size = size;
+        descriptor.usage = wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopyDst;
+        return device.CreateBuffer(&descriptor);
+    }
 };
 
 void CheckMapping(const void* actual, const void* expected, size_t size) {
@@ -716,9 +723,10 @@
             constexpr int kRepeatCount = 50;
             constexpr int kBufferSize = 1024 * 1024 * 10;
             wgpu::Buffer tempWriteBuffer = CreateMapWriteBuffer(kBufferSize);
-            wgpu::Buffer tempReadBuffer = CreateMapReadBuffer(kBufferSize);
+            wgpu::Buffer tempReadBuffer = CreateUniformBuffer(kBufferSize);
             for (int i = 0; i < kRepeatCount; ++i) {
-                encoder.CopyBufferToBuffer(tempWriteBuffer, 0, tempReadBuffer, 0, kBufferSize);
+                encoder.CopyBufferToBuffer(tempWriteBuffer, 0, tempReadBuffer, 0,
+                                           kBufferSize - 1024);
             }
         }