d3d11: Use multiple staging buffers

Using multiple staging buffers and tracking their 'mLastUsageSerial' can
best avoid buffer map stalls in Buffer::WriteInternal.

Bug: dawn:2357
Change-Id: I7bda152640d329608702d9af7639c39bf1bd1c6a
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/170920
Commit-Queue: Jie A Chen <jie.a.chen@intel.com>
Reviewed-by: Austin Eng <enga@chromium.org>
Reviewed-by: Peng Huang <penghuang@chromium.org>
diff --git a/src/dawn/native/Buffer.cpp b/src/dawn/native/Buffer.cpp
index d7a4a41..e508ce2 100644
--- a/src/dawn/native/Buffer.cpp
+++ b/src/dawn/native/Buffer.cpp
@@ -877,6 +877,10 @@
     mLastUsageSerial = serial;
 }
 
+ExecutionSerial BufferBase::GetLastUsageSerial() const {
+    return mLastUsageSerial;
+}
+
 void BufferBase::SetHasAccess(bool hasAccess) {
     mState = hasAccess ? BufferState::Unmapped : BufferState::SharedMemoryNoAccess;
 }
diff --git a/src/dawn/native/Buffer.h b/src/dawn/native/Buffer.h
index 20b13ea..5db99ce 100644
--- a/src/dawn/native/Buffer.h
+++ b/src/dawn/native/Buffer.h
@@ -86,6 +86,7 @@
 
     uint64_t GetSize() const;
     uint64_t GetAllocatedSize() const;
+    ExecutionSerial GetLastUsageSerial() const;
 
     // |GetUsageExternalOnly| returns the usage with which the buffer was created using the
     // base WebGPU API. Additional usages may be added for internal state tracking. |GetUsage|
diff --git a/src/dawn/native/d3d11/BufferD3D11.cpp b/src/dawn/native/d3d11/BufferD3D11.cpp
index 9f463f4..a571a37 100644
--- a/src/dawn/native/d3d11/BufferD3D11.cpp
+++ b/src/dawn/native/d3d11/BufferD3D11.cpp
@@ -630,11 +630,14 @@
     // mD3d11ConstantBuffer.
     Ref<BufferBase> stagingBuffer;
     DAWN_TRY_ASSIGN(stagingBuffer, ToBackend(GetDevice())->GetStagingBuffer(commandContext, size));
-
+    stagingBuffer->MarkUsedInPendingCommands();
     DAWN_TRY(ToBackend(stagingBuffer)->WriteInternal(commandContext, 0, data, size));
+    DAWN_TRY(Buffer::CopyInternal(commandContext, ToBackend(stagingBuffer.Get()),
+                                  /*sourceOffset=*/0,
+                                  /*size=*/size, this, offset));
+    ToBackend(GetDevice())->ReturnStagingBuffer(std::move(stagingBuffer));
 
-    return Buffer::CopyInternal(commandContext, ToBackend(stagingBuffer.Get()), /*sourceOffset=*/0,
-                                /*size=*/size, this, offset);
+    return {};
 }
 
 // static
diff --git a/src/dawn/native/d3d11/DeviceD3D11.cpp b/src/dawn/native/d3d11/DeviceD3D11.cpp
index e42ad64..e194452 100644
--- a/src/dawn/native/d3d11/DeviceD3D11.cpp
+++ b/src/dawn/native/d3d11/DeviceD3D11.cpp
@@ -378,7 +378,7 @@
     DAWN_ASSERT(GetState() == State::Disconnected);
 
     mImplicitPixelLocalStorageAttachmentTextureViews = {};
-    mStagingBuffer = nullptr;
+    mStagingBuffers.clear();
 
     Base::DestroyImpl();
 }
@@ -550,27 +550,58 @@
 ResultOrError<Ref<BufferBase>> Device::GetStagingBuffer(
     const ScopedCommandRecordingContext* commandContext,
     uint64_t size) {
-    constexpr uint64_t kMinSize = 4 * 1024;
-    constexpr uint64_t kMaxSize = 16 * 1024 * 1024;
-    uint64_t bufferSize = mStagingBuffer.Get() ? mStagingBuffer->GetSize() : 0;
-    if (size > bufferSize) {
-        bufferSize = Align(size, kMinSize);
-        BufferDescriptor descriptor;
-        descriptor.usage = wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc;
-        descriptor.size = bufferSize;
-        descriptor.mappedAtCreation = false;
-        descriptor.label = "DawnDeviceStagingBuffer";
-        Ref<BufferBase> buffer;
+    constexpr uint64_t kMinStagingBufferSize = 4 * 1024;
+    uint64_t bufferSize = Align(size, kMinStagingBufferSize);
+    BufferDescriptor descriptor;
+    descriptor.usage = wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc;
+    descriptor.size = bufferSize;
+    descriptor.mappedAtCreation = false;
+    descriptor.label = "DawnDeviceStagingBuffer";
+    Ref<BufferBase> buffer;
+    // We don't cache the buffer if it's too large.
+    if (bufferSize > kMaxStagingBufferSize) {
         DAWN_TRY_ASSIGN(buffer, Buffer::Create(this, Unpack(&descriptor), commandContext));
-        // We don't cache the buffer if it's too large.
-        if (bufferSize > kMaxSize) {
+        return buffer;
+    }
+
+    ExecutionSerial completedSerial = GetQueue()->GetCompletedCommandSerial();
+    for (auto it = mStagingBuffers.begin(); it != mStagingBuffers.end(); ++it) {
+        if ((*it)->GetLastUsageSerial() > completedSerial) {
+            // This buffer, and none after it are ready. Advance to the end and stop the search.
+            break;
+        }
+
+        if ((*it)->GetSize() >= bufferSize) {
+            // this buffer is large enough. Stop searching and remove.
+            buffer = *it;
+            mStagingBuffers.erase(it);
             return buffer;
         }
-        mStagingBuffer = buffer;
     }
-    // Ensure there is no more than 1 active usage of the staging buffer.
-    DAWN_ASSERT(mStagingBuffer->GetRefCountForTesting() <= 1);
-    return mStagingBuffer;
+
+    // Create a new staging buffer as no existing one can be re-used.
+    DAWN_TRY_ASSIGN(buffer, Buffer::Create(this, Unpack(&descriptor), commandContext));
+    mTotalStagingBufferSize += bufferSize;
+
+    // Purge the old staging buffers if the total size is too large.
+    constexpr uint64_t kMaxTotalSize = 16 * 1024 * 1024;
+    for (auto it = mStagingBuffers.begin(); it != mStagingBuffers.end() &&
+                                            mTotalStagingBufferSize > kMaxTotalSize &&
+                                            (*it)->GetLastUsageSerial() <= completedSerial;) {
+        mTotalStagingBufferSize -= (*it)->GetSize();
+        it = mStagingBuffers.erase(it);
+    }
+
+    return buffer;
+}
+
+void Device::ReturnStagingBuffer(Ref<BufferBase>&& buffer) {
+    DAWN_ASSERT(mStagingBuffers.empty() ||
+                mStagingBuffers.back()->GetLastUsageSerial() <= buffer->GetLastUsageSerial());
+    // Only the cached buffers can be re-used.
+    if (buffer->GetSize() <= kMaxStagingBufferSize) {
+        mStagingBuffers.push_back(std::move(buffer));
+    }
 }
 
 }  // namespace dawn::native::d3d11
diff --git a/src/dawn/native/d3d11/DeviceD3D11.h b/src/dawn/native/d3d11/DeviceD3D11.h
index 7d34cb1..21aca83 100644
--- a/src/dawn/native/d3d11/DeviceD3D11.h
+++ b/src/dawn/native/d3d11/DeviceD3D11.h
@@ -99,15 +99,16 @@
         uint32_t implicitAttachmentIndex);
 
     // Grab a staging buffer, the size of which is no less than 'size'.
-    // Note: We assume only 1 staging buffer is active, so the client should release it as soon as
-    // possbile once the buffer usage is done.
+    // The buffer must be returned before the advancing of the current pending serial.
     ResultOrError<Ref<BufferBase>> GetStagingBuffer(
         const ScopedCommandRecordingContext* commandContext,
         uint64_t size);
+    void ReturnStagingBuffer(Ref<BufferBase>&& buffer);
 
   private:
     using Base = d3d::Device;
     using Base::Base;
+    static constexpr uint64_t kMaxStagingBufferSize = 512 * 1024;
 
     ResultOrError<Ref<BindGroupBase>> CreateBindGroupImpl(
         const BindGroupDescriptor* descriptor) override;
@@ -162,8 +163,9 @@
     // TODO(dawn:1704): decide when to clear the cached implicit pixel local storage attachments.
     std::array<Ref<TextureViewBase>, kMaxPLSSlots> mImplicitPixelLocalStorageAttachmentTextureViews;
 
-    // The cached staging buffer.
-    Ref<BufferBase> mStagingBuffer;
+    // The cached staging buffers.
+    std::vector<Ref<BufferBase>> mStagingBuffers;
+    uint64_t mTotalStagingBufferSize = 0;
 };
 
 }  // namespace dawn::native::d3d11