Plumbs alignment offset through to RingBufferAllocator.

- By plumbing the offset through, the allocator can be smarter and only
  allocate extra space for padding when necessary.

Bug: dawn:1985
Change-Id: I25f6762683cdd787e9a9d5ef3519231955a24914
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/151280
Reviewed-by: Corentin Wallez <cwallez@chromium.org>
Commit-Queue: Loko Kung <lokokung@google.com>
Kokoro: Kokoro <noreply+kokoro@google.com>
diff --git a/src/dawn/native/DynamicUploader.cpp b/src/dawn/native/DynamicUploader.cpp
index f1de07b..980c441 100644
--- a/src/dawn/native/DynamicUploader.cpp
+++ b/src/dawn/native/DynamicUploader.cpp
@@ -32,7 +32,8 @@
 }
 
 ResultOrError<UploadHandle> DynamicUploader::AllocateInternal(uint64_t allocationSize,
-                                                              ExecutionSerial serial) {
+                                                              ExecutionSerial serial,
+                                                              uint64_t offsetAlignment) {
     // Disable further sub-allocation should the request be too large.
     if (allocationSize > kRingBufferSize) {
         BufferDescriptor bufferDesc = {};
@@ -54,25 +55,20 @@
     }
 
     // Note: Validation ensures size is already aligned.
-    // First-fit: find next smallest buffer large enough to satisfy the allocation request.
+    // First-fit: find next buffer large enough to satisfy the allocation request.
+    uint64_t startOffset = RingBufferAllocator::kInvalidOffset;
     RingBuffer* targetRingBuffer = mRingBuffers.back().get();
     for (auto& ringBuffer : mRingBuffers) {
-        const RingBufferAllocator& ringBufferAllocator = ringBuffer->mAllocator;
+        RingBufferAllocator& ringBufferAllocator = ringBuffer->mAllocator;
         // Prevent overflow.
         DAWN_ASSERT(ringBufferAllocator.GetSize() >= ringBufferAllocator.GetUsedSize());
-        const uint64_t remainingSize =
-            ringBufferAllocator.GetSize() - ringBufferAllocator.GetUsedSize();
-        if (allocationSize <= remainingSize) {
+        startOffset = ringBufferAllocator.Allocate(allocationSize, serial, offsetAlignment);
+        if (startOffset != RingBufferAllocator::kInvalidOffset) {
             targetRingBuffer = ringBuffer.get();
             break;
         }
     }
 
-    uint64_t startOffset = RingBufferAllocator::kInvalidOffset;
-    if (targetRingBuffer != nullptr) {
-        startOffset = targetRingBuffer->mAllocator.Allocate(allocationSize, serial);
-    }
-
     // Upon failure, append a newly created ring buffer to fulfill the
     // request.
     if (startOffset == RingBufferAllocator::kInvalidOffset) {
@@ -126,19 +122,11 @@
     mReleasedStagingBuffers.ClearUpTo(lastCompletedSerial);
 }
 
-// TODO(dawn:512): Optimize this function so that it doesn't allocate additional memory
-// when it's not necessary.
 ResultOrError<UploadHandle> DynamicUploader::Allocate(uint64_t allocationSize,
                                                       ExecutionSerial serial,
                                                       uint64_t offsetAlignment) {
     DAWN_ASSERT(offsetAlignment > 0);
-    UploadHandle uploadHandle;
-    DAWN_TRY_ASSIGN(uploadHandle, AllocateInternal(allocationSize + offsetAlignment - 1, serial));
-    uint64_t additionalOffset =
-        Align(uploadHandle.startOffset, offsetAlignment) - uploadHandle.startOffset;
-    uploadHandle.mappedBuffer = static_cast<uint8_t*>(uploadHandle.mappedBuffer) + additionalOffset;
-    uploadHandle.startOffset += additionalOffset;
-    return uploadHandle;
+    return AllocateInternal(allocationSize, serial, offsetAlignment);
 }
 
 bool DynamicUploader::ShouldFlush() {
diff --git a/src/dawn/native/DynamicUploader.h b/src/dawn/native/DynamicUploader.h
index e49ae0c..80b8f11 100644
--- a/src/dawn/native/DynamicUploader.h
+++ b/src/dawn/native/DynamicUploader.h
@@ -63,7 +63,9 @@
         RingBufferAllocator mAllocator;
     };
 
-    ResultOrError<UploadHandle> AllocateInternal(uint64_t allocationSize, ExecutionSerial serial);
+    ResultOrError<UploadHandle> AllocateInternal(uint64_t allocationSize,
+                                                 ExecutionSerial serial,
+                                                 uint64_t offsetAlignment);
 
     std::vector<std::unique_ptr<RingBuffer>> mRingBuffers;
     SerialQueue<ExecutionSerial, Ref<BufferBase>> mReleasedStagingBuffers;
diff --git a/src/dawn/native/RingBufferAllocator.cpp b/src/dawn/native/RingBufferAllocator.cpp
index 3d02699..828d04dd 100644
--- a/src/dawn/native/RingBufferAllocator.cpp
+++ b/src/dawn/native/RingBufferAllocator.cpp
@@ -16,6 +16,8 @@
 
 #include <utility>
 
+#include "dawn/common/Math.h"
+
 // Note: Current RingBufferAllocator implementation uses two indices (start and end) to implement a
 // circular queue. However, this approach defines a full queue when one element is still unused.
 //
@@ -70,7 +72,9 @@
 // queue, which identifies an existing (or new) frames-worth of resources. Internally, the
 // ring-buffer maintains offsets of 3 "memory" states: Free, Reclaimed, and Used. This is done
 // in FIFO order as older frames would free resources before newer ones.
-uint64_t RingBufferAllocator::Allocate(uint64_t allocationSize, ExecutionSerial serial) {
+uint64_t RingBufferAllocator::Allocate(uint64_t allocationSize,
+                                       ExecutionSerial serial,
+                                       uint64_t offsetAlignment) {
     // Check if the buffer is full by comparing the used size.
     // If the buffer is not split where waste occurs (e.g. cannot fit new sub-alloc in front), a
     // subsequent sub-alloc could fail where the used size was previously adjusted to include
@@ -86,43 +90,45 @@
     }
 
     uint64_t startOffset = kInvalidOffset;
+    uint64_t currentRequestSize = 0u;
+
+    // Compute an alignment offset for the buffer if allocating at the end.
+    const uint64_t alignmentOffset = Align(mUsedEndOffset, offsetAlignment) - mUsedEndOffset;
+    const uint64_t alignedUsedEndOffset = mUsedEndOffset + alignmentOffset;
 
     // Check if the buffer is NOT split (i.e sub-alloc on ends)
     if (mUsedStartOffset <= mUsedEndOffset) {
         // Order is important (try to sub-alloc at end first).
         // This is due to FIFO order where sub-allocs are inserted from left-to-right (when not
         // wrapped).
-        if (mUsedEndOffset + allocationSize <= mMaxBlockSize) {
-            startOffset = mUsedEndOffset;
-            mUsedEndOffset += allocationSize;
-            mUsedSize += allocationSize;
-            mCurrentRequestSize += allocationSize;
+        if (alignedUsedEndOffset + allocationSize <= mMaxBlockSize) {
+            startOffset = alignedUsedEndOffset;
+            mUsedSize += allocationSize + alignmentOffset;
+            currentRequestSize = allocationSize + alignmentOffset;
         } else if (allocationSize <= mUsedStartOffset) {  // Try to sub-alloc at front.
             // Count the space at the end so that a subsequent
             // sub-alloc cannot fail when the buffer is full.
             const uint64_t requestSize = (mMaxBlockSize - mUsedEndOffset) + allocationSize;
 
             startOffset = 0;
-            mUsedEndOffset = allocationSize;
             mUsedSize += requestSize;
-            mCurrentRequestSize += requestSize;
+            currentRequestSize = requestSize;
         }
-    } else if (mUsedEndOffset + allocationSize <=
-               mUsedStartOffset) {  // Otherwise, buffer is split where sub-alloc must be
-                                    // in-between.
-        startOffset = mUsedEndOffset;
-        mUsedEndOffset += allocationSize;
-        mUsedSize += allocationSize;
-        mCurrentRequestSize += allocationSize;
+    } else if (alignedUsedEndOffset + allocationSize <= mUsedStartOffset) {
+        // Otherwise, buffer is split where sub-alloc must be in-between.
+        startOffset = alignedUsedEndOffset;
+        mUsedSize += allocationSize + alignmentOffset;
+        currentRequestSize = allocationSize + alignmentOffset;
     }
 
     if (startOffset != kInvalidOffset) {
+        mUsedEndOffset = startOffset + allocationSize;
+
         Request request;
         request.endOffset = mUsedEndOffset;
-        request.size = mCurrentRequestSize;
+        request.size = currentRequestSize;
 
         mInflightRequests.Enqueue(std::move(request), serial);
-        mCurrentRequestSize = 0;  // reset
     }
 
     return startOffset;
diff --git a/src/dawn/native/RingBufferAllocator.h b/src/dawn/native/RingBufferAllocator.h
index 6aeb142..e3b6840 100644
--- a/src/dawn/native/RingBufferAllocator.h
+++ b/src/dawn/native/RingBufferAllocator.h
@@ -33,7 +33,9 @@
 
     RingBufferAllocator& operator=(const RingBufferAllocator&);
 
-    uint64_t Allocate(uint64_t allocationSize, ExecutionSerial serial);
+    uint64_t Allocate(uint64_t allocationSize,
+                      ExecutionSerial serial,
+                      uint64_t offsetAlignment = 1);
     void Deallocate(ExecutionSerial lastCompletedSerial);
 
     uint64_t GetSize() const;
@@ -55,8 +57,6 @@
     uint64_t mUsedStartOffset = 0;  // Head of used sub-alloc requests (in bytes).
     uint64_t mMaxBlockSize = 0;     // Max size of the ring buffer (in bytes).
     uint64_t mUsedSize = 0;         // Size of the sub-alloc requests (in bytes) of the ring buffer.
-    uint64_t mCurrentRequestSize =
-        0;  // Size of the sub-alloc requests (in bytes) of the current serial.
 };
 }  // namespace dawn::native
 
diff --git a/src/dawn/tests/end2end/QueueTests.cpp b/src/dawn/tests/end2end/QueueTests.cpp
index 0cbd7c7..f38fdb4 100644
--- a/src/dawn/tests/end2end/QueueTests.cpp
+++ b/src/dawn/tests/end2end/QueueTests.cpp
@@ -174,6 +174,19 @@
     EXPECT_BUFFER_U32_RANGE_EQ(expectedData.data(), buffer, 0, kElements);
 }
 
+// Test using the max buffer size. Regression test for dawn:1985. We don't bother validating the
+// results for this case since that would take a lot longer, just that there are no errors.
+TEST_P(QueueWriteBufferTests, MaxBufferSizeWriteBuffer) {
+    uint32_t maxBufferSize = GetSupportedLimits().limits.maxBufferSize;
+    wgpu::BufferDescriptor descriptor;
+    descriptor.size = maxBufferSize;
+    descriptor.usage = wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst;
+    wgpu::Buffer buffer = device.CreateBuffer(&descriptor);
+
+    std::vector<uint8_t> data(maxBufferSize);
+    queue.WriteBuffer(buffer, 0, data.data(), maxBufferSize);
+}
+
 // Test a special code path: writing when dynamic uploader already contatins some unaligned
 // data, it might be necessary to use a ring buffer with properly aligned offset.
 TEST_P(QueueWriteBufferTests, UnalignedDynamicUploader) {