Add Buffer::GetAllocatedSize()

Buffer allocations may need padding based on backend requirements.
GetAllocatedSize returns the size of the buffer actually allocated.

This CL also updates the backends to use GetAllocatedSize for buffer
clearing and barrier operations which should operate over the
entire allocated resource.

Bug: dawn:1011
Change-Id: Ic5233214414fa090725a4f50fff70d6e178494c3
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/60701
Reviewed-by: Stephen White <senorblanco@chromium.org>
Commit-Queue: Austin Eng <enga@chromium.org>
diff --git a/src/dawn_native/Buffer.cpp b/src/dawn_native/Buffer.cpp
index 7bb1633..3e1b72f 100644
--- a/src/dawn_native/Buffer.cpp
+++ b/src/dawn_native/Buffer.cpp
@@ -177,6 +177,13 @@
         return mSize;
     }
 
+    uint64_t BufferBase::GetAllocatedSize() const {
+        ASSERT(!IsError());
+        // The backend must initialize this value.
+        ASSERT(mAllocatedSize != 0);
+        return mAllocatedSize;
+    }
+
     wgpu::BufferUsage BufferBase::GetUsage() const {
         ASSERT(!IsError());
         return mUsage;
@@ -185,13 +192,29 @@
     MaybeError BufferBase::MapAtCreation() {
         DAWN_TRY(MapAtCreationInternal());
 
+        void* ptr;
+        size_t size;
+        if (mSize == 0) {
+            return {};
+        } else if (mStagingBuffer) {
+            // If there is a staging buffer for initialization, clear its contents directly.
+            // It should be exactly as large as the buffer allocation.
+            ptr = mStagingBuffer->GetMappedPointer();
+            size = mStagingBuffer->GetSize();
+            ASSERT(size == GetAllocatedSize());
+        } else {
+            // Otherwise, the buffer is directly mappable on the CPU.
+            ptr = GetMappedPointerImpl();
+            size = GetAllocatedSize();
+        }
+
         DeviceBase* device = GetDevice();
         if (device->IsToggleEnabled(Toggle::LazyClearResourceOnFirstUse)) {
-            memset(GetMappedRange(0, mSize), uint8_t(0u), mSize);
+            memset(ptr, uint8_t(0u), size);
             SetIsDataInitialized();
             device->IncrementLazyClearCountForTesting();
         } else if (device->IsToggleEnabled(Toggle::NonzeroClearResourcesOnCreationForTesting)) {
-            memset(GetMappedRange(0, mSize), uint8_t(1u), mSize);
+            memset(ptr, uint8_t(1u), size);
         }
 
         return {};
@@ -215,9 +238,12 @@
         } else {
             // If any of these fail, the buffer will be deleted and replaced with an
             // error buffer.
+            // The staging buffer is used to return mappable data to inititalize the buffer
+            // contents. Allocate one as large as the real buffer size so that every byte is
+            // initialized.
             // TODO(crbug.com/dawn/828): Suballocate and reuse memory from a larger staging buffer
             // so we don't create many small buffers.
-            DAWN_TRY_ASSIGN(mStagingBuffer, GetDevice()->CreateStagingBuffer(GetSize()));
+            DAWN_TRY_ASSIGN(mStagingBuffer, GetDevice()->CreateStagingBuffer(GetAllocatedSize()));
         }
 
         return {};
@@ -343,11 +369,14 @@
 
     MaybeError BufferBase::CopyFromStagingBuffer() {
         ASSERT(mStagingBuffer);
-        if (GetSize() == 0) {
+        if (mSize == 0) {
+            // Staging buffer is not created if zero size.
+            ASSERT(mStagingBuffer == nullptr);
             return {};
         }
 
-        DAWN_TRY(GetDevice()->CopyFromStagingToBuffer(mStagingBuffer.get(), 0, this, 0, GetSize()));
+        DAWN_TRY(GetDevice()->CopyFromStagingToBuffer(mStagingBuffer.get(), 0, this, 0,
+                                                      GetAllocatedSize()));
 
         DynamicUploader* uploader = GetDevice()->GetDynamicUploader();
         uploader->ReleaseStagingBuffer(std::move(mStagingBuffer));
diff --git a/src/dawn_native/Buffer.h b/src/dawn_native/Buffer.h
index b3a303c..543ba71 100644
--- a/src/dawn_native/Buffer.h
+++ b/src/dawn_native/Buffer.h
@@ -54,6 +54,7 @@
         static BufferBase* MakeError(DeviceBase* device, const BufferDescriptor* descriptor);
 
         uint64_t GetSize() const;
+        uint64_t GetAllocatedSize() const;
         wgpu::BufferUsage GetUsage() const;
 
         MaybeError MapAtCreation();
@@ -89,6 +90,8 @@
 
         MaybeError MapAtCreationInternal();
 
+        uint64_t mAllocatedSize = 0;
+
       private:
         virtual MaybeError MapAtCreationImpl() = 0;
         virtual MaybeError MapAsyncImpl(wgpu::MapMode mode, size_t offset, size_t size) = 0;
diff --git a/src/dawn_native/d3d12/BufferD3D12.cpp b/src/dawn_native/d3d12/BufferD3D12.cpp
index aea387e..97a90be 100644
--- a/src/dawn_native/d3d12/BufferD3D12.cpp
+++ b/src/dawn_native/d3d12/BufferD3D12.cpp
@@ -102,17 +102,25 @@
     }
 
     MaybeError Buffer::Initialize(bool mappedAtCreation) {
-        D3D12_RESOURCE_DESC resourceDescriptor;
-        resourceDescriptor.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
-        resourceDescriptor.Alignment = 0;
         // D3D buffers are always resource size aligned to 64KB. However, D3D12's validation forbids
         // binding a CBV to an unaligned size. To prevent, one can always safely align the buffer
         // desc size to the CBV data alignment as other buffer usages ignore it (no size check).
         // The validation will still enforce bound checks with the unaligned size returned by
         // GetSize().
         // https://docs.microsoft.com/en-us/windows/win32/direct3d12/uploading-resources#buffer-alignment
-        resourceDescriptor.Width =
-            Align(std::max(GetSize(), uint64_t(4u)), D3D12BufferSizeAlignment(GetUsage()));
+        // Allocate at least 4 bytes so clamped accesses are always in bounds.
+        uint64_t size = std::max(GetSize(), uint64_t(4u));
+        size_t alignment = D3D12BufferSizeAlignment(GetUsage());
+        if (size > std::numeric_limits<uint64_t>::max() - alignment) {
+            // Alignment would overlow.
+            return DAWN_OUT_OF_MEMORY_ERROR("Buffer allocation is too large");
+        }
+        mAllocatedSize = Align(size, alignment);
+
+        D3D12_RESOURCE_DESC resourceDescriptor;
+        resourceDescriptor.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
+        resourceDescriptor.Alignment = 0;
+        resourceDescriptor.Width = mAllocatedSize;
         resourceDescriptor.Height = 1;
         resourceDescriptor.DepthOrArraySize = 1;
         resourceDescriptor.MipLevels = 1;
@@ -328,7 +336,7 @@
 
         // The buffers with mappedAtCreation == true will be initialized in
         // BufferBase::MapAtCreation().
-        DAWN_TRY(MapInternal(true, 0, size_t(GetSize()), "D3D12 map at creation"));
+        DAWN_TRY(MapInternal(true, 0, size_t(GetAllocatedSize()), "D3D12 map at creation"));
 
         return {};
     }
@@ -442,22 +450,23 @@
         // The state of the buffers on UPLOAD heap must always be GENERIC_READ and cannot be
         // changed away, so we can only clear such buffer with buffer mapping.
         if (D3D12HeapType(GetUsage()) == D3D12_HEAP_TYPE_UPLOAD) {
-            DAWN_TRY(MapInternal(true, 0, size_t(GetSize()), "D3D12 map at clear buffer"));
-            memset(mMappedData, clearValue, GetSize());
+            DAWN_TRY(MapInternal(true, 0, size_t(GetAllocatedSize()), "D3D12 map at clear buffer"));
+            memset(mMappedData, clearValue, GetAllocatedSize());
             UnmapImpl();
         } else {
             // TODO(crbug.com/dawn/852): use ClearUnorderedAccessView*() when the buffer usage
             // includes STORAGE.
             DynamicUploader* uploader = device->GetDynamicUploader();
             UploadHandle uploadHandle;
-            DAWN_TRY_ASSIGN(uploadHandle,
-                            uploader->Allocate(GetSize(), device->GetPendingCommandSerial(),
-                                               kCopyBufferToBufferOffsetAlignment));
+            DAWN_TRY_ASSIGN(uploadHandle, uploader->Allocate(GetAllocatedSize(),
+                                                             device->GetPendingCommandSerial(),
+                                                             kCopyBufferToBufferOffsetAlignment));
 
-            memset(uploadHandle.mappedBuffer, clearValue, GetSize());
+            memset(uploadHandle.mappedBuffer, clearValue, GetAllocatedSize());
 
             device->CopyFromStagingToBufferImpl(commandContext, uploadHandle.stagingBuffer,
-                                                uploadHandle.startOffset, this, 0, GetSize());
+                                                uploadHandle.startOffset, this, 0,
+                                                GetAllocatedSize());
         }
 
         return {};
diff --git a/src/dawn_native/metal/BufferMTL.mm b/src/dawn_native/metal/BufferMTL.mm
index 6abf102..237fcd0 100644
--- a/src/dawn_native/metal/BufferMTL.mm
+++ b/src/dawn_native/metal/BufferMTL.mm
@@ -48,21 +48,30 @@
         if (GetSize() > std::numeric_limits<NSUInteger>::max()) {
             return DAWN_OUT_OF_MEMORY_ERROR("Buffer allocation is too large");
         }
-        NSUInteger currentSize = static_cast<NSUInteger>(std::max(GetSize(), uint64_t(4u)));
+
+        uint32_t alignment = 1;
+#ifdef DAWN_PLATFORM_MACOS
+        // [MTLBlitCommandEncoder fillBuffer] requires the size to be a multiple of 4 on MacOS.
+        alignment = 4;
+#endif
 
         // Metal validation layer requires the size of uniform buffer and storage buffer to be no
         // less than the size of the buffer block defined in shader, and the overall size of the
         // buffer must be aligned to the largest alignment of its members.
         if (GetUsage() &
             (wgpu::BufferUsage::Uniform | wgpu::BufferUsage::Storage | kInternalStorageBuffer)) {
-            if (currentSize >
-                std::numeric_limits<NSUInteger>::max() - kMinUniformOrStorageBufferAlignment) {
-                // Alignment would overlow.
-                return DAWN_OUT_OF_MEMORY_ERROR("Buffer allocation is too large");
-            }
-            currentSize = Align(currentSize, kMinUniformOrStorageBufferAlignment);
+            ASSERT(IsAligned(kMinUniformOrStorageBufferAlignment, alignment));
+            alignment = kMinUniformOrStorageBufferAlignment;
         }
 
+        // Allocate at least 4 bytes so clamped accesses are always in bounds.
+        NSUInteger currentSize = static_cast<NSUInteger>(std::max(GetSize(), uint64_t(4u)));
+        if (currentSize > std::numeric_limits<NSUInteger>::max() - alignment) {
+            // Alignment would overlow.
+            return DAWN_OUT_OF_MEMORY_ERROR("Buffer allocation is too large");
+        }
+        currentSize = Align(currentSize, kMinUniformOrStorageBufferAlignment);
+
         if (@available(iOS 12, macOS 10.14, *)) {
             NSUInteger maxBufferSize = [ToBackend(GetDevice())->GetMTLDevice() maxBufferLength];
             if (currentSize > maxBufferSize) {
@@ -83,6 +92,7 @@
             return DAWN_OUT_OF_MEMORY_ERROR("Buffer allocation is too large");
         }
 
+        mAllocatedSize = currentSize;
         mMtlBuffer.Acquire([ToBackend(GetDevice())->GetMTLDevice()
             newBufferWithLength:currentSize
                         options:storageMode]);
@@ -189,14 +199,9 @@
 
     void Buffer::ClearBuffer(CommandRecordingContext* commandContext, uint8_t clearValue) {
         ASSERT(commandContext != nullptr);
-
-        // Metal validation layer doesn't allow the length of the range in fillBuffer() to be 0.
-        if (GetSize() == 0u) {
-            return;
-        }
-
+        ASSERT(GetAllocatedSize() > 0);
         [commandContext->EnsureBlit() fillBuffer:mMtlBuffer.Get()
-                                           range:NSMakeRange(0, GetSize())
+                                           range:NSMakeRange(0, GetAllocatedSize())
                                            value:clearValue];
     }
 
diff --git a/src/dawn_native/null/DeviceNull.cpp b/src/dawn_native/null/DeviceNull.cpp
index 6ad06ee..06945e4 100644
--- a/src/dawn_native/null/DeviceNull.cpp
+++ b/src/dawn_native/null/DeviceNull.cpp
@@ -265,6 +265,7 @@
     Buffer::Buffer(Device* device, const BufferDescriptor* descriptor)
         : BufferBase(device, descriptor) {
         mBackingData = std::unique_ptr<uint8_t[]>(new uint8_t[GetSize()]);
+        mAllocatedSize = GetSize();
     }
 
     Buffer::~Buffer() {
diff --git a/src/dawn_native/opengl/BufferGL.cpp b/src/dawn_native/opengl/BufferGL.cpp
index e6b9507..dd90fcc 100644
--- a/src/dawn_native/opengl/BufferGL.cpp
+++ b/src/dawn_native/opengl/BufferGL.cpp
@@ -35,7 +35,8 @@
 
     Buffer::Buffer(Device* device, const BufferDescriptor* descriptor)
         : BufferBase(device, descriptor) {
-        uint64_t size = GetAppliedSize();
+        // Allocate at least 4 bytes so clamped accesses are always in bounds.
+        mAllocatedSize = std::max(GetSize(), uint64_t(4u));
 
         device->gl.GenBuffers(1, &mBuffer);
         device->gl.BindBuffer(GL_ARRAY_BUFFER, mBuffer);
@@ -44,10 +45,11 @@
         // BufferBase::MapAtCreation().
         if (device->IsToggleEnabled(Toggle::NonzeroClearResourcesOnCreationForTesting) &&
             !descriptor->mappedAtCreation) {
-            std::vector<uint8_t> clearValues(size, 1u);
-            device->gl.BufferData(GL_ARRAY_BUFFER, size, clearValues.data(), GL_STATIC_DRAW);
+            std::vector<uint8_t> clearValues(mAllocatedSize, 1u);
+            device->gl.BufferData(GL_ARRAY_BUFFER, mAllocatedSize, clearValues.data(),
+                                  GL_STATIC_DRAW);
         } else {
-            device->gl.BufferData(GL_ARRAY_BUFFER, size, nullptr, GL_STATIC_DRAW);
+            device->gl.BufferData(GL_ARRAY_BUFFER, mAllocatedSize, nullptr, GL_STATIC_DRAW);
         }
     }
 
@@ -66,10 +68,6 @@
         return mBuffer;
     }
 
-    uint64_t Buffer::GetAppliedSize() const {
-        return std::max(GetSize(), uint64_t(4u));
-    }
-
     void Buffer::EnsureDataInitialized() {
         if (IsDataInitialized() ||
             !GetDevice()->IsToggleEnabled(Toggle::LazyClearResourceOnFirstUse)) {
@@ -109,7 +107,7 @@
         ASSERT(GetDevice()->IsToggleEnabled(Toggle::LazyClearResourceOnFirstUse));
         ASSERT(!IsDataInitialized());
 
-        const uint64_t size = GetAppliedSize();
+        const uint64_t size = GetAllocatedSize();
         Device* device = ToBackend(GetDevice());
 
         const std::vector<uint8_t> clearValues(size, 0u);
diff --git a/src/dawn_native/opengl/BufferGL.h b/src/dawn_native/opengl/BufferGL.h
index 8038123..2259161 100644
--- a/src/dawn_native/opengl/BufferGL.h
+++ b/src/dawn_native/opengl/BufferGL.h
@@ -47,8 +47,6 @@
         MaybeError MapAtCreationImpl() override;
         void* GetMappedPointerImpl() override;
 
-        uint64_t GetAppliedSize() const;
-
         void InitializeToZero();
 
         GLuint mBuffer = 0;
diff --git a/src/dawn_native/vulkan/BufferVk.cpp b/src/dawn_native/vulkan/BufferVk.cpp
index 0a5d1aa..7b796f9 100644
--- a/src/dawn_native/vulkan/BufferVk.cpp
+++ b/src/dawn_native/vulkan/BufferVk.cpp
@@ -137,13 +137,24 @@
     }
 
     MaybeError Buffer::Initialize(bool mappedAtCreation) {
+        // Allocate at least 4 bytes so clamped accesses are always in bounds.
+        // Also, Vulkan requires the size to be non-zero.
+        uint64_t size = std::max(GetSize(), uint64_t(4u));
+        // vkCmdFillBuffer requires the size to be a multiple of 4.
+        size_t alignment = 4u;
+        if (size > std::numeric_limits<uint64_t>::max() - alignment) {
+            // Alignment would overlow.
+            return DAWN_OUT_OF_MEMORY_ERROR("Buffer allocation is too large");
+        }
+        mAllocatedSize = Align(size, alignment);
+
         // Avoid passing ludicrously large sizes to drivers because it causes issues: drivers add
         // some constants to the size passed and align it, but for values close to the maximum
         // VkDeviceSize this can cause overflows and makes drivers crash or return bad sizes in the
         // VkmemoryRequirements. See https://gitlab.khronos.org/vulkan/vulkan/issues/1904
         // Any size with one of two top bits of VkDeviceSize set is a HUGE allocation and we can
         // safely return an OOM error.
-        if (GetSize() & (uint64_t(3) << uint64_t(62))) {
+        if (mAllocatedSize & (uint64_t(3) << uint64_t(62))) {
             return DAWN_OUT_OF_MEMORY_ERROR("Buffer size is HUGE and could cause overflows");
         }
 
@@ -151,7 +162,7 @@
         createInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
         createInfo.pNext = nullptr;
         createInfo.flags = 0;
-        createInfo.size = std::max(GetSize(), uint64_t(4u));
+        createInfo.size = mAllocatedSize;
         // Add CopyDst for non-mappable buffer initialization with mappedAtCreation
         // and robust resource initialization.
         createInfo.usage = VulkanBufferUsage(GetUsage() | wgpu::BufferUsage::CopyDst);
@@ -243,10 +254,8 @@
         barrier->dstQueueFamilyIndex = 0;
         barrier->buffer = mHandle;
         barrier->offset = 0;
-        // Size must be non-zero or VK_WHOLE_SIZE. Use WHOLE_SIZE
-        // instead of GetSize() because the buffer allocation may
-        // be padded.
-        barrier->size = VK_WHOLE_SIZE;
+        // VK_WHOLE_SIZE doesn't work on old Windows Intel Vulkan drivers, so we don't use it.
+        barrier->size = GetAllocatedSize();
 
         mLastUsage = usage;
 
@@ -347,18 +356,15 @@
 
     void Buffer::ClearBuffer(CommandRecordingContext* recordingContext, uint32_t clearValue) {
         ASSERT(recordingContext != nullptr);
-
-        // Vulkan validation layer doesn't allow the `size` in vkCmdFillBuffer() to be 0.
-        if (GetSize() == 0u) {
-            return;
-        }
+        ASSERT(GetAllocatedSize() > 0);
 
         TransitionUsageNow(recordingContext, wgpu::BufferUsage::CopyDst);
 
         Device* device = ToBackend(GetDevice());
-        // This code is fine. According to jiawei.shao@intel.com, VK_WHOLE_SIZE doesn't work
-        // on old Windows Intel Vulkan drivers, so we don't use it.
-        device->fn.CmdFillBuffer(recordingContext->commandBuffer, mHandle, 0, GetSize(),
+        // VK_WHOLE_SIZE doesn't work on old Windows Intel Vulkan drivers, so we don't use it.
+        // Note: Allocated size must be a multiple of 4.
+        ASSERT(GetAllocatedSize() % 4 == 0);
+        device->fn.CmdFillBuffer(recordingContext->commandBuffer, mHandle, 0, GetAllocatedSize(),
                                  clearValue);
     }
 }}  // namespace dawn_native::vulkan