Initialize padding on buffer allocations

Buffer allocations in Dawn may be padded. This padding
could be visible, depending on how the backend
implements robust vertex buffer access. This commit
updates buffer creation to clear all padding bytes
immediately after creation.
It is not counted as a lazy clear.
And, add a test which reads off the end of a padded
vertex buffer to check that the padding bytes are also
initialized.

Also: Update Metal buffers to always allocate enough
space for Tint's vertex pulling, and pass the full
allocated size to Tint. While writing the test in this
commit, a bug was found where Tint assumes there is at
least 4 bytes in the vertex pulling buffer. The WebGPU
API currently allows zero-sized bindings, so Dawn needs
to always allocate enough space for one element.

Also: Update Vulkan vertex/index buffers to allocate
at least one more byte than requested because
vkCmdSetVertexBuffers and vkCmdSetIndexBuffer disallow
the offset to be equal to the buffer size. We need
at least one more byte to make zero-sized bindings
as the end of the buffer valid.

Lastly: Update helper so that a T2B copy of a single
row that touches all bytes in a buffer updates the
buffer state to be fully initialized. This is a small
optimization that is necessary to write the test.

Tested-In: https://chromium-review.googlesource.com/c/chromium/src/+/3107782
No-Try: true
Bug: dawn:837
Change-Id: I829f4764509c4ec784c5aeaaf40b6dcbd4be6866
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/62161
Commit-Queue: Austin Eng <enga@chromium.org>
Reviewed-by: Kai Ninomiya <kainino@chromium.org>
Reviewed-by: Corentin Wallez <cwallez@chromium.org>
(cherry picked from commit ddd0a0a856a2bd7013e82291974f1107f02fc1a9)
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/62382
diff --git a/src/dawn_native/CommandBuffer.cpp b/src/dawn_native/CommandBuffer.cpp
index 19b8010..b6ba4f0 100644
--- a/src/dawn_native/CommandBuffer.cpp
+++ b/src/dawn_native/CommandBuffer.cpp
@@ -17,6 +17,7 @@
 #include "common/BitSetIterator.h"
 #include "dawn_native/Buffer.h"
 #include "dawn_native/CommandEncoder.h"
+#include "dawn_native/CommandValidation.h"
 #include "dawn_native/Commands.h"
 #include "dawn_native/Format.h"
 #include "dawn_native/Texture.h"
@@ -171,27 +172,37 @@
         ASSERT(copy != nullptr);
 
         if (copy->destination.offset > 0) {
+            // The copy doesn't touch the start of the buffer.
             return false;
         }
 
         const TextureBase* texture = copy->source.texture.Get();
         const TexelBlockInfo& blockInfo =
             texture->GetFormat().GetAspectInfo(copy->source.aspect).block;
+        const uint64_t widthInBlocks = copy->copySize.width / blockInfo.width;
         const uint64_t heightInBlocks = copy->copySize.height / blockInfo.height;
+        const bool multiSlice = copy->copySize.depthOrArrayLayers > 1;
+        const bool multiRow = multiSlice || heightInBlocks > 1;
 
-        if (copy->destination.rowsPerImage > heightInBlocks) {
+        if (multiSlice && copy->destination.rowsPerImage > heightInBlocks) {
+            // There are gaps between slices that aren't overwritten
             return false;
         }
 
-        const uint64_t copyTextureDataSizePerRow =
-            copy->copySize.width / blockInfo.width * blockInfo.byteSize;
-        if (copy->destination.bytesPerRow > copyTextureDataSizePerRow) {
+        const uint64_t copyTextureDataSizePerRow = widthInBlocks * blockInfo.byteSize;
+        if (multiRow && copy->destination.bytesPerRow > copyTextureDataSizePerRow) {
+            // There are gaps between rows that aren't overwritten
             return false;
         }
 
-        const uint64_t overwrittenRangeSize =
-            copyTextureDataSizePerRow * heightInBlocks * copy->copySize.depthOrArrayLayers;
-        if (copy->destination.buffer->GetSize() > overwrittenRangeSize) {
+        // After the above checks, we're sure the copy has no gaps.
+        // Now, compute the total number of bytes written.
+        const uint64_t writtenBytes =
+            ComputeRequiredBytesInCopy(blockInfo, copy->copySize, copy->destination.bytesPerRow,
+                                       copy->destination.rowsPerImage)
+                .AcquireSuccess();
+        if (!copy->destination.buffer->IsFullBufferRange(copy->destination.offset, writtenBytes)) {
+            // The written bytes don't cover the whole buffer.
             return false;
         }
 
diff --git a/src/dawn_native/d3d12/BufferD3D12.cpp b/src/dawn_native/d3d12/BufferD3D12.cpp
index 97a90be..399a02c 100644
--- a/src/dawn_native/d3d12/BufferD3D12.cpp
+++ b/src/dawn_native/d3d12/BufferD3D12.cpp
@@ -168,6 +168,21 @@
             DAWN_TRY(ClearBuffer(commandRecordingContext, uint8_t(1u)));
         }
 
+        // Initialize the padding bytes to zero.
+        if (GetDevice()->IsToggleEnabled(Toggle::LazyClearResourceOnFirstUse) &&
+            !mappedAtCreation) {
+            uint32_t paddingBytes = GetAllocatedSize() - GetSize();
+            if (paddingBytes > 0) {
+                CommandRecordingContext* commandRecordingContext;
+                DAWN_TRY_ASSIGN(commandRecordingContext,
+                                ToBackend(GetDevice())->GetPendingCommandContext());
+
+                uint32_t clearSize = paddingBytes;
+                uint64_t clearOffset = GetSize();
+                DAWN_TRY(ClearBuffer(commandRecordingContext, 0, clearOffset, clearSize));
+            }
+        }
+
         return {};
     }
 
@@ -444,29 +459,33 @@
         return {};
     }
 
-    MaybeError Buffer::ClearBuffer(CommandRecordingContext* commandContext, uint8_t clearValue) {
+    MaybeError Buffer::ClearBuffer(CommandRecordingContext* commandContext,
+                                   uint8_t clearValue,
+                                   uint64_t offset,
+                                   uint64_t size) {
         Device* device = ToBackend(GetDevice());
+        size = size > 0 ? size : GetAllocatedSize();
 
         // The state of the buffers on UPLOAD heap must always be GENERIC_READ and cannot be
         // changed away, so we can only clear such buffer with buffer mapping.
         if (D3D12HeapType(GetUsage()) == D3D12_HEAP_TYPE_UPLOAD) {
-            DAWN_TRY(MapInternal(true, 0, size_t(GetAllocatedSize()), "D3D12 map at clear buffer"));
-            memset(mMappedData, clearValue, GetAllocatedSize());
+            DAWN_TRY(MapInternal(true, static_cast<size_t>(offset), static_cast<size_t>(size),
+                                 "D3D12 map at clear buffer"));
+            memset(mMappedData, clearValue, size);
             UnmapImpl();
         } else {
             // TODO(crbug.com/dawn/852): use ClearUnorderedAccessView*() when the buffer usage
             // includes STORAGE.
             DynamicUploader* uploader = device->GetDynamicUploader();
             UploadHandle uploadHandle;
-            DAWN_TRY_ASSIGN(uploadHandle, uploader->Allocate(GetAllocatedSize(),
-                                                             device->GetPendingCommandSerial(),
-                                                             kCopyBufferToBufferOffsetAlignment));
+            DAWN_TRY_ASSIGN(uploadHandle,
+                            uploader->Allocate(size, device->GetPendingCommandSerial(),
+                                               kCopyBufferToBufferOffsetAlignment));
 
-            memset(uploadHandle.mappedBuffer, clearValue, GetAllocatedSize());
+            memset(uploadHandle.mappedBuffer, clearValue, size);
 
             device->CopyFromStagingToBufferImpl(commandContext, uploadHandle.stagingBuffer,
-                                                uploadHandle.startOffset, this, 0,
-                                                GetAllocatedSize());
+                                                uploadHandle.startOffset, this, offset, size);
         }
 
         return {};
diff --git a/src/dawn_native/d3d12/BufferD3D12.h b/src/dawn_native/d3d12/BufferD3D12.h
index 4cd3d51..ed19efa 100644
--- a/src/dawn_native/d3d12/BufferD3D12.h
+++ b/src/dawn_native/d3d12/BufferD3D12.h
@@ -68,7 +68,10 @@
                                                   wgpu::BufferUsage newUsage);
 
         MaybeError InitializeToZero(CommandRecordingContext* commandContext);
-        MaybeError ClearBuffer(CommandRecordingContext* commandContext, uint8_t clearValue);
+        MaybeError ClearBuffer(CommandRecordingContext* commandContext,
+                               uint8_t clearValue,
+                               uint64_t offset = 0,
+                               uint64_t size = 0);
 
         ResourceHeapAllocation mResourceAllocation;
         bool mFixedResourceState = false;
diff --git a/src/dawn_native/metal/BufferMTL.h b/src/dawn_native/metal/BufferMTL.h
index 409b89b..0c7c5e2 100644
--- a/src/dawn_native/metal/BufferMTL.h
+++ b/src/dawn_native/metal/BufferMTL.h
@@ -52,7 +52,10 @@
         MaybeError MapAtCreationImpl() override;
 
         void InitializeToZero(CommandRecordingContext* commandContext);
-        void ClearBuffer(CommandRecordingContext* commandContext, uint8_t clearValue);
+        void ClearBuffer(CommandRecordingContext* commandContext,
+                         uint8_t clearValue,
+                         uint64_t offset = 0,
+                         uint64_t size = 0);
 
         NSPRef<id<MTLBuffer>> mMtlBuffer;
     };
diff --git a/src/dawn_native/metal/BufferMTL.mm b/src/dawn_native/metal/BufferMTL.mm
index 237fcd0..c2c5a31 100644
--- a/src/dawn_native/metal/BufferMTL.mm
+++ b/src/dawn_native/metal/BufferMTL.mm
@@ -45,10 +45,6 @@
             storageMode = MTLResourceStorageModePrivate;
         }
 
-        if (GetSize() > std::numeric_limits<NSUInteger>::max()) {
-            return DAWN_OUT_OF_MEMORY_ERROR("Buffer allocation is too large");
-        }
-
         uint32_t alignment = 1;
 #ifdef DAWN_PLATFORM_MACOS
         // [MTLBlitCommandEncoder fillBuffer] requires the size to be a multiple of 4 on MacOS.
@@ -64,13 +60,25 @@
             alignment = kMinUniformOrStorageBufferAlignment;
         }
 
-        // Allocate at least 4 bytes so clamped accesses are always in bounds.
-        NSUInteger currentSize = static_cast<NSUInteger>(std::max(GetSize(), uint64_t(4u)));
+        // The vertex pulling transform requires at least 4 bytes in the buffer.
+        // 0-sized vertex buffer bindings are allowed, so we always need an additional 4 bytes
+        // after the end.
+        NSUInteger extraBytes = 0u;
+        if ((GetUsage() & wgpu::BufferUsage::Vertex) != 0) {
+            extraBytes = 4u;
+        }
+
+        if (GetSize() > std::numeric_limits<NSUInteger>::max() - extraBytes) {
+            return DAWN_OUT_OF_MEMORY_ERROR("Buffer allocation is too large");
+        }
+        NSUInteger currentSize =
+            std::max(static_cast<NSUInteger>(GetSize()) + extraBytes, NSUInteger(4));
+
         if (currentSize > std::numeric_limits<NSUInteger>::max() - alignment) {
             // Alignment would overlow.
             return DAWN_OUT_OF_MEMORY_ERROR("Buffer allocation is too large");
         }
-        currentSize = Align(currentSize, kMinUniformOrStorageBufferAlignment);
+        currentSize = Align(currentSize, alignment);
 
         if (@available(iOS 12, macOS 10.14, *)) {
             NSUInteger maxBufferSize = [ToBackend(GetDevice())->GetMTLDevice() maxBufferLength];
@@ -109,6 +117,19 @@
             ClearBuffer(commandContext, uint8_t(1u));
         }
 
+        // Initialize the padding bytes to zero.
+        if (GetDevice()->IsToggleEnabled(Toggle::LazyClearResourceOnFirstUse) &&
+            !mappedAtCreation) {
+            uint32_t paddingBytes = GetAllocatedSize() - GetSize();
+            if (paddingBytes > 0) {
+                uint32_t clearSize = Align(paddingBytes, 4);
+                uint64_t clearOffset = GetAllocatedSize() - clearSize;
+
+                CommandRecordingContext* commandContext =
+                    ToBackend(GetDevice())->GetPendingCommandContext();
+                ClearBuffer(commandContext, 0, clearOffset, clearSize);
+            }
+        }
         return {};
     }
 
@@ -197,11 +218,15 @@
         GetDevice()->IncrementLazyClearCountForTesting();
     }
 
-    void Buffer::ClearBuffer(CommandRecordingContext* commandContext, uint8_t clearValue) {
+    void Buffer::ClearBuffer(CommandRecordingContext* commandContext,
+                             uint8_t clearValue,
+                             uint64_t offset,
+                             uint64_t size) {
         ASSERT(commandContext != nullptr);
-        ASSERT(GetAllocatedSize() > 0);
+        size = size > 0 ? size : GetAllocatedSize();
+        ASSERT(size > 0);
         [commandContext->EnsureBlit() fillBuffer:mMtlBuffer.Get()
-                                           range:NSMakeRange(0, GetAllocatedSize())
+                                           range:NSMakeRange(offset, size)
                                            value:clearValue];
     }
 
diff --git a/src/dawn_native/metal/CommandBufferMTL.mm b/src/dawn_native/metal/CommandBufferMTL.mm
index e49ab10..f41cff7 100644
--- a/src/dawn_native/metal/CommandBufferMTL.mm
+++ b/src/dawn_native/metal/CommandBufferMTL.mm
@@ -548,7 +548,8 @@
                 mVertexBufferOffsets[slot] = offset;
 
                 ASSERT(buffer->GetSize() < std::numeric_limits<uint32_t>::max());
-                mVertexBufferBindingSizes[slot] = static_cast<uint32_t>(buffer->GetSize() - offset);
+                mVertexBufferBindingSizes[slot] =
+                    static_cast<uint32_t>(buffer->GetAllocatedSize() - offset);
                 mDirtyVertexBuffers.set(slot);
             }
 
diff --git a/src/dawn_native/opengl/BufferGL.cpp b/src/dawn_native/opengl/BufferGL.cpp
index dd90fcc..1e102d6 100644
--- a/src/dawn_native/opengl/BufferGL.cpp
+++ b/src/dawn_native/opengl/BufferGL.cpp
@@ -49,6 +49,7 @@
             device->gl.BufferData(GL_ARRAY_BUFFER, mAllocatedSize, clearValues.data(),
                                   GL_STATIC_DRAW);
         } else {
+            // Buffers start zeroed if you pass nullptr to glBufferData.
             device->gl.BufferData(GL_ARRAY_BUFFER, mAllocatedSize, nullptr, GL_STATIC_DRAW);
         }
     }
diff --git a/src/dawn_native/vulkan/BufferVk.cpp b/src/dawn_native/vulkan/BufferVk.cpp
index 7b796f9..8139b29 100644
--- a/src/dawn_native/vulkan/BufferVk.cpp
+++ b/src/dawn_native/vulkan/BufferVk.cpp
@@ -137,16 +137,34 @@
     }
 
     MaybeError Buffer::Initialize(bool mappedAtCreation) {
+        // vkCmdFillBuffer requires the size to be a multiple of 4.
+        constexpr size_t kAlignment = 4u;
+
+        uint32_t extraBytes = 0u;
+        if (GetUsage() & (wgpu::BufferUsage::Vertex | wgpu::BufferUsage::Index)) {
+            // vkCmdSetIndexBuffer and vkCmdSetVertexBuffer are invalid if the offset
+            // is equal to the whole buffer size. Allocate at least one more byte so it
+            // is valid to setVertex/IndexBuffer with a zero-sized range at the end
+            // of the buffer with (offset=buffer.size, size=0).
+            extraBytes = 1u;
+        }
+
+        uint64_t size = GetSize();
+        if (size > std::numeric_limits<uint64_t>::max() - extraBytes) {
+            return DAWN_OUT_OF_MEMORY_ERROR("Buffer allocation is too large");
+        }
+
+        size += extraBytes;
+
         // Allocate at least 4 bytes so clamped accesses are always in bounds.
         // Also, Vulkan requires the size to be non-zero.
-        uint64_t size = std::max(GetSize(), uint64_t(4u));
-        // vkCmdFillBuffer requires the size to be a multiple of 4.
-        size_t alignment = 4u;
-        if (size > std::numeric_limits<uint64_t>::max() - alignment) {
+        size = std::max(size, uint64_t(4u));
+
+        if (size > std::numeric_limits<uint64_t>::max() - kAlignment) {
             // Alignment would overlow.
             return DAWN_OUT_OF_MEMORY_ERROR("Buffer allocation is too large");
         }
-        mAllocatedSize = Align(size, alignment);
+        mAllocatedSize = Align(size, kAlignment);
 
         // Avoid passing ludicrously large sizes to drivers because it causes issues: drivers add
         // some constants to the size passed and align it, but for values close to the maximum
@@ -200,6 +218,17 @@
             ClearBuffer(device->GetPendingRecordingContext(), 0x01010101);
         }
 
+        // Initialize the padding bytes to zero.
+        if (device->IsToggleEnabled(Toggle::LazyClearResourceOnFirstUse) && !mappedAtCreation) {
+            uint32_t paddingBytes = GetAllocatedSize() - GetSize();
+            if (paddingBytes > 0) {
+                uint32_t clearSize = Align(paddingBytes, 4);
+                uint64_t clearOffset = GetAllocatedSize() - clearSize;
+
+                CommandRecordingContext* recordingContext = device->GetPendingRecordingContext();
+                ClearBuffer(recordingContext, 0, clearOffset, clearSize);
+            }
+        }
         return {};
     }
 
@@ -354,17 +383,21 @@
         SetIsDataInitialized();
     }
 
-    void Buffer::ClearBuffer(CommandRecordingContext* recordingContext, uint32_t clearValue) {
+    void Buffer::ClearBuffer(CommandRecordingContext* recordingContext,
+                             uint32_t clearValue,
+                             uint64_t offset,
+                             uint64_t size) {
         ASSERT(recordingContext != nullptr);
-        ASSERT(GetAllocatedSize() > 0);
+        size = size > 0 ? size : GetAllocatedSize();
+        ASSERT(size > 0);
 
         TransitionUsageNow(recordingContext, wgpu::BufferUsage::CopyDst);
 
         Device* device = ToBackend(GetDevice());
         // VK_WHOLE_SIZE doesn't work on old Windows Intel Vulkan drivers, so we don't use it.
         // Note: Allocated size must be a multiple of 4.
-        ASSERT(GetAllocatedSize() % 4 == 0);
-        device->fn.CmdFillBuffer(recordingContext->commandBuffer, mHandle, 0, GetAllocatedSize(),
+        ASSERT(size % 4 == 0);
+        device->fn.CmdFillBuffer(recordingContext->commandBuffer, mHandle, offset, size,
                                  clearValue);
     }
 }}  // namespace dawn_native::vulkan
diff --git a/src/dawn_native/vulkan/BufferVk.h b/src/dawn_native/vulkan/BufferVk.h
index e9a8951..1c40140 100644
--- a/src/dawn_native/vulkan/BufferVk.h
+++ b/src/dawn_native/vulkan/BufferVk.h
@@ -55,7 +55,10 @@
 
         MaybeError Initialize(bool mappedAtCreation);
         void InitializeToZero(CommandRecordingContext* recordingContext);
-        void ClearBuffer(CommandRecordingContext* recordingContext, uint32_t clearValue);
+        void ClearBuffer(CommandRecordingContext* recordingContext,
+                         uint32_t clearValue,
+                         uint64_t offset = 0,
+                         uint64_t size = 0);
 
         MaybeError MapAsyncImpl(wgpu::MapMode mode, size_t offset, size_t size) override;
         void UnmapImpl() override;
diff --git a/src/tests/end2end/BufferZeroInitTests.cpp b/src/tests/end2end/BufferZeroInitTests.cpp
index 57be4d5..04c961b 100644
--- a/src/tests/end2end/BufferZeroInitTests.cpp
+++ b/src/tests/end2end/BufferZeroInitTests.cpp
@@ -14,6 +14,7 @@
 
 #include "tests/DawnTest.h"
 
+#include "common/Math.h"
 #include "utils/ComboRenderPipelineDescriptor.h"
 #include "utils/TestUtils.h"
 #include "utils/WGPUHelpers.h"
@@ -201,8 +202,10 @@
         EXPECT_PIXEL_RGBA8_EQ(kExpectedColor, outputTexture, 0u, 0u);
     }
 
-    wgpu::RenderPipeline CreateRenderPipelineForTest(const char* vertexShader,
-                                                     uint32_t vertexBufferCount = 1u) {
+    wgpu::RenderPipeline CreateRenderPipelineForTest(
+        const char* vertexShader,
+        uint32_t vertexBufferCount = 1u,
+        wgpu::VertexFormat vertexFormat = wgpu::VertexFormat::Float32x4) {
         constexpr wgpu::TextureFormat kColorAttachmentFormat = wgpu::TextureFormat::RGBA8Unorm;
 
         wgpu::ShaderModule vsModule = utils::CreateShaderModule(device, vertexShader);
@@ -219,9 +222,9 @@
         descriptor.cFragment.module = fsModule;
         descriptor.primitive.topology = wgpu::PrimitiveTopology::PointList;
         descriptor.vertex.bufferCount = vertexBufferCount;
-        descriptor.cBuffers[0].arrayStride = 4 * sizeof(float);
+        descriptor.cBuffers[0].arrayStride = Align(utils::VertexFormatSize(vertexFormat), 4);
         descriptor.cBuffers[0].attributeCount = 1;
-        descriptor.cAttributes[0].format = wgpu::VertexFormat::Float32x4;
+        descriptor.cAttributes[0].format = vertexFormat;
         descriptor.cTargets[0].format = kColorAttachmentFormat;
         return device.CreateRenderPipeline(&descriptor);
     }
@@ -1128,6 +1131,114 @@
     }
 }
 
+// Test for crbug.com/dawn/837.
+// Test that the padding after a buffer allocation is initialized to 0.
+// This test makes an unaligned vertex buffer which should be padded in the backend
+// allocation. It then tries to index off the end of the vertex buffer in an indexed
+// draw call. A backend which implements robust buffer access via clamping should
+// still see zeros at the end of the buffer.
+TEST_P(BufferZeroInitTest, PaddingInitialized) {
+    DAWN_SUPPRESS_TEST_IF(IsANGLE());  // TODO(crbug.com/dawn/1084).
+
+    constexpr wgpu::TextureFormat kColorAttachmentFormat = wgpu::TextureFormat::RGBA8Unorm;
+    // A small sub-4-byte format means a single vertex can fit entirely within the padded buffer,
+    // touching some of the padding. Test a small format, as well as larger formats.
+    for (wgpu::VertexFormat vertexFormat :
+         {wgpu::VertexFormat::Unorm8x2, wgpu::VertexFormat::Float16x2,
+          wgpu::VertexFormat::Float32x2}) {
+        wgpu::RenderPipeline renderPipeline =
+            CreateRenderPipelineForTest(R"(
+            struct VertexOut {
+                [[location(0)]] color : vec4<f32>;
+                [[builtin(position)]] position : vec4<f32>;
+            };
+
+            [[stage(vertex)]] fn main([[location(0)]] pos : vec2<f32>) -> VertexOut {
+                var output : VertexOut;
+                if (all(pos == vec2<f32>(0.0, 0.0))) {
+                    output.color = vec4<f32>(0.0, 1.0, 0.0, 1.0);
+                } else {
+                    output.color = vec4<f32>(1.0, 0.0, 0.0, 1.0);
+                }
+                output.position = vec4<f32>(0.0, 0.0, 0.0, 1.0);
+                return output;
+            })",
+                                        /* vertexBufferCount */ 1u, vertexFormat);
+
+        // Create an index buffer the indexes off the end of the vertex buffer.
+        wgpu::Buffer indexBuffer =
+            utils::CreateBufferFromData<uint32_t>(device, wgpu::BufferUsage::Index, {1});
+
+        const uint32_t vertexFormatSize = utils::VertexFormatSize(vertexFormat);
+
+        // Create an 8-bit texture to use to initialize buffer contents.
+        wgpu::TextureDescriptor initTextureDesc = {};
+        initTextureDesc.size = {vertexFormatSize + 4, 1, 1};
+        initTextureDesc.format = wgpu::TextureFormat::R8Unorm;
+        initTextureDesc.usage = wgpu::TextureUsage::CopySrc | wgpu::TextureUsage::CopyDst;
+        wgpu::ImageCopyTexture zeroTextureSrc =
+            utils::CreateImageCopyTexture(device.CreateTexture(&initTextureDesc), 0, {0, 0, 0});
+        {
+            wgpu::TextureDataLayout layout =
+                utils::CreateTextureDataLayout(0, wgpu::kCopyStrideUndefined);
+            std::vector<uint8_t> data(initTextureDesc.size.width);
+            queue.WriteTexture(&zeroTextureSrc, data.data(), data.size(), &layout,
+                               &initTextureDesc.size);
+        }
+
+        for (uint32_t extraBytes : {0, 1, 2, 3, 4}) {
+            // Create a vertex buffer to hold a single vertex attribute.
+            // Uniform usage is added to force even more padding on D3D12.
+            // The buffer is internally padded and allocated as a larger buffer.
+            const uint32_t vertexBufferSize = vertexFormatSize + extraBytes;
+            for (uint32_t vertexBufferOffset = 0; vertexBufferOffset <= vertexBufferSize;
+                 vertexBufferOffset += 4u) {
+                wgpu::Buffer vertexBuffer = CreateBuffer(
+                    vertexBufferSize, wgpu::BufferUsage::Vertex | wgpu::BufferUsage::Uniform |
+                                          wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst);
+
+                // "Fully" initialize the buffer with a copy from an 8-bit texture, touching
+                // everything except the padding. From the point-of-view of the API, all
+                // |vertexBufferSize| bytes are initialized. Note: Uses CopyTextureToBuffer because
+                // it does not require 4-byte alignment.
+                {
+                    wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+                    wgpu::ImageCopyBuffer dst =
+                        utils::CreateImageCopyBuffer(vertexBuffer, 0, wgpu::kCopyStrideUndefined);
+                    wgpu::Extent3D extent = {vertexBufferSize, 1, 1};
+                    encoder.CopyTextureToBuffer(&zeroTextureSrc, &dst, &extent);
+
+                    wgpu::CommandBuffer commandBuffer = encoder.Finish();
+                    EXPECT_LAZY_CLEAR(0u, queue.Submit(1, &commandBuffer));
+                }
+
+                wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+
+                wgpu::Texture colorAttachment =
+                    CreateAndInitializeTexture({1, 1, 1}, kColorAttachmentFormat);
+                utils::ComboRenderPassDescriptor renderPassDescriptor(
+                    {colorAttachment.CreateView()});
+
+                wgpu::RenderPassEncoder renderPass = encoder.BeginRenderPass(&renderPassDescriptor);
+
+                renderPass.SetVertexBuffer(0, vertexBuffer, vertexBufferOffset);
+                renderPass.SetIndexBuffer(indexBuffer, wgpu::IndexFormat::Uint32);
+
+                renderPass.SetPipeline(renderPipeline);
+                renderPass.DrawIndexed(1);
+                renderPass.EndPass();
+
+                wgpu::CommandBuffer commandBuffer = encoder.Finish();
+
+                EXPECT_LAZY_CLEAR(0u, queue.Submit(1, &commandBuffer));
+
+                constexpr RGBA8 kExpectedPixelValue = {0, 255, 0, 255};
+                EXPECT_PIXEL_RGBA8_EQ(kExpectedPixelValue, colorAttachment, 0, 0);
+            }
+        }
+    }
+}
+
 // Test the buffer will be lazily initialized correctly when its first use is in SetIndexBuffer.
 TEST_P(BufferZeroInitTest, SetIndexBuffer) {
     // Bind the whole buffer as an index buffer.
diff --git a/src/tests/end2end/TextureZeroInitTests.cpp b/src/tests/end2end/TextureZeroInitTests.cpp
index 56084a3..e43a530 100644
--- a/src/tests/end2end/TextureZeroInitTests.cpp
+++ b/src/tests/end2end/TextureZeroInitTests.cpp
@@ -2060,6 +2060,84 @@
                                           kViewMipLevel, 0, true);
 }
 
+// Test uploading then reading back from a 2D array compressed texture.
+// This is a regression test for a bug where the final destination buffer
+// was considered fully initialized even though there was a 256-byte
+// stride between images.
+TEST_P(CompressedTextureZeroInitTest, Copy2DArrayCompressedB2T2B) {
+    // TODO(crbug.com/dawn/643): diagnose and fix this failure on OpenGL.
+    DAWN_SUPPRESS_TEST_IF(IsOpenGL() || IsOpenGLES());
+
+    // create srcTexture with data
+    wgpu::TextureDescriptor textureDescriptor = CreateTextureDescriptor(
+        4, 5, wgpu::TextureUsage::CopySrc | wgpu::TextureUsage::CopyDst, utils::kBCFormats[0]);
+    textureDescriptor.size = {8, 8, 5};
+    wgpu::Texture srcTexture = device.CreateTexture(&textureDescriptor);
+
+    uint32_t mipLevel = 2;
+    wgpu::Extent3D copyExtent3D = {4, 4, 5};
+
+    uint32_t copyWidthInBlock = copyExtent3D.width / kFormatBlockByteSize;
+    uint32_t copyHeightInBlock = copyExtent3D.height / kFormatBlockByteSize;
+    uint32_t copyRowsPerImage = copyHeightInBlock;
+    uint32_t copyBytesPerRow =
+        Align(copyWidthInBlock * utils::GetTexelBlockSizeInBytes(textureDescriptor.format),
+              kTextureBytesPerRowAlignment);
+
+    // Generate data to upload
+    std::vector<uint8_t> data(utils::RequiredBytesInCopy(copyBytesPerRow, copyRowsPerImage,
+                                                         copyExtent3D, textureDescriptor.format));
+    for (size_t i = 0; i < data.size(); ++i) {
+        data[i] = i % 255;
+    }
+
+    // Copy texture data from a staging buffer to the destination texture.
+    wgpu::Buffer stagingBuffer =
+        utils::CreateBufferFromData(device, data.data(), data.size(), wgpu::BufferUsage::CopySrc);
+    wgpu::ImageCopyBuffer imageCopyBufferSrc =
+        utils::CreateImageCopyBuffer(stagingBuffer, 0, copyBytesPerRow, copyRowsPerImage);
+
+    wgpu::ImageCopyTexture imageCopyTexture =
+        utils::CreateImageCopyTexture(srcTexture, mipLevel, {0, 0, 0});
+
+    {
+        wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+        encoder.CopyBufferToTexture(&imageCopyBufferSrc, &imageCopyTexture, &copyExtent3D);
+        wgpu::CommandBuffer copy = encoder.Finish();
+        EXPECT_LAZY_CLEAR(0u, queue.Submit(1, &copy));
+    }
+
+    // Create a buffer to read back the data. It is the same size as the upload buffer.
+    wgpu::BufferDescriptor readbackDesc = {};
+    readbackDesc.size = data.size();
+    readbackDesc.usage = wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst;
+    wgpu::Buffer readbackBuffer = device.CreateBuffer(&readbackDesc);
+
+    // Copy the texture to the readback buffer.
+    wgpu::ImageCopyBuffer imageCopyBufferDst =
+        utils::CreateImageCopyBuffer(readbackBuffer, 0, copyBytesPerRow, copyRowsPerImage);
+    {
+        wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+        encoder.CopyTextureToBuffer(&imageCopyTexture, &imageCopyBufferDst, &copyExtent3D);
+        wgpu::CommandBuffer copy = encoder.Finish();
+
+        // Expect a lazy clear because the padding in the copy is not touched.
+        EXPECT_LAZY_CLEAR(1u, queue.Submit(1, &copy));
+    }
+
+    // Generate expected data. It is the same as the upload data, but padding is zero.
+    std::vector<uint8_t> expected(data.size(), 0);
+    for (uint32_t z = 0; z < copyExtent3D.depthOrArrayLayers; ++z) {
+        for (uint32_t y = 0; y < copyHeightInBlock; ++y) {
+            memcpy(&expected[copyBytesPerRow * y + copyBytesPerRow * copyRowsPerImage * z],
+                   &data[copyBytesPerRow * y + copyBytesPerRow * copyRowsPerImage * z],
+                   copyWidthInBlock * utils::GetTexelBlockSizeInBytes(textureDescriptor.format));
+        }
+    }
+    // Check final contents
+    EXPECT_BUFFER_U8_RANGE_EQ(expected.data(), readbackBuffer, 0, expected.size());
+}
+
 DAWN_INSTANTIATE_TEST(CompressedTextureZeroInitTest,
                       D3D12Backend({"nonzero_clear_resources_on_creation_for_testing"}),
                       MetalBackend({"nonzero_clear_resources_on_creation_for_testing"}),
diff --git a/src/utils/TestUtils.cpp b/src/utils/TestUtils.cpp
index adbdd28..8f359ab 100644
--- a/src/utils/TestUtils.cpp
+++ b/src/utils/TestUtils.cpp
@@ -134,4 +134,47 @@
         device.GetQueue().WriteTexture(&imageCopyTexture, data.data(), 1, &textureDataLayout,
                                        &copyExtent);
     }
+
+    uint32_t VertexFormatSize(wgpu::VertexFormat format) {
+        switch (format) {
+            case wgpu::VertexFormat::Uint8x2:
+            case wgpu::VertexFormat::Sint8x2:
+            case wgpu::VertexFormat::Unorm8x2:
+            case wgpu::VertexFormat::Snorm8x2:
+                return 2;
+            case wgpu::VertexFormat::Uint8x4:
+            case wgpu::VertexFormat::Sint8x4:
+            case wgpu::VertexFormat::Unorm8x4:
+            case wgpu::VertexFormat::Snorm8x4:
+            case wgpu::VertexFormat::Uint16x2:
+            case wgpu::VertexFormat::Sint16x2:
+            case wgpu::VertexFormat::Unorm16x2:
+            case wgpu::VertexFormat::Snorm16x2:
+            case wgpu::VertexFormat::Float16x2:
+            case wgpu::VertexFormat::Float32:
+            case wgpu::VertexFormat::Uint32:
+            case wgpu::VertexFormat::Sint32:
+                return 4;
+            case wgpu::VertexFormat::Uint16x4:
+            case wgpu::VertexFormat::Sint16x4:
+            case wgpu::VertexFormat::Unorm16x4:
+            case wgpu::VertexFormat::Snorm16x4:
+            case wgpu::VertexFormat::Float16x4:
+            case wgpu::VertexFormat::Float32x2:
+            case wgpu::VertexFormat::Uint32x2:
+            case wgpu::VertexFormat::Sint32x2:
+                return 8;
+            case wgpu::VertexFormat::Float32x3:
+            case wgpu::VertexFormat::Uint32x3:
+            case wgpu::VertexFormat::Sint32x3:
+                return 12;
+            case wgpu::VertexFormat::Float32x4:
+            case wgpu::VertexFormat::Uint32x4:
+            case wgpu::VertexFormat::Sint32x4:
+                return 16;
+            case wgpu::VertexFormat::Undefined:
+                UNREACHABLE();
+        }
+    }
+
 }  // namespace utils
diff --git a/src/utils/TestUtils.h b/src/utils/TestUtils.h
index d611124..02b0daf 100644
--- a/src/utils/TestUtils.h
+++ b/src/utils/TestUtils.h
@@ -60,6 +60,8 @@
     // in it will contain 1 byte of data.
     void UnalignDynamicUploader(wgpu::Device device);
 
+    uint32_t VertexFormatSize(wgpu::VertexFormat format);
+
 }  // namespace utils
 
 #endif  // UTILS_TESTHELPERS_H_