Implement buffer lazy initialization before CopyBufferToBuffer

This patch implements buffer lazy initialization before
CopyBufferToBuffer() behind the toggle LazyClearBufferOnFirstUse.
- If the source buffer is not initialized, it will be cleared to 0
  before CopyBufferToBuffer().
- If the destination buffer is not initialized and the copy doesn't
  overwrite the whole buffer, it will be cleared to 0 before
  CopyBufferToBuffer(), otherwise the buffer shouldn't be cleared.

BUG=dawn:414
TEST=dawn_end2end_tests

Change-Id: I3d0512c6376a1ed8928e86f8e56fefebc16910fa
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/24360
Commit-Queue: Jiawei Shao <jiawei.shao@intel.com>
Reviewed-by: Corentin Wallez <cwallez@chromium.org>
Reviewed-by: Austin Eng <enga@chromium.org>
diff --git a/src/dawn_native/d3d12/BufferD3D12.cpp b/src/dawn_native/d3d12/BufferD3D12.cpp
index 6ff9f0b..2521e93 100644
--- a/src/dawn_native/d3d12/BufferD3D12.cpp
+++ b/src/dawn_native/d3d12/BufferD3D12.cpp
@@ -313,10 +313,44 @@
         return mResourceAllocation.GetInfo().mMethod == allocationMethod;
     }
 
-    MaybeError Buffer::ClearBufferContentsToZero(CommandRecordingContext* commandContext) {
+    MaybeError Buffer::EnsureDataInitialized(CommandRecordingContext* commandContext) {
+        // TODO(jiawei.shao@intel.com): check Toggle::LazyClearResourceOnFirstUse
+        // instead when buffer lazy initialization is completely supported.
+        if (IsDataInitialized() ||
+            !GetDevice()->IsToggleEnabled(Toggle::LazyClearBufferOnFirstUse)) {
+            return {};
+        }
+
+        DAWN_TRY(InitializeToZero(commandContext));
+
+        return {};
+    }
+
+    MaybeError Buffer::EnsureDataInitializedAsDestination(CommandRecordingContext* commandContext,
+                                                          uint64_t offset,
+                                                          uint64_t size) {
+        // TODO(jiawei.shao@intel.com): check Toggle::LazyClearResourceOnFirstUse
+        // instead when buffer lazy initialization is completely supported.
+        if (IsDataInitialized() ||
+            !GetDevice()->IsToggleEnabled(Toggle::LazyClearBufferOnFirstUse)) {
+            return {};
+        }
+
+        if (IsFullBufferRange(offset, size)) {
+            SetIsDataInitialized();
+        } else {
+            DAWN_TRY(InitializeToZero(commandContext));
+        }
+
+        return {};
+    }
+
+    MaybeError Buffer::InitializeToZero(CommandRecordingContext* commandContext) {
         ASSERT(GetDevice()->IsToggleEnabled(Toggle::LazyClearBufferOnFirstUse));
         ASSERT(!IsDataInitialized());
 
+        // TODO(jiawei.shao@intel.com): skip initializing the buffer when it is created on a heap
+        // that has already been zero initialized.
         DAWN_TRY(ClearBuffer(commandContext, uint8_t(0u)));
         SetIsDataInitialized();
         GetDevice()->IncrementLazyClearCountForTesting();
diff --git a/src/dawn_native/d3d12/BufferD3D12.h b/src/dawn_native/d3d12/BufferD3D12.h
index 33e0580..bb25558 100644
--- a/src/dawn_native/d3d12/BufferD3D12.h
+++ b/src/dawn_native/d3d12/BufferD3D12.h
@@ -44,7 +44,10 @@
         bool CheckAllocationMethodForTesting(AllocationMethod allocationMethod) const;
         bool CheckIsResidentForTesting() const;
 
-        MaybeError ClearBufferContentsToZero(CommandRecordingContext* commandContext);
+        MaybeError EnsureDataInitialized(CommandRecordingContext* commandContext);
+        MaybeError EnsureDataInitializedAsDestination(CommandRecordingContext* commandContext,
+                                                      uint64_t offset,
+                                                      uint64_t size);
 
       private:
         ~Buffer() override;
@@ -63,6 +66,7 @@
                                                   D3D12_RESOURCE_BARRIER* barrier,
                                                   wgpu::BufferUsage newUsage);
 
+        MaybeError InitializeToZero(CommandRecordingContext* commandContext);
         MaybeError ClearBuffer(CommandRecordingContext* commandContext, uint8_t clearValue);
 
         ResourceHeapAllocation mResourceAllocation;
diff --git a/src/dawn_native/d3d12/CommandBufferD3D12.cpp b/src/dawn_native/d3d12/CommandBufferD3D12.cpp
index 0b8b124..cd30dc5 100644
--- a/src/dawn_native/d3d12/CommandBufferD3D12.cpp
+++ b/src/dawn_native/d3d12/CommandBufferD3D12.cpp
@@ -582,6 +582,10 @@
                     Buffer* srcBuffer = ToBackend(copy->source.Get());
                     Buffer* dstBuffer = ToBackend(copy->destination.Get());
 
+                    DAWN_TRY(srcBuffer->EnsureDataInitialized(commandContext));
+                    DAWN_TRY(dstBuffer->EnsureDataInitializedAsDestination(
+                        commandContext, copy->destinationOffset, copy->size));
+
                     srcBuffer->TrackUsageAndTransitionNow(commandContext,
                                                           wgpu::BufferUsage::CopySrc);
                     dstBuffer->TrackUsageAndTransitionNow(commandContext,
diff --git a/src/dawn_native/d3d12/DeviceD3D12.cpp b/src/dawn_native/d3d12/DeviceD3D12.cpp
index 0a647ad..01e7ead 100644
--- a/src/dawn_native/d3d12/DeviceD3D12.cpp
+++ b/src/dawn_native/d3d12/DeviceD3D12.cpp
@@ -337,15 +337,8 @@
 
         Buffer* dstBuffer = ToBackend(destination);
 
-        // TODO(jiawei.shao@intel.com): use Toggle::LazyClearResourceOnFirstUse when the support of
-        // buffer lazy initialization is completed.
-        if (IsToggleEnabled(Toggle::LazyClearBufferOnFirstUse) && !dstBuffer->IsDataInitialized()) {
-            if (dstBuffer->IsFullBufferRange(destinationOffset, size)) {
-                dstBuffer->SetIsDataInitialized();
-            } else {
-                DAWN_TRY(dstBuffer->ClearBufferContentsToZero(commandRecordingContext));
-            }
-        }
+        DAWN_TRY(dstBuffer->EnsureDataInitializedAsDestination(commandRecordingContext,
+                                                               destinationOffset, size));
 
         CopyFromStagingToBufferImpl(commandRecordingContext, source, sourceOffset, destination,
                                     destinationOffset, size);
diff --git a/src/dawn_native/metal/BufferMTL.h b/src/dawn_native/metal/BufferMTL.h
index 74d2cdb..38c403c 100644
--- a/src/dawn_native/metal/BufferMTL.h
+++ b/src/dawn_native/metal/BufferMTL.h
@@ -31,7 +31,10 @@
                                                  const BufferDescriptor* descriptor);
         id<MTLBuffer> GetMTLBuffer() const;
 
-        void ClearBufferContentsToZero(CommandRecordingContext* commandContext);
+        void EnsureDataInitialized(CommandRecordingContext* commandContext);
+        void EnsureDataInitializedAsDestination(CommandRecordingContext* commandContext,
+                                                uint64_t offset,
+                                                uint64_t size);
 
       private:
         using BufferBase::BufferBase;
@@ -47,6 +50,7 @@
         bool IsMapWritable() const override;
         MaybeError MapAtCreationImpl() override;
 
+        void InitializeToZero(CommandRecordingContext* commandContext);
         void ClearBuffer(CommandRecordingContext* commandContext, uint8_t clearValue);
 
         id<MTLBuffer> mMtlBuffer = nil;
diff --git a/src/dawn_native/metal/BufferMTL.mm b/src/dawn_native/metal/BufferMTL.mm
index c14a5a2..5b10a8d 100644
--- a/src/dawn_native/metal/BufferMTL.mm
+++ b/src/dawn_native/metal/BufferMTL.mm
@@ -134,7 +134,35 @@
         mMtlBuffer = nil;
     }
 
-    void Buffer::ClearBufferContentsToZero(CommandRecordingContext* commandContext) {
+    void Buffer::EnsureDataInitialized(CommandRecordingContext* commandContext) {
+        // TODO(jiawei.shao@intel.com): check Toggle::LazyClearResourceOnFirstUse
+        // instead when buffer lazy initialization is completely supported.
+        if (IsDataInitialized() ||
+            !GetDevice()->IsToggleEnabled(Toggle::LazyClearBufferOnFirstUse)) {
+            return;
+        }
+
+        InitializeToZero(commandContext);
+    }
+
+    void Buffer::EnsureDataInitializedAsDestination(CommandRecordingContext* commandContext,
+                                                    uint64_t offset,
+                                                    uint64_t size) {
+        // TODO(jiawei.shao@intel.com): check Toggle::LazyClearResourceOnFirstUse
+        // instead when buffer lazy initialization is completely supported.
+        if (IsDataInitialized() ||
+            !GetDevice()->IsToggleEnabled(Toggle::LazyClearBufferOnFirstUse)) {
+            return;
+        }
+
+        if (IsFullBufferRange(offset, size)) {
+            SetIsDataInitialized();
+        } else {
+            InitializeToZero(commandContext);
+        }
+    }
+
+    void Buffer::InitializeToZero(CommandRecordingContext* commandContext) {
         ASSERT(GetDevice()->IsToggleEnabled(Toggle::LazyClearBufferOnFirstUse));
         ASSERT(!IsDataInitialized());
 
diff --git a/src/dawn_native/metal/CommandBufferMTL.mm b/src/dawn_native/metal/CommandBufferMTL.mm
index 4efc7a6..0f923f4 100644
--- a/src/dawn_native/metal/CommandBufferMTL.mm
+++ b/src/dawn_native/metal/CommandBufferMTL.mm
@@ -721,6 +721,11 @@
                 case Command::CopyBufferToBuffer: {
                     CopyBufferToBufferCmd* copy = mCommands.NextCommand<CopyBufferToBufferCmd>();
 
+                    ToBackend(copy->source)->EnsureDataInitialized(commandContext);
+                    ToBackend(copy->destination)
+                        ->EnsureDataInitializedAsDestination(commandContext,
+                                                             copy->destinationOffset, copy->size);
+
                     [commandContext->EnsureBlit()
                            copyFromBuffer:ToBackend(copy->source)->GetMTLBuffer()
                              sourceOffset:copy->sourceOffset
diff --git a/src/dawn_native/metal/DeviceMTL.mm b/src/dawn_native/metal/DeviceMTL.mm
index 01b63b2..07f11cd 100644
--- a/src/dawn_native/metal/DeviceMTL.mm
+++ b/src/dawn_native/metal/DeviceMTL.mm
@@ -254,16 +254,9 @@
         // this function.
         ASSERT(size != 0);
 
-        // TODO(jiawei.shao@intel.com): use Toggle::LazyClearResourceOnFirstUse when the support of
-        // buffer lazy initialization is completed.
-        if (IsToggleEnabled(Toggle::LazyClearBufferOnFirstUse) &&
-            !destination->IsDataInitialized()) {
-            if (destination->IsFullBufferRange(destinationOffset, size)) {
-                destination->SetIsDataInitialized();
-            } else {
-                ToBackend(destination)->ClearBufferContentsToZero(GetPendingCommandContext());
-            }
-        }
+        ToBackend(destination)
+            ->EnsureDataInitializedAsDestination(GetPendingCommandContext(), destinationOffset,
+                                                 size);
 
         id<MTLBuffer> uploadBuffer = ToBackend(source)->GetBufferHandle();
         id<MTLBuffer> buffer = ToBackend(destination)->GetMTLBuffer();
diff --git a/src/dawn_native/opengl/BufferGL.cpp b/src/dawn_native/opengl/BufferGL.cpp
index c56cc55..c0bd7f3 100644
--- a/src/dawn_native/opengl/BufferGL.cpp
+++ b/src/dawn_native/opengl/BufferGL.cpp
@@ -51,7 +51,33 @@
         return std::max(GetSize(), uint64_t(4u));
     }
 
-    void Buffer::ClearBufferContentsToZero() {
+    void Buffer::EnsureDataInitialized() {
+        // TODO(jiawei.shao@intel.com): check Toggle::LazyClearResourceOnFirstUse
+        // instead when buffer lazy initialization is completely supported.
+        if (IsDataInitialized() ||
+            !GetDevice()->IsToggleEnabled(Toggle::LazyClearBufferOnFirstUse)) {
+            return;
+        }
+
+        InitializeToZero();
+    }
+
+    void Buffer::EnsureDataInitializedAsDestination(uint64_t offset, uint64_t size) {
+        // TODO(jiawei.shao@intel.com): check Toggle::LazyClearResourceOnFirstUse
+        // instead when buffer lazy initialization is completely supported.
+        if (IsDataInitialized() ||
+            !GetDevice()->IsToggleEnabled(Toggle::LazyClearBufferOnFirstUse)) {
+            return;
+        }
+
+        if (IsFullBufferRange(offset, size)) {
+            SetIsDataInitialized();
+        } else {
+            InitializeToZero();
+        }
+    }
+
+    void Buffer::InitializeToZero() {
         ASSERT(GetDevice()->IsToggleEnabled(Toggle::LazyClearBufferOnFirstUse));
         ASSERT(!IsDataInitialized());
 
@@ -61,9 +87,9 @@
         const std::vector<uint8_t> clearValues(size, 0u);
         device->gl.BindBuffer(GL_ARRAY_BUFFER, mBuffer);
         device->gl.BufferSubData(GL_ARRAY_BUFFER, 0, size, clearValues.data());
+        device->IncrementLazyClearCountForTesting();
 
         SetIsDataInitialized();
-        device->IncrementLazyClearCountForTesting();
     }
 
     bool Buffer::IsMapWritable() const {
diff --git a/src/dawn_native/opengl/BufferGL.h b/src/dawn_native/opengl/BufferGL.h
index 121b1d6..9bc870e 100644
--- a/src/dawn_native/opengl/BufferGL.h
+++ b/src/dawn_native/opengl/BufferGL.h
@@ -29,7 +29,8 @@
 
         GLuint GetHandle() const;
 
-        void ClearBufferContentsToZero();
+        void EnsureDataInitialized();
+        void EnsureDataInitializedAsDestination(uint64_t offset, uint64_t size);
 
       private:
         ~Buffer() override;
@@ -44,6 +45,8 @@
         void* GetMappedPointerImpl() override;
         uint64_t GetAppliedSize() const;
 
+        void InitializeToZero();
+
         GLuint mBuffer = 0;
         void* mMappedData = nullptr;
     };
diff --git a/src/dawn_native/opengl/CommandBufferGL.cpp b/src/dawn_native/opengl/CommandBufferGL.cpp
index 4180eb5..3689277 100644
--- a/src/dawn_native/opengl/CommandBufferGL.cpp
+++ b/src/dawn_native/opengl/CommandBufferGL.cpp
@@ -493,6 +493,10 @@
                 case Command::CopyBufferToBuffer: {
                     CopyBufferToBufferCmd* copy = mCommands.NextCommand<CopyBufferToBufferCmd>();
 
+                    ToBackend(copy->source)->EnsureDataInitialized();
+                    ToBackend(copy->destination)
+                        ->EnsureDataInitializedAsDestination(copy->destinationOffset, copy->size);
+
                     gl.BindBuffer(GL_PIXEL_PACK_BUFFER, ToBackend(copy->source)->GetHandle());
                     gl.BindBuffer(GL_PIXEL_UNPACK_BUFFER,
                                   ToBackend(copy->destination)->GetHandle());
diff --git a/src/dawn_native/opengl/QueueGL.cpp b/src/dawn_native/opengl/QueueGL.cpp
index 2547f17..88d1575 100644
--- a/src/dawn_native/opengl/QueueGL.cpp
+++ b/src/dawn_native/opengl/QueueGL.cpp
@@ -44,16 +44,7 @@
                                       size_t size) {
         const OpenGLFunctions& gl = ToBackend(GetDevice())->gl;
 
-        // TODO(jiawei.shao@intel.com): use Toggle::LazyClearResourceOnFirstUse when the support of
-        // buffer lazy initialization is completed.
-        if (GetDevice()->IsToggleEnabled(Toggle::LazyClearBufferOnFirstUse) &&
-            !buffer->IsDataInitialized()) {
-            if (buffer->IsFullBufferRange(bufferOffset, size)) {
-                buffer->SetIsDataInitialized();
-            } else {
-                ToBackend(buffer)->ClearBufferContentsToZero();
-            }
-        }
+        ToBackend(buffer)->EnsureDataInitializedAsDestination(bufferOffset, size);
 
         gl.BindBuffer(GL_ARRAY_BUFFER, ToBackend(buffer)->GetHandle());
         gl.BufferSubData(GL_ARRAY_BUFFER, bufferOffset, size, data);
diff --git a/src/dawn_native/vulkan/BufferVk.cpp b/src/dawn_native/vulkan/BufferVk.cpp
index 11f56f5..4175324 100644
--- a/src/dawn_native/vulkan/BufferVk.cpp
+++ b/src/dawn_native/vulkan/BufferVk.cpp
@@ -278,14 +278,41 @@
         }
     }
 
-    void Buffer::ClearBufferContentsToZero(CommandRecordingContext* recordingContext) {
+    void Buffer::EnsureDataInitialized(CommandRecordingContext* recordingContext) {
+        // TODO(jiawei.shao@intel.com): check Toggle::LazyClearResourceOnFirstUse
+        // instead when buffer lazy initialization is completely supported.
+        if (IsDataInitialized() ||
+            !GetDevice()->IsToggleEnabled(Toggle::LazyClearBufferOnFirstUse)) {
+            return;
+        }
+
+        InitializeToZero(recordingContext);
+    }
+
+    void Buffer::EnsureDataInitializedAsDestination(CommandRecordingContext* recordingContext,
+                                                    uint64_t offset,
+                                                    uint64_t size) {
+        // TODO(jiawei.shao@intel.com): check Toggle::LazyClearResourceOnFirstUse
+        // instead when buffer lazy initialization is completely supported.
+        if (IsDataInitialized() ||
+            !GetDevice()->IsToggleEnabled(Toggle::LazyClearBufferOnFirstUse)) {
+            return;
+        }
+
+        if (IsFullBufferRange(offset, size)) {
+            SetIsDataInitialized();
+        } else {
+            InitializeToZero(recordingContext);
+        }
+    }
+
+    void Buffer::InitializeToZero(CommandRecordingContext* recordingContext) {
         ASSERT(GetDevice()->IsToggleEnabled(Toggle::LazyClearBufferOnFirstUse));
         ASSERT(!IsDataInitialized());
 
         ClearBuffer(recordingContext, 0u);
-
-        SetIsDataInitialized();
         GetDevice()->IncrementLazyClearCountForTesting();
+        SetIsDataInitialized();
     }
 
     void Buffer::ClearBuffer(CommandRecordingContext* recordingContext, uint32_t clearValue) {
diff --git a/src/dawn_native/vulkan/BufferVk.h b/src/dawn_native/vulkan/BufferVk.h
index 6fb8592..14be757 100644
--- a/src/dawn_native/vulkan/BufferVk.h
+++ b/src/dawn_native/vulkan/BufferVk.h
@@ -43,12 +43,16 @@
                                 VkPipelineStageFlags* srcStages,
                                 VkPipelineStageFlags* dstStages);
 
-        void ClearBufferContentsToZero(CommandRecordingContext* recordingContext);
+        void EnsureDataInitialized(CommandRecordingContext* recordingContext);
+        void EnsureDataInitializedAsDestination(CommandRecordingContext* recordingContext,
+                                                uint64_t offset,
+                                                uint64_t size);
 
       private:
         ~Buffer() override;
         using BufferBase::BufferBase;
         MaybeError Initialize();
+        void InitializeToZero(CommandRecordingContext* recordingContext);
         void ClearBuffer(CommandRecordingContext* recordingContext, uint32_t clearValue);
 
         // Dawn API
diff --git a/src/dawn_native/vulkan/CommandBufferVk.cpp b/src/dawn_native/vulkan/CommandBufferVk.cpp
index 3a2bec9..dcbbf0d 100644
--- a/src/dawn_native/vulkan/CommandBufferVk.cpp
+++ b/src/dawn_native/vulkan/CommandBufferVk.cpp
@@ -424,6 +424,10 @@
                     Buffer* srcBuffer = ToBackend(copy->source.Get());
                     Buffer* dstBuffer = ToBackend(copy->destination.Get());
 
+                    srcBuffer->EnsureDataInitialized(recordingContext);
+                    dstBuffer->EnsureDataInitializedAsDestination(
+                        recordingContext, copy->destinationOffset, copy->size);
+
                     srcBuffer->TransitionUsageNow(recordingContext, wgpu::BufferUsage::CopySrc);
                     dstBuffer->TransitionUsageNow(recordingContext, wgpu::BufferUsage::CopyDst);
 
diff --git a/src/dawn_native/vulkan/DeviceVk.cpp b/src/dawn_native/vulkan/DeviceVk.cpp
index db13577..da17b6c 100644
--- a/src/dawn_native/vulkan/DeviceVk.cpp
+++ b/src/dawn_native/vulkan/DeviceVk.cpp
@@ -592,16 +592,8 @@
 
         CommandRecordingContext* recordingContext = GetPendingRecordingContext();
 
-        // TODO(jiawei.shao@intel.com): use Toggle::LazyClearResourceOnFirstUse when the support of
-        // buffer lazy initialization is completed.
-        if (IsToggleEnabled(Toggle::LazyClearBufferOnFirstUse) &&
-            !destination->IsDataInitialized()) {
-            if (destination->IsFullBufferRange(destinationOffset, size)) {
-                destination->SetIsDataInitialized();
-            } else {
-                ToBackend(destination)->ClearBufferContentsToZero(recordingContext);
-            }
-        }
+        ToBackend(destination)
+            ->EnsureDataInitializedAsDestination(recordingContext, destinationOffset, size);
 
         // Insert memory barrier to ensure host write operations are made visible before
         // copying from the staging buffer. However, this barrier can be removed (see note below).
diff --git a/src/tests/end2end/BufferZeroInitTests.cpp b/src/tests/end2end/BufferZeroInitTests.cpp
index 5392e49..28e43f5 100644
--- a/src/tests/end2end/BufferZeroInitTests.cpp
+++ b/src/tests/end2end/BufferZeroInitTests.cpp
@@ -87,6 +87,179 @@
     }
 }
 
+// Test that the code path of CopyBufferToBuffer clears the source buffer correctly when it is the
+// first use of the source buffer.
+TEST_P(BufferZeroInitTest, CopyBufferToBufferSource) {
+    constexpr uint64_t kBufferSize = 16u;
+    constexpr wgpu::BufferUsage kBufferUsage =
+        wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst;
+    wgpu::BufferDescriptor bufferDescriptor;
+    bufferDescriptor.size = kBufferSize;
+    bufferDescriptor.usage = kBufferUsage;
+
+    constexpr std::array<uint8_t, kBufferSize> kInitialData = {
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}};
+
+    wgpu::Buffer dstBuffer =
+        utils::CreateBufferFromData(device, kInitialData.data(), kBufferSize, kBufferUsage);
+
+    constexpr std::array<uint32_t, kBufferSize / sizeof(uint32_t)> kExpectedData = {{0, 0, 0, 0}};
+
+    // Full copy from the source buffer
+    {
+        wgpu::Buffer srcBuffer = device.CreateBuffer(&bufferDescriptor);
+        wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+        encoder.CopyBufferToBuffer(srcBuffer, 0, dstBuffer, 0, kBufferSize);
+        wgpu::CommandBuffer commandBuffer = encoder.Finish();
+
+        EXPECT_LAZY_CLEAR(1u, queue.Submit(1, &commandBuffer));
+        EXPECT_BUFFER_U32_RANGE_EQ(kExpectedData.data(), srcBuffer, 0,
+                                   kBufferSize / sizeof(uint32_t));
+    }
+
+    // Partial copy from the source buffer
+    // srcOffset == 0
+    {
+        constexpr uint64_t kSrcOffset = 0;
+        constexpr uint64_t kCopySize = kBufferSize / 2;
+
+        wgpu::Buffer srcBuffer = device.CreateBuffer(&bufferDescriptor);
+        wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+        encoder.CopyBufferToBuffer(srcBuffer, kSrcOffset, dstBuffer, 0, kCopySize);
+        wgpu::CommandBuffer commandBuffer = encoder.Finish();
+
+        EXPECT_LAZY_CLEAR(1u, queue.Submit(1, &commandBuffer));
+        EXPECT_BUFFER_U32_RANGE_EQ(kExpectedData.data(), srcBuffer, 0,
+                                   kBufferSize / sizeof(uint32_t));
+    }
+
+    // srcOffset > 0 and srcOffset + copySize == srcBufferSize
+    {
+        constexpr uint64_t kSrcOffset = kBufferSize / 2;
+        constexpr uint64_t kCopySize = kBufferSize - kSrcOffset;
+
+        wgpu::Buffer srcBuffer = device.CreateBuffer(&bufferDescriptor);
+        wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+        encoder.CopyBufferToBuffer(srcBuffer, kSrcOffset, dstBuffer, 0, kCopySize);
+        wgpu::CommandBuffer commandBuffer = encoder.Finish();
+
+        EXPECT_LAZY_CLEAR(1u, queue.Submit(1, &commandBuffer));
+        EXPECT_BUFFER_U32_RANGE_EQ(kExpectedData.data(), srcBuffer, 0,
+                                   kBufferSize / sizeof(uint32_t));
+    }
+
+    // srcOffset > 0 and srcOffset + copySize < srcBufferSize
+    {
+        constexpr uint64_t kSrcOffset = kBufferSize / 4;
+        constexpr uint64_t kCopySize = kBufferSize / 2;
+
+        wgpu::Buffer srcBuffer = device.CreateBuffer(&bufferDescriptor);
+        wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+        encoder.CopyBufferToBuffer(srcBuffer, kSrcOffset, dstBuffer, 0, kCopySize);
+        wgpu::CommandBuffer commandBuffer = encoder.Finish();
+
+        EXPECT_LAZY_CLEAR(1u, queue.Submit(1, &commandBuffer));
+        EXPECT_BUFFER_U32_RANGE_EQ(kExpectedData.data(), srcBuffer, 0,
+                                   kBufferSize / sizeof(uint32_t));
+    }
+}
+
+// Test that the code path of CopyBufferToBuffer clears the destination buffer correctly when it is
+// the first use of the destination buffer.
+TEST_P(BufferZeroInitTest, CopyBufferToBufferDestination) {
+    constexpr uint64_t kBufferSize = 16u;
+    constexpr wgpu::BufferUsage kBufferUsage =
+        wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst;
+    wgpu::BufferDescriptor bufferDescriptor;
+    bufferDescriptor.size = kBufferSize;
+    bufferDescriptor.usage = kBufferUsage;
+
+    const std::array<uint8_t, kBufferSize> kInitialData = {
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}};
+    wgpu::Buffer srcBuffer =
+        utils::CreateBufferFromData(device, kInitialData.data(), kBufferSize, kBufferUsage);
+
+    // Full copy from the source buffer doesn't need lazy initialization at all.
+    {
+        wgpu::Buffer dstBuffer = device.CreateBuffer(&bufferDescriptor);
+        wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+        encoder.CopyBufferToBuffer(srcBuffer, 0, dstBuffer, 0, kBufferSize);
+        wgpu::CommandBuffer commandBuffer = encoder.Finish();
+
+        EXPECT_LAZY_CLEAR(0u, queue.Submit(1, &commandBuffer));
+
+        EXPECT_BUFFER_U32_RANGE_EQ(reinterpret_cast<const uint32_t*>(kInitialData.data()),
+                                   dstBuffer, 0, kBufferSize / sizeof(uint32_t));
+    }
+
+    // Partial copy from the source buffer needs lazy initialization.
+    // offset == 0
+    {
+        constexpr uint32_t kDstOffset = 0;
+        constexpr uint32_t kCopySize = kBufferSize / 2;
+
+        wgpu::Buffer dstBuffer = device.CreateBuffer(&bufferDescriptor);
+        wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+        encoder.CopyBufferToBuffer(srcBuffer, 0, dstBuffer, kDstOffset, kCopySize);
+        wgpu::CommandBuffer commandBuffer = encoder.Finish();
+
+        EXPECT_LAZY_CLEAR(1u, queue.Submit(1, &commandBuffer));
+
+        std::array<uint8_t, kBufferSize> expectedData;
+        expectedData.fill(0);
+        for (uint32_t index = kDstOffset; index < kDstOffset + kCopySize; ++index) {
+            expectedData[index] = kInitialData[index - kDstOffset];
+        }
+
+        EXPECT_BUFFER_U32_RANGE_EQ(reinterpret_cast<uint32_t*>(expectedData.data()), dstBuffer, 0,
+                                   kBufferSize / sizeof(uint32_t));
+    }
+
+    // offset > 0 and dstOffset + CopySize == kBufferSize
+    {
+        constexpr uint32_t kDstOffset = kBufferSize / 2;
+        constexpr uint32_t kCopySize = kBufferSize - kDstOffset;
+
+        wgpu::Buffer dstBuffer = device.CreateBuffer(&bufferDescriptor);
+        wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+        encoder.CopyBufferToBuffer(srcBuffer, 0, dstBuffer, kDstOffset, kCopySize);
+        wgpu::CommandBuffer commandBuffer = encoder.Finish();
+
+        EXPECT_LAZY_CLEAR(1u, queue.Submit(1, &commandBuffer));
+
+        std::array<uint8_t, kBufferSize> expectedData;
+        expectedData.fill(0);
+        for (uint32_t index = kDstOffset; index < kDstOffset + kCopySize; ++index) {
+            expectedData[index] = kInitialData[index - kDstOffset];
+        }
+
+        EXPECT_BUFFER_U32_RANGE_EQ(reinterpret_cast<uint32_t*>(expectedData.data()), dstBuffer, 0,
+                                   kBufferSize / sizeof(uint32_t));
+    }
+
+    // offset > 0 and dstOffset + CopySize < kBufferSize
+    {
+        constexpr uint32_t kDstOffset = kBufferSize / 4;
+        constexpr uint32_t kCopySize = kBufferSize / 2;
+
+        wgpu::Buffer dstBuffer = device.CreateBuffer(&bufferDescriptor);
+        wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+        encoder.CopyBufferToBuffer(srcBuffer, 0, dstBuffer, kDstOffset, kCopySize);
+        wgpu::CommandBuffer commandBuffer = encoder.Finish();
+
+        EXPECT_LAZY_CLEAR(1u, queue.Submit(1, &commandBuffer));
+
+        std::array<uint8_t, kBufferSize> expectedData;
+        expectedData.fill(0);
+        for (uint32_t index = kDstOffset; index < kDstOffset + kCopySize; ++index) {
+            expectedData[index] = kInitialData[index - kDstOffset];
+        }
+
+        EXPECT_BUFFER_U32_RANGE_EQ(reinterpret_cast<uint32_t*>(expectedData.data()), dstBuffer, 0,
+                                   kBufferSize / sizeof(uint32_t));
+    }
+}
+
 DAWN_INSTANTIATE_TEST(BufferZeroInitTest,
                       D3D12Backend({"nonzero_clear_resources_on_creation_for_testing",
                                     "lazy_clear_buffer_on_first_use"}),