Implement buffer lazy initialization before writeBuffer

This patch implements buffer lazy initialization before writeBuffer():
if the buffer is not initialized and writeBuffer() doesn't cover the
whole buffer, the buffer will be cleared to 0, otherwise the buffer
shouldn't be cleared.

This patch also introduces a toggle LazyClearBufferOnFirstUse for the
development of buffer lazy initialization: before buffer lazy
initialization being completely supported, all the related code will
only be enabled behind this toggle to prevent the buffers with valid
content being unexpectedly cleared.

BUG=dawn:414
TEST=dawn_end2end_tests

Change-Id: I99a2aa98ca4b9b21d69c6b32080afb525e2c4ad3
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/24041
Commit-Queue: Jiawei Shao <jiawei.shao@intel.com>
Reviewed-by: Corentin Wallez <cwallez@chromium.org>
diff --git a/src/dawn_native/Buffer.cpp b/src/dawn_native/Buffer.cpp
index 0e4c384..0414159 100644
--- a/src/dawn_native/Buffer.cpp
+++ b/src/dawn_native/Buffer.cpp
@@ -488,6 +488,10 @@
         mState = BufferState::Destroyed;
     }
 
+    bool BufferBase::IsMapped() const {
+        return mState == BufferState::Mapped;
+    }
+
     void BufferBase::OnMapCommandSerialFinished(uint32_t mapSerial, bool isWrite) {
         void* data = GetMappedPointerImpl();
         if (isWrite) {
@@ -497,4 +501,16 @@
         }
     }
 
+    bool BufferBase::IsDataInitialized() const {
+        return mIsDataInitialized;
+    }
+
+    void BufferBase::SetIsDataInitialized() {
+        mIsDataInitialized = true;
+    }
+
+    bool BufferBase::IsFullBufferRange(uint64_t offset, uint64_t size) const {
+        return offset == 0 && size == GetSize();
+    }
+
 }  // namespace dawn_native
diff --git a/src/dawn_native/Buffer.h b/src/dawn_native/Buffer.h
index c4f1d32..ea67fab 100644
--- a/src/dawn_native/Buffer.h
+++ b/src/dawn_native/Buffer.h
@@ -40,8 +40,6 @@
         };
 
       public:
-        enum class ClearValue { Zero, NonZero };
-
         BufferBase(DeviceBase* device, const BufferDescriptor* descriptor);
 
         static BufferBase* MakeError(DeviceBase* device);
@@ -57,6 +55,10 @@
 
         MaybeError ValidateCanUseOnQueueNow() const;
 
+        bool IsFullBufferRange(uint64_t offset, uint64_t size) const;
+        bool IsDataInitialized() const;
+        void SetIsDataInitialized();
+
         // Dawn API
         void SetSubData(uint64_t start, uint64_t count, const void* data);
         void MapReadAsync(WGPUBufferMapReadCallback callback, void* userdata);
@@ -81,6 +83,8 @@
 
         void DestroyInternal();
 
+        bool IsMapped() const;
+
       private:
         virtual MaybeError MapAtCreationImpl(uint8_t** mappedPointer) = 0;
         virtual MaybeError MapReadAsyncImpl(uint32_t serial) = 0;
@@ -109,6 +113,8 @@
         std::unique_ptr<StagingBufferBase> mStagingBuffer;
 
         BufferState mState;
+
+        bool mIsDataInitialized = false;
     };
 
 }  // namespace dawn_native
diff --git a/src/dawn_native/Toggles.cpp b/src/dawn_native/Toggles.cpp
index 9eb526a..7f40ebd 100644
--- a/src/dawn_native/Toggles.cpp
+++ b/src/dawn_native/Toggles.cpp
@@ -29,111 +29,116 @@
         using ToggleEnumAndInfoList =
             std::array<ToggleEnumAndInfo, static_cast<size_t>(Toggle::EnumCount)>;
 
-        static constexpr ToggleEnumAndInfoList kToggleNameAndInfoList = {{
-            {Toggle::EmulateStoreAndMSAAResolve,
-             {"emulate_store_and_msaa_resolve",
-              "Emulate storing into multisampled color attachments and doing MSAA resolve "
-              "simultaneously. This workaround is enabled by default on the Metal drivers that do "
-              "not support MTLStoreActionStoreAndMultisampleResolve. To support StoreOp::Store on "
-              "those platforms, we should do MSAA resolve in another render pass after ending the "
-              "previous one.",
-              "https://crbug.com/dawn/56"}},
-            {Toggle::NonzeroClearResourcesOnCreationForTesting,
-             {"nonzero_clear_resources_on_creation_for_testing",
-              "Clears texture to full 1 bits as soon as they are created, but doesn't update "
-              "the tracking state of the texture. This way we can test the logic of clearing "
-              "textures that use recycled memory.",
-              "https://crbug.com/dawn/145"}},
-            {Toggle::AlwaysResolveIntoZeroLevelAndLayer,
-             {"always_resolve_into_zero_level_and_layer",
-              "When the resolve target is a texture view that is created on the non-zero level or "
-              "layer of a texture, we first resolve into a temporarily 2D texture with only one "
-              "mipmap level and one array layer, and copy the result of MSAA resolve into the "
-              "true resolve target. This workaround is enabled by default on the Metal drivers "
-              "that have bugs when setting non-zero resolveLevel or resolveSlice.",
-              "https://crbug.com/dawn/56"}},
-            {Toggle::LazyClearResourceOnFirstUse,
-             {"lazy_clear_resource_on_first_use",
-              "Clears resource to zero on first usage. This initializes the resource "
-              "so that no dirty bits from recycled memory is present in the new resource.",
-              "https://crbug.com/dawn/145"}},
-            {Toggle::TurnOffVsync,
-             {"turn_off_vsync",
-              "Turn off vsync when rendering. In order to do performance test or run perf tests, "
-              "turn off vsync so that the fps can exeed 60.",
-              "https://crbug.com/dawn/237"}},
-            {Toggle::UseTemporaryBufferInCompressedTextureToTextureCopy,
-             {"use_temporary_buffer_in_texture_to_texture_copy",
-              "Split texture-to-texture copy into two copies: copy from source texture into a "
-              "temporary buffer, and copy from the temporary buffer into the destination texture "
-              "when copying between compressed textures that don't have block-aligned sizes. This "
-              "workaround is enabled by default on all Vulkan drivers to solve an issue in the "
-              "Vulkan SPEC about the texture-to-texture copies with compressed formats. See #1005 "
-              "(https://github.com/KhronosGroup/Vulkan-Docs/issues/1005) for more details.",
-              "https://crbug.com/dawn/42"}},
-            {Toggle::UseD3D12ResourceHeapTier2,
-             {"use_d3d12_resource_heap_tier2",
-              "Enable support for resource heap tier 2. Resource heap tier 2 allows mixing of "
-              "texture and buffers in the same heap. This allows better heap re-use and reduces "
-              "fragmentation.",
-              "https://crbug.com/dawn/27"}},
-            {Toggle::UseD3D12RenderPass,
-             {"use_d3d12_render_pass",
-              "Use the D3D12 render pass API introduced in Windows build 1809 by default. On "
-              "versions of Windows prior to build 1809, or when this toggle is turned off, Dawn "
-              "will emulate a render pass.",
-              "https://crbug.com/dawn/36"}},
-            {Toggle::UseD3D12ResidencyManagement,
-             {"use_d3d12_residency_management",
-              "Enable residency management. This allows page-in and page-out of resource heaps in "
-              "GPU memory. This component improves overcommitted performance by keeping the most "
-              "recently used resources local to the GPU. Turning this component off can cause "
-              "allocation failures when application memory exceeds physical device memory.",
-              "https://crbug.com/dawn/193"}},
-            {Toggle::SkipValidation,
-             {"skip_validation", "Skip expensive validation of Dawn commands.",
-              "https://crbug.com/dawn/271"}},
-            {Toggle::UseSpvc,
-             {"use_spvc",
-              "Enable use of spvc for shader compilation, instead of accessing spirv_cross "
-              "directly.",
-              "https://crbug.com/dawn/288"}},
-            {Toggle::UseSpvcParser,
-             {"use_spvc_parser",
-              "Enable usage of spvc's internal parsing and IR generation code, instead of "
-              "spirv_cross's.",
-              "https://crbug.com/dawn/288"}},
-            {Toggle::VulkanUseD32S8,
-             {"vulkan_use_d32s8",
-              "Vulkan mandates support of either D32_FLOAT_S8 or D24_UNORM_S8. When available the "
-              "backend will use D32S8 (toggle to on) but setting the toggle to off will make it"
-              "use the D24S8 format when possible.",
-              "https://crbug.com/dawn/286"}},
-            {Toggle::MetalDisableSamplerCompare,
-             {"metal_disable_sampler_compare",
-              "Disables the use of sampler compare on Metal. This is unsupported before A9 "
-              "processors.",
-              "https://crbug.com/dawn/342"}},
-            {Toggle::DisableBaseVertex,
-             {"disable_base_vertex",
-              "Disables the use of non-zero base vertex which is unsupported on some platforms.",
-              "https://crbug.com/dawn/343"}},
-            {Toggle::DisableBaseInstance,
-             {"disable_base_instance",
-              "Disables the use of non-zero base instance which is unsupported on some "
-              "platforms.",
-              "https://crbug.com/dawn/343"}},
-            {Toggle::UseD3D12SmallShaderVisibleHeapForTesting,
-             {"use_d3d12_small_shader_visible_heap",
-              "Enable use of a small D3D12 shader visible heap, instead of using a large one by "
-              "default. This setting is used to test bindgroup encoding.",
-              "https://crbug.com/dawn/155"}},
-            {Toggle::UseDXC,
-             {"use_dxc", "Use DXC instead of FXC for compiling HLSL",
-              "https://crbug.com/dawn/402"}},
-            {Toggle::DisableRobustness,
-             {"disable_robustness", "Disable robust buffer access", "https://crbug.com/dawn/480"}},
-        }};
+        static constexpr ToggleEnumAndInfoList kToggleNameAndInfoList = {
+            {{Toggle::EmulateStoreAndMSAAResolve,
+              {"emulate_store_and_msaa_resolve",
+               "Emulate storing into multisampled color attachments and doing MSAA resolve "
+               "simultaneously. This workaround is enabled by default on the Metal drivers that do "
+               "not support MTLStoreActionStoreAndMultisampleResolve. To support StoreOp::Store on "
+               "those platforms, we should do MSAA resolve in another render pass after ending the "
+               "previous one.",
+               "https://crbug.com/dawn/56"}},
+             {Toggle::NonzeroClearResourcesOnCreationForTesting,
+              {"nonzero_clear_resources_on_creation_for_testing",
+               "Clears texture to full 1 bits as soon as they are created, but doesn't update "
+               "the tracking state of the texture. This way we can test the logic of clearing "
+               "textures that use recycled memory.",
+               "https://crbug.com/dawn/145"}},
+             {Toggle::AlwaysResolveIntoZeroLevelAndLayer,
+              {"always_resolve_into_zero_level_and_layer",
+               "When the resolve target is a texture view that is created on the non-zero level or "
+               "layer of a texture, we first resolve into a temporarily 2D texture with only one "
+               "mipmap level and one array layer, and copy the result of MSAA resolve into the "
+               "true resolve target. This workaround is enabled by default on the Metal drivers "
+               "that have bugs when setting non-zero resolveLevel or resolveSlice.",
+               "https://crbug.com/dawn/56"}},
+             {Toggle::LazyClearResourceOnFirstUse,
+              {"lazy_clear_resource_on_first_use",
+               "Clears resource to zero on first usage. This initializes the resource "
+               "so that no dirty bits from recycled memory is present in the new resource.",
+               "https://crbug.com/dawn/145"}},
+             {Toggle::TurnOffVsync,
+              {"turn_off_vsync",
+               "Turn off vsync when rendering. In order to do performance test or run perf tests, "
+               "turn off vsync so that the fps can exeed 60.",
+               "https://crbug.com/dawn/237"}},
+             {Toggle::UseTemporaryBufferInCompressedTextureToTextureCopy,
+              {"use_temporary_buffer_in_texture_to_texture_copy",
+               "Split texture-to-texture copy into two copies: copy from source texture into a "
+               "temporary buffer, and copy from the temporary buffer into the destination texture "
+               "when copying between compressed textures that don't have block-aligned sizes. This "
+               "workaround is enabled by default on all Vulkan drivers to solve an issue in the "
+               "Vulkan SPEC about the texture-to-texture copies with compressed formats. See #1005 "
+               "(https://github.com/KhronosGroup/Vulkan-Docs/issues/1005) for more details.",
+               "https://crbug.com/dawn/42"}},
+             {Toggle::UseD3D12ResourceHeapTier2,
+              {"use_d3d12_resource_heap_tier2",
+               "Enable support for resource heap tier 2. Resource heap tier 2 allows mixing of "
+               "texture and buffers in the same heap. This allows better heap re-use and reduces "
+               "fragmentation.",
+               "https://crbug.com/dawn/27"}},
+             {Toggle::UseD3D12RenderPass,
+              {"use_d3d12_render_pass",
+               "Use the D3D12 render pass API introduced in Windows build 1809 by default. On "
+               "versions of Windows prior to build 1809, or when this toggle is turned off, Dawn "
+               "will emulate a render pass.",
+               "https://crbug.com/dawn/36"}},
+             {Toggle::UseD3D12ResidencyManagement,
+              {"use_d3d12_residency_management",
+               "Enable residency management. This allows page-in and page-out of resource heaps in "
+               "GPU memory. This component improves overcommitted performance by keeping the most "
+               "recently used resources local to the GPU. Turning this component off can cause "
+               "allocation failures when application memory exceeds physical device memory.",
+               "https://crbug.com/dawn/193"}},
+             {Toggle::SkipValidation,
+              {"skip_validation", "Skip expensive validation of Dawn commands.",
+               "https://crbug.com/dawn/271"}},
+             {Toggle::UseSpvc,
+              {"use_spvc",
+               "Enable use of spvc for shader compilation, instead of accessing spirv_cross "
+               "directly.",
+               "https://crbug.com/dawn/288"}},
+             {Toggle::UseSpvcParser,
+              {"use_spvc_parser",
+               "Enable usage of spvc's internal parsing and IR generation code, instead of "
+               "spirv_cross's.",
+               "https://crbug.com/dawn/288"}},
+             {Toggle::VulkanUseD32S8,
+              {"vulkan_use_d32s8",
+               "Vulkan mandates support of either D32_FLOAT_S8 or D24_UNORM_S8. When available the "
+               "backend will use D32S8 (toggle to on) but setting the toggle to off will make it"
+               "use the D24S8 format when possible.",
+               "https://crbug.com/dawn/286"}},
+             {Toggle::MetalDisableSamplerCompare,
+              {"metal_disable_sampler_compare",
+               "Disables the use of sampler compare on Metal. This is unsupported before A9 "
+               "processors.",
+               "https://crbug.com/dawn/342"}},
+             {Toggle::DisableBaseVertex,
+              {"disable_base_vertex",
+               "Disables the use of non-zero base vertex which is unsupported on some platforms.",
+               "https://crbug.com/dawn/343"}},
+             {Toggle::DisableBaseInstance,
+              {"disable_base_instance",
+               "Disables the use of non-zero base instance which is unsupported on some "
+               "platforms.",
+               "https://crbug.com/dawn/343"}},
+             {Toggle::UseD3D12SmallShaderVisibleHeapForTesting,
+              {"use_d3d12_small_shader_visible_heap",
+               "Enable use of a small D3D12 shader visible heap, instead of using a large one by "
+               "default. This setting is used to test bindgroup encoding.",
+               "https://crbug.com/dawn/155"}},
+             {Toggle::UseDXC,
+              {"use_dxc", "Use DXC instead of FXC for compiling HLSL",
+               "https://crbug.com/dawn/402"}},
+             {Toggle::DisableRobustness,
+              {"disable_robustness", "Disable robust buffer access", "https://crbug.com/dawn/480"}},
+             {Toggle::LazyClearBufferOnFirstUse,
+              {"lazy_clear_buffer_on_first_use",
+               "Clear buffers on their first use. This is a temporary toggle only for the "
+               "development of buffer lazy initialization and will be removed after buffer lazy "
+               "initialization is completely implemented.",
+               "https://crbug.com/dawn/414"}}}};
 
     }  // anonymous namespace
 
diff --git a/src/dawn_native/Toggles.h b/src/dawn_native/Toggles.h
index 1e7e3e3..f9d66a2 100644
--- a/src/dawn_native/Toggles.h
+++ b/src/dawn_native/Toggles.h
@@ -43,6 +43,7 @@
         UseD3D12SmallShaderVisibleHeapForTesting,
         UseDXC,
         DisableRobustness,
+        LazyClearBufferOnFirstUse,
 
         EnumCount,
         InvalidEnum = EnumCount,
diff --git a/src/dawn_native/d3d12/BufferD3D12.cpp b/src/dawn_native/d3d12/BufferD3D12.cpp
index 65adaf0..a7b8fd8 100644
--- a/src/dawn_native/d3d12/BufferD3D12.cpp
+++ b/src/dawn_native/d3d12/BufferD3D12.cpp
@@ -123,7 +123,11 @@
             ToBackend(GetDevice())->AllocateMemory(heapType, resourceDescriptor, bufferUsage));
 
         if (GetDevice()->IsToggleEnabled(Toggle::NonzeroClearResourcesOnCreationForTesting)) {
-            DAWN_TRY(ClearBuffer(ClearValue::NonZero));
+            CommandRecordingContext* commandRecordingContext;
+            DAWN_TRY_ASSIGN(commandRecordingContext,
+                            ToBackend(GetDevice())->GetPendingCommandContext());
+
+            DAWN_TRY(ClearBuffer(commandRecordingContext, uint8_t(1u)));
         }
 
         return {};
@@ -310,18 +314,25 @@
         return mResourceAllocation.GetInfo().mMethod == allocationMethod;
     }
 
-    MaybeError Buffer::ClearBuffer(ClearValue clearValue) {
-        // TODO(jiawei.shao@intel.com): support buffer lazy-initialization to 0.
-        ASSERT(clearValue == BufferBase::ClearValue::NonZero);
-        constexpr uint8_t kClearBufferValue = 1u;
+    MaybeError Buffer::ClearBufferContentsToZero(CommandRecordingContext* commandContext) {
+        ASSERT(GetDevice()->IsToggleEnabled(Toggle::LazyClearBufferOnFirstUse));
+        ASSERT(!IsDataInitialized());
 
+        DAWN_TRY(ClearBuffer(commandContext, uint8_t(0u)));
+        SetIsDataInitialized();
+        GetDevice()->IncrementLazyClearCountForTesting();
+
+        return {};
+    }
+
+    MaybeError Buffer::ClearBuffer(CommandRecordingContext* commandContext, uint8_t clearValue) {
         Device* device = ToBackend(GetDevice());
 
         // The state of the buffers on UPLOAD heap must always be GENERIC_READ and cannot be
         // changed away, so we can only clear such buffer with buffer mapping.
         if (D3D12HeapType(GetUsage()) == D3D12_HEAP_TYPE_UPLOAD) {
             DAWN_TRY(MapInternal(true, "D3D12 map at clear buffer"));
-            memset(mMappedData, kClearBufferValue, GetSize());
+            memset(mMappedData, clearValue, GetSize());
             UnmapImpl();
         } else {
             // TODO(jiawei.shao@intel.com): use ClearUnorderedAccessView*() when the buffer usage
@@ -331,10 +342,10 @@
             DAWN_TRY_ASSIGN(uploadHandle,
                             uploader->Allocate(GetSize(), device->GetPendingCommandSerial()));
 
-            memset(uploadHandle.mappedBuffer, kClearBufferValue, GetSize());
+            memset(uploadHandle.mappedBuffer, clearValue, GetSize());
 
-            DAWN_TRY(device->CopyFromStagingToBuffer(uploadHandle.stagingBuffer,
-                                                     uploadHandle.startOffset, this, 0, GetSize()));
+            device->CopyFromStagingToBufferImpl(commandContext, uploadHandle.stagingBuffer,
+                                                uploadHandle.startOffset, this, 0, GetSize());
         }
 
         return {};
diff --git a/src/dawn_native/d3d12/BufferD3D12.h b/src/dawn_native/d3d12/BufferD3D12.h
index a081986..a364250 100644
--- a/src/dawn_native/d3d12/BufferD3D12.h
+++ b/src/dawn_native/d3d12/BufferD3D12.h
@@ -44,6 +44,8 @@
         bool CheckAllocationMethodForTesting(AllocationMethod allocationMethod) const;
         bool CheckIsResidentForTesting() const;
 
+        MaybeError ClearBufferContentsToZero(CommandRecordingContext* commandContext);
+
       private:
         ~Buffer() override;
         // Dawn API
@@ -61,7 +63,7 @@
                                                   D3D12_RESOURCE_BARRIER* barrier,
                                                   wgpu::BufferUsage newUsage);
 
-        MaybeError ClearBuffer(ClearValue clearValue);
+        MaybeError ClearBuffer(CommandRecordingContext* commandContext, uint8_t clearValue);
 
         ResourceHeapAllocation mResourceAllocation;
         bool mFixedResourceState = false;
diff --git a/src/dawn_native/d3d12/DeviceD3D12.cpp b/src/dawn_native/d3d12/DeviceD3D12.cpp
index d4b9b51..7ecb984 100644
--- a/src/dawn_native/d3d12/DeviceD3D12.cpp
+++ b/src/dawn_native/d3d12/DeviceD3D12.cpp
@@ -336,16 +336,39 @@
         DAWN_TRY_ASSIGN(commandRecordingContext, GetPendingCommandContext());
 
         Buffer* dstBuffer = ToBackend(destination);
-        StagingBuffer* srcBuffer = ToBackend(source);
-        dstBuffer->TrackUsageAndTransitionNow(commandRecordingContext, wgpu::BufferUsage::CopyDst);
 
-        commandRecordingContext->GetCommandList()->CopyBufferRegion(
-            dstBuffer->GetD3D12Resource(), destinationOffset, srcBuffer->GetResource(),
-            sourceOffset, size);
+        // TODO(jiawei.shao@intel.com): use Toggle::LazyClearResourceOnFirstUse when the support of
+        // buffer lazy initialization is completed.
+        if (IsToggleEnabled(Toggle::LazyClearBufferOnFirstUse) && !dstBuffer->IsDataInitialized()) {
+            if (dstBuffer->IsFullBufferRange(destinationOffset, size)) {
+                dstBuffer->SetIsDataInitialized();
+            } else {
+                DAWN_TRY(dstBuffer->ClearBufferContentsToZero(commandRecordingContext));
+            }
+        }
+
+        CopyFromStagingToBufferImpl(commandRecordingContext, source, sourceOffset, destination,
+                                    destinationOffset, size);
 
         return {};
     }
 
+    void Device::CopyFromStagingToBufferImpl(CommandRecordingContext* commandContext,
+                                             StagingBufferBase* source,
+                                             uint64_t sourceOffset,
+                                             BufferBase* destination,
+                                             uint64_t destinationOffset,
+                                             uint64_t size) {
+        ASSERT(commandContext != nullptr);
+        Buffer* dstBuffer = ToBackend(destination);
+        StagingBuffer* srcBuffer = ToBackend(source);
+        dstBuffer->TrackUsageAndTransitionNow(commandContext, wgpu::BufferUsage::CopyDst);
+
+        commandContext->GetCommandList()->CopyBufferRegion(
+            dstBuffer->GetD3D12Resource(), destinationOffset, srcBuffer->GetResource(),
+            sourceOffset, size);
+    }
+
     void Device::DeallocateMemory(ResourceHeapAllocation& allocation) {
         mResourceAllocatorManager->DeallocateMemory(allocation);
     }
diff --git a/src/dawn_native/d3d12/DeviceD3D12.h b/src/dawn_native/d3d12/DeviceD3D12.h
index 1ee4092..983ed67 100644
--- a/src/dawn_native/d3d12/DeviceD3D12.h
+++ b/src/dawn_native/d3d12/DeviceD3D12.h
@@ -91,6 +91,13 @@
                                            uint64_t destinationOffset,
                                            uint64_t size) override;
 
+        void CopyFromStagingToBufferImpl(CommandRecordingContext* commandContext,
+                                         StagingBufferBase* source,
+                                         uint64_t sourceOffset,
+                                         BufferBase* destination,
+                                         uint64_t destinationOffset,
+                                         uint64_t size);
+
         ResultOrError<ResourceHeapAllocation> AllocateMemory(
             D3D12_HEAP_TYPE heapType,
             const D3D12_RESOURCE_DESC& resourceDescriptor,
diff --git a/src/dawn_native/metal/BufferMTL.h b/src/dawn_native/metal/BufferMTL.h
index 98bab96..f204e53 100644
--- a/src/dawn_native/metal/BufferMTL.h
+++ b/src/dawn_native/metal/BufferMTL.h
@@ -22,6 +22,7 @@
 
 namespace dawn_native { namespace metal {
 
+    class CommandRecordingContext;
     class Device;
 
     class Buffer : public BufferBase {
@@ -29,6 +30,8 @@
         static ResultOrError<Buffer*> Create(Device* device, const BufferDescriptor* descriptor);
         id<MTLBuffer> GetMTLBuffer() const;
 
+        void ClearBufferContentsToZero(CommandRecordingContext* commandContext);
+
       private:
         using BufferBase::BufferBase;
         MaybeError Initialize();
@@ -43,7 +46,7 @@
         bool IsMapWritable() const override;
         MaybeError MapAtCreationImpl(uint8_t** mappedPointer) override;
 
-        void ClearBuffer(BufferBase::ClearValue clearValue);
+        void ClearBuffer(CommandRecordingContext* commandContext, uint8_t clearValue);
 
         id<MTLBuffer> mMtlBuffer = nil;
     };
diff --git a/src/dawn_native/metal/BufferMTL.mm b/src/dawn_native/metal/BufferMTL.mm
index 46724bf..858eab6 100644
--- a/src/dawn_native/metal/BufferMTL.mm
+++ b/src/dawn_native/metal/BufferMTL.mm
@@ -15,6 +15,7 @@
 #include "dawn_native/metal/BufferMTL.h"
 
 #include "common/Math.h"
+#include "dawn_native/metal/CommandRecordingContext.h"
 #include "dawn_native/metal/DeviceMTL.h"
 
 #include <limits>
@@ -87,7 +88,9 @@
         }
 
         if (GetDevice()->IsToggleEnabled(Toggle::NonzeroClearResourcesOnCreationForTesting)) {
-            ClearBuffer(BufferBase::ClearValue::NonZero);
+            CommandRecordingContext* commandContext =
+                ToBackend(GetDevice())->GetPendingCommandContext();
+            ClearBuffer(commandContext, uint8_t(1u));
         }
 
         return {};
@@ -132,16 +135,21 @@
         mMtlBuffer = nil;
     }
 
-    void Buffer::ClearBuffer(BufferBase::ClearValue clearValue) {
-        // TODO(jiawei.shao@intel.com): support buffer lazy-initialization to 0.
-        ASSERT(clearValue == BufferBase::ClearValue::NonZero);
-        const uint8_t clearBufferValue = 1;
+    void Buffer::ClearBufferContentsToZero(CommandRecordingContext* commandContext) {
+        ASSERT(GetDevice()->IsToggleEnabled(Toggle::LazyClearBufferOnFirstUse));
+        ASSERT(!IsDataInitialized());
 
-        Device* device = ToBackend(GetDevice());
-        CommandRecordingContext* commandContext = device->GetPendingCommandContext();
+        ClearBuffer(commandContext, uint8_t(0u));
+
+        SetIsDataInitialized();
+        GetDevice()->IncrementLazyClearCountForTesting();
+    }
+
+    void Buffer::ClearBuffer(CommandRecordingContext* commandContext, uint8_t clearValue) {
+        ASSERT(commandContext != nullptr);
         [commandContext->EnsureBlit() fillBuffer:mMtlBuffer
                                            range:NSMakeRange(0, GetSize())
-                                           value:clearBufferValue];
+                                           value:clearValue];
     }
 
 }}  // namespace dawn_native::metal
diff --git a/src/dawn_native/metal/DeviceMTL.mm b/src/dawn_native/metal/DeviceMTL.mm
index d8d0feb..40875c5 100644
--- a/src/dawn_native/metal/DeviceMTL.mm
+++ b/src/dawn_native/metal/DeviceMTL.mm
@@ -254,6 +254,17 @@
         // this function.
         ASSERT(size != 0);
 
+        // TODO(jiawei.shao@intel.com): use Toggle::LazyClearResourceOnFirstUse when the support of
+        // buffer lazy initialization is completed.
+        if (IsToggleEnabled(Toggle::LazyClearBufferOnFirstUse) &&
+            !destination->IsDataInitialized()) {
+            if (destination->IsFullBufferRange(destinationOffset, size)) {
+                destination->SetIsDataInitialized();
+            } else {
+                ToBackend(destination)->ClearBufferContentsToZero(GetPendingCommandContext());
+            }
+        }
+
         id<MTLBuffer> uploadBuffer = ToBackend(source)->GetBufferHandle();
         id<MTLBuffer> buffer = ToBackend(destination)->GetMTLBuffer();
         [GetPendingCommandContext()->EnsureBlit() copyFromBuffer:uploadBuffer
diff --git a/src/dawn_native/null/DeviceNull.cpp b/src/dawn_native/null/DeviceNull.cpp
index fd91585..b654347 100644
--- a/src/dawn_native/null/DeviceNull.cpp
+++ b/src/dawn_native/null/DeviceNull.cpp
@@ -197,6 +197,10 @@
                                                BufferBase* destination,
                                                uint64_t destinationOffset,
                                                uint64_t size) {
+        if (IsToggleEnabled(Toggle::LazyClearBufferOnFirstUse)) {
+            destination->SetIsDataInitialized();
+        }
+
         auto operation = std::make_unique<CopyFromStagingToBufferOperation>();
         operation->staging = source;
         operation->destination = ToBackend(destination);
diff --git a/src/dawn_native/opengl/BufferGL.cpp b/src/dawn_native/opengl/BufferGL.cpp
index 7e91a49..f9839bd 100644
--- a/src/dawn_native/opengl/BufferGL.cpp
+++ b/src/dawn_native/opengl/BufferGL.cpp
@@ -24,7 +24,7 @@
         : BufferBase(device, descriptor) {
         // TODO(cwallez@chromium.org): Have a global "zero" buffer instead of creating a new 4-byte
         // buffer?
-        uint64_t size = std::max(GetSize(), uint64_t(4u));
+        uint64_t size = GetAppliedSize();
 
         device->gl.GenBuffers(1, &mBuffer);
         device->gl.BindBuffer(GL_ARRAY_BUFFER, mBuffer);
@@ -45,6 +45,27 @@
         return mBuffer;
     }
 
+    uint64_t Buffer::GetAppliedSize() const {
+        // TODO(cwallez@chromium.org): Have a global "zero" buffer instead of creating a new 4-byte
+        // buffer?
+        return std::max(GetSize(), uint64_t(4u));
+    }
+
+    void Buffer::ClearBufferContentsToZero() {
+        ASSERT(GetDevice()->IsToggleEnabled(Toggle::LazyClearBufferOnFirstUse));
+        ASSERT(!IsDataInitialized());
+
+        const uint64_t size = GetAppliedSize();
+        Device* device = ToBackend(GetDevice());
+
+        const std::vector<uint8_t> clearValues(size, 0u);
+        device->gl.BindBuffer(GL_ARRAY_BUFFER, mBuffer);
+        device->gl.BufferSubData(GL_ARRAY_BUFFER, 0, size, clearValues.data());
+
+        SetIsDataInitialized();
+        device->IncrementLazyClearCountForTesting();
+    }
+
     bool Buffer::IsMapWritable() const {
         // TODO(enga): All buffers in GL can be mapped. Investigate if mapping them will cause the
         // driver to migrate it to shared memory.
diff --git a/src/dawn_native/opengl/BufferGL.h b/src/dawn_native/opengl/BufferGL.h
index 9949829..272af03 100644
--- a/src/dawn_native/opengl/BufferGL.h
+++ b/src/dawn_native/opengl/BufferGL.h
@@ -29,6 +29,8 @@
 
         GLuint GetHandle() const;
 
+        void ClearBufferContentsToZero();
+
       private:
         ~Buffer() override;
         // Dawn API
@@ -40,6 +42,7 @@
         bool IsMapWritable() const override;
         MaybeError MapAtCreationImpl(uint8_t** mappedPointer) override;
         void* GetMappedPointerImpl() override;
+        uint64_t GetAppliedSize() const;
 
         GLuint mBuffer = 0;
         void* mMappedData = nullptr;
diff --git a/src/dawn_native/opengl/QueueGL.cpp b/src/dawn_native/opengl/QueueGL.cpp
index a33cbd0..2547f17 100644
--- a/src/dawn_native/opengl/QueueGL.cpp
+++ b/src/dawn_native/opengl/QueueGL.cpp
@@ -44,6 +44,17 @@
                                       size_t size) {
         const OpenGLFunctions& gl = ToBackend(GetDevice())->gl;
 
+        // TODO(jiawei.shao@intel.com): use Toggle::LazyClearResourceOnFirstUse when the support of
+        // buffer lazy initialization is completed.
+        if (GetDevice()->IsToggleEnabled(Toggle::LazyClearBufferOnFirstUse) &&
+            !buffer->IsDataInitialized()) {
+            if (buffer->IsFullBufferRange(bufferOffset, size)) {
+                buffer->SetIsDataInitialized();
+            } else {
+                ToBackend(buffer)->ClearBufferContentsToZero();
+            }
+        }
+
         gl.BindBuffer(GL_ARRAY_BUFFER, ToBackend(buffer)->GetHandle());
         gl.BufferSubData(GL_ARRAY_BUFFER, bufferOffset, size, data);
         return {};
diff --git a/src/dawn_native/vulkan/BufferVk.cpp b/src/dawn_native/vulkan/BufferVk.cpp
index 0b9385e..74e8469 100644
--- a/src/dawn_native/vulkan/BufferVk.cpp
+++ b/src/dawn_native/vulkan/BufferVk.cpp
@@ -166,7 +166,7 @@
             "vkBindBufferMemory"));
 
         if (device->IsToggleEnabled(Toggle::NonzeroClearResourcesOnCreationForTesting)) {
-            ClearBuffer(device->GetPendingRecordingContext(), ClearValue::NonZero);
+            ClearBuffer(device->GetPendingRecordingContext(), 0x01010101);
         }
 
         return {};
@@ -287,20 +287,25 @@
         }
     }
 
-    void Buffer::ClearBuffer(CommandRecordingContext* recordingContext, ClearValue clearValue) {
+    void Buffer::ClearBufferContentsToZero(CommandRecordingContext* recordingContext) {
+        ASSERT(GetDevice()->IsToggleEnabled(Toggle::LazyClearBufferOnFirstUse));
+        ASSERT(!IsDataInitialized());
+
+        ClearBuffer(recordingContext, 0u);
+
+        SetIsDataInitialized();
+        GetDevice()->IncrementLazyClearCountForTesting();
+    }
+
+    void Buffer::ClearBuffer(CommandRecordingContext* recordingContext, uint32_t clearValue) {
         ASSERT(recordingContext != nullptr);
 
-        // TODO(jiawei.shao@intel.com): support buffer lazy-initialization to 0.
-        ASSERT(clearValue == BufferBase::ClearValue::NonZero);
-
-        constexpr uint32_t kClearBufferValue = 0x01010101;
-
         TransitionUsageNow(recordingContext, wgpu::BufferUsage::CopyDst);
 
         Device* device = ToBackend(GetDevice());
         // TODO(jiawei.shao@intel.com): find out why VK_WHOLE_SIZE doesn't work on old Windows Intel
         // Vulkan drivers.
         device->fn.CmdFillBuffer(recordingContext->commandBuffer, mHandle, 0, GetSize(),
-                                 kClearBufferValue);
+                                 clearValue);
     }
 }}  // namespace dawn_native::vulkan
diff --git a/src/dawn_native/vulkan/BufferVk.h b/src/dawn_native/vulkan/BufferVk.h
index 1c048703..ace13a9 100644
--- a/src/dawn_native/vulkan/BufferVk.h
+++ b/src/dawn_native/vulkan/BufferVk.h
@@ -45,11 +45,13 @@
                                 VkPipelineStageFlags* srcStages,
                                 VkPipelineStageFlags* dstStages);
 
+        void ClearBufferContentsToZero(CommandRecordingContext* recordingContext);
+
       private:
         ~Buffer() override;
         using BufferBase::BufferBase;
         MaybeError Initialize();
-        void ClearBuffer(CommandRecordingContext* recordingContext, ClearValue clearValue);
+        void ClearBuffer(CommandRecordingContext* recordingContext, uint32_t clearValue);
 
         // Dawn API
         MaybeError MapReadAsyncImpl(uint32_t serial) override;
diff --git a/src/dawn_native/vulkan/DeviceVk.cpp b/src/dawn_native/vulkan/DeviceVk.cpp
index 2415b49..5a2a905 100644
--- a/src/dawn_native/vulkan/DeviceVk.cpp
+++ b/src/dawn_native/vulkan/DeviceVk.cpp
@@ -590,6 +590,19 @@
         // calling this function.
         ASSERT(size != 0);
 
+        CommandRecordingContext* recordingContext = GetPendingRecordingContext();
+
+        // TODO(jiawei.shao@intel.com): use Toggle::LazyClearResourceOnFirstUse when the support of
+        // buffer lazy initialization is completed.
+        if (IsToggleEnabled(Toggle::LazyClearBufferOnFirstUse) &&
+            !destination->IsDataInitialized()) {
+            if (destination->IsFullBufferRange(destinationOffset, size)) {
+                destination->SetIsDataInitialized();
+            } else {
+                ToBackend(destination)->ClearBufferContentsToZero(recordingContext);
+            }
+        }
+
         // Insert memory barrier to ensure host write operations are made visible before
         // copying from the staging buffer. However, this barrier can be removed (see note below).
         //
@@ -599,7 +612,6 @@
 
         // Insert pipeline barrier to ensure correct ordering with previous memory operations on the
         // buffer.
-        CommandRecordingContext* recordingContext = GetPendingRecordingContext();
         ToBackend(destination)->TransitionUsageNow(recordingContext, wgpu::BufferUsage::CopyDst);
 
         VkBufferCopy copy;
diff --git a/src/tests/BUILD.gn b/src/tests/BUILD.gn
index 201ff08..853e1f0 100644
--- a/src/tests/BUILD.gn
+++ b/src/tests/BUILD.gn
@@ -259,6 +259,7 @@
     "end2end/BasicTests.cpp",
     "end2end/BindGroupTests.cpp",
     "end2end/BufferTests.cpp",
+    "end2end/BufferZeroInitTests.cpp",
     "end2end/ClipSpaceTests.cpp",
     "end2end/ColorStateTests.cpp",
     "end2end/CompressedTextureFormatTests.cpp",
diff --git a/src/tests/end2end/BufferZeroInitTests.cpp b/src/tests/end2end/BufferZeroInitTests.cpp
new file mode 100644
index 0000000..5392e49
--- /dev/null
+++ b/src/tests/end2end/BufferZeroInitTests.cpp
@@ -0,0 +1,98 @@
+// Copyright 2020 The Dawn Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tests/DawnTest.h"
+
+#include "utils/WGPUHelpers.h"
+
+#define EXPECT_LAZY_CLEAR(N, statement)                                                       \
+    do {                                                                                      \
+        if (UsesWire()) {                                                                     \
+            statement;                                                                        \
+        } else {                                                                              \
+            size_t lazyClearsBefore = dawn_native::GetLazyClearCountForTesting(device.Get()); \
+            statement;                                                                        \
+            size_t lazyClearsAfter = dawn_native::GetLazyClearCountForTesting(device.Get());  \
+            EXPECT_EQ(N, lazyClearsAfter - lazyClearsBefore);                                 \
+        }                                                                                     \
+    } while (0)
+
+class BufferZeroInitTest : public DawnTest {
+  public:
+    wgpu::Buffer CreateBuffer(uint64_t size, wgpu::BufferUsage usage) {
+        wgpu::BufferDescriptor descriptor;
+        descriptor.size = size;
+        descriptor.usage = usage;
+        return device.CreateBuffer(&descriptor);
+    }
+};
+
+// Test that calling writeBuffer to overwrite the entire buffer doesn't need to lazily initialize
+// the destination buffer.
+TEST_P(BufferZeroInitTest, WriteBufferToEntireBuffer) {
+    constexpr uint32_t kBufferSize = 8u;
+    constexpr wgpu::BufferUsage kBufferUsage =
+        wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst;
+    wgpu::Buffer buffer = CreateBuffer(kBufferSize, kBufferUsage);
+
+    constexpr std::array<uint32_t, kBufferSize / sizeof(uint32_t)> kExpectedData = {
+        {0x02020202u, 0x02020202u}};
+    EXPECT_LAZY_CLEAR(0u, queue.WriteBuffer(buffer, 0, kExpectedData.data(), kBufferSize));
+
+    EXPECT_BUFFER_U32_RANGE_EQ(kExpectedData.data(), buffer, 0, kBufferSize / sizeof(uint32_t));
+}
+
+// Test that calling writeBuffer to overwrite a part of buffer needs to lazily initialize the
+// destination buffer.
+TEST_P(BufferZeroInitTest, WriteBufferToSubBuffer) {
+    constexpr uint32_t kBufferSize = 8u;
+    constexpr wgpu::BufferUsage kBufferUsage =
+        wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst;
+
+    constexpr uint32_t kCopyValue = 0x02020202u;
+
+    // offset == 0
+    {
+        wgpu::Buffer buffer = CreateBuffer(kBufferSize, kBufferUsage);
+
+        constexpr uint32_t kCopyOffset = 0u;
+        EXPECT_LAZY_CLEAR(1u,
+                          queue.WriteBuffer(buffer, kCopyOffset, &kCopyValue, sizeof(kCopyValue)));
+
+        EXPECT_BUFFER_U32_EQ(kCopyValue, buffer, kCopyOffset);
+        EXPECT_BUFFER_U32_EQ(0, buffer, kBufferSize - sizeof(kCopyValue));
+    }
+
+    // offset > 0
+    {
+        wgpu::Buffer buffer = CreateBuffer(kBufferSize, kBufferUsage);
+
+        constexpr uint32_t kCopyOffset = 4u;
+        EXPECT_LAZY_CLEAR(1u,
+                          queue.WriteBuffer(buffer, kCopyOffset, &kCopyValue, sizeof(kCopyValue)));
+
+        EXPECT_BUFFER_U32_EQ(0, buffer, 0);
+        EXPECT_BUFFER_U32_EQ(kCopyValue, buffer, kCopyOffset);
+    }
+}
+
+DAWN_INSTANTIATE_TEST(BufferZeroInitTest,
+                      D3D12Backend({"nonzero_clear_resources_on_creation_for_testing",
+                                    "lazy_clear_buffer_on_first_use"}),
+                      MetalBackend({"nonzero_clear_resources_on_creation_for_testing",
+                                    "lazy_clear_buffer_on_first_use"}),
+                      OpenGLBackend({"nonzero_clear_resources_on_creation_for_testing",
+                                     "lazy_clear_buffer_on_first_use"}),
+                      VulkanBackend({"nonzero_clear_resources_on_creation_for_testing",
+                                     "lazy_clear_buffer_on_first_use"}));