Vulkan: Implement fast path for writeBuffer

Implement a fast path for writeBuffer in Vulkan where, instead of
writing data to a scratch buffer and then copying the data into the
destination buffer, the data is written directly into the destination
buffer. This reduces bandwidth by avoiding the extra copy and removes
the need to synchronize the transfer command, improving
synchronization.

To take the fast path:
1. The buffer's memory needs to be host visible. In practice this
   means this optimization is for integrated GPUs where memory is
   shared.
2. The buffer must not currently be in use. In practice this means
   the application needs to double buffer its uniform buffers.
3. There must not be any pending GPU writes to the buffer.

If the fast path can't be taken we will fall back to the current
implementation (writing to a scratch buffer and copying the data to
the destination buffer).

In a test Sponza scene, where the application does not manually
suballocate buffers, this reduces frame times by 11% on a Mali-G78
device.

Bug: dawn:774 dawn:851
Change-Id: Ib5f770297d5729df374d0f847a47ade6b0bf24ef
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/162780
Reviewed-by: Austin Eng <enga@chromium.org>
Commit-Queue: Austin Eng <enga@chromium.org>
diff --git a/src/dawn/native/Buffer.cpp b/src/dawn/native/Buffer.cpp
index 6119b33..26440c1 100644
--- a/src/dawn/native/Buffer.cpp
+++ b/src/dawn/native/Buffer.cpp
@@ -869,6 +869,25 @@
     return mLastUsageSerial;
 }
 
+MaybeError BufferBase::UploadData(uint64_t bufferOffset, const void* data, size_t size) {
+    if (size == 0) {
+        return {};
+    }
+
+    DeviceBase* device = GetDevice();
+
+    UploadHandle uploadHandle;
+    DAWN_TRY_ASSIGN(uploadHandle, device->GetDynamicUploader()->Allocate(
+                                      size, device->GetQueue()->GetPendingCommandSerial(),
+                                      kCopyBufferToBufferOffsetAlignment));
+    DAWN_ASSERT(uploadHandle.mappedBuffer != nullptr);
+
+    memcpy(uploadHandle.mappedBuffer, data, size);
+
+    return device->CopyFromStagingToBuffer(uploadHandle.stagingBuffer, uploadHandle.startOffset,
+                                           this, bufferOffset, size);
+}
+
 void BufferBase::SetHasAccess(bool hasAccess) {
     mState = hasAccess ? BufferState::Unmapped : BufferState::SharedMemoryNoAccess;
 }
diff --git a/src/dawn/native/Buffer.h b/src/dawn/native/Buffer.h
index e87ee17..5ac26c6 100644
--- a/src/dawn/native/Buffer.h
+++ b/src/dawn/native/Buffer.h
@@ -102,6 +102,7 @@
     bool IsFullBufferRange(uint64_t offset, uint64_t size) const;
     bool NeedsInitialization() const;
     void MarkUsedInPendingCommands();
+    virtual MaybeError UploadData(uint64_t bufferOffset, const void* data, size_t size);
 
     // SharedResource impl.
     void SetHasAccess(bool hasAccess) override;
diff --git a/src/dawn/native/Queue.cpp b/src/dawn/native/Queue.cpp
index bbeee41..d2539ca 100644
--- a/src/dawn/native/Queue.cpp
+++ b/src/dawn/native/Queue.cpp
@@ -432,22 +432,7 @@
                                       uint64_t bufferOffset,
                                       const void* data,
                                       size_t size) {
-    if (size == 0) {
-        return {};
-    }
-
-    DeviceBase* device = GetDevice();
-
-    UploadHandle uploadHandle;
-    DAWN_TRY_ASSIGN(uploadHandle,
-                    device->GetDynamicUploader()->Allocate(size, GetPendingCommandSerial(),
-                                                           kCopyBufferToBufferOffsetAlignment));
-    DAWN_ASSERT(uploadHandle.mappedBuffer != nullptr);
-
-    memcpy(uploadHandle.mappedBuffer, data, size);
-
-    return device->CopyFromStagingToBuffer(uploadHandle.stagingBuffer, uploadHandle.startOffset,
-                                           buffer, bufferOffset, size);
+    return buffer->UploadData(bufferOffset, data, size);
 }
 
 void QueueBase::APIWriteTexture(const ImageCopyTexture* destination,
diff --git a/src/dawn/native/vulkan/BufferVk.cpp b/src/dawn/native/vulkan/BufferVk.cpp
index 65fdeb9..1853dc7 100644
--- a/src/dawn/native/vulkan/BufferVk.cpp
+++ b/src/dawn/native/vulkan/BufferVk.cpp
@@ -268,6 +268,15 @@
         }
     }
 
+    // Get if buffer is host visible and coherent. This can be the case even if the buffer was not
+    // created with map usages, as on integrated GPUs all memory will typically be host visible.
+    const size_t memoryType = ToBackend(mMemoryAllocation.GetResourceHeap())->GetMemoryType();
+    const VkMemoryPropertyFlags memoryPropertyFlags =
+        device->GetDeviceInfo().memoryTypes[memoryType].propertyFlags;
+    mHostVisible = IsSubset(VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, memoryPropertyFlags);
+    mHostCoherent = IsSubset(VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, memoryPropertyFlags);
+    mHasWriteTransitioned = false;
+
     SetLabelImpl();
 
     return {};
@@ -359,6 +368,10 @@
     mHostMappedDisposeCallback = hostMappedDesc->disposeCallback;
     mHostMappedDisposeUserdata = hostMappedDesc->userdata;
 
+    mHostVisible = false;
+    mHostCoherent = false;
+    mHasWriteTransitioned = false;
+
     SetLabelImpl();
 
     // Assume the data is initialized since an external pointer was provided.
@@ -539,6 +552,101 @@
     return memory;
 }
 
+MaybeError Buffer::UploadData(uint64_t bufferOffset, const void* data, size_t size) {
+    if (size == 0) {
+        return {};
+    }
+
+    Device* device = ToBackend(GetDevice());
+
+    const bool isInUse = GetLastUsageSerial() > device->GetQueue()->GetCompletedCommandSerial();
+    const bool isMappable = GetUsage() & kMappableBufferUsages;
+    // Get if buffer has pending writes on the GPU. Even if the write workload has finished, the
+    // write may still need a barrier to make the write available.
+    const bool hasPendingWrites = !IsSubset(mLastWriteUsage, wgpu::BufferUsage::MapWrite);
+
+    if (!isInUse && !hasPendingWrites && mHostVisible) {
+        // Buffer does not have any pending uses and is CPU writable. We can map the buffer directly
+        // and write the contents, skipping the scratch buffer.
+        VkDeviceMemory deviceMemory = ToBackend(mMemoryAllocation.GetResourceHeap())->GetMemory();
+        uint8_t* memory;
+        uint64_t realOffset = bufferOffset;
+        if (!isMappable) {
+            // TODO(crbug.com/dawn/774): Persistently map frequently updated buffers instead of
+            // mapping/unmapping each time.
+
+            VkDeviceSize offset = mMemoryAllocation.GetOffset();
+            VkDeviceSize mapSize = mAllocatedSize;
+            if (!NeedsInitialization() && mHostCoherent) {
+                // We can map only the part of the buffer we need to upload the data.
+                // We avoid this for non-coherent memory as the mapping needs to be aligned to
+                // nonCoherentAtomSize.
+                offset += bufferOffset;
+                mapSize = size;
+                realOffset = 0;
+            }
+
+            void* mappedPointer;
+            DAWN_TRY(CheckVkSuccess(device->fn.MapMemory(device->GetVkDevice(), deviceMemory,
+                                                         offset, mapSize, 0, &mappedPointer),
+                                    "vkMapMemory"));
+            memory = static_cast<uint8_t*>(mappedPointer);
+        } else {
+            // Mappable buffers are already persistently mapped.
+            memory = mMemoryAllocation.GetMappedPointer();
+        }
+
+        VkMappedMemoryRange mappedMemoryRange = {};
+        mappedMemoryRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
+        mappedMemoryRange.memory = deviceMemory;
+        mappedMemoryRange.offset = mMemoryAllocation.GetOffset();
+        mappedMemoryRange.size = mAllocatedSize;
+        if (!mHostCoherent) {
+            // For non-coherent memory we need to explicitly invalidate the memory range to make
+            // available GPU writes visible.
+            device->fn.InvalidateMappedMemoryRanges(device->GetVkDevice(), 1, &mappedMemoryRange);
+        }
+
+        if (NeedsInitialization()) {
+            memset(memory, 0, mAllocatedSize);
+            device->IncrementLazyClearCountForTesting();
+            SetInitialized(true);
+        }
+
+        // Copy data.
+        memcpy(memory + realOffset, data, size);
+
+        if (!mHostCoherent) {
+            // For non-coherent memory we need to explicitly flush the memory range to make the host
+            // write visible.
+            // TODO(crbug.com/dawn/774): Batch the flush calls instead of doing one per writeBuffer.
+            device->fn.FlushMappedMemoryRanges(device->GetVkDevice(), 1, &mappedMemoryRange);
+        }
+
+        if (!isMappable) {
+            device->fn.UnmapMemory(device->GetVkDevice(), deviceMemory);
+        }
+        return {};
+    }
+
+    // Write to scratch buffer and copy into final destination buffer.
+    MaybeError error = BufferBase::UploadData(bufferOffset, data, size);
+
+    if (mHostVisible && !mHasWriteTransitioned) {
+        // Transition to MapWrite so the next time we try to upload data to this buffer, we can take
+        // the fast path. This avoids the issue where the first write will take the slow path due to
+        // zero initialization. Only attempt this once to avoid transitioning a buffer many times
+        // despite never getting the fast path.
+        CommandRecordingContext* recordingContext =
+            ToBackend(device->GetQueue())->GetPendingRecordingContext();
+        TransitionUsageNow(recordingContext, wgpu::BufferUsage::MapWrite);
+        device->GetQueue()->ForceEventualFlushOfCommands();
+        mHasWriteTransitioned = true;
+    }
+
+    return error;
+}
+
 void Buffer::DestroyImpl() {
     // TODO(crbug.com/dawn/831): DestroyImpl is called from two places.
     // - It may be called if the buffer is explicitly destroyed with APIDestroy.
diff --git a/src/dawn/native/vulkan/BufferVk.h b/src/dawn/native/vulkan/BufferVk.h
index 8dfa8f5..3676539 100644
--- a/src/dawn/native/vulkan/BufferVk.h
+++ b/src/dawn/native/vulkan/BufferVk.h
@@ -96,6 +96,7 @@
     bool IsCPUWritableAtCreation() const override;
     MaybeError MapAtCreationImpl() override;
     void* GetMappedPointer() override;
+    MaybeError UploadData(uint64_t bufferOffset, const void* data, size_t size) override;
 
     VkBuffer mHandle = VK_NULL_HANDLE;
     ResourceMemoryAllocation mMemoryAllocation;
@@ -113,6 +114,10 @@
     // Track which usages have read the buffer since the last write.
     wgpu::BufferUsage mReadUsage = wgpu::BufferUsage::None;
     wgpu::ShaderStage mReadShaderStages = wgpu::ShaderStage::None;
+
+    bool mHostVisible : 1;
+    bool mHostCoherent : 1;
+    bool mHasWriteTransitioned : 1;
 };
 
 }  // namespace dawn::native::vulkan
diff --git a/src/dawn/native/vulkan/ResourceMemoryAllocatorVk.cpp b/src/dawn/native/vulkan/ResourceMemoryAllocatorVk.cpp
index 61c6352..b561059 100644
--- a/src/dawn/native/vulkan/ResourceMemoryAllocatorVk.cpp
+++ b/src/dawn/native/vulkan/ResourceMemoryAllocatorVk.cpp
@@ -157,7 +157,7 @@
     const VkMemoryRequirements& requirements,
     MemoryKind kind,
     bool forceDisableSubAllocation) {
-    // The Vulkan spec guarantees at least on memory type is valid.
+    // The Vulkan spec guarantees at least one memory type is valid.
     int memoryType = FindBestTypeIndex(requirements, kind);
     DAWN_ASSERT(memoryType >= 0);
 
@@ -180,9 +180,17 @@
         // linear (resp. opaque) resources can coexist in the same page. In particular Nvidia
         // GPUs often use a granularity of 64k which will lead to a lot of wasted spec. Revisit
         // with a more efficient algorithm later.
+        const VulkanDeviceInfo& info = mDevice->GetDeviceInfo();
         uint64_t alignment =
-            std::max(requirements.alignment,
-                     mDevice->GetDeviceInfo().properties.limits.bufferImageGranularity);
+            std::max(requirements.alignment, info.properties.limits.bufferImageGranularity);
+
+        if ((info.memoryTypes[memoryType].propertyFlags &
+             (VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) ==
+            VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
+            // Host accesses to non-coherent memory are bounded by nonCoherentAtomSize. We may map
+            // host visible "non-mappable" memory when taking the fast path during buffer uploads.
+            alignment = std::max(alignment, info.properties.limits.nonCoherentAtomSize);
+        }
 
         ResourceMemoryAllocation subAllocation;
         DAWN_TRY_ASSIGN(subAllocation, mAllocatorsPerType[memoryType]->AllocateMemory(
diff --git a/src/dawn/tests/end2end/BufferTests.cpp b/src/dawn/tests/end2end/BufferTests.cpp
index 6a5a7ac..e6a1d78 100644
--- a/src/dawn/tests/end2end/BufferTests.cpp
+++ b/src/dawn/tests/end2end/BufferTests.cpp
@@ -528,13 +528,18 @@
         *static_cast<bool*>(userdata) = true;
     };
     // Calling MapAsync another time, will reject the callback with error status
-    // (but doesn't produce a validation error) and mMapOffset is not updated
-    // because the buffer is already being mapped and it doesn't allow multiple
-    // MapAsync requests.
+    // and mMapOffset is not updated because the buffer is already being mapped and it doesn't allow
+    // multiple MapAsync requests.
     auto cb2 = [](WGPUBufferMapAsyncStatus status, void* userdata) {
         ASSERT_EQ(WGPUBufferMapAsyncStatus_MappingAlreadyPending, status);
         *static_cast<bool*>(userdata) = true;
     };
+    // Calling MapAsync when the buffer is already mapped (as opposed to pending mapping) will cause
+    // a validation error.
+    auto cb2Mapped = [](WGPUBufferMapAsyncStatus status, void* userdata) {
+        ASSERT_EQ(WGPUBufferMapAsyncStatus_ValidationError, status);
+        *static_cast<bool*>(userdata) = true;
+    };
 
     // Legacy MapAsync
     if (!GetParam().mFutureCallbackMode) {
@@ -542,10 +547,13 @@
         buffer.MapAsync(wgpu::MapMode::Read, 8, 4, cb1, &done1);
 
         // Call MapAsync another time, the callback will be rejected with error status
-        // (but doesn't produce a validation error) and mMapOffset is not updated
-        // because the buffer is already being mapped and it doesn't allow multiple
-        // MapAsync requests.
-        buffer.MapAsync(wgpu::MapMode::Read, 0, 4, cb2, &done2);
+        // and mMapOffset is not updated because the buffer is already being mapped and it doesn't
+        // allow multiple MapAsync requests.
+        if (buffer.GetMapState() == wgpu::BufferMapState::Mapped) {
+            ASSERT_DEVICE_ERROR(buffer.MapAsync(wgpu::MapMode::Read, 0, 4, cb2Mapped, &done2));
+        } else {
+            buffer.MapAsync(wgpu::MapMode::Read, 0, 4, cb2, &done2);
+        }
 
         while (!done1 || !done2) {
             WaitABit();
@@ -556,11 +564,17 @@
                                           {nullptr, *GetParam().mFutureCallbackMode, cb1, &done1});
 
         // Call MapAsync another time, the callback will be rejected with error status
-        // (but doesn't produce a validation error) and mMapOffset is not updated
-        // because the buffer is already being mapped and it doesn't allow multiple
-        // MapAsync requests.
-        wgpu::Future f2 = buffer.MapAsync(wgpu::MapMode::Read, 0, 4,
-                                          {nullptr, *GetParam().mFutureCallbackMode, cb2, &done2});
+        // and mMapOffset is not updated because the buffer is already being mapped and it doesn't
+        // allow multiple MapAsync requests.
+        wgpu::Future f2;
+        if (buffer.GetMapState() == wgpu::BufferMapState::Mapped) {
+            ASSERT_DEVICE_ERROR(f2 = buffer.MapAsync(
+                                    wgpu::MapMode::Read, 0, 4,
+                                    {nullptr, *GetParam().mFutureCallbackMode, cb2Mapped, &done2}));
+        } else {
+            f2 = buffer.MapAsync(wgpu::MapMode::Read, 0, 4,
+                                 {nullptr, *GetParam().mFutureCallbackMode, cb2, &done2});
+        }
 
         switch (*GetParam().mFutureCallbackMode) {
             case wgpu::CallbackMode::WaitAnyOnly: {