Vulkan: Combine all pass barriers in a single call

This patch combines barriers in a render/compute pass into one
function call.

Previously, we need to dispatch barrier(s) for each buffer/texture
in a pass. So we may need quite a lot function calls to deliver
barriers in a pass in real web applications. One example is that
we did see that too many function calls to deliver barriers in
Aquarium (WebGPU porting) contributed to CPU usage and bottleneck.

Bug: dawn:441

Change-Id: Ibe44967fefd2e1e6e64df4587146c4fb7fbe8e73
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/22700
Commit-Queue: Yunchao He <yunchao.he@intel.com>
Reviewed-by: Austin Eng <enga@chromium.org>
diff --git a/src/dawn_native/vulkan/BufferVk.cpp b/src/dawn_native/vulkan/BufferVk.cpp
index 0b4584b..6bd2987 100644
--- a/src/dawn_native/vulkan/BufferVk.cpp
+++ b/src/dawn_native/vulkan/BufferVk.cpp
@@ -186,6 +186,25 @@
 
     void Buffer::TransitionUsageNow(CommandRecordingContext* recordingContext,
                                     wgpu::BufferUsage usage) {
+        std::vector<VkBufferMemoryBarrier> barriers;
+        VkPipelineStageFlags srcStages = 0;
+        VkPipelineStageFlags dstStages = 0;
+
+        TransitionUsageNow(recordingContext, usage, &barriers, &srcStages, &dstStages);
+
+        if (barriers.size() > 0) {
+            ASSERT(barriers.size() == 1);
+            ToBackend(GetDevice())
+                ->fn.CmdPipelineBarrier(recordingContext->commandBuffer, srcStages, dstStages, 0, 0,
+                                        nullptr, barriers.size(), barriers.data(), 0, nullptr);
+        }
+    }
+
+    void Buffer::TransitionUsageNow(CommandRecordingContext* recordingContext,
+                                    wgpu::BufferUsage usage,
+                                    std::vector<VkBufferMemoryBarrier>* bufferBarriers,
+                                    VkPipelineStageFlags* srcStages,
+                                    VkPipelineStageFlags* dstStages) {
         bool lastIncludesTarget = (mLastUsage & usage) == usage;
         bool lastReadOnly = (mLastUsage & kReadOnlyBufferUsages) == mLastUsage;
 
@@ -200,8 +219,8 @@
             return;
         }
 
-        VkPipelineStageFlags srcStages = VulkanPipelineStage(mLastUsage);
-        VkPipelineStageFlags dstStages = VulkanPipelineStage(usage);
+        *srcStages |= VulkanPipelineStage(mLastUsage);
+        *dstStages |= VulkanPipelineStage(usage);
 
         VkBufferMemoryBarrier barrier;
         barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
@@ -214,9 +233,7 @@
         barrier.offset = 0;
         barrier.size = GetSize();
 
-        ToBackend(GetDevice())
-            ->fn.CmdPipelineBarrier(recordingContext->commandBuffer, srcStages, dstStages, 0, 0,
-                                    nullptr, 1, &barrier, 0, nullptr);
+        bufferBarriers->push_back(barrier);
 
         mLastUsage = usage;
     }
diff --git a/src/dawn_native/vulkan/BufferVk.h b/src/dawn_native/vulkan/BufferVk.h
index 021e4ab..93669b6 100644
--- a/src/dawn_native/vulkan/BufferVk.h
+++ b/src/dawn_native/vulkan/BufferVk.h
@@ -39,6 +39,11 @@
         // `commands`.
         // TODO(cwallez@chromium.org): coalesce barriers and do them early when possible.
         void TransitionUsageNow(CommandRecordingContext* recordingContext, wgpu::BufferUsage usage);
+        void TransitionUsageNow(CommandRecordingContext* recordingContext,
+                                wgpu::BufferUsage usage,
+                                std::vector<VkBufferMemoryBarrier>* bufferBarriers,
+                                VkPipelineStageFlags* srcStages,
+                                VkPipelineStageFlags* dstStages);
 
       private:
         ~Buffer() override;
diff --git a/src/dawn_native/vulkan/CommandBufferVk.cpp b/src/dawn_native/vulkan/CommandBufferVk.cpp
index c220176..54c0686 100644
--- a/src/dawn_native/vulkan/CommandBufferVk.cpp
+++ b/src/dawn_native/vulkan/CommandBufferVk.cpp
@@ -376,12 +376,19 @@
         VkCommandBuffer commands = recordingContext->commandBuffer;
 
         // Records the necessary barriers for the resource usage pre-computed by the frontend
-        auto TransitionForPass = [](CommandRecordingContext* recordingContext,
+        auto TransitionForPass = [](Device* device, CommandRecordingContext* recordingContext,
                                     const PassResourceUsage& usages) {
+            std::vector<VkBufferMemoryBarrier> bufferBarriers;
+            std::vector<VkImageMemoryBarrier> imageBarriers;
+            VkPipelineStageFlags srcStages = 0;
+            VkPipelineStageFlags dstStages = 0;
+
             for (size_t i = 0; i < usages.buffers.size(); ++i) {
                 Buffer* buffer = ToBackend(usages.buffers[i]);
-                buffer->TransitionUsageNow(recordingContext, usages.bufferUsages[i]);
+                buffer->TransitionUsageNow(recordingContext, usages.bufferUsages[i],
+                                           &bufferBarriers, &srcStages, &dstStages);
             }
+
             for (size_t i = 0; i < usages.textures.size(); ++i) {
                 Texture* texture = ToBackend(usages.textures[i]);
                 // Clear textures that are not output attachments. Output attachments will be
@@ -393,9 +400,18 @@
                                                                  texture->GetArrayLayers());
                 }
                 texture->TransitionUsageForPass(recordingContext,
-                                                usages.textureUsages[i].subresourceUsages);
+                                                usages.textureUsages[i].subresourceUsages,
+                                                &imageBarriers, &srcStages, &dstStages);
+            }
+
+            if (bufferBarriers.size() || imageBarriers.size()) {
+                device->fn.CmdPipelineBarrier(recordingContext->commandBuffer, srcStages, dstStages,
+                                              0, 0, nullptr, bufferBarriers.size(),
+                                              bufferBarriers.data(), imageBarriers.size(),
+                                              imageBarriers.data());
             }
         };
+
         const std::vector<PassResourceUsage>& passResourceUsages = GetResourceUsages().perPass;
         size_t nextPassNumber = 0;
 
@@ -562,7 +578,7 @@
                 case Command::BeginRenderPass: {
                     BeginRenderPassCmd* cmd = mCommands.NextCommand<BeginRenderPassCmd>();
 
-                    TransitionForPass(recordingContext, passResourceUsages[nextPassNumber]);
+                    TransitionForPass(device, recordingContext, passResourceUsages[nextPassNumber]);
 
                     LazyClearRenderPassAttachments(cmd);
                     DAWN_TRY(RecordRenderPass(recordingContext, cmd));
@@ -574,7 +590,7 @@
                 case Command::BeginComputePass: {
                     mCommands.NextCommand<BeginComputePassCmd>();
 
-                    TransitionForPass(recordingContext, passResourceUsages[nextPassNumber]);
+                    TransitionForPass(device, recordingContext, passResourceUsages[nextPassNumber]);
                     RecordComputePass(recordingContext);
 
                     nextPassNumber++;
diff --git a/src/dawn_native/vulkan/TextureVk.cpp b/src/dawn_native/vulkan/TextureVk.cpp
index c9ad0eb..15d7137 100644
--- a/src/dawn_native/vulkan/TextureVk.cpp
+++ b/src/dawn_native/vulkan/TextureVk.cpp
@@ -670,34 +670,41 @@
     }
 
     void Texture::TweakTransitionForExternalUsage(CommandRecordingContext* recordingContext,
-                                                  std::vector<VkImageMemoryBarrier>* barriers) {
+                                                  std::vector<VkImageMemoryBarrier>* barriers,
+                                                  size_t transitionBarrierStart) {
         ASSERT(GetNumMipLevels() == 1 && GetArrayLayers() == 1);
-        ASSERT(barriers->size() <= 1);
+
+        // transitionBarrierStart specify the index where barriers for current transition start in
+        // the vector. barriers->size() - transitionBarrierStart is the number of barriers that we
+        // have already added into the vector during current transition.
+        ASSERT(barriers->size() - transitionBarrierStart <= 1);
 
         if (mExternalState == ExternalState::PendingAcquire) {
-            if (!barriers->size()) {
+            if (barriers->size() == transitionBarrierStart) {
                 barriers->push_back(BuildMemoryBarrier(GetFormat(), mHandle,
                                                        wgpu::TextureUsage::None,
                                                        wgpu::TextureUsage::None, 0, 0));
             }
 
             // Transfer texture from external queue to graphics queue
-            (*barriers)[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_EXTERNAL_KHR;
-            (*barriers)[0].dstQueueFamilyIndex = ToBackend(GetDevice())->GetGraphicsQueueFamily();
+            (*barriers)[transitionBarrierStart].srcQueueFamilyIndex = VK_QUEUE_FAMILY_EXTERNAL_KHR;
+            (*barriers)[transitionBarrierStart].dstQueueFamilyIndex =
+                ToBackend(GetDevice())->GetGraphicsQueueFamily();
             // Don't override oldLayout to leave it as VK_IMAGE_LAYOUT_UNDEFINED
             // TODO(http://crbug.com/dawn/200)
             mExternalState = ExternalState::Acquired;
         } else if (mExternalState == ExternalState::PendingRelease) {
-            if (!barriers->size()) {
+            if (barriers->size() == transitionBarrierStart) {
                 barriers->push_back(BuildMemoryBarrier(GetFormat(), mHandle,
                                                        wgpu::TextureUsage::None,
                                                        wgpu::TextureUsage::None, 0, 0));
             }
 
             // Transfer texture from graphics queue to external queue
-            (*barriers)[0].srcQueueFamilyIndex = ToBackend(GetDevice())->GetGraphicsQueueFamily();
-            (*barriers)[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_EXTERNAL_KHR;
-            (*barriers)[0].newLayout = VK_IMAGE_LAYOUT_GENERAL;
+            (*barriers)[transitionBarrierStart].srcQueueFamilyIndex =
+                ToBackend(GetDevice())->GetGraphicsQueueFamily();
+            (*barriers)[transitionBarrierStart].dstQueueFamilyIndex = VK_QUEUE_FAMILY_EXTERNAL_KHR;
+            (*barriers)[transitionBarrierStart].newLayout = VK_IMAGE_LAYOUT_GENERAL;
             mExternalState = ExternalState::Released;
         }
 
@@ -714,8 +721,11 @@
     }
 
     void Texture::TransitionUsageForPass(CommandRecordingContext* recordingContext,
-                                         const std::vector<wgpu::TextureUsage>& subresourceUsages) {
-        std::vector<VkImageMemoryBarrier> barriers;
+                                         const std::vector<wgpu::TextureUsage>& subresourceUsages,
+                                         std::vector<VkImageMemoryBarrier>* imageBarriers,
+                                         VkPipelineStageFlags* srcStages,
+                                         VkPipelineStageFlags* dstStages) {
+        size_t transitionBarrierStart = imageBarriers->size();
         const Format& format = GetFormat();
 
         wgpu::TextureUsage allUsages = wgpu::TextureUsage::None;
@@ -740,7 +750,7 @@
                     continue;
                 }
 
-                barriers.push_back(
+                imageBarriers->push_back(
                     BuildMemoryBarrier(format, mHandle, mLastSubresourceUsages[index],
                                        subresourceUsages[index], mipLevel, arrayLayer));
 
@@ -751,14 +761,12 @@
         }
 
         if (mExternalState != ExternalState::InternalOnly) {
-            TweakTransitionForExternalUsage(recordingContext, &barriers);
+            TweakTransitionForExternalUsage(recordingContext, imageBarriers,
+                                            transitionBarrierStart);
         }
 
-        VkPipelineStageFlags srcStages = VulkanPipelineStage(allLastUsages, format);
-        VkPipelineStageFlags dstStages = VulkanPipelineStage(allUsages, format);
-        ToBackend(GetDevice())
-            ->fn.CmdPipelineBarrier(recordingContext->commandBuffer, srcStages, dstStages, 0, 0,
-                                    nullptr, 0, nullptr, barriers.size(), barriers.data());
+        *srcStages |= VulkanPipelineStage(allLastUsages, format);
+        *dstStages |= VulkanPipelineStage(allUsages, format);
     }
 
     void Texture::TransitionUsageNow(CommandRecordingContext* recordingContext,
@@ -796,7 +804,7 @@
         }
 
         if (mExternalState != ExternalState::InternalOnly) {
-            TweakTransitionForExternalUsage(recordingContext, &barriers);
+            TweakTransitionForExternalUsage(recordingContext, &barriers, 0);
         }
 
         VkPipelineStageFlags srcStages = VulkanPipelineStage(allLastUsages, format);
diff --git a/src/dawn_native/vulkan/TextureVk.h b/src/dawn_native/vulkan/TextureVk.h
index 1ac37c4..a1df372 100644
--- a/src/dawn_native/vulkan/TextureVk.h
+++ b/src/dawn_native/vulkan/TextureVk.h
@@ -73,7 +73,10 @@
                                 uint32_t baseArrayLayer,
                                 uint32_t layerCount);
         void TransitionUsageForPass(CommandRecordingContext* recordingContext,
-                                    const std::vector<wgpu::TextureUsage>& subresourceUsages);
+                                    const std::vector<wgpu::TextureUsage>& subresourceUsages,
+                                    std::vector<VkImageMemoryBarrier>* imageBarriers,
+                                    VkPipelineStageFlags* srcStages,
+                                    VkPipelineStageFlags* dstStages);
 
         void EnsureSubresourceContentInitialized(CommandRecordingContext* recordingContext,
                                                  uint32_t baseMipLevel,
@@ -107,7 +110,8 @@
                                 TextureBase::ClearValue);
 
         void TweakTransitionForExternalUsage(CommandRecordingContext* recordingContext,
-                                             std::vector<VkImageMemoryBarrier>* barriers);
+                                             std::vector<VkImageMemoryBarrier>* barriers,
+                                             size_t transitionBarrierStart);
 
         VkImage mHandle = VK_NULL_HANDLE;
         ResourceMemoryAllocation mMemoryAllocation;