Reland "Add tests having multiple indirect dispatch within one pass"

This is a reland of commit 52d11797f66d0718643d5462954cf6d38c42c232

Original change's description:
> Add tests having multiple indirect dispatch within one pass
>
> Add more test coverage for multiple DispatchWorkgroupsIndirect
> calls within one compute shader pass to ensure
> DispatchWorkgroupsIndirect with non-zero indirect offset work
> correctly when duplicating numWorkgroups.
>
> Bug: dawn:2201, dawn:1262
> Change-Id: I598bfd076328b0baa9ef04e61c62c2a7b048ac48
> Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/158821
> Kokoro: Kokoro <noreply+kokoro@google.com>
> Reviewed-by: Austin Eng <enga@chromium.org>
> Reviewed-by: Loko Kung <lokokung@google.com>
> Commit-Queue: Shrek Shao <shrekshao@google.com>

Bug: dawn:2201, dawn:1262
Change-Id: I94465c9be6230a6fffc4511197edbd2f47e1edb0
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/160160
Kokoro: Kokoro <noreply+kokoro@google.com>
Reviewed-by: Kai Ninomiya <kainino@chromium.org>
Commit-Queue: Kai Ninomiya <kainino@chromium.org>
diff --git a/src/dawn/tests/end2end/ComputeDispatchTests.cpp b/src/dawn/tests/end2end/ComputeDispatchTests.cpp
index 62c6cb5..2ad2361 100644
--- a/src/dawn/tests/end2end/ComputeDispatchTests.cpp
+++ b/src/dawn/tests/end2end/ComputeDispatchTests.cpp
@@ -25,9 +25,11 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+#include <algorithm>
 #include <initializer_list>
 #include <vector>
 
+#include "dawn/common/Math.h"
 #include "dawn/tests/DawnTest.h"
 #include "dawn/utils/WGPUHelpers.h"
 
@@ -225,8 +227,6 @@
     // TODO(crbug.com/dawn/1196): Fails on Chromium's Quadro P400 bots
     DAWN_SUPPRESS_TEST_IF(IsD3D12() && IsNvidia());
 #endif
-    // TODO(crbug.com/dawn/1262): Fails with the full validation turned on.
-    DAWN_SUPPRESS_TEST_IF(IsD3D12() && IsFullBackendValidationEnabled());
 
     IndirectTest({2, 3, 4}, 0);
 }
@@ -257,8 +257,6 @@
     // TODO(crbug.com/dawn/1196): Fails on Chromium's Quadro P400 bots
     DAWN_SUPPRESS_TEST_IF(IsD3D12() && IsNvidia());
 #endif
-    // TODO(crbug.com/dawn/1262): Fails with the full validation turned on.
-    DAWN_SUPPRESS_TEST_IF(IsD3D12() && IsFullBackendValidationEnabled());
 
     IndirectTest({0, 0, 0, 2, 3, 4}, 3 * sizeof(uint32_t));
 }
@@ -274,8 +272,6 @@
     // TODO(crbug.com/dawn/1196): Fails on Chromium's Quadro P400 bots
     DAWN_SUPPRESS_TEST_IF(IsD3D12() && IsNvidia());
 #endif
-    // TODO(crbug.com/dawn/1262): Fails with the full validation turned on.
-    DAWN_SUPPRESS_TEST_IF(IsD3D12() && IsFullBackendValidationEnabled());
 
     // TODO(crbug.com/dawn/1165): Fails with WARP
     DAWN_SUPPRESS_TEST_IF(IsWARP());
@@ -336,5 +332,267 @@
                       OpenGLESBackend(),
                       VulkanBackend());
 
+namespace {
+using UseNumWorkgoups = bool;
+DAWN_TEST_PARAM_STRUCT(Params, UseNumWorkgoups);
+}  // namespace
+
+class ComputeMultipleDispatchesTests : public DawnTestWithParams<Params> {
+  protected:
+    void SetUp() override {
+        DawnTestWithParams<Params>::SetUp();
+
+        bindGroupLayout = utils::MakeBindGroupLayout(
+            device, {{0, wgpu::ShaderStage::Compute, wgpu::BufferBindingType::Uniform,
+                      /* hasDynamicOffset = */ true},
+                     {1, wgpu::ShaderStage::Compute, wgpu::BufferBindingType::Storage}});
+
+        wgpu::PipelineLayoutDescriptor pipelineLayoutDescriptor;
+        pipelineLayoutDescriptor.bindGroupLayoutCount = 1;
+        pipelineLayoutDescriptor.bindGroupLayouts = &bindGroupLayout;
+        wgpu::PipelineLayout pipelineLayout =
+            device.CreatePipelineLayout(&pipelineLayoutDescriptor);
+
+        // Write workgroup number into the output buffer if we saw the biggest dispatch
+        // To make sure the dispatch was not called, write maximum u32 value for 0 dispatches
+        wgpu::ShaderModule module = utils::CreateShaderModule(device, R"(
+            @group(0) @binding(0) var<uniform> dispatchId : u32;
+            @group(0) @binding(1) var<storage, read_write> output : array<vec3u>;
+
+            @compute @workgroup_size(1, 1, 1)
+            fn main(@builtin(global_invocation_id) GlobalInvocationID : vec3u,
+                    @builtin(num_workgroups) dispatch : vec3u) {
+                if (dispatch.x == 0u || dispatch.y == 0u || dispatch.z == 0u) {
+                    output[dispatchId] = vec3u(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu);
+                    return;
+                }
+
+                if (all(GlobalInvocationID == dispatch - vec3u(1u, 1u, 1u))) {
+                    output[dispatchId] = dispatch;
+                }
+            })");
+
+        wgpu::ComputePipelineDescriptor csDesc;
+        //
+        csDesc.compute.module = module;
+        csDesc.compute.entryPoint = "main";
+        csDesc.layout = pipelineLayout;
+        pipeline = device.CreateComputePipeline(&csDesc);
+
+        // Test the use of the compute pipelines without using @num_workgroups
+        wgpu::ShaderModule moduleWithoutNumWorkgroups = utils::CreateShaderModule(device, R"(
+            // input.xyz = num_workgroups.xyz, input.w = dispatch call id (i.e. offset in output buffer)
+            @group(0) @binding(0) var<uniform> input : vec4u;
+            @group(0) @binding(1) var<storage, read_write> output : array<vec3u>;
+
+            @compute @workgroup_size(1, 1, 1)
+            fn main(@builtin(global_invocation_id) GlobalInvocationID : vec3u) {
+                let dispatch : vec3u = input.xyz;
+                let dispatchId = input.w;
+
+                if (dispatch.x == 0u || dispatch.y == 0u || dispatch.z == 0u) {
+                    output[dispatchId] = vec3u(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu);
+                    return;
+                }
+
+                if (all(GlobalInvocationID == dispatch - vec3u(1u, 1u, 1u))) {
+                    output[dispatchId] = dispatch;
+                }
+            })");
+        csDesc.compute.module = moduleWithoutNumWorkgroups;
+        csDesc.compute.entryPoint = "main";
+        csDesc.layout = pipelineLayout;
+        pipelineWithoutNumWorkgroups = device.CreateComputePipeline(&csDesc);
+    }
+
+    void IndirectTest(std::vector<uint32_t> indirectBufferData,
+                      std::vector<uint64_t> indirectOffsets) {
+        bool useNumWorkgroups = GetParam().mUseNumWorkgoups;
+        // Set up dst storage buffer to contain dispatch x, y, z
+        wgpu::BufferDescriptor dstBufDescriptor;
+        // array<vec3u> aligns to 16 bytes
+        dstBufDescriptor.size = Align(indirectOffsets.size() * 4u * sizeof(uint32_t), 16u);
+        dstBufDescriptor.usage =
+            wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst;
+        wgpu::Buffer dst = device.CreateBuffer(&dstBufDescriptor);
+
+        wgpu::Buffer indirectBuffer = utils::CreateBufferFromData(
+            device, &indirectBufferData[0], indirectBufferData.size() * sizeof(uint32_t),
+            wgpu::BufferUsage::Indirect | wgpu::BufferUsage::CopySrc);
+
+        // dynamic offset requires a 256 byte alignment. So we store dispatch index every 256 byte
+        // instead of compactly
+        constexpr uint32_t kDynamicOffsetReq = 256;
+        constexpr uint32_t kIndexOffset = kDynamicOffsetReq / sizeof(uint32_t);
+
+        std::vector<uint32_t> dynamicOffsets(indirectOffsets.size());
+        for (size_t i = 0; i < indirectOffsets.size(); i++) {
+            dynamicOffsets[i] = i * kDynamicOffsetReq;
+        }
+
+        // Set up bind group and issue dispatch
+        wgpu::BindGroup bindGroup;
+        wgpu::ComputePipeline computePipelineForTest;
+
+        if (useNumWorkgroups) {
+            computePipelineForTest = pipeline;
+
+            std::vector<uint32_t> dispatchIds(indirectOffsets.size() * kIndexOffset);
+            for (size_t i = 0; i < indirectOffsets.size(); i++) {
+                size_t o = kIndexOffset * i;
+                dispatchIds[o] = i;
+            }
+
+            wgpu::Buffer uniformBuffer = utils::CreateBufferFromData(
+                device, dispatchIds.data(), dispatchIds.size() * sizeof(uint32_t),
+                wgpu::BufferUsage::Uniform);
+            bindGroup = utils::MakeBindGroup(device, bindGroupLayout,
+                                             {
+                                                 {0, uniformBuffer, 0, sizeof(uint32_t)},
+                                                 {1, dst, 0, dstBufDescriptor.size},
+                                             });
+        } else {
+            computePipelineForTest = pipelineWithoutNumWorkgroups;
+
+            std::vector<uint32_t> inputs(indirectOffsets.size() * kIndexOffset);
+            for (size_t i = 0; i < indirectOffsets.size(); i++) {
+                uint32_t indirectStart = indirectOffsets[i] / sizeof(uint32_t);
+                size_t o = kIndexOffset * i;
+                // numWorkgroups
+                inputs[o] = indirectBufferData[indirectStart];
+                inputs[o + 1] = indirectBufferData[indirectStart + 1];
+                inputs[o + 2] = indirectBufferData[indirectStart + 2];
+                // dispatchId
+                inputs[o + 3] = i;
+            }
+
+            wgpu::Buffer uniformBuffer =
+                utils::CreateBufferFromData(device, inputs.data(), inputs.size() * sizeof(uint32_t),
+                                            wgpu::BufferUsage::Uniform);
+            bindGroup = utils::MakeBindGroup(device, bindGroupLayout,
+                                             {
+                                                 {0, uniformBuffer, 0, 4 * sizeof(uint32_t)},
+                                                 {1, dst, 0, dstBufDescriptor.size},
+                                             });
+        }
+
+        wgpu::CommandBuffer commands;
+        {
+            wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+            wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
+            pass.SetPipeline(computePipelineForTest);
+            for (size_t i = 0; i < indirectOffsets.size(); i++) {
+                uint64_t indirectOffset = indirectOffsets[i];
+                // Use dynamic binding offset to set dispatch Id (used as offset to output buffer)
+                // for each dispatch call
+                pass.SetBindGroup(0, bindGroup, 1, &dynamicOffsets[i]);
+                pass.DispatchWorkgroupsIndirect(indirectBuffer, indirectOffset);
+            }
+
+            pass.End();
+
+            commands = encoder.Finish();
+        }
+
+        queue.Submit(1, &commands);
+
+        uint32_t maxComputeWorkgroupsPerDimension =
+            GetSupportedLimits().limits.maxComputeWorkgroupsPerDimension;
+
+        std::vector<uint32_t> expected(4 * indirectOffsets.size(), 0);
+        for (size_t i = 0; i < indirectOffsets.size(); i++) {
+            uint32_t indirectStart = indirectOffsets[i] / sizeof(uint32_t);
+            size_t o = 4 * i;
+
+            if (indirectBufferData[indirectStart] == 0 ||
+                indirectBufferData[indirectStart + 1] == 0 ||
+                indirectBufferData[indirectStart + 2] == 0 ||
+                indirectBufferData[indirectStart] > maxComputeWorkgroupsPerDimension ||
+                indirectBufferData[indirectStart + 1] > maxComputeWorkgroupsPerDimension ||
+                indirectBufferData[indirectStart + 2] > maxComputeWorkgroupsPerDimension) {
+                std::copy(kSentinelData.begin(), kSentinelData.end(), expected.begin() + o);
+            } else {
+                expected[o] = indirectBufferData[indirectStart];
+                expected[o + 1] = indirectBufferData[indirectStart + 1];
+                expected[o + 2] = indirectBufferData[indirectStart + 2];
+            }
+        }
+
+        // Verify the indirect buffer is not modified
+        EXPECT_BUFFER_U32_RANGE_EQ(&indirectBufferData[0], indirectBuffer, 0,
+                                   indirectBufferData.size());
+        // Verify the dispatch got called with group counts in indirect buffer if all group counts
+        // are not zero
+        EXPECT_BUFFER_U32_RANGE_EQ(&expected[0], dst, 0, expected.size());
+    }
+
+  private:
+    wgpu::ComputePipeline pipeline;
+    wgpu::ComputePipeline pipelineWithoutNumWorkgroups;
+    wgpu::BindGroupLayout bindGroupLayout;
+};
+
+// Test indirect dispatches with buffer offset
+TEST_P(ComputeMultipleDispatchesTests, IndirectOffset) {
+#if DAWN_PLATFORM_IS(32_BIT)
+    // TODO(crbug.com/dawn/1196): Fails on Chromium's Quadro P400 bots
+    DAWN_SUPPRESS_TEST_IF(IsD3D12() && IsNvidia());
+#endif
+
+    // Control case: One DispatchWorkgroupsIndirect call
+    IndirectTest({0, 0, 0, 2, 3, 4}, {3 * sizeof(uint32_t)});
+
+    // Two dispatches: first is no-op
+    IndirectTest({0, 0, 0, 2, 3, 4}, {0, 3 * sizeof(uint32_t)});
+
+    // Two dispatches
+    IndirectTest({9, 8, 7, 2, 3, 4}, {0, 3 * sizeof(uint32_t)});
+
+    // Indirect offsets not in order
+    IndirectTest({9, 8, 7, 2, 3, 4}, {3 * sizeof(uint32_t), 0});
+
+    // Multiple dispatches with duplicate indirect offsets
+    IndirectTest({9, 8, 7, 0, 0, 0, 2, 3, 4, 0xa, 0xb, 0xc, 0xf, 0xe, 0xd},
+                 {
+                     6 * sizeof(uint32_t),
+                     0,
+                     3 * sizeof(uint32_t),
+                     12 * sizeof(uint32_t),
+                     9 * sizeof(uint32_t),
+                     6 * sizeof(uint32_t),
+                     6 * sizeof(uint32_t),
+                 });
+}
+
+// Test indirect dispatches exceeding the max limit with an offset are noop-ed.
+TEST_P(ComputeMultipleDispatchesTests, ExceedsMaxWorkgroupsWithOffsetNoop) {
+    DAWN_TEST_UNSUPPORTED_IF(HasToggleEnabled("skip_validation"));
+
+#if DAWN_PLATFORM_IS(32_BIT)
+    // TODO(crbug.com/dawn/1196): Fails on Chromium's Quadro P400 bots
+    DAWN_SUPPRESS_TEST_IF(IsD3D12() && IsNvidia());
+#endif
+
+    // TODO(crbug.com/dawn/839): Investigate why this test fails with WARP.
+    DAWN_SUPPRESS_TEST_IF(IsWARP());
+
+    uint32_t max = GetSupportedLimits().limits.maxComputeWorkgroupsPerDimension;
+
+    // Two dispatches: first is no-op
+    IndirectTest({max + 1, 1, 1, 2, 3, 4}, {0, 3 * sizeof(uint32_t)});
+
+    // Two dispatches: second is no-op
+    IndirectTest({2, 3, 4, max + 1, 1, 1}, {0, 3 * sizeof(uint32_t)});
+
+    // Two dispatches: second is no-op
+    IndirectTest({max + 1, 1, 1, 2, 3, 4}, {3 * sizeof(uint32_t), 0});
+}
+
+DAWN_INSTANTIATE_TEST_P(ComputeMultipleDispatchesTests,
+                        {D3D11Backend(), D3D12Backend(), MetalBackend(), OpenGLBackend(),
+                         OpenGLESBackend(), VulkanBackend()},
+                        {true, false}  // useNumWorkgroups
+);
+
 }  // anonymous namespace
 }  // namespace dawn