Reland "Add tests having multiple indirect dispatch within one pass"
This is a reland of commit 52d11797f66d0718643d5462954cf6d38c42c232
Original change's description:
> Add tests having multiple indirect dispatch within one pass
>
> Add more test coverage for multiple DispatchWorkgroupsIndirect
> calls within one compute shader pass to ensure
> DispatchWorkgroupsIndirect with non-zero indirect offset work
> correctly when duplicating numWorkgroups.
>
> Bug: dawn:2201, dawn:1262
> Change-Id: I598bfd076328b0baa9ef04e61c62c2a7b048ac48
> Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/158821
> Kokoro: Kokoro <noreply+kokoro@google.com>
> Reviewed-by: Austin Eng <enga@chromium.org>
> Reviewed-by: Loko Kung <lokokung@google.com>
> Commit-Queue: Shrek Shao <shrekshao@google.com>
Bug: dawn:2201, dawn:1262
Change-Id: I94465c9be6230a6fffc4511197edbd2f47e1edb0
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/160160
Kokoro: Kokoro <noreply+kokoro@google.com>
Reviewed-by: Kai Ninomiya <kainino@chromium.org>
Commit-Queue: Kai Ninomiya <kainino@chromium.org>
diff --git a/src/dawn/tests/end2end/ComputeDispatchTests.cpp b/src/dawn/tests/end2end/ComputeDispatchTests.cpp
index 62c6cb5..2ad2361 100644
--- a/src/dawn/tests/end2end/ComputeDispatchTests.cpp
+++ b/src/dawn/tests/end2end/ComputeDispatchTests.cpp
@@ -25,9 +25,11 @@
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#include <algorithm>
#include <initializer_list>
#include <vector>
+#include "dawn/common/Math.h"
#include "dawn/tests/DawnTest.h"
#include "dawn/utils/WGPUHelpers.h"
@@ -225,8 +227,6 @@
// TODO(crbug.com/dawn/1196): Fails on Chromium's Quadro P400 bots
DAWN_SUPPRESS_TEST_IF(IsD3D12() && IsNvidia());
#endif
- // TODO(crbug.com/dawn/1262): Fails with the full validation turned on.
- DAWN_SUPPRESS_TEST_IF(IsD3D12() && IsFullBackendValidationEnabled());
IndirectTest({2, 3, 4}, 0);
}
@@ -257,8 +257,6 @@
// TODO(crbug.com/dawn/1196): Fails on Chromium's Quadro P400 bots
DAWN_SUPPRESS_TEST_IF(IsD3D12() && IsNvidia());
#endif
- // TODO(crbug.com/dawn/1262): Fails with the full validation turned on.
- DAWN_SUPPRESS_TEST_IF(IsD3D12() && IsFullBackendValidationEnabled());
IndirectTest({0, 0, 0, 2, 3, 4}, 3 * sizeof(uint32_t));
}
@@ -274,8 +272,6 @@
// TODO(crbug.com/dawn/1196): Fails on Chromium's Quadro P400 bots
DAWN_SUPPRESS_TEST_IF(IsD3D12() && IsNvidia());
#endif
- // TODO(crbug.com/dawn/1262): Fails with the full validation turned on.
- DAWN_SUPPRESS_TEST_IF(IsD3D12() && IsFullBackendValidationEnabled());
// TODO(crbug.com/dawn/1165): Fails with WARP
DAWN_SUPPRESS_TEST_IF(IsWARP());
@@ -336,5 +332,267 @@
OpenGLESBackend(),
VulkanBackend());
+namespace {
+using UseNumWorkgoups = bool;
+DAWN_TEST_PARAM_STRUCT(Params, UseNumWorkgoups);
+} // namespace
+
+class ComputeMultipleDispatchesTests : public DawnTestWithParams<Params> {
+ protected:
+ void SetUp() override {
+ DawnTestWithParams<Params>::SetUp();
+
+ bindGroupLayout = utils::MakeBindGroupLayout(
+ device, {{0, wgpu::ShaderStage::Compute, wgpu::BufferBindingType::Uniform,
+ /* hasDynamicOffset = */ true},
+ {1, wgpu::ShaderStage::Compute, wgpu::BufferBindingType::Storage}});
+
+ wgpu::PipelineLayoutDescriptor pipelineLayoutDescriptor;
+ pipelineLayoutDescriptor.bindGroupLayoutCount = 1;
+ pipelineLayoutDescriptor.bindGroupLayouts = &bindGroupLayout;
+ wgpu::PipelineLayout pipelineLayout =
+ device.CreatePipelineLayout(&pipelineLayoutDescriptor);
+
+ // Write workgroup number into the output buffer if we saw the biggest dispatch
+ // To make sure the dispatch was not called, write maximum u32 value for 0 dispatches
+ wgpu::ShaderModule module = utils::CreateShaderModule(device, R"(
+ @group(0) @binding(0) var<uniform> dispatchId : u32;
+ @group(0) @binding(1) var<storage, read_write> output : array<vec3u>;
+
+ @compute @workgroup_size(1, 1, 1)
+ fn main(@builtin(global_invocation_id) GlobalInvocationID : vec3u,
+ @builtin(num_workgroups) dispatch : vec3u) {
+ if (dispatch.x == 0u || dispatch.y == 0u || dispatch.z == 0u) {
+ output[dispatchId] = vec3u(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu);
+ return;
+ }
+
+ if (all(GlobalInvocationID == dispatch - vec3u(1u, 1u, 1u))) {
+ output[dispatchId] = dispatch;
+ }
+ })");
+
+ wgpu::ComputePipelineDescriptor csDesc;
+ //
+ csDesc.compute.module = module;
+ csDesc.compute.entryPoint = "main";
+ csDesc.layout = pipelineLayout;
+ pipeline = device.CreateComputePipeline(&csDesc);
+
+ // Test the use of the compute pipelines without using @num_workgroups
+ wgpu::ShaderModule moduleWithoutNumWorkgroups = utils::CreateShaderModule(device, R"(
+ // input.xyz = num_workgroups.xyz, input.w = dispatch call id (i.e. offset in output buffer)
+ @group(0) @binding(0) var<uniform> input : vec4u;
+ @group(0) @binding(1) var<storage, read_write> output : array<vec3u>;
+
+ @compute @workgroup_size(1, 1, 1)
+ fn main(@builtin(global_invocation_id) GlobalInvocationID : vec3u) {
+ let dispatch : vec3u = input.xyz;
+ let dispatchId = input.w;
+
+ if (dispatch.x == 0u || dispatch.y == 0u || dispatch.z == 0u) {
+ output[dispatchId] = vec3u(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu);
+ return;
+ }
+
+ if (all(GlobalInvocationID == dispatch - vec3u(1u, 1u, 1u))) {
+ output[dispatchId] = dispatch;
+ }
+ })");
+ csDesc.compute.module = moduleWithoutNumWorkgroups;
+ csDesc.compute.entryPoint = "main";
+ csDesc.layout = pipelineLayout;
+ pipelineWithoutNumWorkgroups = device.CreateComputePipeline(&csDesc);
+ }
+
+ void IndirectTest(std::vector<uint32_t> indirectBufferData,
+ std::vector<uint64_t> indirectOffsets) {
+ bool useNumWorkgroups = GetParam().mUseNumWorkgoups;
+ // Set up dst storage buffer to contain dispatch x, y, z
+ wgpu::BufferDescriptor dstBufDescriptor;
+ // array<vec3u> aligns to 16 bytes
+ dstBufDescriptor.size = Align(indirectOffsets.size() * 4u * sizeof(uint32_t), 16u);
+ dstBufDescriptor.usage =
+ wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst;
+ wgpu::Buffer dst = device.CreateBuffer(&dstBufDescriptor);
+
+ wgpu::Buffer indirectBuffer = utils::CreateBufferFromData(
+ device, &indirectBufferData[0], indirectBufferData.size() * sizeof(uint32_t),
+ wgpu::BufferUsage::Indirect | wgpu::BufferUsage::CopySrc);
+
+ // dynamic offset requires a 256 byte alignment. So we store dispatch index every 256 byte
+ // instead of compactly
+ constexpr uint32_t kDynamicOffsetReq = 256;
+ constexpr uint32_t kIndexOffset = kDynamicOffsetReq / sizeof(uint32_t);
+
+ std::vector<uint32_t> dynamicOffsets(indirectOffsets.size());
+ for (size_t i = 0; i < indirectOffsets.size(); i++) {
+ dynamicOffsets[i] = i * kDynamicOffsetReq;
+ }
+
+ // Set up bind group and issue dispatch
+ wgpu::BindGroup bindGroup;
+ wgpu::ComputePipeline computePipelineForTest;
+
+ if (useNumWorkgroups) {
+ computePipelineForTest = pipeline;
+
+ std::vector<uint32_t> dispatchIds(indirectOffsets.size() * kIndexOffset);
+ for (size_t i = 0; i < indirectOffsets.size(); i++) {
+ size_t o = kIndexOffset * i;
+ dispatchIds[o] = i;
+ }
+
+ wgpu::Buffer uniformBuffer = utils::CreateBufferFromData(
+ device, dispatchIds.data(), dispatchIds.size() * sizeof(uint32_t),
+ wgpu::BufferUsage::Uniform);
+ bindGroup = utils::MakeBindGroup(device, bindGroupLayout,
+ {
+ {0, uniformBuffer, 0, sizeof(uint32_t)},
+ {1, dst, 0, dstBufDescriptor.size},
+ });
+ } else {
+ computePipelineForTest = pipelineWithoutNumWorkgroups;
+
+ std::vector<uint32_t> inputs(indirectOffsets.size() * kIndexOffset);
+ for (size_t i = 0; i < indirectOffsets.size(); i++) {
+ uint32_t indirectStart = indirectOffsets[i] / sizeof(uint32_t);
+ size_t o = kIndexOffset * i;
+ // numWorkgroups
+ inputs[o] = indirectBufferData[indirectStart];
+ inputs[o + 1] = indirectBufferData[indirectStart + 1];
+ inputs[o + 2] = indirectBufferData[indirectStart + 2];
+ // dispatchId
+ inputs[o + 3] = i;
+ }
+
+ wgpu::Buffer uniformBuffer =
+ utils::CreateBufferFromData(device, inputs.data(), inputs.size() * sizeof(uint32_t),
+ wgpu::BufferUsage::Uniform);
+ bindGroup = utils::MakeBindGroup(device, bindGroupLayout,
+ {
+ {0, uniformBuffer, 0, 4 * sizeof(uint32_t)},
+ {1, dst, 0, dstBufDescriptor.size},
+ });
+ }
+
+ wgpu::CommandBuffer commands;
+ {
+ wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+ wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
+ pass.SetPipeline(computePipelineForTest);
+ for (size_t i = 0; i < indirectOffsets.size(); i++) {
+ uint64_t indirectOffset = indirectOffsets[i];
+ // Use dynamic binding offset to set dispatch Id (used as offset to output buffer)
+ // for each dispatch call
+ pass.SetBindGroup(0, bindGroup, 1, &dynamicOffsets[i]);
+ pass.DispatchWorkgroupsIndirect(indirectBuffer, indirectOffset);
+ }
+
+ pass.End();
+
+ commands = encoder.Finish();
+ }
+
+ queue.Submit(1, &commands);
+
+ uint32_t maxComputeWorkgroupsPerDimension =
+ GetSupportedLimits().limits.maxComputeWorkgroupsPerDimension;
+
+ std::vector<uint32_t> expected(4 * indirectOffsets.size(), 0);
+ for (size_t i = 0; i < indirectOffsets.size(); i++) {
+ uint32_t indirectStart = indirectOffsets[i] / sizeof(uint32_t);
+ size_t o = 4 * i;
+
+ if (indirectBufferData[indirectStart] == 0 ||
+ indirectBufferData[indirectStart + 1] == 0 ||
+ indirectBufferData[indirectStart + 2] == 0 ||
+ indirectBufferData[indirectStart] > maxComputeWorkgroupsPerDimension ||
+ indirectBufferData[indirectStart + 1] > maxComputeWorkgroupsPerDimension ||
+ indirectBufferData[indirectStart + 2] > maxComputeWorkgroupsPerDimension) {
+ std::copy(kSentinelData.begin(), kSentinelData.end(), expected.begin() + o);
+ } else {
+ expected[o] = indirectBufferData[indirectStart];
+ expected[o + 1] = indirectBufferData[indirectStart + 1];
+ expected[o + 2] = indirectBufferData[indirectStart + 2];
+ }
+ }
+
+ // Verify the indirect buffer is not modified
+ EXPECT_BUFFER_U32_RANGE_EQ(&indirectBufferData[0], indirectBuffer, 0,
+ indirectBufferData.size());
+ // Verify the dispatch got called with group counts in indirect buffer if all group counts
+ // are not zero
+ EXPECT_BUFFER_U32_RANGE_EQ(&expected[0], dst, 0, expected.size());
+ }
+
+ private:
+ wgpu::ComputePipeline pipeline;
+ wgpu::ComputePipeline pipelineWithoutNumWorkgroups;
+ wgpu::BindGroupLayout bindGroupLayout;
+};
+
+// Test indirect dispatches with buffer offset
+TEST_P(ComputeMultipleDispatchesTests, IndirectOffset) {
+#if DAWN_PLATFORM_IS(32_BIT)
+ // TODO(crbug.com/dawn/1196): Fails on Chromium's Quadro P400 bots
+ DAWN_SUPPRESS_TEST_IF(IsD3D12() && IsNvidia());
+#endif
+
+ // Control case: One DispatchWorkgroupsIndirect call
+ IndirectTest({0, 0, 0, 2, 3, 4}, {3 * sizeof(uint32_t)});
+
+ // Two dispatches: first is no-op
+ IndirectTest({0, 0, 0, 2, 3, 4}, {0, 3 * sizeof(uint32_t)});
+
+ // Two dispatches
+ IndirectTest({9, 8, 7, 2, 3, 4}, {0, 3 * sizeof(uint32_t)});
+
+ // Indirect offsets not in order
+ IndirectTest({9, 8, 7, 2, 3, 4}, {3 * sizeof(uint32_t), 0});
+
+ // Multiple dispatches with duplicate indirect offsets
+ IndirectTest({9, 8, 7, 0, 0, 0, 2, 3, 4, 0xa, 0xb, 0xc, 0xf, 0xe, 0xd},
+ {
+ 6 * sizeof(uint32_t),
+ 0,
+ 3 * sizeof(uint32_t),
+ 12 * sizeof(uint32_t),
+ 9 * sizeof(uint32_t),
+ 6 * sizeof(uint32_t),
+ 6 * sizeof(uint32_t),
+ });
+}
+
+// Test indirect dispatches exceeding the max limit with an offset are noop-ed.
+TEST_P(ComputeMultipleDispatchesTests, ExceedsMaxWorkgroupsWithOffsetNoop) {
+ DAWN_TEST_UNSUPPORTED_IF(HasToggleEnabled("skip_validation"));
+
+#if DAWN_PLATFORM_IS(32_BIT)
+ // TODO(crbug.com/dawn/1196): Fails on Chromium's Quadro P400 bots
+ DAWN_SUPPRESS_TEST_IF(IsD3D12() && IsNvidia());
+#endif
+
+ // TODO(crbug.com/dawn/839): Investigate why this test fails with WARP.
+ DAWN_SUPPRESS_TEST_IF(IsWARP());
+
+ uint32_t max = GetSupportedLimits().limits.maxComputeWorkgroupsPerDimension;
+
+ // Two dispatches: first is no-op
+ IndirectTest({max + 1, 1, 1, 2, 3, 4}, {0, 3 * sizeof(uint32_t)});
+
+ // Two dispatches: second is no-op
+ IndirectTest({2, 3, 4, max + 1, 1, 1}, {0, 3 * sizeof(uint32_t)});
+
+ // Two dispatches: second is no-op
+ IndirectTest({max + 1, 1, 1, 2, 3, 4}, {3 * sizeof(uint32_t), 0});
+}
+
+DAWN_INSTANTIATE_TEST_P(ComputeMultipleDispatchesTests,
+ {D3D11Backend(), D3D12Backend(), MetalBackend(), OpenGLBackend(),
+ OpenGLESBackend(), VulkanBackend()},
+ {true, false} // useNumWorkgroups
+);
+
} // anonymous namespace
} // namespace dawn