src/dawn/tests/end2end/SubgroupsTests.cpp - dawn - Git at Google

 // Copyright 2023 The Dawn & Tint Authors
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
 // 1. Redistributions of source code must retain the above copyright notice, this
 //    list of conditions and the following disclaimer.
 //
 // 2. Redistributions in binary form must reproduce the above copyright notice,
 //    this list of conditions and the following disclaimer in the documentation
 //    and/or other materials provided with the distribution.
 //
 // 3. Neither the name of the copyright holder nor the names of its
 //    contributors may be used to endorse or promote products derived from
 //    this software without specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #include <cmath>
 #include <cstdint>
 #include <string>
 #include <vector>

 #include "dawn/common/GPUInfo.h"
 #include "dawn/common/Math.h"
 #include "dawn/tests/DawnTest.h"
 #include "dawn/utils/ComboRenderPipelineDescriptor.h"
 #include "dawn/utils/WGPUHelpers.h"

 namespace dawn {
 namespace {

 enum class RequestSubgroups {
     WhenAvailable,
     Never,
 };
 std::ostream& operator<<(std::ostream& o, const RequestSubgroups& r) {
     switch (r) {
         case RequestSubgroups::WhenAvailable:
             o << "when_supported";
             break;
         case RequestSubgroups::Never:
             o << "never";
             break;
     }
     return o;
 }

 DAWN_TEST_PARAM_STRUCT(SubgroupsAdapterInfoParams, RequestSubgroups);

 template <class Params>
 class SubgroupsAdapterInfoTestBase : public DawnTestWithParams<Params> {
   public:
     using DawnTestWithParams<Params>::GetParam;
     using DawnTestWithParams<Params>::SupportsFeatures;

   protected:
     std::vector<wgpu::FeatureName> GetRequiredFeatures() override {
         if (GetParam().mRequestSubgroups == RequestSubgroups::WhenAvailable &&
             SupportsFeatures({wgpu::FeatureName::Subgroups})) {
             return {wgpu::FeatureName::Subgroups};
         }
         return {};
     }

     // Checks valid values for min and max subgroup sizes, per spec.
     void CheckValidSizes(uint32_t subgroupMinSize, uint32_t subgroupMaxSize) {
         EXPECT_GE(subgroupMinSize, 4u) << subgroupMinSize;
         EXPECT_TRUE(IsPowerOfTwo(subgroupMinSize)) << subgroupMinSize;

         EXPECT_LE(subgroupMaxSize, 128u) << subgroupMaxSize;
         EXPECT_TRUE(IsPowerOfTwo(subgroupMaxSize));

         EXPECT_LE(subgroupMinSize, subgroupMaxSize)
             << subgroupMinSize << " should be less equal than " << subgroupMaxSize;
     }
 };

 using SubgroupsAdapterInfoTests = SubgroupsAdapterInfoTestBase<SubgroupsAdapterInfoParams>;

 // Test that subgroupMinSize and subgroupMaxSize in wgpu::AdapterInfo are valid.
 TEST_P(SubgroupsAdapterInfoTests, FromAdapter) {
     wgpu::AdapterInfo info;
     adapter.GetInfo(&info);

     CheckValidSizes(info.subgroupMinSize, info.subgroupMaxSize);
 }

 // Test that subgroupMinSize and subgroupMaxSize in wgpu::AdapterInfo are valid.
 TEST_P(SubgroupsAdapterInfoTests, FromDevice) {
     wgpu::AdapterInfo info;
     device.GetAdapterInfo(&info);

     CheckValidSizes(info.subgroupMinSize, info.subgroupMaxSize);
 }

 // Test that subgroupMinSize and subgroupMaxSize in wgpu::AdapterInfo are the same between adapter
 // and device.
 TEST_P(SubgroupsAdapterInfoTests, DeviceAndAdapterAgree) {
     wgpu::AdapterInfo adapter_info;
     adapter.GetInfo(&adapter_info);

     wgpu::AdapterInfo device_info;
     device.GetAdapterInfo(&device_info);

     EXPECT_EQ(device_info.subgroupMinSize, adapter_info.subgroupMinSize);
     EXPECT_EQ(device_info.subgroupMaxSize, adapter_info.subgroupMaxSize);
 }

 DAWN_INSTANTIATE_TEST_P(SubgroupsAdapterInfoTests,
                         {D3D12Backend(), D3D12Backend({}, {"use_dxc"}), MetalBackend(),
                          VulkanBackend()},
                         {RequestSubgroups::WhenAvailable, RequestSubgroups::Never});

 template <class Params>
 class SubgroupsTestsBase : public DawnTestWithParams<Params> {
   public:
     using DawnTestWithParams<Params>::GetParam;
     using DawnTestWithParams<Params>::SupportsFeatures;

   protected:
     std::vector<wgpu::FeatureName> GetRequiredFeatures() override {
         // Always require related features if available.
         std::vector<wgpu::FeatureName> requiredFeatures;
         if (SupportsFeatures({wgpu::FeatureName::ShaderF16})) {
             mRequiredShaderF16Feature = true;
             requiredFeatures.push_back(wgpu::FeatureName::ShaderF16);
         }
         if (SupportsFeatures({wgpu::FeatureName::Subgroups})) {
             mRequiredSubgroupsFeature = true;
             requiredFeatures.push_back(wgpu::FeatureName::Subgroups);
         }
         return requiredFeatures;
     }

     // Helper function that write enable directives for all required features into WGSL code
     std::stringstream& EnableExtensions(std::stringstream& code) {
         if (mRequiredShaderF16Feature) {
             code << "enable f16;";
         }
         if (mRequiredSubgroupsFeature) {
             code << "enable subgroups;";
         }
         return code;
     }

     bool IsShaderF16EnabledInWGSL() const { return mRequiredShaderF16Feature; }
     bool IsSubgroupsEnabledInWGSL() const { return mRequiredSubgroupsFeature; }

   private:
     bool mRequiredShaderF16Feature = false;
     bool mRequiredSubgroupsFeature = false;
 };

 class SubgroupsShaderTests : public SubgroupsTestsBase<AdapterTestParam> {
   protected:
     // Testing reading subgroup_size. The shader declares a workgroup size of [workgroupSize, 1, 1],
     // in which each invocation read the workgroup_size built-in value and write back to output
     // buffer. It is expected that all output workgroup_size are equal and valid, i.e. between 1~128
     // and is a power of 2.
     void TestReadSubgroupSize(uint32_t workgroupSize) {
         auto shaderModule = CreateShaderModuleForReadSubgroupSize(workgroupSize);

         wgpu::ComputePipelineDescriptor csDesc;
         csDesc.compute.module = shaderModule;
         auto pipeline = device.CreateComputePipeline(&csDesc);

         uint32_t outputBufferSizeInBytes = workgroupSize * sizeof(uint32_t);
         wgpu::BufferDescriptor outputBufferDesc;
         outputBufferDesc.size = outputBufferSizeInBytes;
         outputBufferDesc.usage = wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc;
         wgpu::Buffer outputBuffer = device.CreateBuffer(&outputBufferDesc);

         wgpu::BindGroup bindGroup = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
                                                          {
                                                              {0, outputBuffer},
                                                          });

         wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
         wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
         pass.SetPipeline(pipeline);
         pass.SetBindGroup(0, bindGroup);
         pass.DispatchWorkgroups(1);
         pass.End();
         wgpu::CommandBuffer commands = encoder.Finish();
         queue.Submit(1, &commands);

         EXPECT_BUFFER(outputBuffer, 0, outputBufferSizeInBytes,
                       new ExpectReadSubgroupSizeOutput(workgroupSize));
     }

   private:
     // Helper function that create shader module for testing reading subgroup_size. The shader
     // declares a workgroup size of [workgroupSize, 1, 1], in which each invocation read the
     // workgroup_size built-in value and write back to output buffer. It is expected that all
     // output workgroup_size are equal and valid, i.e. between 1~128 and is a power of 2.
     wgpu::ShaderModule CreateShaderModuleForReadSubgroupSize(uint32_t workgroupSize) {
         DAWN_ASSERT((1 <= workgroupSize) && (workgroupSize <= 256));
         std::stringstream code;
         EnableExtensions(code) << R"(
 const workgroupSize = )" << workgroupSize
                                << R"(u;

 @group(0) @binding(0) var<storage, read_write> output : array<u32, workgroupSize>;

 @compute @workgroup_size(workgroupSize, 1, 1)
 fn main(
     @builtin(local_invocation_id) local_id : vec3u,
     @builtin(subgroup_size) sg_size : u32
 ) {
     output[local_id.x] = sg_size;
 }
 )";
         return utils::CreateShaderModule(device, code.str().c_str());
     }

     class ExpectReadSubgroupSizeOutput : public dawn::detail::Expectation {
       public:
         explicit ExpectReadSubgroupSizeOutput(uint32_t workgroupSize)
             : mWorkgroupSize(workgroupSize) {}
         testing::AssertionResult Check(const void* data, size_t size) override {
             DAWN_ASSERT(size == sizeof(int32_t) * mWorkgroupSize);
             const uint32_t* actual = static_cast<const uint32_t*>(data);

             const uint32_t& outputSubgroupSizeAt0 = actual[0];
             // Validate that output subgroup_size is valid
             if (!(
                     // subgroup_size should be at least 1
                     (1 <= outputSubgroupSizeAt0) &&
                     // subgroup_size should be no larger than 128
                     (outputSubgroupSizeAt0 <= 128) &&
                     // subgroup_size should be a power of 2
                     ((outputSubgroupSizeAt0 & (outputSubgroupSizeAt0 - 1)) == 0))) {
                 testing::AssertionResult result = testing::AssertionFailure()
                                                   << "Got invalid subgroup_size output: "
                                                   << outputSubgroupSizeAt0;
                 return result;
             }
             // Validate that subgroup_size of all invocation are identical.
             for (uint32_t i = 1; i < mWorkgroupSize; i++) {
                 const uint32_t& outputSubgroupSize = actual[i];
                 if (outputSubgroupSize != outputSubgroupSizeAt0) {
                     testing::AssertionResult result = testing::AssertionFailure()
                                                       << "Got inconsistent subgroup_size output: "
                                                          "subgroup_size of invocation 0 is "
                                                       << outputSubgroupSizeAt0
                                                       << ", while invocation " << i << " is "
                                                       << outputSubgroupSize;
                     return result;
                 }
             }

             return testing::AssertionSuccess();
         }

       private:
         uint32_t mWorkgroupSize;
     };
 };

 // Test that subgroup_size builtin attribute read by each invocation is valid and identical for any
 // workgroup size between 1 and 256.
 TEST_P(SubgroupsShaderTests, ReadSubgroupSize) {
     DAWN_TEST_UNSUPPORTED_IF(!IsSubgroupsEnabledInWGSL());

     for (uint32_t workgroupSize : {1, 2, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128, 255, 256}) {
         TestReadSubgroupSize(workgroupSize);
     }
 }

 // DawnTestBase::CreateDeviceImpl always enables allow_unsafe_apis toggle.
 DAWN_INSTANTIATE_TEST(SubgroupsShaderTests,
                       D3D12Backend(),
                       D3D12Backend({}, {"use_dxc"}),
                       MetalBackend(),
                       VulkanBackend());

 class SubgroupsShaderTestsFragment : public SubgroupsTestsBase<AdapterTestParam> {
   protected:
     // Testing reading subgroup_size in fragment shader. There is no workgroup size here and
     // subgroup_size is varying.
     void FragmentSubgroupSizeTest() {
         wgpu::ShaderModule vsModule = utils::CreateShaderModule(device, R"(
             @vertex
             fn main(@builtin(vertex_index) VertexIndex : u32) -> @builtin(position) vec4f {
                 var pos = array(
                     vec2f(-1.0, -1.0),
                     vec2f(-1.0,  1.0),
                     vec2f( 1.0, -1.0),
                     vec2f( 1.0,  1.0),
                     vec2f(-1.0,  1.0),
                     vec2f( 1.0, -1.0));
                 return vec4f(pos[VertexIndex], 0.5, 1.0);
             })");

         std::stringstream fsCode;
         {
             EnableExtensions(fsCode);
             fsCode << R"(
             @group(0) @binding(0) var<storage, read_write> output : array<u32>;
             @fragment fn main(@builtin(subgroup_size) sg_size : u32) -> @location(0) vec4f {
                 output[0] = sg_size;
                 return vec4f(0.0, 1.0, 0.0, 1.0);
             })";
         }

         wgpu::ShaderModule fsModule = utils::CreateShaderModule(device, fsCode.str().c_str());

         utils::BasicRenderPass renderPass = utils::CreateBasicRenderPass(device, 100, 100);
         utils::ComboRenderPipelineDescriptor descriptor;
         descriptor.vertex.module = vsModule;
         descriptor.cFragment.module = fsModule;
         descriptor.cTargets[0].format = renderPass.colorFormat;

         auto pipeline = device.CreateRenderPipeline(&descriptor);

         constexpr uint32_t kArrayNumElements = 1u;
         uint32_t outputBufferSizeInBytes = sizeof(uint32_t) * kArrayNumElements;
         wgpu::BufferDescriptor outputBufferDesc;
         outputBufferDesc.size = outputBufferSizeInBytes;
         outputBufferDesc.usage = wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc;
         wgpu::Buffer outputBuffer = device.CreateBuffer(&outputBufferDesc);
         wgpu::BindGroup bindGroup = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
                                                          {
                                                              {0, outputBuffer},
                                                          });

         wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
         {
             wgpu::RenderPassEncoder pass = encoder.BeginRenderPass(&renderPass.renderPassInfo);
             pass.SetPipeline(pipeline);
             pass.SetBindGroup(0, bindGroup);
             pass.Draw(6);
             pass.End();
         }

         wgpu::CommandBuffer commands = encoder.Finish();
         queue.Submit(1, &commands);

         // Check that the fragment shader ran.
         EXPECT_PIXEL_RGBA8_EQ(utils::RGBA8::kGreen, renderPass.color, 0, 0);
         EXPECT_PIXEL_RGBA8_EQ(utils::RGBA8::kGreen, renderPass.color, 0, 99);
         EXPECT_PIXEL_RGBA8_EQ(utils::RGBA8::kGreen, renderPass.color, 99, 0);
         EXPECT_PIXEL_RGBA8_EQ(utils::RGBA8::kGreen, renderPass.color, 99, 99);

         EXPECT_BUFFER(outputBuffer, 0, outputBufferSizeInBytes, new ExpectReadSubgroupSizeOutput());
     }

   private:
     class ExpectReadSubgroupSizeOutput : public dawn::detail::Expectation {
       public:
         ExpectReadSubgroupSizeOutput() {}
         testing::AssertionResult Check(const void* data, size_t size) override {
             DAWN_ASSERT(size == sizeof(int32_t));
             const uint32_t* actual = static_cast<const uint32_t*>(data);
             const uint32_t& outputSubgroupSizeAt0 = actual[0];
             // Subgroup size can vary across fragment invocation (unlike compute) but we could still
             // improve this check by using the min and max subgroup size for the device.
             if (!(
                     // subgroup_size should be at least 1
                     (1 <= outputSubgroupSizeAt0) &&
                     // subgroup_size should be no larger than 128
                     (outputSubgroupSizeAt0 <= 128) &&
                     // subgroup_size should be a power of 2
                     (IsPowerOfTwo(outputSubgroupSizeAt0)))) {
                 testing::AssertionResult result = testing::AssertionFailure()
                                                   << "Got invalid subgroup_size output: "
                                                   << outputSubgroupSizeAt0;
                 return result;
             }
             return testing::AssertionSuccess();
         }
     };
 };

 // Test that subgroup_size builtin attribute read by each invocation is valid and identical for any
 // workgroup size between 1 and 256.
 TEST_P(SubgroupsShaderTestsFragment, ReadSubgroupSize) {
     DAWN_TEST_UNSUPPORTED_IF(!IsSubgroupsEnabledInWGSL());
     FragmentSubgroupSizeTest();
 }

 // DawnTestBase::CreateDeviceImpl always enables allow_unsafe_apis toggle.
 DAWN_INSTANTIATE_TEST(SubgroupsShaderTestsFragment,
                       D3D12Backend(),
                       D3D12Backend({}, {"use_dxc"}),
                       MetalBackend(),
                       VulkanBackend());

 enum class BroadcastType {
     I32,
     U32,
     F32,
     F16,
 };

 std::ostream& operator<<(std::ostream& o, BroadcastType broadcastType) {
     switch (broadcastType) {
         case BroadcastType::I32:
             o << "i32";
             break;
         case BroadcastType::U32:
             o << "u32";
             break;
         case BroadcastType::F32:
             o << "f32";
             break;
         case BroadcastType::F16:
             o << "f16";
             break;
     }
     return o;
 }

 // Indicate which kind of value is the register of invocation 0 set to in subgroupBroadcast tests,
 // and it will be broadcast to its subgroup.
 enum class SubgroupBroadcastValueOfInvocation0 {
     Constant,      // Initialize reg of invocation 0 to
                    // SubgroupBroadcastConstantValueForInvocation0
     SubgroupSize,  // Initialize reg of invocation 0 to the value of subgroup_size
 };

 std::ostream& operator<<(std::ostream& o,
                          SubgroupBroadcastValueOfInvocation0 subgroupBroadcastValueOfInvocation0) {
     switch (subgroupBroadcastValueOfInvocation0) {
         case SubgroupBroadcastValueOfInvocation0::Constant:
             o << "Constant";
             break;
         case SubgroupBroadcastValueOfInvocation0::SubgroupSize:
             o << "SubgroupSize";
             break;
     }
     return o;
 }

 DAWN_TEST_PARAM_STRUCT(SubgroupsBroadcastTestsParams,
                        BroadcastType,
                        SubgroupBroadcastValueOfInvocation0);

 // These two constants should be different so that the broadcast results from invocation 0 can be
 // distinguished from other invocations, and both should not be 0 so that the broadcast results can
 // be distinguished from zero-initialized empty buffer. They should also be exactly-representable in
 // f16 type so we can expect the exact result values for f16 tests.
 constexpr int32_t SubgroupBroadcastConstantValueForInvocation0 = 1;
 constexpr int32_t SubgroupRegisterInitializer = 555;

 class SubgroupsBroadcastTests : public SubgroupsTestsBase<SubgroupsBroadcastTestsParams> {
   protected:
     // Testing subgroup broadcasting. The shader declares a workgroup size of [workgroupSize, 1, 1],
     // in which each invocation hold a register initialized to SubgroupRegisterInitializer, then
     // sets the register of invocation 0 to SubgroupBroadcastConstantValueForInvocation0 or value of
     // subgroup_size, broadcasts the register's value of subgroup_id 0 for all subgroups, and writes
     // back each invocation's register to buffer broadcastOutput. After dispatching, it is expected
     // that broadcastOutput contains exactly [subgroup_size] elements being of
     // SubgroupBroadcastConstantValueForInvocation0 of value [subgroup_size] and all other elements
     // being SubgroupRegisterInitializer. Note that although we assume invocation 0 of the workgroup
     // has a subgroup_id of 0 in its subgroup, we don't assume any other particular subgroups layout
     // property.
     void TestBroadcastSubgroupSize(uint32_t workgroupSize) {
         auto shaderModule = CreateShaderModuleForBroadcastSubgroupSize(workgroupSize);

         wgpu::ComputePipelineDescriptor csDesc;
         csDesc.compute.module = shaderModule;
         auto pipeline = device.CreateComputePipeline(&csDesc);

         uint32_t outputBufferSizeInBytes = (1 + workgroupSize) * sizeof(uint32_t);
         wgpu::BufferDescriptor outputBufferDesc;
         outputBufferDesc.size = outputBufferSizeInBytes;
         outputBufferDesc.usage = wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc;
         wgpu::Buffer outputBuffer = device.CreateBuffer(&outputBufferDesc);

         wgpu::BindGroup bindGroup = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
                                                          {
                                                              {0, outputBuffer},
                                                          });

         wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
         wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
         pass.SetPipeline(pipeline);
         pass.SetBindGroup(0, bindGroup);
         pass.DispatchWorkgroups(1);
         pass.End();
         wgpu::CommandBuffer commands = encoder.Finish();
         queue.Submit(1, &commands);

         EXPECT_BUFFER(outputBuffer, 0, outputBufferSizeInBytes,
                       new ExpectBroadcastSubgroupSizeOutput(workgroupSize));
     }

   private:
     // Helper function that create shader module for testing broadcasting subgroup_size. The shader
     // declares a workgroup size of [workgroupSize, 1, 1], in which each invocation hold a register
     // initialized to SubgroupRegisterInitializer, then sets the register of invocation 0 to
     // SubgroupBroadcastConstantValueForInvocation0 or value of subgroup_size, broadcasts the
     // register's value of subgroup_id 0 for all subgroups, and writes back each invocation's
     // register to buffer broadcastOutput.
     wgpu::ShaderModule CreateShaderModuleForBroadcastSubgroupSize(uint32_t workgroupSize) {
         DAWN_ASSERT((1 <= workgroupSize) && (workgroupSize <= 256));
         std::stringstream code;
         EnableExtensions(code) << R"(
 const workgroupSize = )" << workgroupSize
                                << R"(u;
 alias BroadcastType = )" << GetParam().mBroadcastType
                                << R"(;

 struct Output {
     subgroupSizeOutput : u32,
     broadcastOutput : array<i32, workgroupSize>,
 };
 @group(0) @binding(0) var<storage, read_write> output : Output;

 @compute @workgroup_size(workgroupSize, 1, 1)
 fn main(
     @builtin(local_invocation_id) local_id : vec3u,
     @builtin(subgroup_size) sg_size : u32
 ) {
     // Initialize the register of BroadcastType to SubgroupRegisterInitializer.
     var reg: BroadcastType = BroadcastType()"
                                << SubgroupRegisterInitializer << R"();
     // Set the register value to subgroup size for invocation 0, and also output the subgroup size.
     if (all(local_id == vec3u())) {
         reg = BroadcastType()";
         switch (GetParam().mSubgroupBroadcastValueOfInvocation0) {
             case SubgroupBroadcastValueOfInvocation0::Constant: {
                 code << SubgroupBroadcastConstantValueForInvocation0;
                 break;
             }
             case SubgroupBroadcastValueOfInvocation0::SubgroupSize: {
                 code << "sg_size";
                 break;
             }
         }
         code << R"();
         output.subgroupSizeOutput = sg_size;
     }
     workgroupBarrier();
     // Broadcast the register value of subgroup_id 0 in each subgroup.
     reg = subgroupBroadcast(reg, 0u);
     // Write back the register value in i32.
     output.broadcastOutput[local_id.x] = i32(reg);
 }
 )";
         return utils::CreateShaderModule(device, code.str().c_str());
     }

     class ExpectBroadcastSubgroupSizeOutput : public dawn::detail::Expectation {
       public:
         explicit ExpectBroadcastSubgroupSizeOutput(uint32_t workgroupSize)
             : mWorkgroupSize(workgroupSize) {}
         testing::AssertionResult Check(const void* data, size_t size) override {
             DAWN_ASSERT(size == sizeof(int32_t) * (1 + mWorkgroupSize));
             const int32_t* actual = static_cast<const int32_t*>(data);

             int32_t outputSubgroupSize = actual[0];
             if (!(
                     // subgroup_size should be at least 1
                     (1 <= outputSubgroupSize) &&
                     // subgroup_size should be no larger than 128
                     (outputSubgroupSize <= 128) &&
                     // subgroup_size should be a power of 2
                     ((outputSubgroupSize & (outputSubgroupSize - 1)) == 0))) {
                 testing::AssertionResult result = testing::AssertionFailure()
                                                   << "Got invalid subgroup_size output: "
                                                   << outputSubgroupSize;
                 return result;
             }

             int32_t valueFromInvocation0;
             switch (GetParam().mSubgroupBroadcastValueOfInvocation0) {
                 case SubgroupBroadcastValueOfInvocation0::Constant: {
                     valueFromInvocation0 = SubgroupBroadcastConstantValueForInvocation0;
                     break;
                 }
                 case SubgroupBroadcastValueOfInvocation0::SubgroupSize: {
                     valueFromInvocation0 = outputSubgroupSize;
                     break;
                 }
             }

             // Expected that broadcastOutput contains exactly [subgroup_size] elements being of
             // value [subgroup_size] and all other elements being -1 (placeholder). Note that
             // although we assume invocation 0 of the workgroup has a subgroup_id of 0 in its
             // subgroup, we don't assume any other particular subgroups layout property.
             uint32_t valueFromInvocation0Count = 0;
             uint32_t valueFromOtherInvocationCount = 0;
             for (uint32_t i = 0; i < mWorkgroupSize; i++) {
                 int32_t broadcastOutput = actual[i + 1];
                 if (broadcastOutput == valueFromInvocation0) {
                     valueFromInvocation0Count++;
                 } else if (broadcastOutput == SubgroupRegisterInitializer) {
                     valueFromOtherInvocationCount++;
                 } else {
                     testing::AssertionResult result = testing::AssertionFailure()
                                                       << "Got invalid broadcastOutput[" << i
                                                       << "] : " << broadcastOutput << ", expected "
                                                       << valueFromInvocation0 << " or "
                                                       << SubgroupRegisterInitializer << ".";
                     return result;
                 }
             }

             uint32_t expectedValueFromInvocation0Count =
                 (static_cast<int32_t>(mWorkgroupSize) < outputSubgroupSize) ? mWorkgroupSize
                                                                             : outputSubgroupSize;
             uint32_t expectedValueFromOtherInvocationCount =
                 mWorkgroupSize - expectedValueFromInvocation0Count;
             if ((valueFromInvocation0Count != expectedValueFromInvocation0Count) ||
                 (valueFromOtherInvocationCount != expectedValueFromOtherInvocationCount)) {
                 testing::AssertionResult result =
                     testing::AssertionFailure()
                     << "Unexpected broadcastOutput, got " << valueFromInvocation0Count
                     << " elements of value " << valueFromInvocation0 << " and "
                     << valueFromOtherInvocationCount << " elements of value "
                     << SubgroupRegisterInitializer << ", expected "
                     << expectedValueFromInvocation0Count << " elements of value "
                     << valueFromInvocation0 << " and " << expectedValueFromOtherInvocationCount
                     << " elements of value " << SubgroupRegisterInitializer << ".";
                 return result;
             }

             return testing::AssertionSuccess();
         }

       private:
         uint32_t mWorkgroupSize;
     };
 };

 // Test that subgroupBroadcast builtin function works as expected for any workgroup size between 1
 // and 256. Note that although we assume invocation 0 of the workgroup has a subgroup_id of 0 in its
 // subgroup, we don't assume any other particular subgroups layout property.
 TEST_P(SubgroupsBroadcastTests, SubgroupBroadcast) {
     DAWN_TEST_UNSUPPORTED_IF(!IsSubgroupsEnabledInWGSL());
     if (GetParam().mBroadcastType == BroadcastType::F16) {
         DAWN_TEST_UNSUPPORTED_IF(!IsShaderF16EnabledInWGSL());
     }

     for (uint32_t workgroupSize : {1, 2, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128, 255, 256}) {
         TestBroadcastSubgroupSize(workgroupSize);
     }
 }

 // DawnTestBase::CreateDeviceImpl always enables allow_unsafe_apis toggle.
 DAWN_INSTANTIATE_TEST_P(SubgroupsBroadcastTests,
                         {D3D12Backend(), D3D12Backend({}, {"use_dxc"}), MetalBackend(),
                          VulkanBackend()},
                         {
                             BroadcastType::I32,
                             BroadcastType::U32,
                             BroadcastType::F32,
                             BroadcastType::F16,
                         },  // BroadcastType
                         {SubgroupBroadcastValueOfInvocation0::Constant,
                          SubgroupBroadcastValueOfInvocation0::SubgroupSize}
                         // SubgroupBroadcastValueOfInvocation0
 );

 // Core functions that may be polyfilled
 enum class SubgroupIntrinsicOp : uint8_t {
     Add,
     Mul,
 };

 std::ostream& operator<<(std::ostream& o, SubgroupIntrinsicOp subgroup_op) {
     switch (subgroup_op) {
         case SubgroupIntrinsicOp::Add:
             o << "subgroupInclusiveAdd";
             break;
         case SubgroupIntrinsicOp::Mul:
             o << "subgroupInclusiveMul";
             break;
     }
     return o;
 }

 enum class SubgroupOpDataType : uint8_t {
     U32,
     I32,
     F32,
     F16,
 };

 std::ostream& operator<<(std::ostream& o, SubgroupOpDataType subgroupOpType) {
     switch (subgroupOpType) {
         case SubgroupOpDataType::F32:
             o << "f32";
             break;
         case SubgroupOpDataType::F16:
             o << "f16";
             break;
         case SubgroupOpDataType::U32:
             o << "u32";
             break;
         case SubgroupOpDataType::I32:
             o << "i32";
             break;
     }
     return o;
 }

 DAWN_TEST_PARAM_STRUCT(SubgroupsShaderInclusiveTestsParams,
                        SubgroupIntrinsicOp,
                        SubgroupOpDataType);

 // Subgroup inclusive operations are polyfilled on some backends (HLSL) so it is convenient to
 // execution test them here. This is only to establish that the polyfill is working as expected and
 // not to evaluate the accuracy of these subgroup operations.
 // As noted elsewhere, it is very difficult to test product (mul) for integers as subgroup sizes
 // greater than bits of the type (32) will overflow for even the smallest constant (2).
 class SubgroupsShaderInclusiveTest
     : public SubgroupsTestsBase<SubgroupsShaderInclusiveTestsParams> {
   protected:
     void TestReadSubgroupSize(uint32_t workgroupSize) {
         auto shaderModule = CreateShaderModuleForReadSubgroupSize(workgroupSize, kValueInShader);

         wgpu::ComputePipelineDescriptor csDesc;
         csDesc.compute.module = shaderModule;
         auto pipeline = device.CreateComputePipeline(&csDesc);

         uint32_t outputBufferSizeInBytes = workgroupSize * sizeof(float);
         wgpu::BufferDescriptor outputBufferDesc;
         outputBufferDesc.size = outputBufferSizeInBytes;
         outputBufferDesc.usage = wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc;
         wgpu::Buffer outputBuffer = device.CreateBuffer(&outputBufferDesc);

         wgpu::BindGroup bindGroup = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
                                                          {
                                                              {0, outputBuffer},
                                                          });

         wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
         wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
         pass.SetPipeline(pipeline);
         pass.SetBindGroup(0, bindGroup);
         pass.DispatchWorkgroups(1);
         pass.End();
         wgpu::CommandBuffer commands = encoder.Finish();
         queue.Submit(1, &commands);

         EXPECT_BUFFER(outputBuffer, 0, outputBufferSizeInBytes,
                       new ExpectReadSubgroupSizeOutput(workgroupSize));
     }

   private:
     static constexpr float kValueInShader = 1.05f;

     wgpu::ShaderModule CreateShaderModuleForReadSubgroupSize(uint32_t workgroupSize,
                                                              float common_value) {
         DAWN_ASSERT((1 <= workgroupSize) && (workgroupSize <= 256));
         std::stringstream code;
         EnableExtensions(code) << R"(
 const workgroupSize = )" << workgroupSize
                                << R"(u;

 @group(0) @binding(0) var<storage, read_write> output : array<f32, workgroupSize>;

 @compute @workgroup_size(workgroupSize, 1, 1)
 fn main(
     @builtin(local_invocation_id) local_id : vec3u
 ) {
     let a:)" << GetParam().mSubgroupOpDataType
                                << R"( =  )" << GetParam().mSubgroupOpDataType << R"( ( )"
                                << common_value << R"( );;
     output[local_id.x] = f32( )"
                                << GetParam().mSubgroupIntrinsicOp << R"( (a));
 }
 )";
         return utils::CreateShaderModule(device, code.str().c_str());
     }

     class ExpectReadSubgroupSizeOutput : public dawn::detail::Expectation {
       public:
         explicit ExpectReadSubgroupSizeOutput(uint32_t workgroupSize)
             : mWorkgroupSize(workgroupSize) {}
         testing::AssertionResult Check(const void* data, size_t size) override {
             DAWN_ASSERT(size == sizeof(float) * mWorkgroupSize);
             const float* actual = static_cast<const float*>(data);

             auto approximate_f = [](float actual, float expected) {
                 return abs(actual - expected) / abs(expected) < (kValueInShader / 2.0f);
             };

             // Floor initial value in shader to simulate integer casting.
             const float init_value = (GetParam().mSubgroupOpDataType == SubgroupOpDataType::U32 ||
                                       GetParam().mSubgroupOpDataType == SubgroupOpDataType::I32)
                                          ? std::floor(kValueInShader)
                                          : kValueInShader;
             float expected_value = init_value;
             for (uint32_t i = 0; i < mWorkgroupSize; i++) {
                 if (!approximate_f(actual[i], expected_value)) {
                     // This could be the next subgroup so retest with the initial value.
                     float old_expected_value = expected_value;
                     expected_value = init_value;
                     if (!approximate_f(actual[i], expected_value)) {
                         testing::AssertionResult result =
                             testing::AssertionFailure()
                             << "Unexpected result for " << GetParam().mSubgroupIntrinsicOp
                             << " for type " << GetParam().mSubgroupOpDataType << " for element "
                             << i << " of a workgroup size of " << mWorkgroupSize
                             << ". The actual value was " << actual[i] << " and the expected was "
                             << expected_value << " or " << old_expected_value;

                         return result;
                     }
                 }

                 if (GetParam().mSubgroupIntrinsicOp == SubgroupIntrinsicOp::Add) {
                     expected_value += kValueInShader;
                 } else if (GetParam().mSubgroupIntrinsicOp == SubgroupIntrinsicOp::Mul) {
                     expected_value *= kValueInShader;
                 }
             }

             return testing::AssertionSuccess();
         }

       private:
         uint32_t mWorkgroupSize;
     };
 };

 TEST_P(SubgroupsShaderInclusiveTest, InclusiveExecution) {
     DAWN_TEST_UNSUPPORTED_IF(!IsSubgroupsEnabledInWGSL());
     if (GetParam().mSubgroupOpDataType == SubgroupOpDataType::F16) {
         DAWN_TEST_UNSUPPORTED_IF(!IsShaderF16EnabledInWGSL());

         // TODO(361330160): The f16 implementation does not seem produce correct values in
         // execution. It is not clear if this is due to something wrong with the polyfill or the
         // underlying exclusive implementation.
         // This was observed for D3D12 NVIDIA GeForce GTX 1660
         DAWN_SUPPRESS_TEST_IF(gpu_info::IsNvidiaTuring(GetParam().adapterProperties.vendorID,
                                                        GetParam().adapterProperties.deviceID));
         // TODO(361330160): Also fails on MacBookPro16,1 with AMD Radeon Pro 5300M
         DAWN_SUPPRESS_TEST_IF(gpu_info::IsAMDRDNA1(GetParam().adapterProperties.vendorID,
                                                    GetParam().adapterProperties.deviceID));
     }

     for (uint32_t workgroupSize : {1, 2, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128, 255, 256}) {
         TestReadSubgroupSize(workgroupSize);
     }
 }

 DAWN_INSTANTIATE_TEST_P(SubgroupsShaderInclusiveTest,
                         {D3D12Backend(), D3D12Backend({}, {"use_dxc"}), MetalBackend(),
                          VulkanBackend()},
                         {SubgroupIntrinsicOp::Add, SubgroupIntrinsicOp::Mul},
                         {
                             SubgroupOpDataType::F32,
                             SubgroupOpDataType::F16,
                             SubgroupOpDataType::U32,
                             SubgroupOpDataType::I32,
                         });

 }  // anonymous namespace
 }  // namespace dawn