src/tests/end2end/ComputeStorageBufferBarrierTests.cpp - dawn - Git at Google

 // Copyright 2019 The Dawn Authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include "tests/DawnTest.h"

 #include "utils/WGPUHelpers.h"

 class ComputeStorageBufferBarrierTests : public DawnTest {
   protected:
     static constexpr uint32_t kNumValues = 100;
     static constexpr uint32_t kIterations = 100;
 };

 // Test that multiple dispatches to increment values in a storage buffer are synchronized.
 TEST_P(ComputeStorageBufferBarrierTests, AddIncrement) {
     std::vector<uint32_t> data(kNumValues, 0);
     std::vector<uint32_t> expected(kNumValues, 0x1234 * kIterations);

     uint64_t bufferSize = static_cast<uint64_t>(data.size() * sizeof(uint32_t));
     wgpu::Buffer buffer = utils::CreateBufferFromData(
         device, data.data(), bufferSize, wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc);

     wgpu::ShaderModule module = utils::CreateShaderModule(device, R"(
         [[block]] struct Buf {
             data : array<u32, 100>;
         };

         [[group(0), binding(0)]] var<storage, read_write> buf : Buf;

         [[stage(compute), workgroup_size(1)]]
         fn main([[builtin(global_invocation_id)]] GlobalInvocationID : vec3<u32>) {
             buf.data[GlobalInvocationID.x] = buf.data[GlobalInvocationID.x] + 0x1234u;
         }
     )");

     wgpu::ComputePipelineDescriptor pipelineDesc = {};
     pipelineDesc.compute.module = module;
     pipelineDesc.compute.entryPoint = "main";
     wgpu::ComputePipeline pipeline = device.CreateComputePipeline(&pipelineDesc);

     wgpu::BindGroup bindGroup =
         utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0), {{0, buffer, 0, bufferSize}});

     wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
     wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
     pass.SetPipeline(pipeline);
     pass.SetBindGroup(0, bindGroup);
     for (uint32_t i = 0; i < kIterations; ++i) {
         pass.Dispatch(kNumValues);
     }
     pass.EndPass();
     wgpu::CommandBuffer commands = encoder.Finish();
     queue.Submit(1, &commands);

     EXPECT_BUFFER_U32_RANGE_EQ(expected.data(), buffer, 0, kNumValues);
 }

 // Test that multiple dispatches to increment values by ping-ponging between two storage buffers
 // are synchronized.
 TEST_P(ComputeStorageBufferBarrierTests, AddPingPong) {
     std::vector<uint32_t> data(kNumValues, 0);
     std::vector<uint32_t> expectedA(kNumValues, 0x1234 * kIterations);
     std::vector<uint32_t> expectedB(kNumValues, 0x1234 * (kIterations - 1));

     uint64_t bufferSize = static_cast<uint64_t>(data.size() * sizeof(uint32_t));

     wgpu::Buffer bufferA = utils::CreateBufferFromData(
         device, data.data(), bufferSize, wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc);

     wgpu::Buffer bufferB = utils::CreateBufferFromData(
         device, data.data(), bufferSize, wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc);

     wgpu::ShaderModule module = utils::CreateShaderModule(device, R"(
         [[block]] struct Buf {
             data : array<u32, 100>;
         };

         [[group(0), binding(0)]] var<storage, read_write> src : Buf;
         [[group(0), binding(1)]] var<storage, read_write> dst : Buf;

         [[stage(compute), workgroup_size(1)]]
         fn main([[builtin(global_invocation_id)]] GlobalInvocationID : vec3<u32>) {
             dst.data[GlobalInvocationID.x] = src.data[GlobalInvocationID.x] + 0x1234u;
         }
     )");

     wgpu::ComputePipelineDescriptor pipelineDesc = {};
     pipelineDesc.compute.module = module;
     pipelineDesc.compute.entryPoint = "main";
     wgpu::ComputePipeline pipeline = device.CreateComputePipeline(&pipelineDesc);

     wgpu::BindGroup bindGroupA = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
                                                       {
                                                           {0, bufferA, 0, bufferSize},
                                                           {1, bufferB, 0, bufferSize},
                                                       });

     wgpu::BindGroup bindGroupB = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
                                                       {
                                                           {0, bufferB, 0, bufferSize},
                                                           {1, bufferA, 0, bufferSize},
                                                       });

     wgpu::BindGroup bindGroups[2] = {bindGroupA, bindGroupB};

     wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
     wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
     pass.SetPipeline(pipeline);

     for (uint32_t i = 0; i < kIterations / 2; ++i) {
         pass.SetBindGroup(0, bindGroups[0]);
         pass.Dispatch(kNumValues);
         pass.SetBindGroup(0, bindGroups[1]);
         pass.Dispatch(kNumValues);
     }
     pass.EndPass();
     wgpu::CommandBuffer commands = encoder.Finish();
     queue.Submit(1, &commands);

     EXPECT_BUFFER_U32_RANGE_EQ(expectedA.data(), bufferA, 0, kNumValues);
     EXPECT_BUFFER_U32_RANGE_EQ(expectedB.data(), bufferB, 0, kNumValues);
 }

 // Test that multiple dispatches to increment values by ping-ponging between storage buffers and
 // read-only storage buffers are synchronized in one compute pass.
 TEST_P(ComputeStorageBufferBarrierTests, StorageAndReadonlyStoragePingPongInOnePass) {
     std::vector<uint32_t> data(kNumValues, 0);
     std::vector<uint32_t> expectedA(kNumValues, 0x1234 * kIterations);
     std::vector<uint32_t> expectedB(kNumValues, 0x1234 * (kIterations - 1));

     uint64_t bufferSize = static_cast<uint64_t>(data.size() * sizeof(uint32_t));

     wgpu::Buffer bufferA = utils::CreateBufferFromData(
         device, data.data(), bufferSize, wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc);

     wgpu::Buffer bufferB = utils::CreateBufferFromData(
         device, data.data(), bufferSize, wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc);

     wgpu::ShaderModule module = utils::CreateShaderModule(device, R"(
         [[block]] struct Buf {
             data : array<u32, 100>;
         };

         [[group(0), binding(0)]] var<storage, read> src : Buf;
         [[group(0), binding(1)]] var<storage, read_write> dst : Buf;

         [[stage(compute), workgroup_size(1)]]
         fn main([[builtin(global_invocation_id)]] GlobalInvocationID : vec3<u32>) {
             dst.data[GlobalInvocationID.x] = src.data[GlobalInvocationID.x] + 0x1234u;
         }
     )");

     wgpu::ComputePipelineDescriptor pipelineDesc = {};
     pipelineDesc.compute.module = module;
     pipelineDesc.compute.entryPoint = "main";
     wgpu::ComputePipeline pipeline = device.CreateComputePipeline(&pipelineDesc);

     wgpu::BindGroup bindGroupA = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
                                                       {
                                                           {0, bufferA, 0, bufferSize},
                                                           {1, bufferB, 0, bufferSize},
                                                       });

     wgpu::BindGroup bindGroupB = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
                                                       {
                                                           {0, bufferB, 0, bufferSize},
                                                           {1, bufferA, 0, bufferSize},
                                                       });

     wgpu::BindGroup bindGroups[2] = {bindGroupA, bindGroupB};

     wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
     wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
     pass.SetPipeline(pipeline);

     for (uint32_t i = 0; i < kIterations / 2; ++i) {
         pass.SetBindGroup(0, bindGroups[0]);
         pass.Dispatch(kNumValues);
         pass.SetBindGroup(0, bindGroups[1]);
         pass.Dispatch(kNumValues);
     }
     pass.EndPass();
     wgpu::CommandBuffer commands = encoder.Finish();
     queue.Submit(1, &commands);

     EXPECT_BUFFER_U32_RANGE_EQ(expectedA.data(), bufferA, 0, kNumValues);
     EXPECT_BUFFER_U32_RANGE_EQ(expectedB.data(), bufferB, 0, kNumValues);
 }

 // Test that Storage to Uniform buffer transitions work and synchronize correctly
 // by ping-ponging between Storage/Uniform usage in sequential compute passes.
 TEST_P(ComputeStorageBufferBarrierTests, UniformToStorageAddPingPong) {
     std::vector<uint32_t> data(kNumValues, 0);
     std::vector<uint32_t> expectedA(kNumValues, 0x1234 * kIterations);
     std::vector<uint32_t> expectedB(kNumValues, 0x1234 * (kIterations - 1));

     uint64_t bufferSize = static_cast<uint64_t>(data.size() * sizeof(uint32_t));

     wgpu::Buffer bufferA = utils::CreateBufferFromData(
         device, data.data(), bufferSize,
         wgpu::BufferUsage::Storage | wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopySrc);

     wgpu::Buffer bufferB = utils::CreateBufferFromData(
         device, data.data(), bufferSize,
         wgpu::BufferUsage::Storage | wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopySrc);

     wgpu::ShaderModule module = utils::CreateShaderModule(device, R"(
         [[block]] struct Buf {
             data : array<vec4<u32>, 25>;
         };

         [[group(0), binding(0)]] var<uniform> src : Buf;
         [[group(0), binding(1)]] var<storage, read_write> dst : Buf;

         [[stage(compute), workgroup_size(1)]]
         fn main([[builtin(global_invocation_id)]] GlobalInvocationID : vec3<u32>) {
             dst.data[GlobalInvocationID.x] = src.data[GlobalInvocationID.x] +
                 vec4<u32>(0x1234u, 0x1234u, 0x1234u, 0x1234u);
         }
     )");

     wgpu::ComputePipelineDescriptor pipelineDesc = {};
     pipelineDesc.compute.module = module;
     pipelineDesc.compute.entryPoint = "main";
     wgpu::ComputePipeline pipeline = device.CreateComputePipeline(&pipelineDesc);

     wgpu::BindGroup bindGroupA = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
                                                       {
                                                           {0, bufferA, 0, bufferSize},
                                                           {1, bufferB, 0, bufferSize},
                                                       });

     wgpu::BindGroup bindGroupB = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
                                                       {
                                                           {0, bufferB, 0, bufferSize},
                                                           {1, bufferA, 0, bufferSize},
                                                       });

     wgpu::BindGroup bindGroups[2] = {bindGroupA, bindGroupB};

     wgpu::CommandEncoder encoder = device.CreateCommandEncoder();

     for (uint32_t i = 0, b = 0; i < kIterations; ++i, b = 1 - b) {
         wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
         pass.SetPipeline(pipeline);
         pass.SetBindGroup(0, bindGroups[b]);
         pass.Dispatch(kNumValues / 4);
         pass.EndPass();
     }

     wgpu::CommandBuffer commands = encoder.Finish();
     queue.Submit(1, &commands);

     EXPECT_BUFFER_U32_RANGE_EQ(expectedA.data(), bufferA, 0, kNumValues);
     EXPECT_BUFFER_U32_RANGE_EQ(expectedB.data(), bufferB, 0, kNumValues);
 }

 // Test that Storage to Uniform buffer transitions work and synchronize correctly
 // by ping-ponging between Storage/Uniform usage in one compute pass.
 TEST_P(ComputeStorageBufferBarrierTests, UniformToStorageAddPingPongInOnePass) {
     std::vector<uint32_t> data(kNumValues, 0);
     std::vector<uint32_t> expectedA(kNumValues, 0x1234 * kIterations);
     std::vector<uint32_t> expectedB(kNumValues, 0x1234 * (kIterations - 1));

     uint64_t bufferSize = static_cast<uint64_t>(data.size() * sizeof(uint32_t));

     wgpu::Buffer bufferA = utils::CreateBufferFromData(
         device, data.data(), bufferSize,
         wgpu::BufferUsage::Storage | wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopySrc);

     wgpu::Buffer bufferB = utils::CreateBufferFromData(
         device, data.data(), bufferSize,
         wgpu::BufferUsage::Storage | wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopySrc);

     wgpu::ShaderModule module = utils::CreateShaderModule(device, R"(
         [[block]] struct Buf {
             data : array<vec4<u32>, 25>;
         };

         [[group(0), binding(0)]] var<uniform> src : Buf;
         [[group(0), binding(1)]] var<storage, read_write> dst : Buf;

         [[stage(compute), workgroup_size(1)]]
         fn main([[builtin(global_invocation_id)]] GlobalInvocationID : vec3<u32>) {
             dst.data[GlobalInvocationID.x] = src.data[GlobalInvocationID.x] +
                 vec4<u32>(0x1234u, 0x1234u, 0x1234u, 0x1234u);
         }
     )");

     wgpu::ComputePipelineDescriptor pipelineDesc = {};
     pipelineDesc.compute.module = module;
     pipelineDesc.compute.entryPoint = "main";
     wgpu::ComputePipeline pipeline = device.CreateComputePipeline(&pipelineDesc);

     wgpu::BindGroup bindGroupA = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
                                                       {
                                                           {0, bufferA, 0, bufferSize},
                                                           {1, bufferB, 0, bufferSize},
                                                       });

     wgpu::BindGroup bindGroupB = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
                                                       {
                                                           {0, bufferB, 0, bufferSize},
                                                           {1, bufferA, 0, bufferSize},
                                                       });

     wgpu::BindGroup bindGroups[2] = {bindGroupA, bindGroupB};

     wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
     wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
     for (uint32_t i = 0, b = 0; i < kIterations; ++i, b = 1 - b) {
         pass.SetPipeline(pipeline);
         pass.SetBindGroup(0, bindGroups[b]);
         pass.Dispatch(kNumValues / 4);
     }
     pass.EndPass();

     wgpu::CommandBuffer commands = encoder.Finish();
     queue.Submit(1, &commands);

     EXPECT_BUFFER_U32_RANGE_EQ(expectedA.data(), bufferA, 0, kNumValues);
     EXPECT_BUFFER_U32_RANGE_EQ(expectedB.data(), bufferB, 0, kNumValues);
 }

 // Test that barriers for dispatches correctly combine Indirect | Storage in backends with explicit
 // barriers. Do this by:
 //  1 - Initializing an indirect buffer with zeros.
 //  2 - Write ones into it with a compute shader.
 //  3 - Use the indirect buffer in a Dispatch while also reading its data.
 TEST_P(ComputeStorageBufferBarrierTests, IndirectBufferCorrectBarrier) {
     // For some reason SPIRV-Cross crashes when translating the step3 shader to HLSL. Suppress the
     // failure since we'll remove SPIRV-Cross at some point.
     DAWN_SUPPRESS_TEST_IF(IsD3D12() && !HasToggleEnabled("use_tint_generator"));

     wgpu::ComputePipelineDescriptor step2PipelineDesc;
     step2PipelineDesc.compute.entryPoint = "main";
     step2PipelineDesc.compute.module = utils::CreateShaderModule(device, R"(
         [[block]] struct Buf {
             data : array<u32, 3>;
         };
         [[group(0), binding(0)]] var<storage, read_write> buf : Buf;

         [[stage(compute), workgroup_size(1)]] fn main() {
             buf.data = array<u32, 3>(1u, 1u, 1u);
         }
     )");
     wgpu::ComputePipeline step2Pipeline = device.CreateComputePipeline(&step2PipelineDesc);

     wgpu::ComputePipelineDescriptor step3PipelineDesc;
     step3PipelineDesc.compute.entryPoint = "main";
     step3PipelineDesc.compute.module = utils::CreateShaderModule(device, R"(
         [[block]] struct Buf {
             data : array<u32, 3>;
         };
         [[group(0), binding(0)]] var<storage, read> buf : Buf;

         [[block]] struct Result {
             data : u32;
         };
         [[group(0), binding(1)]] var<storage, read_write> result : Result;

         [[stage(compute), workgroup_size(1)]] fn main() {
             result.data = 2u;
             if (buf.data[0] == 1u && buf.data[1] == 1u && buf.data[2] == 1u) {
                 result.data = 1u;
             }
         }
     )");
     wgpu::ComputePipeline step3Pipeline = device.CreateComputePipeline(&step3PipelineDesc);

     //  1 - Initializing an indirect buffer with zeros.
     wgpu::Buffer buf = utils::CreateBufferFromData<uint32_t>(
         device, wgpu::BufferUsage::Storage | wgpu::BufferUsage::Indirect, {0u, 0u, 0u});

     //  2 - Write ones into it with a compute shader.
     wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
     wgpu::ComputePassEncoder pass = encoder.BeginComputePass();

     wgpu::BindGroup step2Group =
         utils::MakeBindGroup(device, step2Pipeline.GetBindGroupLayout(0), {{0, buf}});

     pass.SetPipeline(step2Pipeline);
     pass.SetBindGroup(0, step2Group);
     pass.Dispatch(1);

     //  3 - Use the indirect buffer in a Dispatch while also reading its data.
     wgpu::Buffer resultBuffer = utils::CreateBufferFromData<uint32_t>(
         device, wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc, {0u});
     wgpu::BindGroup step3Group = utils::MakeBindGroup(device, step3Pipeline.GetBindGroupLayout(0),
                                                       {{0, buf}, {1, resultBuffer}});

     pass.SetPipeline(step3Pipeline);
     pass.SetBindGroup(0, step3Group);
     pass.DispatchIndirect(buf, 0);

     pass.EndPass();
     wgpu::CommandBuffer commands = encoder.Finish();
     queue.Submit(1, &commands);

     EXPECT_BUFFER_U32_EQ(1u, resultBuffer, 0);
 }

 DAWN_INSTANTIATE_TEST(ComputeStorageBufferBarrierTests,
                       D3D12Backend(),
                       MetalBackend(),
                       OpenGLBackend(),
                       OpenGLESBackend(),
                       VulkanBackend());
	// Copyright 2019 The Dawn Authors
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#include "tests/DawnTest.h"

	#include "utils/WGPUHelpers.h"

	class ComputeStorageBufferBarrierTests : public DawnTest {
	protected:
	static constexpr uint32_t kNumValues = 100;
	static constexpr uint32_t kIterations = 100;
	};

	// Test that multiple dispatches to increment values in a storage buffer are synchronized.
	TEST_P(ComputeStorageBufferBarrierTests, AddIncrement) {
	std::vector<uint32_t> data(kNumValues, 0);
	std::vector<uint32_t> expected(kNumValues, 0x1234 * kIterations);

	uint64_t bufferSize = static_cast<uint64_t>(data.size() * sizeof(uint32_t));
	wgpu::Buffer buffer = utils::CreateBufferFromData(
	device, data.data(), bufferSize, wgpu::BufferUsage::Storage \| wgpu::BufferUsage::CopySrc);

	wgpu::ShaderModule module = utils::CreateShaderModule(device, R"(
	[[block]] struct Buf {
	data : array<u32, 100>;
	};

	[[group(0), binding(0)]] var<storage, read_write> buf : Buf;

	[[stage(compute), workgroup_size(1)]]
	fn main([[builtin(global_invocation_id)]] GlobalInvocationID : vec3<u32>) {
	buf.data[GlobalInvocationID.x] = buf.data[GlobalInvocationID.x] + 0x1234u;
	}
	)");

	wgpu::ComputePipelineDescriptor pipelineDesc = {};
	pipelineDesc.compute.module = module;
	pipelineDesc.compute.entryPoint = "main";
	wgpu::ComputePipeline pipeline = device.CreateComputePipeline(&pipelineDesc);

	wgpu::BindGroup bindGroup =
	utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0), {{0, buffer, 0, bufferSize}});

	wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
	wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
	pass.SetPipeline(pipeline);
	pass.SetBindGroup(0, bindGroup);
	for (uint32_t i = 0; i < kIterations; ++i) {
	pass.Dispatch(kNumValues);
	}
	pass.EndPass();
	wgpu::CommandBuffer commands = encoder.Finish();
	queue.Submit(1, &commands);

	EXPECT_BUFFER_U32_RANGE_EQ(expected.data(), buffer, 0, kNumValues);
	}

	// Test that multiple dispatches to increment values by ping-ponging between two storage buffers
	// are synchronized.
	TEST_P(ComputeStorageBufferBarrierTests, AddPingPong) {
	std::vector<uint32_t> data(kNumValues, 0);
	std::vector<uint32_t> expectedA(kNumValues, 0x1234 * kIterations);
	std::vector<uint32_t> expectedB(kNumValues, 0x1234 * (kIterations - 1));

	uint64_t bufferSize = static_cast<uint64_t>(data.size() * sizeof(uint32_t));

	wgpu::Buffer bufferA = utils::CreateBufferFromData(
	device, data.data(), bufferSize, wgpu::BufferUsage::Storage \| wgpu::BufferUsage::CopySrc);

	wgpu::Buffer bufferB = utils::CreateBufferFromData(
	device, data.data(), bufferSize, wgpu::BufferUsage::Storage \| wgpu::BufferUsage::CopySrc);

	wgpu::ShaderModule module = utils::CreateShaderModule(device, R"(
	[[block]] struct Buf {
	data : array<u32, 100>;
	};

	[[group(0), binding(0)]] var<storage, read_write> src : Buf;
	[[group(0), binding(1)]] var<storage, read_write> dst : Buf;

	[[stage(compute), workgroup_size(1)]]
	fn main([[builtin(global_invocation_id)]] GlobalInvocationID : vec3<u32>) {
	dst.data[GlobalInvocationID.x] = src.data[GlobalInvocationID.x] + 0x1234u;
	}
	)");

	wgpu::ComputePipelineDescriptor pipelineDesc = {};
	pipelineDesc.compute.module = module;
	pipelineDesc.compute.entryPoint = "main";
	wgpu::ComputePipeline pipeline = device.CreateComputePipeline(&pipelineDesc);

	wgpu::BindGroup bindGroupA = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
	{
	{0, bufferA, 0, bufferSize},
	{1, bufferB, 0, bufferSize},
	});

	wgpu::BindGroup bindGroupB = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
	{
	{0, bufferB, 0, bufferSize},
	{1, bufferA, 0, bufferSize},
	});

	wgpu::BindGroup bindGroups[2] = {bindGroupA, bindGroupB};

	wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
	wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
	pass.SetPipeline(pipeline);

	for (uint32_t i = 0; i < kIterations / 2; ++i) {
	pass.SetBindGroup(0, bindGroups[0]);
	pass.Dispatch(kNumValues);
	pass.SetBindGroup(0, bindGroups[1]);
	pass.Dispatch(kNumValues);
	}
	pass.EndPass();
	wgpu::CommandBuffer commands = encoder.Finish();
	queue.Submit(1, &commands);

	EXPECT_BUFFER_U32_RANGE_EQ(expectedA.data(), bufferA, 0, kNumValues);
	EXPECT_BUFFER_U32_RANGE_EQ(expectedB.data(), bufferB, 0, kNumValues);
	}

	// Test that multiple dispatches to increment values by ping-ponging between storage buffers and
	// read-only storage buffers are synchronized in one compute pass.
	TEST_P(ComputeStorageBufferBarrierTests, StorageAndReadonlyStoragePingPongInOnePass) {
	std::vector<uint32_t> data(kNumValues, 0);
	std::vector<uint32_t> expectedA(kNumValues, 0x1234 * kIterations);
	std::vector<uint32_t> expectedB(kNumValues, 0x1234 * (kIterations - 1));

	uint64_t bufferSize = static_cast<uint64_t>(data.size() * sizeof(uint32_t));

	wgpu::Buffer bufferA = utils::CreateBufferFromData(
	device, data.data(), bufferSize, wgpu::BufferUsage::Storage \| wgpu::BufferUsage::CopySrc);

	wgpu::Buffer bufferB = utils::CreateBufferFromData(
	device, data.data(), bufferSize, wgpu::BufferUsage::Storage \| wgpu::BufferUsage::CopySrc);

	wgpu::ShaderModule module = utils::CreateShaderModule(device, R"(
	[[block]] struct Buf {
	data : array<u32, 100>;
	};

	[[group(0), binding(0)]] var<storage, read> src : Buf;
	[[group(0), binding(1)]] var<storage, read_write> dst : Buf;

	[[stage(compute), workgroup_size(1)]]
	fn main([[builtin(global_invocation_id)]] GlobalInvocationID : vec3<u32>) {
	dst.data[GlobalInvocationID.x] = src.data[GlobalInvocationID.x] + 0x1234u;
	}
	)");

	wgpu::ComputePipelineDescriptor pipelineDesc = {};
	pipelineDesc.compute.module = module;
	pipelineDesc.compute.entryPoint = "main";
	wgpu::ComputePipeline pipeline = device.CreateComputePipeline(&pipelineDesc);

	wgpu::BindGroup bindGroupA = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
	{
	{0, bufferA, 0, bufferSize},
	{1, bufferB, 0, bufferSize},
	});

	wgpu::BindGroup bindGroupB = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
	{
	{0, bufferB, 0, bufferSize},
	{1, bufferA, 0, bufferSize},
	});

	wgpu::BindGroup bindGroups[2] = {bindGroupA, bindGroupB};

	wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
	wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
	pass.SetPipeline(pipeline);

	for (uint32_t i = 0; i < kIterations / 2; ++i) {
	pass.SetBindGroup(0, bindGroups[0]);
	pass.Dispatch(kNumValues);
	pass.SetBindGroup(0, bindGroups[1]);
	pass.Dispatch(kNumValues);
	}
	pass.EndPass();
	wgpu::CommandBuffer commands = encoder.Finish();
	queue.Submit(1, &commands);

	EXPECT_BUFFER_U32_RANGE_EQ(expectedA.data(), bufferA, 0, kNumValues);
	EXPECT_BUFFER_U32_RANGE_EQ(expectedB.data(), bufferB, 0, kNumValues);
	}

	// Test that Storage to Uniform buffer transitions work and synchronize correctly
	// by ping-ponging between Storage/Uniform usage in sequential compute passes.
	TEST_P(ComputeStorageBufferBarrierTests, UniformToStorageAddPingPong) {
	std::vector<uint32_t> data(kNumValues, 0);
	std::vector<uint32_t> expectedA(kNumValues, 0x1234 * kIterations);
	std::vector<uint32_t> expectedB(kNumValues, 0x1234 * (kIterations - 1));

	uint64_t bufferSize = static_cast<uint64_t>(data.size() * sizeof(uint32_t));

	wgpu::Buffer bufferA = utils::CreateBufferFromData(
	device, data.data(), bufferSize,
	wgpu::BufferUsage::Storage \| wgpu::BufferUsage::Uniform \| wgpu::BufferUsage::CopySrc);

	wgpu::Buffer bufferB = utils::CreateBufferFromData(
	device, data.data(), bufferSize,
	wgpu::BufferUsage::Storage \| wgpu::BufferUsage::Uniform \| wgpu::BufferUsage::CopySrc);

	wgpu::ShaderModule module = utils::CreateShaderModule(device, R"(
	[[block]] struct Buf {
	data : array<vec4<u32>, 25>;
	};

	[[group(0), binding(0)]] var<uniform> src : Buf;
	[[group(0), binding(1)]] var<storage, read_write> dst : Buf;

	[[stage(compute), workgroup_size(1)]]
	fn main([[builtin(global_invocation_id)]] GlobalInvocationID : vec3<u32>) {
	dst.data[GlobalInvocationID.x] = src.data[GlobalInvocationID.x] +
	vec4<u32>(0x1234u, 0x1234u, 0x1234u, 0x1234u);
	}
	)");

	wgpu::ComputePipelineDescriptor pipelineDesc = {};
	pipelineDesc.compute.module = module;
	pipelineDesc.compute.entryPoint = "main";
	wgpu::ComputePipeline pipeline = device.CreateComputePipeline(&pipelineDesc);

	wgpu::BindGroup bindGroupA = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
	{
	{0, bufferA, 0, bufferSize},
	{1, bufferB, 0, bufferSize},
	});

	wgpu::BindGroup bindGroupB = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
	{
	{0, bufferB, 0, bufferSize},
	{1, bufferA, 0, bufferSize},
	});

	wgpu::BindGroup bindGroups[2] = {bindGroupA, bindGroupB};

	wgpu::CommandEncoder encoder = device.CreateCommandEncoder();

	for (uint32_t i = 0, b = 0; i < kIterations; ++i, b = 1 - b) {
	wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
	pass.SetPipeline(pipeline);
	pass.SetBindGroup(0, bindGroups[b]);
	pass.Dispatch(kNumValues / 4);
	pass.EndPass();
	}

	wgpu::CommandBuffer commands = encoder.Finish();
	queue.Submit(1, &commands);

	EXPECT_BUFFER_U32_RANGE_EQ(expectedA.data(), bufferA, 0, kNumValues);
	EXPECT_BUFFER_U32_RANGE_EQ(expectedB.data(), bufferB, 0, kNumValues);
	}

	// Test that Storage to Uniform buffer transitions work and synchronize correctly
	// by ping-ponging between Storage/Uniform usage in one compute pass.
	TEST_P(ComputeStorageBufferBarrierTests, UniformToStorageAddPingPongInOnePass) {
	std::vector<uint32_t> data(kNumValues, 0);
	std::vector<uint32_t> expectedA(kNumValues, 0x1234 * kIterations);
	std::vector<uint32_t> expectedB(kNumValues, 0x1234 * (kIterations - 1));

	uint64_t bufferSize = static_cast<uint64_t>(data.size() * sizeof(uint32_t));

	wgpu::Buffer bufferA = utils::CreateBufferFromData(
	device, data.data(), bufferSize,
	wgpu::BufferUsage::Storage \| wgpu::BufferUsage::Uniform \| wgpu::BufferUsage::CopySrc);

	wgpu::Buffer bufferB = utils::CreateBufferFromData(
	device, data.data(), bufferSize,
	wgpu::BufferUsage::Storage \| wgpu::BufferUsage::Uniform \| wgpu::BufferUsage::CopySrc);

	wgpu::ShaderModule module = utils::CreateShaderModule(device, R"(
	[[block]] struct Buf {
	data : array<vec4<u32>, 25>;
	};

	[[group(0), binding(0)]] var<uniform> src : Buf;
	[[group(0), binding(1)]] var<storage, read_write> dst : Buf;

	[[stage(compute), workgroup_size(1)]]
	fn main([[builtin(global_invocation_id)]] GlobalInvocationID : vec3<u32>) {
	dst.data[GlobalInvocationID.x] = src.data[GlobalInvocationID.x] +
	vec4<u32>(0x1234u, 0x1234u, 0x1234u, 0x1234u);
	}
	)");

	wgpu::ComputePipelineDescriptor pipelineDesc = {};
	pipelineDesc.compute.module = module;
	pipelineDesc.compute.entryPoint = "main";
	wgpu::ComputePipeline pipeline = device.CreateComputePipeline(&pipelineDesc);

	wgpu::BindGroup bindGroupA = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
	{
	{0, bufferA, 0, bufferSize},
	{1, bufferB, 0, bufferSize},
	});

	wgpu::BindGroup bindGroupB = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
	{
	{0, bufferB, 0, bufferSize},
	{1, bufferA, 0, bufferSize},
	});

	wgpu::BindGroup bindGroups[2] = {bindGroupA, bindGroupB};

	wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
	wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
	for (uint32_t i = 0, b = 0; i < kIterations; ++i, b = 1 - b) {
	pass.SetPipeline(pipeline);
	pass.SetBindGroup(0, bindGroups[b]);
	pass.Dispatch(kNumValues / 4);
	}
	pass.EndPass();

	wgpu::CommandBuffer commands = encoder.Finish();
	queue.Submit(1, &commands);

	EXPECT_BUFFER_U32_RANGE_EQ(expectedA.data(), bufferA, 0, kNumValues);
	EXPECT_BUFFER_U32_RANGE_EQ(expectedB.data(), bufferB, 0, kNumValues);
	}

	// Test that barriers for dispatches correctly combine Indirect \| Storage in backends with explicit
	// barriers. Do this by:
	// 1 - Initializing an indirect buffer with zeros.
	// 2 - Write ones into it with a compute shader.
	// 3 - Use the indirect buffer in a Dispatch while also reading its data.
	TEST_P(ComputeStorageBufferBarrierTests, IndirectBufferCorrectBarrier) {
	// For some reason SPIRV-Cross crashes when translating the step3 shader to HLSL. Suppress the
	// failure since we'll remove SPIRV-Cross at some point.
	DAWN_SUPPRESS_TEST_IF(IsD3D12() && !HasToggleEnabled("use_tint_generator"));

	wgpu::ComputePipelineDescriptor step2PipelineDesc;
	step2PipelineDesc.compute.entryPoint = "main";
	step2PipelineDesc.compute.module = utils::CreateShaderModule(device, R"(
	[[block]] struct Buf {
	data : array<u32, 3>;
	};
	[[group(0), binding(0)]] var<storage, read_write> buf : Buf;

	[[stage(compute), workgroup_size(1)]] fn main() {
	buf.data = array<u32, 3>(1u, 1u, 1u);
	}
	)");
	wgpu::ComputePipeline step2Pipeline = device.CreateComputePipeline(&step2PipelineDesc);

	wgpu::ComputePipelineDescriptor step3PipelineDesc;
	step3PipelineDesc.compute.entryPoint = "main";
	step3PipelineDesc.compute.module = utils::CreateShaderModule(device, R"(
	[[block]] struct Buf {
	data : array<u32, 3>;
	};
	[[group(0), binding(0)]] var<storage, read> buf : Buf;

	[[block]] struct Result {
	data : u32;
	};
	[[group(0), binding(1)]] var<storage, read_write> result : Result;

	[[stage(compute), workgroup_size(1)]] fn main() {
	result.data = 2u;
	if (buf.data[0] == 1u && buf.data[1] == 1u && buf.data[2] == 1u) {
	result.data = 1u;
	}
	}
	)");
	wgpu::ComputePipeline step3Pipeline = device.CreateComputePipeline(&step3PipelineDesc);

	// 1 - Initializing an indirect buffer with zeros.
	wgpu::Buffer buf = utils::CreateBufferFromData<uint32_t>(
	device, wgpu::BufferUsage::Storage \| wgpu::BufferUsage::Indirect, {0u, 0u, 0u});

	// 2 - Write ones into it with a compute shader.
	wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
	wgpu::ComputePassEncoder pass = encoder.BeginComputePass();

	wgpu::BindGroup step2Group =
	utils::MakeBindGroup(device, step2Pipeline.GetBindGroupLayout(0), {{0, buf}});

	pass.SetPipeline(step2Pipeline);
	pass.SetBindGroup(0, step2Group);
	pass.Dispatch(1);

	// 3 - Use the indirect buffer in a Dispatch while also reading its data.
	wgpu::Buffer resultBuffer = utils::CreateBufferFromData<uint32_t>(
	device, wgpu::BufferUsage::Storage \| wgpu::BufferUsage::CopySrc, {0u});
	wgpu::BindGroup step3Group = utils::MakeBindGroup(device, step3Pipeline.GetBindGroupLayout(0),
	{{0, buf}, {1, resultBuffer}});

	pass.SetPipeline(step3Pipeline);
	pass.SetBindGroup(0, step3Group);
	pass.DispatchIndirect(buf, 0);

	pass.EndPass();
	wgpu::CommandBuffer commands = encoder.Finish();
	queue.Submit(1, &commands);

	EXPECT_BUFFER_U32_EQ(1u, resultBuffer, 0);
	}

	DAWN_INSTANTIATE_TEST(ComputeStorageBufferBarrierTests,
	D3D12Backend(),
	MetalBackend(),
	OpenGLBackend(),
	OpenGLESBackend(),
	VulkanBackend());