[hlsl] transform: Zero init arrays with a loop
If the array size is greater than a threshold.
This is a work around for FXC stalling when initializing large arrays
with a single zero-init assignment.
Bug: tint:936
Fixed: tint:943
Fixed: tint:942
Change-Id: Ie93c8f373874b8d6d020d041fa48b38fb1352f71
Reviewed-on: https://dawn-review.googlesource.com/c/tint/+/56775
Kokoro: Kokoro <noreply+kokoro@google.com>
Commit-Queue: Ben Clayton <bclayton@google.com>
Reviewed-by: Antonio Maiorano <amaiorano@google.com>
diff --git a/src/transform/hlsl.cc b/src/transform/hlsl.cc
index 823f2b5..baf8897 100644
--- a/src/transform/hlsl.cc
+++ b/src/transform/hlsl.cc
@@ -66,6 +66,10 @@
manager.Add<ExternalTextureTransform>();
manager.Add<PromoteInitializersToConstVar>();
manager.Add<PadArrayElements>();
+
+ ZeroInitWorkgroupMemory::Config zero_init_cfg;
+ zero_init_cfg.init_arrays_with_loop_size_threshold = 32; // 8 scalars
+ data.Add<ZeroInitWorkgroupMemory::Config>(zero_init_cfg);
data.Add<CanonicalizeEntryPointIO::Config>(
CanonicalizeEntryPointIO::BuiltinStyle::kStructMember);
auto out = manager.Run(in, data);
diff --git a/src/transform/zero_init_workgroup_memory.cc b/src/transform/zero_init_workgroup_memory.cc
index 80ba53a..600a02f 100644
--- a/src/transform/zero_init_workgroup_memory.cc
+++ b/src/transform/zero_init_workgroup_memory.cc
@@ -24,6 +24,7 @@
#include "src/utils/get_or_create.h"
TINT_INSTANTIATE_TYPEINFO(tint::transform::ZeroInitWorkgroupMemory);
+TINT_INSTANTIATE_TYPEINFO(tint::transform::ZeroInitWorkgroupMemory::Config);
namespace tint {
namespace transform {
@@ -32,14 +33,16 @@
struct ZeroInitWorkgroupMemory::State {
/// The clone context
CloneContext& ctx;
- /// The built statements
- ast::StatementList& stmts;
+ /// The config
+ Config cfg;
/// Zero() generates the statements required to zero initialize the workgroup
/// storage expression of type `ty`.
/// @param ty the expression type
+ /// @param stmts the built statements
/// @param get_expr a function that builds the AST nodes for the expression
void Zero(const sem::Type* ty,
+ ast::StatementList& stmts,
const std::function<ast::Expression*()>& get_expr) {
if (CanZero(ty)) {
auto* var = get_expr();
@@ -61,21 +64,32 @@
if (auto* str = ty->As<sem::Struct>()) {
for (auto* member : str->Members()) {
auto name = ctx.Clone(member->Declaration()->symbol());
- Zero(member->Type(),
+ Zero(member->Type(), stmts,
[&] { return ctx.dst->MemberAccessor(get_expr(), name); });
}
return;
}
if (auto* arr = ty->As<sem::Array>()) {
- // TODO(bclayton): If array sizes become pipeline-overridable then this
- // will need to emit code for a loop.
- // See https://github.com/gpuweb/gpuweb/pull/1792
- for (size_t i = 0; i < arr->Count(); i++) {
- Zero(arr->ElemType(), [&] {
- return ctx.dst->IndexAccessor(get_expr(),
- static_cast<ProgramBuilder::u32>(i));
- });
+ if (ShouldEmitForLoop(arr)) {
+ auto i = ctx.dst->Symbols().New("i");
+ auto* i_decl = ctx.dst->Decl(ctx.dst->Var(i, ctx.dst->ty.i32()));
+ auto* cond = ctx.dst->create<ast::BinaryExpression>(
+ ast::BinaryOp::kLessThan, ctx.dst->Expr(i),
+ ctx.dst->Expr(static_cast<int>(arr->Count())));
+ auto* inc = ctx.dst->Assign(i, ctx.dst->Add(i, 1));
+ ast::StatementList for_stmts;
+ Zero(arr->ElemType(), for_stmts,
+ [&] { return ctx.dst->IndexAccessor(get_expr(), i); });
+ auto* body = ctx.dst->Block(for_stmts);
+ stmts.emplace_back(ctx.dst->For(i_decl, cond, inc, body));
+ } else {
+ for (size_t i = 0; i < arr->Count(); i++) {
+ Zero(arr->ElemType(), stmts, [&] {
+ return ctx.dst->IndexAccessor(get_expr(),
+ static_cast<ProgramBuilder::u32>(i));
+ });
+ }
}
return;
}
@@ -89,7 +103,7 @@
/// CanZero() returns false, then the type needs to be initialized by
/// decomposing the initialization into multiple sub-initializations.
/// @param ty the type to inspect
- static bool CanZero(const sem::Type* ty) {
+ bool CanZero(const sem::Type* ty) {
if (ty->Is<sem::Atomic>()) {
return false;
}
@@ -101,21 +115,39 @@
}
}
if (auto* arr = ty->As<sem::Array>()) {
- if (!CanZero(arr->ElemType())) {
+ if (ShouldEmitForLoop(arr) || !CanZero(arr->ElemType())) {
return false;
}
}
return true;
}
+
+ /// @returns true if the array should be emitted as a for-loop instead of
+ /// using zero-initializer statements.
+ /// @param array the array
+ bool ShouldEmitForLoop(const sem::Array* array) {
+ // TODO(bclayton): If array sizes become pipeline-overridable then this
+ // we need to return true for these arrays.
+ // See https://github.com/gpuweb/gpuweb/pull/1792
+ return (cfg.init_arrays_with_loop_size_threshold != 0) &&
+ (array->SizeInBytes() >= cfg.init_arrays_with_loop_size_threshold);
+ }
};
ZeroInitWorkgroupMemory::ZeroInitWorkgroupMemory() = default;
ZeroInitWorkgroupMemory::~ZeroInitWorkgroupMemory() = default;
-void ZeroInitWorkgroupMemory::Run(CloneContext& ctx, const DataMap&, DataMap&) {
+void ZeroInitWorkgroupMemory::Run(CloneContext& ctx,
+ const DataMap& inputs,
+ DataMap&) {
auto& sem = ctx.src->Sem();
+ Config cfg;
+ if (auto* c = inputs.Get<Config>()) {
+ cfg = *c;
+ }
+
for (auto* ast_func : ctx.src->AST().Functions()) {
if (!ast_func->IsEntryPoint()) {
continue;
@@ -129,7 +161,7 @@
if (var->StorageClass() != ast::StorageClass::kWorkgroup) {
continue;
}
- State{ctx, stmts}.Zero(var->Type()->UnwrapRef(), [&] {
+ State{ctx, cfg}.Zero(var->Type()->UnwrapRef(), stmts, [&] {
auto var_name = ctx.Clone(var->Declaration()->symbol());
return ctx.dst->Expr(var_name);
});
@@ -193,5 +225,11 @@
ctx.Clone();
}
+ZeroInitWorkgroupMemory::Config::Config() = default;
+ZeroInitWorkgroupMemory::Config::Config(const Config&) = default;
+ZeroInitWorkgroupMemory::Config::~Config() = default;
+ZeroInitWorkgroupMemory::Config& ZeroInitWorkgroupMemory::Config::operator=(
+ const Config&) = default;
+
} // namespace transform
} // namespace tint
diff --git a/src/transform/zero_init_workgroup_memory.h b/src/transform/zero_init_workgroup_memory.h
index 1644b4d..e40a458 100644
--- a/src/transform/zero_init_workgroup_memory.h
+++ b/src/transform/zero_init_workgroup_memory.h
@@ -26,6 +26,27 @@
class ZeroInitWorkgroupMemory
: public Castable<ZeroInitWorkgroupMemory, Transform> {
public:
+ /// Configuration options for the transform
+ struct Config : public Castable<Config, Data> {
+ /// Constructor
+ Config();
+
+ /// Copy constructor
+ Config(const Config&);
+
+ /// Destructor
+ ~Config() override;
+
+ /// Assignment operator
+ /// @returns this Config
+ Config& operator=(const Config&);
+
+ /// If greater than 0, then arrays of at least this size in bytes will be
+ /// zero initialized using a for loop. If 0, then the array is assigned a
+ /// zero initialized array with a single statement.
+ uint32_t init_arrays_with_loop_size_threshold = 0;
+ };
+
/// Constructor
ZeroInitWorkgroupMemory();
diff --git a/src/transform/zero_init_workgroup_memory_test.cc b/src/transform/zero_init_workgroup_memory_test.cc
index ab1b305..6c318ab 100644
--- a/src/transform/zero_init_workgroup_memory_test.cc
+++ b/src/transform/zero_init_workgroup_memory_test.cc
@@ -558,6 +558,56 @@
EXPECT_EQ(expect, str(got));
}
+TEST_F(ZeroInitWorkgroupMemoryTest, WorkgroupArray_InitWithLoop) {
+ auto* src = R"(
+struct S {
+ a : array<i32, 3>; // size: 12, less than the loop threshold
+ b : array<i32, 4>; // size: 16, equal to the loop threshold
+ c : array<i32, 5>; // size: 20, greater than the loop threshold
+};
+
+var<workgroup> w : S;
+
+[[stage(compute), workgroup_size(1)]]
+fn f() {
+ ignore(w); // Initialization should be inserted above this statement
+}
+)";
+ auto* expect = R"(
+struct S {
+ a : array<i32, 3>;
+ b : array<i32, 4>;
+ c : array<i32, 5>;
+};
+
+var<workgroup> w : S;
+
+[[stage(compute), workgroup_size(1)]]
+fn f([[builtin(local_invocation_index)]] local_invocation_index : u32) {
+ if ((local_invocation_index == 0u)) {
+ w.a = array<i32, 3>();
+ for(var i : i32; (i < 4); i = (i + 1)) {
+ w.b[i] = i32();
+ }
+ for(var i_1 : i32; (i_1 < 5); i_1 = (i_1 + 1)) {
+ w.c[i_1] = i32();
+ }
+ }
+ workgroupBarrier();
+ ignore(w);
+}
+)";
+
+ ZeroInitWorkgroupMemory::Config cfg;
+ cfg.init_arrays_with_loop_size_threshold = 16;
+
+ DataMap data;
+ data.Add<ZeroInitWorkgroupMemory::Config>(cfg);
+ auto got = Run<ZeroInitWorkgroupMemory>(src, data);
+
+ EXPECT_EQ(expect, str(got));
+}
+
} // namespace
} // namespace transform
} // namespace tint
diff --git a/test/bug/tint/914.wgsl.expected.hlsl b/test/bug/tint/914.wgsl.expected.hlsl
index 4030d77..3de9408 100644
--- a/test/bug/tint/914.wgsl.expected.hlsl
+++ b/test/bug/tint/914.wgsl.expected.hlsl
@@ -69,10 +69,16 @@
const uint3 global_id = tint_symbol.global_id;
const uint local_invocation_index = tint_symbol.local_invocation_index;
if ((local_invocation_index == 0u)) {
- const float tint_symbol_5[64][64] = (float[64][64])0;
- mm_Asub = tint_symbol_5;
- const float tint_symbol_6[64][64] = (float[64][64])0;
- mm_Bsub = tint_symbol_6;
+ for(int i = 0; (i < 64); i = (i + 1)) {
+ for(int i_1 = 0; (i_1 < 64); i_1 = (i_1 + 1)) {
+ mm_Asub[i][i_1] = 0.0f;
+ }
+ }
+ for(int i_2 = 0; (i_2 < 64); i_2 = (i_2 + 1)) {
+ for(int i_3 = 0; (i_3 < 64); i_3 = (i_3 + 1)) {
+ mm_Bsub[i_2][i_3] = 0.0f;
+ }
+ }
}
GroupMemoryBarrierWithGroupSync();
const uint tileRow = (local_id.y * RowPerThread);
diff --git a/test/bug/tint/942.wgsl b/test/bug/tint/942.wgsl
new file mode 100644
index 0000000..aaf7549
--- /dev/null
+++ b/test/bug/tint/942.wgsl
@@ -0,0 +1,80 @@
+[[block]] struct Params {
+ filterDim : u32;
+ blockDim : u32;
+};
+
+[[group(0), binding(0)]] var samp : sampler;
+[[group(0), binding(1)]] var<uniform> params : Params;
+[[group(1), binding(1)]] var inputTex : texture_2d<f32>;
+[[group(1), binding(2)]] var outputTex : texture_storage_2d<rgba8unorm, write>;
+
+[[block]] struct Flip {
+ value : u32;
+};
+[[group(1), binding(3)]] var<uniform> flip : Flip;
+
+// This shader blurs the input texture in one direction, depending on whether
+// |flip.value| is 0 or 1.
+// It does so by running (256 / 4) threads per workgroup to load 256
+// texels into 4 rows of shared memory. Each thread loads a
+// 4 x 4 block of texels to take advantage of the texture sampling
+// hardware.
+// Then, each thread computes the blur result by averaging the adjacent texel values
+// in shared memory.
+// Because we're operating on a subset of the texture, we cannot compute all of the
+// results since not all of the neighbors are available in shared memory.
+// Specifically, with 256 x 256 tiles, we can only compute and write out
+// square blocks of size 256 - (filterSize - 1). We compute the number of blocks
+// needed in Javascript and dispatch that amount.
+
+var<workgroup> tile : array<array<vec3<f32>, 256>, 4>;
+
+[[stage(compute), workgroup_size(64, 1, 1)]]
+fn main(
+ [[builtin(workgroup_id)]] WorkGroupID : vec3<u32>,
+ [[builtin(local_invocation_id)]] LocalInvocationID : vec3<u32>
+) {
+ let filterOffset : u32 = (params.filterDim - 1u) / 2u;
+ let dims : vec2<i32> = textureDimensions(inputTex, 0);
+
+ let baseIndex = vec2<i32>(
+ WorkGroupID.xy * vec2<u32>(params.blockDim, 4u) +
+ LocalInvocationID.xy * vec2<u32>(4u, 1u)
+ ) - vec2<i32>(i32(filterOffset), 0);
+
+ for (var r : u32 = 0u; r < 4u; r = r + 1u) {
+ for (var c : u32 = 0u; c < 4u; c = c + 1u) {
+ var loadIndex = baseIndex + vec2<i32>(i32(c), i32(r));
+ if (flip.value != 0u) {
+ loadIndex = loadIndex.yx;
+ }
+
+ tile[r][4u * LocalInvocationID.x + c] =
+ textureSampleLevel(inputTex, samp,
+ (vec2<f32>(loadIndex) + vec2<f32>(0.25, 0.25)) / vec2<f32>(dims), 0.0).rgb;
+ }
+ }
+
+ workgroupBarrier();
+
+ for (var r : u32 = 0u; r < 4u; r = r + 1u) {
+ for (var c : u32 = 0u; c < 4u; c = c + 1u) {
+ var writeIndex = baseIndex + vec2<i32>(i32(c), i32(r));
+ if (flip.value != 0u) {
+ writeIndex = writeIndex.yx;
+ }
+
+ let center : u32 = 4u * LocalInvocationID.x + c;
+ if (center >= filterOffset &&
+ center < 256u - filterOffset &&
+ all(writeIndex < dims)) {
+ var acc : vec3<f32> = vec3<f32>(0.0, 0.0, 0.0);
+ for (var f : u32 = 0u; f < params.filterDim; f = f + 1u) {
+ var i : u32 = center + f - filterOffset;
+ acc = acc + (1.0 / f32(params.filterDim)) * tile[r][i];
+ }
+ textureStore(outputTex, writeIndex, vec4<f32>(acc, 1.0));
+ }
+ }
+ }
+}
diff --git a/test/bug/tint/942.wgsl.expected.hlsl b/test/bug/tint/942.wgsl.expected.hlsl
new file mode 100644
index 0000000..ff2739f
--- /dev/null
+++ b/test/bug/tint/942.wgsl.expected.hlsl
@@ -0,0 +1,96 @@
+SamplerState samp : register(s0, space0);
+cbuffer cbuffer_params : register(b1, space0) {
+ uint4 params[1];
+};
+Texture2D<float4> inputTex : register(t1, space1);
+RWTexture2D<float4> outputTex : register(u2, space1);
+
+cbuffer cbuffer_flip : register(b3, space1) {
+ uint4 flip[1];
+};
+groupshared float3 tile[4][256];
+
+struct tint_symbol_1 {
+ uint3 LocalInvocationID : SV_GroupThreadID;
+ uint local_invocation_index : SV_GroupIndex;
+ uint3 WorkGroupID : SV_GroupID;
+};
+
+[numthreads(64, 1, 1)]
+void main(tint_symbol_1 tint_symbol) {
+ const uint3 WorkGroupID = tint_symbol.WorkGroupID;
+ const uint3 LocalInvocationID = tint_symbol.LocalInvocationID;
+ const uint local_invocation_index = tint_symbol.local_invocation_index;
+ if ((local_invocation_index == 0u)) {
+ for(int i_1 = 0; (i_1 < 4); i_1 = (i_1 + 1)) {
+ for(int i_2 = 0; (i_2 < 256); i_2 = (i_2 + 1)) {
+ tile[i_1][i_2] = float3(0.0f, 0.0f, 0.0f);
+ }
+ }
+ }
+ GroupMemoryBarrierWithGroupSync();
+ const uint scalar_offset = (0u) / 4;
+ const uint filterOffset = ((params[scalar_offset / 4][scalar_offset % 4] - 1u) / 2u);
+ int3 tint_tmp;
+ inputTex.GetDimensions(0, tint_tmp.x, tint_tmp.y, tint_tmp.z);
+ const int2 dims = tint_tmp.xy;
+ const uint scalar_offset_1 = (4u) / 4;
+ const int2 baseIndex = (int2(((WorkGroupID.xy * uint2(params[scalar_offset_1 / 4][scalar_offset_1 % 4], 4u)) + (LocalInvocationID.xy * uint2(4u, 1u)))) - int2(int(filterOffset), 0));
+ {
+ uint r = 0u;
+ for(; !(!((r < 4u))); r = (r + 1u)) {
+ {
+ uint c = 0u;
+ for(; !(!((c < 4u))); c = (c + 1u)) {
+ int2 loadIndex = (baseIndex + int2(int(c), int(r)));
+ const uint scalar_offset_2 = (0u) / 4;
+ if ((flip[scalar_offset_2 / 4][scalar_offset_2 % 4] != 0u)) {
+ loadIndex = loadIndex.yx;
+ }
+ tile[r][((4u * LocalInvocationID.x) + c)] = inputTex.SampleLevel(samp, ((float2(loadIndex) + float2(0.25f, 0.25f)) / float2(dims)), 0.0f).rgb;
+ }
+ }
+ }
+ }
+ GroupMemoryBarrierWithGroupSync();
+ {
+ uint r = 0u;
+ for(; !(!((r < 4u))); r = (r + 1u)) {
+ {
+ uint c = 0u;
+ for(; !(!((c < 4u))); c = (c + 1u)) {
+ int2 writeIndex = (baseIndex + int2(int(c), int(r)));
+ const uint scalar_offset_3 = (0u) / 4;
+ if ((flip[scalar_offset_3 / 4][scalar_offset_3 % 4] != 0u)) {
+ writeIndex = writeIndex.yx;
+ }
+ const uint center = ((4u * LocalInvocationID.x) + c);
+ bool tint_tmp_2 = (center >= filterOffset);
+ if (tint_tmp_2) {
+ tint_tmp_2 = (center < (256u - filterOffset));
+ }
+ bool tint_tmp_1 = (tint_tmp_2);
+ if (tint_tmp_1) {
+ tint_tmp_1 = all((writeIndex < dims));
+ }
+ if ((tint_tmp_1)) {
+ float3 acc = float3(0.0f, 0.0f, 0.0f);
+ {
+ uint f = 0u;
+ while (true) {
+ const uint scalar_offset_4 = (0u) / 4;
+ if (!(!(!((f < params[scalar_offset_4 / 4][scalar_offset_4 % 4]))))) { break; }
+ uint i = ((center + f) - filterOffset);
+ const uint scalar_offset_5 = (0u) / 4;
+ acc = (acc + ((1.0f / float(params[scalar_offset_5 / 4][scalar_offset_5 % 4])) * tile[r][i]));
+ f = (f + 1u);
+ }
+ }
+ outputTex[writeIndex] = float4(acc, 1.0f);
+ }
+ }
+ }
+ }
+ }
+ return;
+}
diff --git a/test/bug/tint/942.wgsl.expected.msl b/test/bug/tint/942.wgsl.expected.msl
new file mode 100644
index 0000000..16e440d
--- /dev/null
+++ b/test/bug/tint/942.wgsl.expected.msl
@@ -0,0 +1,102 @@
+#include <metal_stdlib>
+
+using namespace metal;
+struct Params {
+ /* 0x0000 */ uint filterDim;
+ /* 0x0004 */ uint blockDim;
+};
+struct Flip {
+ /* 0x0000 */ uint value;
+};
+struct tint_array_wrapper_1 {
+ float3 arr[256];
+};
+struct tint_array_wrapper {
+ tint_array_wrapper_1 arr[4];
+};
+
+kernel void tint_symbol(texture2d<float, access::sample> tint_symbol_4 [[texture(1)]], sampler tint_symbol_5 [[sampler(0)]], texture2d<float, access::write> tint_symbol_6 [[texture(2)]], uint3 WorkGroupID [[threadgroup_position_in_grid]], uint3 LocalInvocationID [[thread_position_in_threadgroup]], uint local_invocation_index [[thread_index_in_threadgroup]], constant Params& params [[buffer(1)]], constant Flip& flip [[buffer(3)]]) {
+ threadgroup tint_array_wrapper tint_symbol_3;
+ if ((local_invocation_index == 0u)) {
+ tint_array_wrapper const tint_symbol_2 = {.arr={}};
+ tint_symbol_3 = tint_symbol_2;
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ uint const filterOffset = ((params.filterDim - 1u) / 2u);
+ int2 const dims = int2(tint_symbol_4.get_width(0), tint_symbol_4.get_height(0));
+ int2 const baseIndex = (int2(((WorkGroupID.xy * uint2(params.blockDim, 4u)) + (LocalInvocationID.xy * uint2(4u, 1u)))) - int2(int(filterOffset), 0));
+ {
+ uint r = 0u;
+ while (true) {
+ if (!((r < 4u))) {
+ break;
+ }
+ {
+ uint c = 0u;
+ while (true) {
+ if (!((c < 4u))) {
+ break;
+ }
+ int2 loadIndex = (baseIndex + int2(int(c), int(r)));
+ if ((flip.value != 0u)) {
+ loadIndex = loadIndex.yx;
+ }
+ tint_symbol_3.arr[r].arr[((4u * LocalInvocationID.x) + c)] = tint_symbol_4.sample(tint_symbol_5, ((float2(loadIndex) + float2(0.25f, 0.25f)) / float2(dims)), level(0.0f)).rgb;
+ {
+ c = (c + 1u);
+ }
+ }
+ }
+ {
+ r = (r + 1u);
+ }
+ }
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ {
+ uint r = 0u;
+ while (true) {
+ if (!((r < 4u))) {
+ break;
+ }
+ {
+ uint c = 0u;
+ while (true) {
+ if (!((c < 4u))) {
+ break;
+ }
+ int2 writeIndex = (baseIndex + int2(int(c), int(r)));
+ if ((flip.value != 0u)) {
+ writeIndex = writeIndex.yx;
+ }
+ uint const center = ((4u * LocalInvocationID.x) + c);
+ if ((((center >= filterOffset) && (center < (256u - filterOffset))) && all((writeIndex < dims)))) {
+ float3 acc = float3(0.0f, 0.0f, 0.0f);
+ {
+ uint f = 0u;
+ while (true) {
+ if (!((f < params.filterDim))) {
+ break;
+ }
+ uint i = ((center + f) - filterOffset);
+ acc = (acc + ((1.0f / float(params.filterDim)) * tint_symbol_3.arr[r].arr[i]));
+ {
+ f = (f + 1u);
+ }
+ }
+ }
+ tint_symbol_6.write(float4(acc, 1.0f), uint2(writeIndex));
+ }
+ {
+ c = (c + 1u);
+ }
+ }
+ }
+ {
+ r = (r + 1u);
+ }
+ }
+ }
+ return;
+}
+
diff --git a/test/bug/tint/942.wgsl.expected.spvasm b/test/bug/tint/942.wgsl.expected.spvasm
new file mode 100644
index 0000000..ef1f0a9
--- /dev/null
+++ b/test/bug/tint/942.wgsl.expected.spvasm
@@ -0,0 +1,374 @@
+; SPIR-V
+; Version: 1.3
+; Generator: Google Tint Compiler; 0
+; Bound: 239
+; Schema: 0
+ OpCapability Shader
+ OpCapability ImageQuery
+ OpMemoryModel Logical GLSL450
+ OpEntryPoint GLCompute %main "main" %tint_symbol_2 %tint_symbol %tint_symbol_1
+ OpExecutionMode %main LocalSize 64 1 1
+ OpName %samp "samp"
+ OpName %Params "Params"
+ OpMemberName %Params 0 "filterDim"
+ OpMemberName %Params 1 "blockDim"
+ OpName %params "params"
+ OpName %inputTex "inputTex"
+ OpName %outputTex "outputTex"
+ OpName %Flip "Flip"
+ OpMemberName %Flip 0 "value"
+ OpName %flip "flip"
+ OpName %tile "tile"
+ OpName %tint_symbol "tint_symbol"
+ OpName %tint_symbol_1 "tint_symbol_1"
+ OpName %tint_symbol_2 "tint_symbol_2"
+ OpName %main "main"
+ OpName %r "r"
+ OpName %c "c"
+ OpName %loadIndex "loadIndex"
+ OpName %r_0 "r"
+ OpName %c_0 "c"
+ OpName %writeIndex "writeIndex"
+ OpName %acc "acc"
+ OpName %f "f"
+ OpName %i "i"
+ OpDecorate %samp DescriptorSet 0
+ OpDecorate %samp Binding 0
+ OpDecorate %Params Block
+ OpMemberDecorate %Params 0 Offset 0
+ OpMemberDecorate %Params 1 Offset 4
+ OpDecorate %params NonWritable
+ OpDecorate %params DescriptorSet 0
+ OpDecorate %params Binding 1
+ OpDecorate %inputTex DescriptorSet 1
+ OpDecorate %inputTex Binding 1
+ OpDecorate %outputTex NonReadable
+ OpDecorate %outputTex DescriptorSet 1
+ OpDecorate %outputTex Binding 2
+ OpDecorate %Flip Block
+ OpMemberDecorate %Flip 0 Offset 0
+ OpDecorate %flip NonWritable
+ OpDecorate %flip DescriptorSet 1
+ OpDecorate %flip Binding 3
+ OpDecorate %_arr_v3float_uint_256 ArrayStride 16
+ OpDecorate %_arr__arr_v3float_uint_256_uint_4 ArrayStride 4096
+ OpDecorate %tint_symbol BuiltIn WorkgroupId
+ OpDecorate %tint_symbol_1 BuiltIn LocalInvocationId
+ OpDecorate %tint_symbol_2 BuiltIn LocalInvocationIndex
+ %3 = OpTypeSampler
+%_ptr_UniformConstant_3 = OpTypePointer UniformConstant %3
+ %samp = OpVariable %_ptr_UniformConstant_3 UniformConstant
+ %uint = OpTypeInt 32 0
+ %Params = OpTypeStruct %uint %uint
+%_ptr_Uniform_Params = OpTypePointer Uniform %Params
+ %params = OpVariable %_ptr_Uniform_Params Uniform
+ %float = OpTypeFloat 32
+ %10 = OpTypeImage %float 2D 0 0 0 1 Unknown
+%_ptr_UniformConstant_10 = OpTypePointer UniformConstant %10
+ %inputTex = OpVariable %_ptr_UniformConstant_10 UniformConstant
+ %14 = OpTypeImage %float 2D 0 0 0 2 Rgba8
+%_ptr_UniformConstant_14 = OpTypePointer UniformConstant %14
+ %outputTex = OpVariable %_ptr_UniformConstant_14 UniformConstant
+ %Flip = OpTypeStruct %uint
+%_ptr_Uniform_Flip = OpTypePointer Uniform %Flip
+ %flip = OpVariable %_ptr_Uniform_Flip Uniform
+ %v3float = OpTypeVector %float 3
+ %uint_256 = OpConstant %uint 256
+%_arr_v3float_uint_256 = OpTypeArray %v3float %uint_256
+ %uint_4 = OpConstant %uint 4
+%_arr__arr_v3float_uint_256_uint_4 = OpTypeArray %_arr_v3float_uint_256 %uint_4
+%_ptr_Workgroup__arr__arr_v3float_uint_256_uint_4 = OpTypePointer Workgroup %_arr__arr_v3float_uint_256_uint_4
+ %tile = OpVariable %_ptr_Workgroup__arr__arr_v3float_uint_256_uint_4 Workgroup
+ %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+%tint_symbol = OpVariable %_ptr_Input_v3uint Input
+%tint_symbol_1 = OpVariable %_ptr_Input_v3uint Input
+%_ptr_Input_uint = OpTypePointer Input %uint
+%tint_symbol_2 = OpVariable %_ptr_Input_uint Input
+ %void = OpTypeVoid
+ %31 = OpTypeFunction %void
+ %uint_0 = OpConstant %uint 0
+ %bool = OpTypeBool
+ %41 = OpConstantNull %_arr__arr_v3float_uint_256_uint_4
+ %uint_2 = OpConstant %uint 2
+ %uint_264 = OpConstant %uint 264
+%_ptr_Uniform_uint = OpTypePointer Uniform %uint
+ %uint_1 = OpConstant %uint 1
+ %int = OpTypeInt 32 1
+ %v2int = OpTypeVector %int 2
+ %int_0 = OpConstant %int 0
+ %v2uint = OpTypeVector %uint 2
+ %66 = OpConstantComposite %v2uint %uint_4 %uint_1
+%_ptr_Function_uint = OpTypePointer Function %uint
+ %74 = OpConstantNull %uint
+%_ptr_Function_v2int = OpTypePointer Function %v2int
+ %102 = OpConstantNull %v2int
+%_ptr_Workgroup_v3float = OpTypePointer Workgroup %v3float
+ %v4float = OpTypeVector %float 4
+ %122 = OpTypeSampledImage %10
+ %v2float = OpTypeVector %float 2
+ %float_0_25 = OpConstant %float 0.25
+ %128 = OpConstantComposite %v2float %float_0_25 %float_0_25
+ %float_0 = OpConstant %float 0
+ %v2bool = OpTypeVector %bool 2
+ %193 = OpConstantComposite %v3float %float_0 %float_0 %float_0
+%_ptr_Function_v3float = OpTypePointer Function %v3float
+ %196 = OpConstantNull %v3float
+ %float_1 = OpConstant %float 1
+ %main = OpFunction %void None %31
+ %34 = OpLabel
+ %r = OpVariable %_ptr_Function_uint Function %74
+ %c = OpVariable %_ptr_Function_uint Function %74
+ %loadIndex = OpVariable %_ptr_Function_v2int Function %102
+ %r_0 = OpVariable %_ptr_Function_uint Function %74
+ %c_0 = OpVariable %_ptr_Function_uint Function %74
+ %writeIndex = OpVariable %_ptr_Function_v2int Function %102
+ %acc = OpVariable %_ptr_Function_v3float Function %196
+ %f = OpVariable %_ptr_Function_uint Function %74
+ %i = OpVariable %_ptr_Function_uint Function %74
+ %35 = OpLoad %uint %tint_symbol_2
+ %37 = OpIEqual %bool %35 %uint_0
+ OpSelectionMerge %39 None
+ OpBranchConditional %37 %40 %39
+ %40 = OpLabel
+ OpStore %tile %41
+ OpBranch %39
+ %39 = OpLabel
+ OpControlBarrier %uint_2 %uint_2 %uint_264
+ %46 = OpAccessChain %_ptr_Uniform_uint %params %uint_0
+ %47 = OpLoad %uint %46
+ %49 = OpISub %uint %47 %uint_1
+ %50 = OpUDiv %uint %49 %uint_2
+ %54 = OpLoad %10 %inputTex
+ %51 = OpImageQuerySizeLod %v2int %54 %int_0
+ %58 = OpLoad %v3uint %tint_symbol
+ %59 = OpVectorShuffle %v2uint %58 %58 0 1
+ %60 = OpAccessChain %_ptr_Uniform_uint %params %uint_1
+ %61 = OpLoad %uint %60
+ %62 = OpCompositeConstruct %v2uint %61 %uint_4
+ %63 = OpIMul %v2uint %59 %62
+ %64 = OpLoad %v3uint %tint_symbol_1
+ %65 = OpVectorShuffle %v2uint %64 %64 0 1
+ %67 = OpIMul %v2uint %65 %66
+ %68 = OpIAdd %v2uint %63 %67
+ %56 = OpBitcast %v2int %68
+ %69 = OpBitcast %int %50
+ %70 = OpCompositeConstruct %v2int %69 %int_0
+ %71 = OpISub %v2int %56 %70
+ OpStore %r %uint_0
+ OpBranch %75
+ %75 = OpLabel
+ OpLoopMerge %76 %77 None
+ OpBranch %78
+ %78 = OpLabel
+ %80 = OpLoad %uint %r
+ %81 = OpULessThan %bool %80 %uint_4
+ %79 = OpLogicalNot %bool %81
+ OpSelectionMerge %82 None
+ OpBranchConditional %79 %83 %82
+ %83 = OpLabel
+ OpBranch %76
+ %82 = OpLabel
+ OpStore %c %uint_0
+ OpBranch %85
+ %85 = OpLabel
+ OpLoopMerge %86 %87 None
+ OpBranch %88
+ %88 = OpLabel
+ %90 = OpLoad %uint %c
+ %91 = OpULessThan %bool %90 %uint_4
+ %89 = OpLogicalNot %bool %91
+ OpSelectionMerge %92 None
+ OpBranchConditional %89 %93 %92
+ %93 = OpLabel
+ OpBranch %86
+ %92 = OpLabel
+ %95 = OpLoad %uint %c
+ %94 = OpBitcast %int %95
+ %97 = OpLoad %uint %r
+ %96 = OpBitcast %int %97
+ %98 = OpCompositeConstruct %v2int %94 %96
+ %99 = OpIAdd %v2int %71 %98
+ OpStore %loadIndex %99
+ %103 = OpAccessChain %_ptr_Uniform_uint %flip %uint_0
+ %104 = OpLoad %uint %103
+ %105 = OpINotEqual %bool %104 %uint_0
+ OpSelectionMerge %106 None
+ OpBranchConditional %105 %107 %106
+ %107 = OpLabel
+ %108 = OpLoad %v2int %loadIndex
+ %109 = OpVectorShuffle %v2int %108 %108 1 0
+ OpStore %loadIndex %109
+ OpBranch %106
+ %106 = OpLabel
+ %110 = OpLoad %uint %r
+ %111 = OpAccessChain %_ptr_Input_uint %tint_symbol_1 %uint_0
+ %112 = OpLoad %uint %111
+ %113 = OpIMul %uint %uint_4 %112
+ %114 = OpLoad %uint %c
+ %115 = OpIAdd %uint %113 %114
+ %117 = OpAccessChain %_ptr_Workgroup_v3float %tile %110 %115
+ %120 = OpLoad %3 %samp
+ %121 = OpLoad %10 %inputTex
+ %123 = OpSampledImage %122 %121 %120
+ %126 = OpLoad %v2int %loadIndex
+ %124 = OpConvertSToF %v2float %126
+ %129 = OpFAdd %v2float %124 %128
+ %130 = OpConvertSToF %v2float %51
+ %131 = OpFDiv %v2float %129 %130
+ %118 = OpImageSampleExplicitLod %v4float %123 %131 Lod %float_0
+ %133 = OpVectorShuffle %v3float %118 %118 0 1 2
+ OpStore %117 %133
+ OpBranch %87
+ %87 = OpLabel
+ %134 = OpLoad %uint %c
+ %135 = OpIAdd %uint %134 %uint_1
+ OpStore %c %135
+ OpBranch %85
+ %86 = OpLabel
+ OpBranch %77
+ %77 = OpLabel
+ %136 = OpLoad %uint %r
+ %137 = OpIAdd %uint %136 %uint_1
+ OpStore %r %137
+ OpBranch %75
+ %76 = OpLabel
+ OpControlBarrier %uint_2 %uint_2 %uint_264
+ OpStore %r_0 %uint_0
+ OpBranch %140
+ %140 = OpLabel
+ OpLoopMerge %141 %142 None
+ OpBranch %143
+ %143 = OpLabel
+ %145 = OpLoad %uint %r_0
+ %146 = OpULessThan %bool %145 %uint_4
+ %144 = OpLogicalNot %bool %146
+ OpSelectionMerge %147 None
+ OpBranchConditional %144 %148 %147
+ %148 = OpLabel
+ OpBranch %141
+ %147 = OpLabel
+ OpStore %c_0 %uint_0
+ OpBranch %150
+ %150 = OpLabel
+ OpLoopMerge %151 %152 None
+ OpBranch %153
+ %153 = OpLabel
+ %155 = OpLoad %uint %c_0
+ %156 = OpULessThan %bool %155 %uint_4
+ %154 = OpLogicalNot %bool %156
+ OpSelectionMerge %157 None
+ OpBranchConditional %154 %158 %157
+ %158 = OpLabel
+ OpBranch %151
+ %157 = OpLabel
+ %160 = OpLoad %uint %c_0
+ %159 = OpBitcast %int %160
+ %162 = OpLoad %uint %r_0
+ %161 = OpBitcast %int %162
+ %163 = OpCompositeConstruct %v2int %159 %161
+ %164 = OpIAdd %v2int %71 %163
+ OpStore %writeIndex %164
+ %166 = OpAccessChain %_ptr_Uniform_uint %flip %uint_0
+ %167 = OpLoad %uint %166
+ %168 = OpINotEqual %bool %167 %uint_0
+ OpSelectionMerge %169 None
+ OpBranchConditional %168 %170 %169
+ %170 = OpLabel
+ %171 = OpLoad %v2int %writeIndex
+ %172 = OpVectorShuffle %v2int %171 %171 1 0
+ OpStore %writeIndex %172
+ OpBranch %169
+ %169 = OpLabel
+ %173 = OpAccessChain %_ptr_Input_uint %tint_symbol_1 %uint_0
+ %174 = OpLoad %uint %173
+ %175 = OpIMul %uint %uint_4 %174
+ %176 = OpLoad %uint %c_0
+ %177 = OpIAdd %uint %175 %176
+ %178 = OpUGreaterThanEqual %bool %177 %50
+ OpSelectionMerge %179 None
+ OpBranchConditional %178 %180 %179
+ %180 = OpLabel
+ %181 = OpISub %uint %uint_256 %50
+ %182 = OpULessThan %bool %177 %181
+ OpBranch %179
+ %179 = OpLabel
+ %183 = OpPhi %bool %178 %169 %182 %180
+ OpSelectionMerge %184 None
+ OpBranchConditional %183 %185 %184
+ %185 = OpLabel
+ %187 = OpLoad %v2int %writeIndex
+ %188 = OpSLessThan %v2bool %187 %51
+ %186 = OpAll %bool %188
+ OpBranch %184
+ %184 = OpLabel
+ %190 = OpPhi %bool %183 %179 %186 %185
+ OpSelectionMerge %191 None
+ OpBranchConditional %190 %192 %191
+ %192 = OpLabel
+ OpStore %acc %193
+ OpStore %f %uint_0
+ OpBranch %198
+ %198 = OpLabel
+ OpLoopMerge %199 %200 None
+ OpBranch %201
+ %201 = OpLabel
+ %203 = OpLoad %uint %f
+ %204 = OpAccessChain %_ptr_Uniform_uint %params %uint_0
+ %205 = OpLoad %uint %204
+ %206 = OpULessThan %bool %203 %205
+ %202 = OpLogicalNot %bool %206
+ OpSelectionMerge %207 None
+ OpBranchConditional %202 %208 %207
+ %208 = OpLabel
+ OpBranch %199
+ %207 = OpLabel
+ %209 = OpLoad %uint %f
+ %210 = OpIAdd %uint %177 %209
+ %211 = OpISub %uint %210 %50
+ OpStore %i %211
+ %213 = OpLoad %v3float %acc
+ %216 = OpAccessChain %_ptr_Uniform_uint %params %uint_0
+ %217 = OpLoad %uint %216
+ %215 = OpConvertUToF %float %217
+ %218 = OpFDiv %float %float_1 %215
+ %219 = OpLoad %uint %r_0
+ %220 = OpLoad %uint %i
+ %221 = OpAccessChain %_ptr_Workgroup_v3float %tile %219 %220
+ %222 = OpLoad %v3float %221
+ %223 = OpVectorTimesScalar %v3float %222 %218
+ %224 = OpFAdd %v3float %213 %223
+ OpStore %acc %224
+ OpBranch %200
+ %200 = OpLabel
+ %225 = OpLoad %uint %f
+ %226 = OpIAdd %uint %225 %uint_1
+ OpStore %f %226
+ OpBranch %198
+ %199 = OpLabel
+ %228 = OpLoad %14 %outputTex
+ %229 = OpLoad %v2int %writeIndex
+ %230 = OpLoad %v3float %acc
+ %231 = OpCompositeExtract %float %230 0
+ %232 = OpCompositeExtract %float %230 1
+ %233 = OpCompositeExtract %float %230 2
+ %234 = OpCompositeConstruct %v4float %231 %232 %233 %float_1
+ OpImageWrite %228 %229 %234
+ OpBranch %191
+ %191 = OpLabel
+ OpBranch %152
+ %152 = OpLabel
+ %235 = OpLoad %uint %c_0
+ %236 = OpIAdd %uint %235 %uint_1
+ OpStore %c_0 %236
+ OpBranch %150
+ %151 = OpLabel
+ OpBranch %142
+ %142 = OpLabel
+ %237 = OpLoad %uint %r_0
+ %238 = OpIAdd %uint %237 %uint_1
+ OpStore %r_0 %238
+ OpBranch %140
+ %141 = OpLabel
+ OpReturn
+ OpFunctionEnd
diff --git a/test/bug/tint/942.wgsl.expected.wgsl b/test/bug/tint/942.wgsl.expected.wgsl
new file mode 100644
index 0000000..d261497
--- /dev/null
+++ b/test/bug/tint/942.wgsl.expected.wgsl
@@ -0,0 +1,106 @@
+[[block]]
+struct Params {
+ filterDim : u32;
+ blockDim : u32;
+};
+
+[[group(0), binding(0)]] var samp : sampler;
+
+[[group(0), binding(1)]] var<uniform> params : Params;
+
+[[group(1), binding(1)]] var inputTex : texture_2d<f32>;
+
+[[group(1), binding(2)]] var outputTex : texture_storage_2d<rgba8unorm, write>;
+
+[[block]]
+struct Flip {
+ value : u32;
+};
+
+[[group(1), binding(3)]] var<uniform> flip : Flip;
+
+var<workgroup> tile : array<array<vec3<f32>, 256>, 4>;
+
+[[stage(compute), workgroup_size(64, 1, 1)]]
+fn main([[builtin(workgroup_id)]] WorkGroupID : vec3<u32>, [[builtin(local_invocation_id)]] LocalInvocationID : vec3<u32>) {
+ let filterOffset : u32 = ((params.filterDim - 1u) / 2u);
+ let dims : vec2<i32> = textureDimensions(inputTex, 0);
+ let baseIndex = (vec2<i32>(((WorkGroupID.xy * vec2<u32>(params.blockDim, 4u)) + (LocalInvocationID.xy * vec2<u32>(4u, 1u)))) - vec2<i32>(i32(filterOffset), 0));
+ {
+ var r : u32 = 0u;
+ loop {
+ if (!((r < 4u))) {
+ break;
+ }
+ {
+ var c : u32 = 0u;
+ loop {
+ if (!((c < 4u))) {
+ break;
+ }
+ var loadIndex = (baseIndex + vec2<i32>(i32(c), i32(r)));
+ if ((flip.value != 0u)) {
+ loadIndex = loadIndex.yx;
+ }
+ tile[r][((4u * LocalInvocationID.x) + c)] = textureSampleLevel(inputTex, samp, ((vec2<f32>(loadIndex) + vec2<f32>(0.25, 0.25)) / vec2<f32>(dims)), 0.0).rgb;
+
+ continuing {
+ c = (c + 1u);
+ }
+ }
+ }
+
+ continuing {
+ r = (r + 1u);
+ }
+ }
+ }
+ workgroupBarrier();
+ {
+ var r : u32 = 0u;
+ loop {
+ if (!((r < 4u))) {
+ break;
+ }
+ {
+ var c : u32 = 0u;
+ loop {
+ if (!((c < 4u))) {
+ break;
+ }
+ var writeIndex = (baseIndex + vec2<i32>(i32(c), i32(r)));
+ if ((flip.value != 0u)) {
+ writeIndex = writeIndex.yx;
+ }
+ let center : u32 = ((4u * LocalInvocationID.x) + c);
+ if ((((center >= filterOffset) && (center < (256u - filterOffset))) && all((writeIndex < dims)))) {
+ var acc : vec3<f32> = vec3<f32>(0.0, 0.0, 0.0);
+ {
+ var f : u32 = 0u;
+ loop {
+ if (!((f < params.filterDim))) {
+ break;
+ }
+ var i : u32 = ((center + f) - filterOffset);
+ acc = (acc + ((1.0 / f32(params.filterDim)) * tile[r][i]));
+
+ continuing {
+ f = (f + 1u);
+ }
+ }
+ }
+ textureStore(outputTex, writeIndex, vec4<f32>(acc, 1.0));
+ }
+
+ continuing {
+ c = (c + 1u);
+ }
+ }
+ }
+
+ continuing {
+ r = (r + 1u);
+ }
+ }
+ }
+}
diff --git a/test/bug/tint/943.spvasm.expected.hlsl b/test/bug/tint/943.spvasm.expected.hlsl
index a2192d2..47f46cd 100644
--- a/test/bug/tint/943.spvasm.expected.hlsl
+++ b/test/bug/tint/943.spvasm.expected.hlsl
@@ -336,10 +336,15 @@
const uint3 gl_GlobalInvocationID_param = tint_symbol.gl_GlobalInvocationID_param;
const uint local_invocation_index = tint_symbol.local_invocation_index;
if ((local_invocation_index == 0u)) {
- const float tint_symbol_6[64][64] = (float[64][64])0;
- mm_Asub = tint_symbol_6;
- const float tint_symbol_7[64][1] = (float[64][1])0;
- mm_Bsub = tint_symbol_7;
+ for(int i = 0; (i < 64); i = (i + 1)) {
+ for(int i_1 = 0; (i_1 < 64); i_1 = (i_1 + 1)) {
+ mm_Asub[i][i_1] = 0.0f;
+ }
+ }
+ for(int i_2 = 0; (i_2 < 64); i_2 = (i_2 + 1)) {
+ const float tint_symbol_6[1] = (float[1])0;
+ mm_Bsub[i_2] = tint_symbol_6;
+ }
}
GroupMemoryBarrierWithGroupSync();
gl_LocalInvocationID = gl_LocalInvocationID_param;