msl: Handle workgroup matrix allocations

Use a threadgroup memory argument for any workgroup variable that
contains a matrix.

The generator now provides a list of threadgroup memory arguments for
each entry point, so that the runtime knows how many bytes to allocate
for each argument.

Bug: tint:938
Change-Id: Ia4af33cd6a44c4f74258793443eb737c2931f5eb
Reviewed-on: https://dawn-review.googlesource.com/c/tint/+/64042
Kokoro: Kokoro <noreply+kokoro@google.com>
Reviewed-by: Ben Clayton <bclayton@google.com>
Commit-Queue: James Price <jrprice@google.com>
diff --git a/src/transform/module_scope_var_to_entry_point_param.cc b/src/transform/module_scope_var_to_entry_point_param.cc
index 3567d69..e865cab 100644
--- a/src/transform/module_scope_var_to_entry_point_param.cc
+++ b/src/transform/module_scope_var_to_entry_point_param.cc
@@ -15,6 +15,7 @@
 #include "src/transform/module_scope_var_to_entry_point_param.h"
 
 #include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
@@ -29,6 +30,24 @@
 
 namespace tint {
 namespace transform {
+namespace {
+// Returns `true` if `type` is or contains a matrix type.
+bool ContainsMatrix(const sem::Type* type) {
+  type = type->UnwrapRef();
+  if (type->Is<sem::Matrix>()) {
+    return true;
+  } else if (auto* ary = type->As<sem::Array>()) {
+    return ContainsMatrix(ary->ElemType());
+  } else if (auto* str = type->As<sem::Struct>()) {
+    for (auto* member : str->Members()) {
+      if (ContainsMatrix(member->Type())) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+}  // namespace
 
 ModuleScopeVarToEntryPointParam::ModuleScopeVarToEntryPointParam() = default;
 
@@ -105,6 +124,9 @@
 
       auto* store_type = CreateASTTypeFor(ctx, var->Type()->UnwrapRef());
 
+      // Track whether the new variable is a pointer or not.
+      bool is_pointer = false;
+
       if (is_entry_point) {
         if (store_type->is_handle()) {
           // For a texture or sampler variable, redeclare it as an entry point
@@ -117,17 +139,36 @@
           auto* param = ctx.dst->Param(new_var_symbol, store_type, decos);
           ctx.InsertFront(func_ast->params(), param);
         } else {
-          // For a private or workgroup variable, redeclare it at function
-          // scope. Disable storage class validation on this variable.
-          auto* disable_validation =
-              ctx.dst->ASTNodes().Create<ast::DisableValidationDecoration>(
-                  ctx.dst->ID(), ast::DisabledValidation::kIgnoreStorageClass);
-          auto* constructor = ctx.Clone(var->Declaration()->constructor());
-          auto* local_var = ctx.dst->Var(
-              new_var_symbol, store_type, var->StorageClass(), constructor,
-              ast::DecorationList{disable_validation});
-          ctx.InsertFront(func_ast->body()->statements(),
-                          ctx.dst->Decl(local_var));
+          if (var->StorageClass() == ast::StorageClass::kWorkgroup &&
+              ContainsMatrix(var->Type())) {
+            // Due to a bug in the MSL compiler, we use a threadgroup memory
+            // argument for any workgroup allocation that contains a matrix.
+            // See crbug.com/tint/938.
+            auto* disable_validation =
+                ctx.dst->ASTNodes().Create<ast::DisableValidationDecoration>(
+                    ctx.dst->ID(),
+                    ast::DisabledValidation::kEntryPointParameter);
+            auto* param_type =
+                ctx.dst->ty.pointer(store_type, var->StorageClass());
+            auto* param = ctx.dst->Param(new_var_symbol, param_type,
+                                         {disable_validation});
+            ctx.InsertFront(func_ast->params(), param);
+            is_pointer = true;
+          } else {
+            // For any other private or workgroup variable, redeclare it at
+            // function scope. Disable storage class validation on this
+            // variable.
+            auto* disable_validation =
+                ctx.dst->ASTNodes().Create<ast::DisableValidationDecoration>(
+                    ctx.dst->ID(),
+                    ast::DisabledValidation::kIgnoreStorageClass);
+            auto* constructor = ctx.Clone(var->Declaration()->constructor());
+            auto* local_var = ctx.dst->Var(
+                new_var_symbol, store_type, var->StorageClass(), constructor,
+                ast::DecorationList{disable_validation});
+            ctx.InsertFront(func_ast->body()->statements(),
+                            ctx.dst->Decl(local_var));
+          }
         }
       } else {
         // For a regular function, redeclare the variable as a parameter.
@@ -135,6 +176,7 @@
         auto* param_type = store_type;
         if (!store_type->is_handle()) {
           param_type = ctx.dst->ty.pointer(param_type, var->StorageClass());
+          is_pointer = true;
         }
         ctx.InsertBack(func_ast->params(),
                        ctx.dst->Param(new_var_symbol, param_type));
@@ -145,7 +187,7 @@
       for (auto* user : var->Users()) {
         if (user->Stmt()->Function() == func_ast) {
           ast::Expression* expr = ctx.dst->Expr(new_var_symbol);
-          if (!is_entry_point && !store_type->is_handle()) {
+          if (is_pointer) {
             // If this identifier is used by an address-of operator, just remove
             // the address-of instead of adding a deref, since we already have a
             // pointer.
@@ -172,11 +214,15 @@
       // Add new arguments for any variables that are needed by the callee.
       // For entry points, pass non-handle types as pointers.
       for (auto* target_var : target_sem->ReferencedModuleVariables()) {
+        bool is_handle = target_var->Type()->UnwrapRef()->is_handle();
+        bool is_workgroup_matrix =
+            target_var->StorageClass() == ast::StorageClass::kWorkgroup &&
+            ContainsMatrix(target_var->Type());
         if (target_var->StorageClass() == ast::StorageClass::kPrivate ||
             target_var->StorageClass() == ast::StorageClass::kWorkgroup ||
             target_var->StorageClass() == ast::StorageClass::kUniformConstant) {
           ast::Expression* arg = ctx.dst->Expr(var_to_symbol[target_var]);
-          if (is_entry_point && !target_var->Type()->UnwrapRef()->is_handle()) {
+          if (is_entry_point && !is_handle && !is_workgroup_matrix) {
             arg = ctx.dst->AddressOf(arg);
           }
           ctx.InsertBack(call->params(), arg);
diff --git a/src/transform/module_scope_var_to_entry_point_param_test.cc b/src/transform/module_scope_var_to_entry_point_param_test.cc
index 13119d5..cff9c81 100644
--- a/src/transform/module_scope_var_to_entry_point_param_test.cc
+++ b/src/transform/module_scope_var_to_entry_point_param_test.cc
@@ -329,6 +329,64 @@
   EXPECT_EQ(expect, str(got));
 }
 
+TEST_F(ModuleScopeVarToEntryPointParamTest, Matrix) {
+  auto* src = R"(
+var<workgroup> m : mat2x2<f32>;
+
+[[stage(compute), workgroup_size(1)]]
+fn main() {
+  let x = m;
+}
+)";
+
+  auto* expect = R"(
+[[stage(compute), workgroup_size(1)]]
+fn main([[internal(disable_validation__entry_point_parameter)]] tint_symbol : ptr<workgroup, mat2x2<f32>>) {
+  let x = *(tint_symbol);
+}
+)";
+
+  auto got = Run<ModuleScopeVarToEntryPointParam>(src);
+
+  EXPECT_EQ(expect, str(got));
+}
+
+TEST_F(ModuleScopeVarToEntryPointParamTest, NestedMatrix) {
+  auto* src = R"(
+struct S1 {
+  m : mat2x2<f32>;
+};
+struct S2 {
+  s : S1;
+};
+var<workgroup> m : array<S2, 4>;
+
+[[stage(compute), workgroup_size(1)]]
+fn main() {
+  let x = m;
+}
+)";
+
+  auto* expect = R"(
+struct S1 {
+  m : mat2x2<f32>;
+};
+
+struct S2 {
+  s : S1;
+};
+
+[[stage(compute), workgroup_size(1)]]
+fn main([[internal(disable_validation__entry_point_parameter)]] tint_symbol : ptr<workgroup, array<S2, 4u>>) {
+  let x = *(tint_symbol);
+}
+)";
+
+  auto got = Run<ModuleScopeVarToEntryPointParam>(src);
+
+  EXPECT_EQ(expect, str(got));
+}
+
 TEST_F(ModuleScopeVarToEntryPointParamTest, EmtpyModule) {
   auto* src = "";
 
diff --git a/src/writer/msl/generator.cc b/src/writer/msl/generator.cc
index c898204..cc1aa5c 100644
--- a/src/writer/msl/generator.cc
+++ b/src/writer/msl/generator.cc
@@ -45,6 +45,7 @@
   result.error = impl->error();
   result.msl = impl->result();
   result.has_invariant_attribute = impl->HasInvariant();
+  result.workgroup_allocations = impl->DynamicWorkgroupAllocations();
 
   return result;
 }
diff --git a/src/writer/msl/generator.h b/src/writer/msl/generator.h
index aa8d88e..af812d7 100644
--- a/src/writer/msl/generator.h
+++ b/src/writer/msl/generator.h
@@ -17,6 +17,8 @@
 
 #include <memory>
 #include <string>
+#include <unordered_map>
+#include <vector>
 
 #include "src/writer/text.h"
 
@@ -73,6 +75,11 @@
 
   /// True if the generated shader uses the invariant attribute.
   bool has_invariant_attribute = false;
+
+  /// A map from entry point name to a list of dynamic workgroup allocations.
+  /// Each entry in the vector is the size of the workgroup allocation that
+  /// should be created for that index.
+  std::unordered_map<std::string, std::vector<uint32_t>> workgroup_allocations;
 };
 
 /// Generate MSL for a program, according to a set of configuration options. The
diff --git a/src/writer/msl/generator_impl.cc b/src/writer/msl/generator_impl.cc
index b790c44..10d9620 100644
--- a/src/writer/msl/generator_impl.cc
+++ b/src/writer/msl/generator_impl.cc
@@ -1634,13 +1634,14 @@
 
 bool GeneratorImpl::EmitEntryPointFunction(ast::Function* func) {
   auto* func_sem = program_->Sem().Get(func);
+  auto func_name = program_->Symbols().NameFor(func->symbol());
 
   {
     auto out = line();
 
     EmitStage(out, func->pipeline_stage());
     out << " " << func->return_type()->FriendlyName(program_->Symbols());
-    out << " " << program_->Symbols().NameFor(func->symbol()) << "(";
+    out << " " << func_name << "(";
 
     // Emit entry point parameters.
     bool first = true;
@@ -1652,11 +1653,14 @@
 
       auto* type = program_->Sem().Get(var)->Type()->UnwrapRef();
 
-      if (!EmitType(out, type, "")) {
+      auto param_name = program_->Symbols().NameFor(var->symbol());
+      if (!EmitType(out, type, param_name)) {
         return false;
       }
-
-      out << " " << program_->Symbols().NameFor(var->symbol());
+      // Parameter name is output as part of the type for arrays and pointers.
+      if (!type->Is<sem::Array>() && !type->Is<sem::Pointer>()) {
+        out << " " << param_name;
+      }
 
       if (type->Is<sem::Struct>()) {
         out << " [[stage_in]]";
@@ -1682,6 +1686,16 @@
               << "invalid handle type entry point parameter";
           return false;
         }
+      } else if (auto* ptr = var->type()->As<ast::Pointer>()) {
+        if (ptr->storage_class() == ast::StorageClass::kWorkgroup) {
+          auto& allocations = workgroup_allocations_[func_name];
+          out << " [[threadgroup(" << allocations.size() << ")]]";
+          allocations.push_back(program_->Sem().Get(ptr->type())->Size());
+        } else {
+          TINT_ICE(Writer, diagnostics_)
+              << "invalid pointer storage class for entry point parameter";
+          return false;
+        }
       } else {
         auto& decos = var->decorations();
         bool builtin_found = false;
diff --git a/src/writer/msl/generator_impl.h b/src/writer/msl/generator_impl.h
index b2be21d..ee5b2bf 100644
--- a/src/writer/msl/generator_impl.h
+++ b/src/writer/msl/generator_impl.h
@@ -17,6 +17,7 @@
 
 #include <string>
 #include <unordered_map>
+#include <vector>
 
 #include "src/ast/array_accessor_expression.h"
 #include "src/ast/assignment_statement.h"
@@ -84,6 +85,12 @@
   /// @returns true if an invariant attribute was generated
   bool HasInvariant() { return has_invariant_; }
 
+  /// @returns a map from entry point to list of required workgroup allocations
+  const std::unordered_map<std::string, std::vector<uint32_t>>&
+  DynamicWorkgroupAllocations() const {
+    return workgroup_allocations_;
+  }
+
   /// Handles generating a declared type
   /// @param ty the declared type to generate
   /// @returns true if the declared type was emitted
@@ -378,6 +385,11 @@
   /// True if matrix-packed_vector operator overloads have been generated.
   bool matrix_packed_vector_overloads_ = false;
 
+  /// A map from entry point name to a list of dynamic workgroup allocations.
+  /// Each entry in the vector is the size of the workgroup allocation that
+  /// should be created for that index.
+  std::unordered_map<std::string, std::vector<uint32_t>> workgroup_allocations_;
+
   std::unordered_map<const sem::Intrinsic*, std::string> intrinsics_;
   std::unordered_map<const sem::Type*, std::string> unary_minus_funcs_;
 };
diff --git a/src/writer/msl/generator_impl_test.cc b/src/writer/msl/generator_impl_test.cc
index 5ce939e..af43a53 100644
--- a/src/writer/msl/generator_impl_test.cc
+++ b/src/writer/msl/generator_impl_test.cc
@@ -130,6 +130,243 @@
 )");
 }
 
+TEST_F(MslGeneratorImplTest, WorkgroupMatrix) {
+  Global("m", ty.mat2x2<f32>(), ast::StorageClass::kWorkgroup);
+  Func("comp_main", ast::VariableList{}, ty.void_(),
+       {Decl(Const("x", nullptr, Expr("m")))},
+       {Stage(ast::PipelineStage::kCompute), WorkgroupSize(1)});
+
+  GeneratorImpl& gen = SanitizeAndBuild();
+
+  ASSERT_TRUE(gen.Generate()) << gen.error();
+  EXPECT_EQ(gen.result(), R"(#include <metal_stdlib>
+
+using namespace metal;
+void comp_main_inner(uint local_invocation_index, threadgroup float2x2* const tint_symbol) {
+  {
+    *(tint_symbol) = float2x2();
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  float2x2 const x = *(tint_symbol);
+}
+
+kernel void comp_main(threadgroup float2x2* tint_symbol_1 [[threadgroup(0)]], uint local_invocation_index [[thread_index_in_threadgroup]]) {
+  comp_main_inner(local_invocation_index, tint_symbol_1);
+  return;
+}
+
+)");
+
+  auto allocations = gen.DynamicWorkgroupAllocations();
+  ASSERT_TRUE(allocations.count("comp_main"));
+  ASSERT_EQ(allocations["comp_main"].size(), 1u);
+  EXPECT_EQ(allocations["comp_main"][0], 2u * 2u * sizeof(float));
+}
+
+TEST_F(MslGeneratorImplTest, WorkgroupMatrixInArray) {
+  Global("m", ty.array(ty.mat2x2<f32>(), 4), ast::StorageClass::kWorkgroup);
+  Func("comp_main", ast::VariableList{}, ty.void_(),
+       {Decl(Const("x", nullptr, Expr("m")))},
+       {Stage(ast::PipelineStage::kCompute), WorkgroupSize(1)});
+
+  GeneratorImpl& gen = SanitizeAndBuild();
+
+  ASSERT_TRUE(gen.Generate()) << gen.error();
+  EXPECT_EQ(gen.result(), R"(#include <metal_stdlib>
+
+using namespace metal;
+struct tint_array_wrapper {
+  float2x2 arr[4];
+};
+
+void comp_main_inner(uint local_invocation_index, threadgroup tint_array_wrapper* const tint_symbol) {
+  for(uint idx = local_invocation_index; (idx < 4u); idx = (idx + 1u)) {
+    uint const i = idx;
+    (*(tint_symbol)).arr[i] = float2x2();
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  tint_array_wrapper const x = *(tint_symbol);
+}
+
+kernel void comp_main(threadgroup tint_array_wrapper* tint_symbol_1 [[threadgroup(0)]], uint local_invocation_index [[thread_index_in_threadgroup]]) {
+  comp_main_inner(local_invocation_index, tint_symbol_1);
+  return;
+}
+
+)");
+
+  auto allocations = gen.DynamicWorkgroupAllocations();
+  ASSERT_TRUE(allocations.count("comp_main"));
+  ASSERT_EQ(allocations["comp_main"].size(), 1u);
+  EXPECT_EQ(allocations["comp_main"][0], 4u * 2u * 2u * sizeof(float));
+}
+
+TEST_F(MslGeneratorImplTest, WorkgroupMatrixInStruct) {
+  Structure("S1", {
+                      Member("m1", ty.mat2x2<f32>()),
+                      Member("m2", ty.mat4x4<f32>()),
+                  });
+  Structure("S2", {
+                      Member("s", ty.type_name("S1")),
+                  });
+  Global("s", ty.type_name("S2"), ast::StorageClass::kWorkgroup);
+  Func("comp_main", ast::VariableList{}, ty.void_(),
+       {Decl(Const("x", nullptr, Expr("s")))},
+       {Stage(ast::PipelineStage::kCompute), WorkgroupSize(1)});
+
+  GeneratorImpl& gen = SanitizeAndBuild();
+
+  ASSERT_TRUE(gen.Generate()) << gen.error();
+  EXPECT_EQ(gen.result(), R"(#include <metal_stdlib>
+
+using namespace metal;
+struct S1 {
+  float2x2 m1;
+  float4x4 m2;
+};
+struct S2 {
+  S1 s;
+};
+
+void comp_main_inner(uint local_invocation_index, threadgroup S2* const tint_symbol_1) {
+  {
+    S2 const tint_symbol = {};
+    *(tint_symbol_1) = tint_symbol;
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  S2 const x = *(tint_symbol_1);
+}
+
+kernel void comp_main(threadgroup S2* tint_symbol_2 [[threadgroup(0)]], uint local_invocation_index [[thread_index_in_threadgroup]]) {
+  comp_main_inner(local_invocation_index, tint_symbol_2);
+  return;
+}
+
+)");
+
+  auto allocations = gen.DynamicWorkgroupAllocations();
+  ASSERT_TRUE(allocations.count("comp_main"));
+  ASSERT_EQ(allocations["comp_main"].size(), 1u);
+  EXPECT_EQ(allocations["comp_main"][0],
+            (2 * 2 * sizeof(float)) + (4u * 4u * sizeof(float)));
+}
+
+TEST_F(MslGeneratorImplTest, WorkgroupMatrix_Multiples) {
+  Global("m1", ty.mat2x2<f32>(), ast::StorageClass::kWorkgroup);
+  Global("m2", ty.mat2x3<f32>(), ast::StorageClass::kWorkgroup);
+  Global("m3", ty.mat2x4<f32>(), ast::StorageClass::kWorkgroup);
+  Global("m4", ty.mat3x2<f32>(), ast::StorageClass::kWorkgroup);
+  Global("m5", ty.mat3x3<f32>(), ast::StorageClass::kWorkgroup);
+  Global("m6", ty.mat3x4<f32>(), ast::StorageClass::kWorkgroup);
+  Global("m7", ty.mat4x2<f32>(), ast::StorageClass::kWorkgroup);
+  Global("m8", ty.mat4x3<f32>(), ast::StorageClass::kWorkgroup);
+  Global("m9", ty.mat4x4<f32>(), ast::StorageClass::kWorkgroup);
+  Func("main1", ast::VariableList{}, ty.void_(),
+       {
+           Decl(Const("a1", nullptr, Expr("m1"))),
+           Decl(Const("a2", nullptr, Expr("m2"))),
+           Decl(Const("a3", nullptr, Expr("m3"))),
+       },
+       {Stage(ast::PipelineStage::kCompute), WorkgroupSize(1)});
+  Func("main2", ast::VariableList{}, ty.void_(),
+       {
+           Decl(Const("a1", nullptr, Expr("m4"))),
+           Decl(Const("a2", nullptr, Expr("m5"))),
+           Decl(Const("a3", nullptr, Expr("m6"))),
+       },
+       {Stage(ast::PipelineStage::kCompute), WorkgroupSize(1)});
+  Func("main3", ast::VariableList{}, ty.void_(),
+       {
+           Decl(Const("a1", nullptr, Expr("m7"))),
+           Decl(Const("a2", nullptr, Expr("m8"))),
+           Decl(Const("a3", nullptr, Expr("m9"))),
+       },
+       {Stage(ast::PipelineStage::kCompute), WorkgroupSize(1)});
+  Func("main4_no_usages", ast::VariableList{}, ty.void_(), {},
+       {Stage(ast::PipelineStage::kCompute), WorkgroupSize(1)});
+
+  GeneratorImpl& gen = SanitizeAndBuild();
+
+  ASSERT_TRUE(gen.Generate()) << gen.error();
+  EXPECT_EQ(gen.result(), R"(#include <metal_stdlib>
+
+using namespace metal;
+void main1_inner(uint local_invocation_index, threadgroup float2x2* const tint_symbol, threadgroup float2x3* const tint_symbol_1, threadgroup float2x4* const tint_symbol_2) {
+  {
+    *(tint_symbol) = float2x2();
+    *(tint_symbol_1) = float2x3();
+    *(tint_symbol_2) = float2x4();
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  float2x2 const a1 = *(tint_symbol);
+  float2x3 const a2 = *(tint_symbol_1);
+  float2x4 const a3 = *(tint_symbol_2);
+}
+
+kernel void main1(threadgroup float2x2* tint_symbol_3 [[threadgroup(0)]], threadgroup float2x3* tint_symbol_4 [[threadgroup(1)]], threadgroup float2x4* tint_symbol_5 [[threadgroup(2)]], uint local_invocation_index [[thread_index_in_threadgroup]]) {
+  main1_inner(local_invocation_index, tint_symbol_3, tint_symbol_4, tint_symbol_5);
+  return;
+}
+
+void main2_inner(uint local_invocation_index_1, threadgroup float3x2* const tint_symbol_6, threadgroup float3x3* const tint_symbol_7, threadgroup float3x4* const tint_symbol_8) {
+  {
+    *(tint_symbol_6) = float3x2();
+    *(tint_symbol_7) = float3x3();
+    *(tint_symbol_8) = float3x4();
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  float3x2 const a1 = *(tint_symbol_6);
+  float3x3 const a2 = *(tint_symbol_7);
+  float3x4 const a3 = *(tint_symbol_8);
+}
+
+kernel void main2(threadgroup float3x2* tint_symbol_9 [[threadgroup(0)]], threadgroup float3x3* tint_symbol_10 [[threadgroup(1)]], threadgroup float3x4* tint_symbol_11 [[threadgroup(2)]], uint local_invocation_index_1 [[thread_index_in_threadgroup]]) {
+  main2_inner(local_invocation_index_1, tint_symbol_9, tint_symbol_10, tint_symbol_11);
+  return;
+}
+
+void main3_inner(uint local_invocation_index_2, threadgroup float4x2* const tint_symbol_12, threadgroup float4x3* const tint_symbol_13, threadgroup float4x4* const tint_symbol_14) {
+  {
+    *(tint_symbol_12) = float4x2();
+    *(tint_symbol_13) = float4x3();
+    *(tint_symbol_14) = float4x4();
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  float4x2 const a1 = *(tint_symbol_12);
+  float4x3 const a2 = *(tint_symbol_13);
+  float4x4 const a3 = *(tint_symbol_14);
+}
+
+kernel void main3(threadgroup float4x2* tint_symbol_15 [[threadgroup(0)]], threadgroup float4x3* tint_symbol_16 [[threadgroup(1)]], threadgroup float4x4* tint_symbol_17 [[threadgroup(2)]], uint local_invocation_index_2 [[thread_index_in_threadgroup]]) {
+  main3_inner(local_invocation_index_2, tint_symbol_15, tint_symbol_16, tint_symbol_17);
+  return;
+}
+
+kernel void main4_no_usages() {
+  return;
+}
+
+)");
+
+  auto allocations = gen.DynamicWorkgroupAllocations();
+  ASSERT_TRUE(allocations.count("main1"));
+  ASSERT_TRUE(allocations.count("main2"));
+  ASSERT_TRUE(allocations.count("main3"));
+  EXPECT_EQ(allocations.count("main4_no_usages"), 0u);
+  ASSERT_EQ(allocations["main1"].size(), 3u);
+  EXPECT_EQ(allocations["main1"][0], 2u * 2u * sizeof(float));
+  EXPECT_EQ(allocations["main1"][1], 2u * 4u * sizeof(float));
+  EXPECT_EQ(allocations["main1"][2], 2u * 4u * sizeof(float));
+  ASSERT_EQ(allocations["main2"].size(), 3u);
+  EXPECT_EQ(allocations["main2"][0], 3u * 2u * sizeof(float));
+  EXPECT_EQ(allocations["main2"][1], 3u * 4u * sizeof(float));
+  EXPECT_EQ(allocations["main2"][2], 3u * 4u * sizeof(float));
+  ASSERT_EQ(allocations["main3"].size(), 3u);
+  EXPECT_EQ(allocations["main3"][0], 4u * 2u * sizeof(float));
+  EXPECT_EQ(allocations["main3"][1], 4u * 4u * sizeof(float));
+  EXPECT_EQ(allocations["main3"][2], 4u * 4u * sizeof(float));
+}
+
 }  // namespace
 }  // namespace msl
 }  // namespace writer
diff --git a/test/var/initialization/workgroup/matrix.wgsl.expected.msl b/test/var/initialization/workgroup/matrix.wgsl.expected.msl
index 52f49c4..54bf7bb 100644
--- a/test/var/initialization/workgroup/matrix.wgsl.expected.msl
+++ b/test/var/initialization/workgroup/matrix.wgsl.expected.msl
@@ -1,192 +1,16 @@
-SKIP: crbug.com/tint/938
+#include <metal_stdlib>
 
+using namespace metal;
+void tint_symbol_inner(uint local_invocation_index, threadgroup float2x3* const tint_symbol_1) {
+  {
+    *(tint_symbol_1) = float2x3();
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  (void) *(tint_symbol_1);
+}
 
+kernel void tint_symbol(threadgroup float2x3* tint_symbol_2 [[threadgroup(0)]], uint local_invocation_index [[thread_index_in_threadgroup]]) {
+  tint_symbol_inner(local_invocation_index, tint_symbol_2);
+  return;
+}
 
-Validation Failure:
-
-Compilation failed:
-
-program_source:5:24: error: no matching constructor for initialization of 'threadgroup metal::float2x3' (aka 'threadgroup matrix<float, 2, 3>')
-  threadgroup float2x3 tint_symbol_2;
-                       ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:269:23: note: candidate template ignored: requirement 'sizeof...(U) == 2' was not satisfied [with U = <>]
-  METAL_FUNC explicit matrix(initializer_list<U>... cols) thread
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:281:23: note: candidate template ignored: requirement '(sizeof...(U) == 2) || (sizeof...(U) == 2 * 3)' was not satisfied [with U = <>]
-  METAL_FUNC explicit matrix(U... vals) thread
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:299:23: note: candidate template ignored: requirement 'sizeof...(U) == 2' was not satisfied [with U = <>]
-  METAL_FUNC explicit matrix(initializer_list<U>... cols) constant
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:311:23: note: candidate template ignored: requirement '(sizeof...(U) == 2) || (sizeof...(U) == 2 * 3)' was not satisfied [with U = <>]
-  METAL_FUNC explicit matrix(U... vals) constant
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:330:23: note: candidate template ignored: requirement 'sizeof...(U) == 2' was not satisfied [with U = <>]
-  METAL_FUNC explicit matrix(initializer_list<U>... cols) ray_data
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:342:23: note: candidate template ignored: requirement '(sizeof...(U) == 2) || (sizeof...(U) == 2 * 3)' was not satisfied [with U = <>]
-  METAL_FUNC explicit matrix(U... vals) ray_data
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:56:23: note: candidate constructor template not viable: requires 2 arguments, but 0 were provided
-  METAL_FUNC explicit matrix(T val, _integer_sequence<int, C...>) thread
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:62:23: note: candidate constructor template not viable: requires 3 arguments, but 0 were provided
-  METAL_FUNC explicit matrix(cols_init_tag, initializer_list<vec<T, Rows>> cols, _integer_sequence<int, C...>) thread
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:68:23: note: candidate constructor template not viable: requires at least 1 argument, but 0 were provided
-  METAL_FUNC explicit matrix(cols_all_tag, U... cols) thread
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:75:23: note: candidate constructor template not viable: requires at least 1 argument, but 0 were provided
-  METAL_FUNC explicit matrix(elems_all_tag, U... elems) thread
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:80:23: note: candidate constructor template not viable: requires 2 arguments, but 0 were provided
-  METAL_FUNC explicit matrix(initializer_list<T> elems, _integer_sequence<int, C...>) thread
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:86:23: note: candidate constructor template not viable: requires 2 arguments, but 0 were provided
-  METAL_FUNC explicit matrix(T val, _integer_sequence<int, C...>) constant
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:92:23: note: candidate constructor template not viable: requires 3 arguments, but 0 were provided
-  METAL_FUNC explicit matrix(cols_init_tag, initializer_list<vec<T, Rows>> cols, _integer_sequence<int, C...>) constant
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:98:23: note: candidate constructor template not viable: requires at least 1 argument, but 0 were provided
-  METAL_FUNC explicit matrix(cols_all_tag, U... cols) constant
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:105:23: note: candidate constructor template not viable: requires at least 1 argument, but 0 were provided
-  METAL_FUNC explicit matrix(elems_all_tag, U... elems) constant
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:110:23: note: candidate constructor template not viable: requires 2 arguments, but 0 were provided
-  METAL_FUNC explicit matrix(initializer_list<T> elems, _integer_sequence<int, C...>) constant
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:117:23: note: candidate constructor template not viable: requires 2 arguments, but 0 were provided
-  METAL_FUNC explicit matrix(T val, _integer_sequence<int, C...>) ray_data
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:123:23: note: candidate constructor template not viable: requires 3 arguments, but 0 were provided
-  METAL_FUNC explicit matrix(cols_init_tag, initializer_list<vec<T, Rows>> cols, _integer_sequence<int, C...>) ray_data
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:129:23: note: candidate constructor template not viable: requires at least 1 argument, but 0 were provided
-  METAL_FUNC explicit matrix(cols_all_tag, U... cols) ray_data
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:136:23: note: candidate constructor template not viable: requires at least 1 argument, but 0 were provided
-  METAL_FUNC explicit matrix(elems_all_tag, U... elems) ray_data
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:141:23: note: candidate constructor template not viable: requires 2 arguments, but 0 were provided
-  METAL_FUNC explicit matrix(initializer_list<T> elems, _integer_sequence<int, C...>) ray_data
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:149:23: note: candidate constructor template not viable: requires 2 arguments, but 0 were provided
-  METAL_FUNC explicit matrix(const thread matrix<U, Cols, Rows> &that, _integer_sequence<int, C...>) thread
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:154:23: note: candidate constructor template not viable: requires 2 arguments, but 0 were provided
-  METAL_FUNC explicit matrix(const device matrix<U, Cols, Rows> &that, _integer_sequence<int, C...>) thread
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:159:23: note: candidate constructor template not viable: requires 2 arguments, but 0 were provided
-  METAL_FUNC explicit matrix(const constant matrix<U, Cols, Rows> &that, _integer_sequence<int, C...>) thread
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:164:23: note: candidate constructor template not viable: requires 2 arguments, but 0 were provided
-  METAL_FUNC explicit matrix(const threadgroup matrix<U, Cols, Rows> &that, _integer_sequence<int, C...>) thread
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:170:23: note: candidate constructor template not viable: requires 2 arguments, but 0 were provided
-  METAL_FUNC explicit matrix(const threadgroup_imageblock matrix<U, Cols, Rows> &that, _integer_sequence<int, C...>) thread
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:177:23: note: candidate constructor template not viable: requires 2 arguments, but 0 were provided
-  METAL_FUNC explicit matrix(const ray_data matrix<U, Cols, Rows> &that, _integer_sequence<int, C...>) thread
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:183:23: note: candidate constructor template not viable: requires 2 arguments, but 0 were provided
-  METAL_FUNC explicit matrix(const thread matrix<U, Cols, Rows> &that, _integer_sequence<int, C...>) constant
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:188:23: note: candidate constructor template not viable: requires 2 arguments, but 0 were provided
-  METAL_FUNC explicit matrix(const device matrix<U, Cols, Rows> &that, _integer_sequence<int, C...>) constant
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:193:23: note: candidate constructor template not viable: requires 2 arguments, but 0 were provided
-  METAL_FUNC explicit matrix(const constant matrix<U, Cols, Rows> &that, _integer_sequence<int, C...>) constant
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:198:23: note: candidate constructor template not viable: requires 2 arguments, but 0 were provided
-  METAL_FUNC explicit matrix(const threadgroup matrix<U, Cols, Rows> &that, _integer_sequence<int, C...>) constant
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:204:23: note: candidate constructor template not viable: requires 2 arguments, but 0 were provided
-  METAL_FUNC explicit matrix(const threadgroup_imageblock matrix<U, Cols, Rows> &that, _integer_sequence<int, C...>) constant
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:211:23: note: candidate constructor template not viable: requires 2 arguments, but 0 were provided
-  METAL_FUNC explicit matrix(const ray_data matrix<U, Cols, Rows> &that, _integer_sequence<int, C...>) constant
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:218:23: note: candidate constructor template not viable: requires 2 arguments, but 0 were provided
-  METAL_FUNC explicit matrix(const thread matrix<U, Cols, Rows> &that, _integer_sequence<int, C...>) ray_data
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:225:23: note: candidate constructor template not viable: requires 2 arguments, but 0 were provided
-  METAL_FUNC explicit matrix(const device matrix<U, Cols, Rows> &that, _integer_sequence<int, C...>) ray_data
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:232:23: note: candidate constructor template not viable: requires 2 arguments, but 0 were provided
-  METAL_FUNC explicit matrix(const constant matrix<U, Cols, Rows> &that, _integer_sequence<int, C...>) ray_data
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:239:23: note: candidate constructor template not viable: requires 2 arguments, but 0 were provided
-  METAL_FUNC explicit matrix(const threadgroup matrix<U, Cols, Rows> &that, _integer_sequence<int, C...>) ray_data
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:247:23: note: candidate constructor template not viable: requires 2 arguments, but 0 were provided
-  METAL_FUNC explicit matrix(const threadgroup_imageblock matrix<U, Cols, Rows> &that, _integer_sequence<int, C...>) ray_data
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:255:23: note: candidate constructor template not viable: requires 2 arguments, but 0 were provided
-  METAL_FUNC explicit matrix(const ray_data matrix<U, Cols, Rows> &that, _integer_sequence<int, C...>) ray_data
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:358:23: note: candidate constructor template not viable: requires single argument 'that', but no arguments were provided
-  METAL_FUNC explicit matrix(const thread matrix<U, Cols, Rows> &that) thread
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:365:23: note: candidate constructor template not viable: requires single argument 'that', but no arguments were provided
-  METAL_FUNC explicit matrix(const device matrix<U, Cols, Rows> &that) thread
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:372:23: note: candidate constructor template not viable: requires single argument 'that', but no arguments were provided
-  METAL_FUNC explicit matrix(const constant matrix<U, Cols, Rows> &that) thread
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:379:23: note: candidate constructor template not viable: requires single argument 'that', but no arguments were provided
-  METAL_FUNC explicit matrix(const threadgroup matrix<U, Cols, Rows> &that) thread
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:387:23: note: candidate constructor template not viable: requires single argument 'that', but no arguments were provided
-  METAL_FUNC explicit matrix(const threadgroup_imageblock matrix<U, Cols, Rows> &that) thread
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:396:23: note: candidate constructor template not viable: requires single argument 'that', but no arguments were provided
-  METAL_FUNC explicit matrix(const ray_data matrix<U, Cols, Rows> &that) thread
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:404:23: note: candidate constructor template not viable: requires single argument 'that', but no arguments were provided
-  METAL_FUNC explicit matrix(const thread matrix<U, Cols, Rows> &that) constant
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:411:23: note: candidate constructor template not viable: requires single argument 'that', but no arguments were provided
-  METAL_FUNC explicit matrix(const device matrix<U, Cols, Rows> &that) constant
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:418:23: note: candidate constructor template not viable: requires single argument 'that', but no arguments were provided
-  METAL_FUNC explicit matrix(const constant matrix<U, Cols, Rows> &that) constant
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:425:23: note: candidate constructor template not viable: requires single argument 'that', but no arguments were provided
-  METAL_FUNC explicit matrix(const threadgroup matrix<U, Cols, Rows> &that) constant
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:433:23: note: candidate constructor template not viable: requires single argument 'that', but no arguments were provided
-  METAL_FUNC explicit matrix(const threadgroup_imageblock matrix<U, Cols, Rows> &that) constant
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:442:23: note: candidate constructor template not viable: requires single argument 'that', but no arguments were provided
-  METAL_FUNC explicit matrix(const ray_data matrix<U, Cols, Rows> &that) constant
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:451:23: note: candidate constructor template not viable: requires single argument 'that', but no arguments were provided
-  METAL_FUNC explicit matrix(const thread matrix<U, Cols, Rows> &that) ray_data
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:460:23: note: candidate constructor template not viable: requires single argument 'that', but no arguments were provided
-  METAL_FUNC explicit matrix(const device matrix<U, Cols, Rows> &that) ray_data
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:469:23: note: candidate constructor template not viable: requires single argument 'that', but no arguments were provided
-  METAL_FUNC explicit matrix(const constant matrix<U, Cols, Rows> &that) ray_data
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:478:23: note: candidate constructor template not viable: requires single argument 'that', but no arguments were provided
-  METAL_FUNC explicit matrix(const threadgroup matrix<U, Cols, Rows> &that) ray_data
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:488:23: note: candidate constructor template not viable: requires single argument 'that', but no arguments were provided
-  METAL_FUNC explicit matrix(const threadgroup_imageblock matrix<U, Cols, Rows> &that) ray_data
-                      ^
-/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/31001/Libraries/lib/clang/31001.189/include/metal/metal_matrix:498:23: note: candidate constructor template not viable: requires single argument 'that', but no arguments were provided
-  METAL_FUNC explicit matrix(const ray_data matrix<U, Cols, Rows> &that) ray_data
-                      ^
-program_source:6:31: warning: equality comparison with extraneous parentheses
-  if ((local_invocation_index == 0u)) {
-       ~~~~~~~~~~~~~~~~~~~~~~~^~~~~
-program_source:6:31: note: remove extraneous parentheses around the comparison to silence this warning
-  if ((local_invocation_index == 0u)) {
-      ~                       ^    ~
-program_source:6:31: note: use '=' to turn this equality comparison into an assignment
-  if ((local_invocation_index == 0u)) {
-                              ^~
-                              =