Import Tint changes from Dawn

Changes:
  - 90b29e500a3ee0310c4f0df18eda68a2803fb2ac tint/transform/std140: Correctly handle nested / bare mat... by Ben Clayton <bclayton@google.com>
  - 84b43d61fa37b216e99f290e636d6dae7520c522 tint/writer: Handle unnested, uniform matCx2 matrices by Ben Clayton <bclayton@google.com>
  - a3f2bf6c60465c24e4ba706142cd47b7b357cfcb spirv-reader: phis as a particular case of hoisting to a ... by David Neto <dneto@google.com>
  - ebc5bba6718e5c1193fd116dac537b69dd1ec35b tint: const eval of binary XOR by Antonio Maiorano <amaiorano@google.com>
GitOrigin-RevId: 90b29e500a3ee0310c4f0df18eda68a2803fb2ac
Change-Id: I2247f4366934bb709291f09fade1c0f2888f50a5
Reviewed-on: https://dawn-review.googlesource.com/c/tint/+/102700
Reviewed-by: Ben Clayton <bclayton@google.com>
Kokoro: Kokoro <noreply+kokoro@google.com>
Commit-Queue: Ben Clayton <bclayton@google.com>
diff --git a/src/tint/intrinsics.def b/src/tint/intrinsics.def
index e32a0f4..23e0dc8 100644
--- a/src/tint/intrinsics.def
+++ b/src/tint/intrinsics.def
@@ -920,8 +920,8 @@
 op % <T: fiu32_f16, N: num> (vec<N, T>, T) -> vec<N, T>
 op % <T: fiu32_f16, N: num> (T, vec<N, T>) -> vec<N, T>
 
-op ^ <T: iu32>(T, T) -> T
-op ^ <T: iu32, N: num> (vec<N, T>, vec<N, T>) -> vec<N, T>
+@const op ^ <T: ia_iu32>(T, T) -> T
+@const op ^ <T: ia_iu32, N: num> (vec<N, T>, vec<N, T>) -> vec<N, T>
 
 @const op & (bool, bool) -> bool
 @const op & <N: num> (vec<N, bool>, vec<N, bool>) -> vec<N, bool>
diff --git a/src/tint/reader/spirv/function.cc b/src/tint/reader/spirv/function.cc
index b16d08d..59a98e1 100644
--- a/src/tint/reader/spirv/function.cc
+++ b/src/tint/reader/spirv/function.cc
@@ -37,6 +37,8 @@
 #include "src/tint/sem/depth_texture.h"
 #include "src/tint/sem/sampled_texture.h"
 #include "src/tint/transform/spirv_atomic.h"
+#include "src/tint/utils/hashmap.h"
+#include "src/tint/utils/hashset.h"
 
 // Terms:
 //    CFG: the control flow graph of the function, where basic blocks are the
@@ -3356,16 +3358,6 @@
         auto* type = ty_.Reference(storage_type, ast::StorageClass::kNone);
         identifier_types_.emplace(id, type);
     }
-    // Emit declarations of phi state variables, in index order.
-    for (auto id : sorted_by_index(block_info.phis_needing_state_vars)) {
-        const auto* def_inst = def_use_mgr_->GetDef(id);
-        TINT_ASSERT(Reader, def_inst);
-        const auto phi_var_name = GetDefInfo(id)->phi_var;
-        TINT_ASSERT(Reader, !phi_var_name.empty());
-        auto* var = builder_.Var(phi_var_name,
-                                 parser_impl_.ConvertType(def_inst->type_id())->Build(builder_));
-        AddStatement(create<ast::VariableDeclStatement>(Source{}, var));
-    }
 
     // Emit regular statements.
     const spvtools::opt::BasicBlock& bb = *(block_info.basic_block);
@@ -3384,22 +3376,55 @@
     // Emit assignments to carry values to phi nodes in potential destinations.
     // Do it in index order.
     if (!block_info.phi_assignments.IsEmpty()) {
-        auto sorted = block_info.phi_assignments;
+        // Keep only the phis that are used.
+        utils::Vector<BlockInfo::PhiAssignment, 4> worklist;
+        worklist.Reserve(block_info.phi_assignments.Length());
+        for (const auto assignment : block_info.phi_assignments) {
+            if (GetDefInfo(assignment.phi_id)->num_uses > 0) {
+                worklist.Push(assignment);
+            }
+        }
+        // Sort them.
         std::stable_sort(
-            sorted.begin(), sorted.end(),
+            worklist.begin(), worklist.end(),
             [this](const BlockInfo::PhiAssignment& lhs, const BlockInfo::PhiAssignment& rhs) {
                 return GetDefInfo(lhs.phi_id)->index < GetDefInfo(rhs.phi_id)->index;
             });
-        for (auto assignment : block_info.phi_assignments) {
-            const auto var_name = GetDefInfo(assignment.phi_id)->phi_var;
-            auto expr = MakeExpression(assignment.value);
-            if (!expr) {
-                return false;
+
+        // Generate assignments to the phi variables being fed by this
+        // block.  It must act as a parallel assignment. So first capture the
+        // current value of any value that will be overwritten, then generate
+        // the assignments.
+
+        // The set of IDs that are read  by the assignments.
+        utils::Hashset<uint32_t, 8> read_set;
+        for (const auto assignment : worklist) {
+            read_set.Add(assignment.value_id);
+        }
+        // Generate a let-declaration to capture the current value of each phi
+        // that will be both read and written.
+        utils::Hashmap<uint32_t, Symbol, 8> copied_phis;
+        for (const auto assignment : worklist) {
+            const auto phi_id = assignment.phi_id;
+            if (read_set.Find(phi_id)) {
+                auto copy_name = namer_.MakeDerivedName(namer_.Name(phi_id) + "_c" +
+                                                        std::to_string(block_info.id));
+                auto copy_sym = builder_.Symbols().Register(copy_name);
+                copied_phis.GetOrCreate(phi_id, [copy_sym]() { return copy_sym; });
+                AddStatement(builder_.WrapInStatement(
+                    builder_.Let(copy_sym, builder_.Expr(namer_.Name(phi_id)))));
             }
-            AddStatement(create<ast::AssignmentStatement>(
-                Source{},
-                create<ast::IdentifierExpression>(Source{}, builder_.Symbols().Register(var_name)),
-                expr.expr));
+        }
+
+        // Generate assignments to the phi vars.
+        for (const auto assignment : worklist) {
+            const auto phi_id = assignment.phi_id;
+            auto* const lhs_expr = builder_.Expr(namer_.Name(phi_id));
+            // If RHS value is actually a phi we just cpatured, then use it.
+            auto* const copy_sym = copied_phis.Find(assignment.value_id);
+            auto* const rhs_expr =
+                copy_sym ? builder_.Expr(*copy_sym) : MakeExpression(assignment.value_id).expr;
+            AddStatement(builder_.Assign(lhs_expr, rhs_expr));
         }
     }
 
@@ -3692,11 +3717,8 @@
         }
 
         case SpvOpPhi: {
-            // Emit a read from the associated state variable.
-            TypedExpression expr{parser_impl_.ConvertType(inst.type_id()),
-                                 create<ast::IdentifierExpression>(
-                                     Source{}, builder_.Symbols().Register(def_info->phi_var))};
-            return EmitConstDefOrWriteToHoistedVar(inst, expr);
+            // The value will be in scope, available for reading from the phi ID.
+            return true;
         }
 
         case SpvOpOuterProduct:
@@ -4884,60 +4906,80 @@
         }
     }
 
-    // Scan uses of locally defined IDs, in function block order.
+    // Scan uses of locally defined IDs, finding their first and last uses, in
+    // block order.
+
+    // Updates the span of block positions that this value is used in.
+    // Ignores values defined outside this function.
+    auto record_value_use = [this](uint32_t id, const BlockInfo* block_info) {
+        if (auto* def_info = GetDefInfo(id)) {
+            // Update usage count.
+            def_info->num_uses++;
+            // Update usage span.
+            def_info->first_use_pos = std::min(def_info->first_use_pos, block_info->pos);
+            def_info->last_use_pos = std::max(def_info->last_use_pos, block_info->pos);
+
+            // Determine whether this ID is defined in a different construct
+            // from this use.
+            const auto defining_block = block_order_[def_info->block_pos];
+            const auto* def_in_construct = GetBlockInfo(defining_block)->construct;
+            if (def_in_construct != block_info->construct) {
+                def_info->used_in_another_construct = true;
+            }
+        }
+    };
     for (auto block_id : block_order_) {
         const auto* block_info = GetBlockInfo(block_id);
-        const auto block_pos = block_info->pos;
         for (const auto& inst : *(block_info->basic_block)) {
             // Update bookkeeping for locally-defined IDs used by this instruction.
-            inst.ForEachInId([this, block_pos, block_info](const uint32_t* id_ptr) {
-                auto* def_info = GetDefInfo(*id_ptr);
-                if (def_info) {
-                    // Update usage count.
-                    def_info->num_uses++;
-                    // Update usage span.
-                    def_info->last_use_pos = std::max(def_info->last_use_pos, block_pos);
-
-                    // Determine whether this ID is defined in a different construct
-                    // from this use.
-                    const auto defining_block = block_order_[def_info->block_pos];
-                    const auto* def_in_construct = GetBlockInfo(defining_block)->construct;
-                    if (def_in_construct != block_info->construct) {
-                        def_info->used_in_another_construct = true;
-                    }
-                }
-            });
-
             if (inst.opcode() == SpvOpPhi) {
-                // Declare a name for the variable used to carry values to a phi.
+                // For an OpPhi defining value P, an incoming value V from parent block B is
+                // counted as being "used" at block B, not at the block containing the Phi.
+                // That's because we will create a variable PHI_P to hold the phi value, and
+                // in the code generated for block B, create assignment `PHI_P = V`.
+                // To make the WGSL scopes work, both P and V are counted as being "used"
+                // in the parent block B.
+
                 const auto phi_id = inst.result_id();
                 auto* phi_def_info = GetDefInfo(phi_id);
-                phi_def_info->phi_var = namer_.MakeDerivedName(namer_.Name(phi_id) + "_phi");
+                phi_def_info->is_phi = true;
+
                 // Track all the places where we need to mention the variable,
                 // so we can place its declaration.  First, record the location of
                 // the read from the variable.
-                uint32_t first_pos = block_pos;
-                uint32_t last_pos = block_pos;
                 // Record the assignments that will propagate values from predecessor
                 // blocks.
                 for (uint32_t i = 0; i + 1 < inst.NumInOperands(); i += 2) {
-                    const uint32_t value_id = inst.GetSingleWordInOperand(i);
+                    const uint32_t incoming_value_id = inst.GetSingleWordInOperand(i);
                     const uint32_t pred_block_id = inst.GetSingleWordInOperand(i + 1);
                     auto* pred_block_info = GetBlockInfo(pred_block_id);
                     // The predecessor might not be in the block order at all, so we
                     // need this guard.
                     if (IsInBlockOrder(pred_block_info)) {
+                        // Track where the incoming value needs to be in scope.
+                        record_value_use(incoming_value_id, block_info);
+
+                        // Track where P needs to be in scope.  It's not an ordinary use, so don't
+                        // count it as one.
+                        const auto pred_pos = pred_block_info->pos;
+                        phi_def_info->first_use_pos =
+                            std::min(phi_def_info->first_use_pos, pred_pos);
+                        phi_def_info->last_use_pos = std::max(phi_def_info->last_use_pos, pred_pos);
+
                         // Record the assignment that needs to occur at the end
                         // of the predecessor block.
-                        pred_block_info->phi_assignments.Push({phi_id, value_id});
-                        first_pos = std::min(first_pos, pred_block_info->pos);
-                        last_pos = std::max(last_pos, pred_block_info->pos);
+                        pred_block_info->phi_assignments.Push({phi_id, incoming_value_id});
                     }
                 }
 
                 // Schedule the declaration of the state variable.
-                const auto* enclosing_construct = GetEnclosingScope(first_pos, last_pos);
+                const auto* enclosing_construct =
+                    GetEnclosingScope(phi_def_info->first_use_pos, phi_def_info->last_use_pos);
                 GetBlockInfo(enclosing_construct->begin_id)->phis_needing_state_vars.Push(phi_id);
+            } else {
+                inst.ForEachInId([block_info, &record_value_use](const uint32_t* id_ptr) {
+                    record_value_use(*id_ptr, block_info);
+                });
             }
         }
     }
@@ -4967,41 +5009,47 @@
             continue;
         }
 
-        // The first use must be the at the SSA definition, because block order
-        // respects dominance.
-        const auto first_pos = def_info->block_pos;
-        const auto last_use_pos = def_info->last_use_pos;
-
-        const auto* def_in_construct = GetBlockInfo(block_order_[first_pos])->construct;
+        const auto* def_in_construct = GetBlockInfo(block_order_[def_info->block_pos])->construct;
         // A definition in the first block of an kIfSelection or kSwitchSelection
         // occurs before the branch, and so that definition should count as
         // having been defined at the scope of the parent construct.
-        if (first_pos == def_in_construct->begin_pos) {
+        if (def_info->block_pos == def_in_construct->begin_pos) {
             if ((def_in_construct->kind == Construct::kIfSelection) ||
                 (def_in_construct->kind == Construct::kSwitchSelection)) {
                 def_in_construct = def_in_construct->parent;
             }
         }
 
-        bool should_hoist = false;
-        if (!def_in_construct->ContainsPos(last_use_pos)) {
+        // We care about the earliest between the place of definition, and the first
+        // use of the value.
+        const auto first_pos = std::min(def_info->block_pos, def_info->first_use_pos);
+        const auto last_use_pos = def_info->last_use_pos;
+
+        bool should_hoist_to_let = false;
+        bool should_hoist_to_var = false;
+        if (def_info->is_phi) {
+            // We need to generate a variable, and assignments to that variable in
+            // all the phi parent blocks.
+            should_hoist_to_var = true;
+        } else if (!def_in_construct->ContainsPos(first_pos) ||
+                   !def_in_construct->ContainsPos(last_use_pos)) {
             // To satisfy scoping, we have to hoist the definition out to an enclosing
             // construct.
-            should_hoist = true;
+            should_hoist_to_var = true;
         } else {
             // Avoid moving combinatorial values across constructs.  This is a
             // simple heuristic to avoid changing the cost of an operation
             // by moving it into or out of a loop, for example.
             if ((def_info->storage_class == ast::StorageClass::kInvalid) &&
                 def_info->used_in_another_construct) {
-                should_hoist = true;
+                should_hoist_to_let = true;
             }
         }
 
-        if (should_hoist) {
+        if (should_hoist_to_var || should_hoist_to_let) {
             const auto* enclosing_construct = GetEnclosingScope(first_pos, last_use_pos);
-            if (enclosing_construct == def_in_construct) {
-                // We can use a plain 'const' definition.
+            if (should_hoist_to_let && (enclosing_construct == def_in_construct)) {
+                // We can use a plain 'let' declaration.
                 def_info->requires_named_const_def = true;
             } else {
                 // We need to make a hoisted variable definition.
diff --git a/src/tint/reader/spirv/function.h b/src/tint/reader/spirv/function.h
index d779c93..ff7336e 100644
--- a/src/tint/reader/spirv/function.h
+++ b/src/tint/reader/spirv/function.h
@@ -15,6 +15,7 @@
 #ifndef SRC_TINT_READER_SPIRV_FUNCTION_H_
 #define SRC_TINT_READER_SPIRV_FUNCTION_H_
 
+#include <limits>
 #include <memory>
 #include <optional>
 #include <string>
@@ -166,8 +167,8 @@
     struct PhiAssignment {
         /// The ID of an OpPhi receiving a value from this basic block.
         uint32_t phi_id;
-        /// The the value carried to the given OpPhi.
-        uint32_t value;
+        /// The ID of the value carried to the given OpPhi.
+        uint32_t value_id;
     };
     /// If this basic block branches to a visited basic block containing phis,
     /// then this is the list of writes to the variables associated those phis.
@@ -258,10 +259,9 @@
     /// True if the definition of this ID is inside the function.
     const bool locally_defined = true;
 
-    /// The position of the first block in which this ID is visible, in function
-    /// block order.  For IDs defined outside of the function, it is 0.
-    /// For IDs defined in the function, it is the position of the block
-    /// containing the definition of the ID.
+    /// For IDs defined in the function, this is the position of the block
+    /// containing the definition of the ID, in function block order.
+    /// For IDs defined outside of the function, it is 0.
     /// See method `FunctionEmitter::ComputeBlockOrderAndPositions`
     const uint32_t block_pos = 0;
 
@@ -272,8 +272,17 @@
     /// The number of uses of this ID.
     uint32_t num_uses = 0;
 
+    /// The block position of the first use of this ID, or MAX_UINT if it is not
+    /// used at all.  The "first" ordering is determined by the function block
+    /// order.  The first use of an ID might be in an OpPhi that precedes the
+    /// definition of the ID.
+    /// The ID defined by an OpPhi is counted as being "used" in each of its
+    /// parent blocks.
+    uint32_t first_use_pos = std::numeric_limits<uint32_t>::max();
     /// The block position of the last use of this ID, or 0 if it is not used
     /// at all.  The "last" ordering is determined by the function block order.
+    /// The ID defined by an OpPhi is counted as being "used" in each of its
+    /// parent blocks.
     uint32_t last_use_pos = 0;
 
     /// Is this value used in a construct other than the one in which it was
@@ -288,8 +297,8 @@
     /// corresponding position of the ID definition in SPIR-V.  This compensates
     /// for the difference between dominance and scoping. An SSA definition can
     /// dominate all its uses, but the construct where it is defined does not
-    /// enclose all the uses, and so if it were declared as a WGSL constant
-    /// definition at the point of its SPIR-V definition, then the WGSL name
+    /// enclose all the uses, and so if it were declared as a WGSL let-
+    /// declaration at the point of its SPIR-V definition, then the WGSL name
     /// would go out of scope too early. Fix that by creating a variable at the
     /// top of the smallest construct that encloses both the definition and all
     /// its uses. Then the original SPIR-V definition maps to a WGSL assignment
@@ -299,10 +308,8 @@
     /// example, pointers. crbug.com/tint/98
     bool requires_hoisted_def = false;
 
-    /// If the definition is an OpPhi, then `phi_var` is the name of the
-    /// variable that stores the value carried from parent basic blocks into
-    /// the basic block containing the OpPhi. Otherwise this is the empty string.
-    std::string phi_var;
+    /// Is this ID an OpPhi?
+    bool is_phi = false;
 
     /// The storage class to use for this value, if it is of pointer type.
     /// This is required to carry a storage class override from a storage
@@ -332,11 +339,11 @@
       << " inst.result_id: " << di.inst.result_id()
       << " locally_defined: " << (di.locally_defined ? "true" : "false")
       << " block_pos: " << di.block_pos << " num_uses: " << di.num_uses
-      << " last_use_pos: " << di.last_use_pos
+      << " first_use_pos: " << di.first_use_pos << " last_use_pos: " << di.last_use_pos
       << " used_in_another_construct: " << (di.used_in_another_construct ? "true" : "false")
       << " requires_named_const_def: " << (di.requires_named_const_def ? "true" : "false")
-      << " requires_hoisted_def: " << (di.requires_hoisted_def ? "true" : "false") << " phi_var: '"
-      << di.phi_var << "'";
+      << " requires_hoisted_def: " << (di.requires_hoisted_def ? "true" : "false")
+      << " is_phi: " << (di.is_phi ? "true" : "false") << "";
     if (di.storage_class != ast::StorageClass::kNone) {
         o << " sc:" << int(di.storage_class);
     }
@@ -603,7 +610,7 @@
     /// @returns an possibly updated type
     const Type* RemapStorageClass(const Type* type, uint32_t result_id);
 
-    /// Marks locally defined values when they should get a 'const'
+    /// Marks locally defined values when they should get a 'let'
     /// definition in WGSL, or a 'var' definition at an outer scope.
     /// This occurs in several cases:
     ///  - When a SPIR-V instruction might use the dynamically computed value
diff --git a/src/tint/reader/spirv/function_var_test.cc b/src/tint/reader/spirv/function_var_test.cc
index ff28299..5d156eb 100644
--- a/src/tint/reader/spirv/function_var_test.cc
+++ b/src/tint/reader/spirv/function_var_test.cc
@@ -922,6 +922,52 @@
     EXPECT_EQ(expect, got);
 }
 
+TEST_F(SpvParserFunctionVarTest, EmitStatement_Phi_SimultaneousAssignment) {
+    // Phis must act as if they are simutaneously assigned.
+    // %101 and %102 should exchange values on each iteration, and never have
+    // the same value.
+    auto assembly = Preamble() + R"(
+%100 = OpFunction %void None %voidfn
+
+%10 = OpLabel
+OpBranch %20
+
+%20 = OpLabel
+%101 = OpPhi %bool %true %10 %102 %20
+%102 = OpPhi %bool %false %10 %101 %20
+OpLoopMerge %99 %20 None
+OpBranchConditional %true %99 %20
+
+%99 = OpLabel
+OpReturn
+
+OpFunctionEnd
+  )";
+    auto p = parser(test::Assemble(assembly));
+    ASSERT_TRUE(p->BuildAndParseInternalModuleExceptFunctions()) << assembly;
+    auto fe = p->function_emitter(100);
+    EXPECT_TRUE(fe.EmitBody()) << p->error();
+
+    auto ast_body = fe.ast_body();
+    auto got = test::ToString(p->program(), ast_body);
+    auto* expect = R"(var x_101 : bool;
+var x_102 : bool;
+x_101 = true;
+x_102 = false;
+loop {
+  let x_101_c20 = x_101;
+  let x_102_c20 = x_102;
+  x_101 = x_102_c20;
+  x_102 = x_101_c20;
+  if (true) {
+    break;
+  }
+}
+return;
+)";
+    EXPECT_EQ(expect, got);
+}
+
 TEST_F(SpvParserFunctionVarTest, EmitStatement_Phi_SingleBlockLoopIndex) {
     auto assembly = Preamble() + R"(
      %pty = OpTypePointer Private %uint
@@ -969,20 +1015,19 @@
     auto ast_body = fe.ast_body();
     auto got = test::ToString(p->program(), ast_body);
     auto* expect = R"(loop {
-  var x_2_phi : u32;
-  var x_3_phi : u32;
+  var x_2 : u32;
+  var x_3 : u32;
   let x_101 : bool = x_7;
   let x_102 : bool = x_8;
-  x_2_phi = 0u;
-  x_3_phi = 1u;
+  x_2 = 0u;
+  x_3 = 1u;
   if (x_101) {
     break;
   }
   loop {
-    let x_2 : u32 = x_2_phi;
-    let x_3 : u32 = x_3_phi;
-    x_2_phi = (x_2 + 1u);
-    x_3_phi = x_3;
+    let x_3_c20 = x_3;
+    x_2 = (x_2 + 1u);
+    x_3 = x_3_c20;
     if (x_102) {
       break;
     }
@@ -1043,27 +1088,26 @@
     auto ast_body = fe.ast_body();
     auto got = test::ToString(p->program(), ast_body);
     auto* expect = R"(loop {
-  var x_2_phi : u32;
-  var x_3_phi : u32;
+  var x_2 : u32;
+  var x_3 : u32;
   let x_101 : bool = x_7;
   let x_102 : bool = x_8;
-  x_2_phi = 0u;
-  x_3_phi = 1u;
+  x_2 = 0u;
+  x_3 = 1u;
   if (x_101) {
     break;
   }
   loop {
     var x_4 : u32;
-    let x_2 : u32 = x_2_phi;
-    let x_3 : u32 = x_3_phi;
     if (x_102) {
       break;
     }
 
     continuing {
       x_4 = (x_2 + 1u);
-      x_2_phi = x_4;
-      x_3_phi = x_3;
+      let x_3_c30 = x_3;
+      x_2 = x_4;
+      x_3 = x_3_c30;
     }
   }
 }
@@ -1101,6 +1145,7 @@
 
      %30 = OpLabel
      %7 = OpIAdd %uint %4 %6 ; use %4 again
+     %8 = OpCopyObject %uint %5 ; use %5
      OpBranch %20
 
      %79 = OpLabel
@@ -1123,24 +1168,25 @@
     auto got = test::ToString(p->program(), ast_body);
     auto* expect = R"(let x_101 : bool = x_17;
 loop {
-  var x_2_phi : u32;
-  var x_5_phi : u32;
-  x_2_phi = 0u;
-  x_5_phi = 1u;
+  var x_2 : u32;
+  var x_5 : u32;
+  x_2 = 0u;
+  x_5 = 1u;
   loop {
+    var x_4 : u32;
+    var x_6 : u32;
     var x_7 : u32;
-    let x_2 : u32 = x_2_phi;
-    let x_5 : u32 = x_5_phi;
-    let x_4 : u32 = (x_2 + 1u);
-    let x_6 : u32 = (x_4 + 1u);
+    x_4 = (x_2 + 1u);
+    x_6 = (x_4 + 1u);
     if (x_101) {
       break;
     }
 
     continuing {
       x_7 = (x_4 + x_6);
-      x_2_phi = x_4;
-      x_5_phi = x_7;
+      let x_8 : u32 = x_5;
+      x_2 = x_4;
+      x_5 = x_7;
     }
   }
 }
@@ -1203,21 +1249,20 @@
     auto* expect = R"(let x_101 : bool = x_7;
 let x_102 : bool = x_8;
 loop {
-  var x_2_phi : u32;
+  var x_2 : u32;
   if (x_101) {
     break;
   }
   if (x_102) {
-    x_2_phi = 0u;
+    x_2 = 0u;
     continue;
   } else {
-    x_2_phi = 1u;
+    x_2 = 1u;
     continue;
   }
-  x_2_phi = 0u;
+  x_2 = 0u;
 
   continuing {
-    let x_2 : u32 = x_2_phi;
     x_1 = x_2;
   }
 }
@@ -1277,13 +1322,13 @@
     auto* expect = R"(let x_101 : bool = x_7;
 let x_102 : bool = x_8;
 loop {
-  var x_2_phi : u32;
+  var x_2 : u32;
   if (x_101) {
     break;
   }
-  x_2_phi = 0u;
+  x_2 = 0u;
   if (x_102) {
-    x_2_phi = 1u;
+    x_2 = 1u;
     continue;
   } else {
     continue;
@@ -1291,7 +1336,6 @@
   return;
 
   continuing {
-    let x_2 : u32 = x_2_phi;
     x_1 = x_2;
   }
 }
@@ -1334,7 +1378,8 @@
 
      %99 = OpLabel
      ; predecessors are all dominated by case construct head at %30
-     %phi = OpPhi %uint %uint_0 %45 %uint_1 %50
+     %41 = OpPhi %uint %uint_0 %45 %uint_1 %50
+     %101 = OpCopyObject %uint %41 ; give it a use so it's emitted
      OpReturn
 
      OpFunctionEnd
@@ -1346,7 +1391,7 @@
 
     auto ast_body = fe.ast_body();
     auto got = test::ToString(p->program(), ast_body);
-    auto* expect = R"(var x_41_phi : u32;
+    auto* expect = R"(var x_41 : u32;
 switch(1u) {
   default: {
     fallthrough;
@@ -1357,19 +1402,19 @@
   case 1u: {
     if (true) {
     } else {
-      x_41_phi = 0u;
+      x_41 = 0u;
       break;
     }
-    x_41_phi = 1u;
+    x_41 = 1u;
   }
 }
-let x_41 : u32 = x_41_phi;
+let x_101 : u32 = x_41;
 return;
 )";
     EXPECT_EQ(expect, got) << got << assembly;
 }
 
-TEST_F(SpvParserFunctionVarTest, EmitStatement_UseInPhiCountsAsUse) {
+TEST_F(SpvParserFunctionVarTest, EmitStatement_Phi_UseInPhiCountsAsUse) {
     // From crbug.com/215
     // If the only use of a combinatorially computed ID is as the value
     // in an OpPhi, then we still have to emit it.  The algorithm fix
@@ -1393,6 +1438,7 @@
 
          %99 = OpLabel
         %101 = OpPhi %bool %11 %10 %12 %20
+        %102 = OpCopyObject %bool %101  ;; ensure a use of %101
                OpReturn
 
                OpFunctionEnd
@@ -1405,14 +1451,330 @@
 
     auto ast_body = fe.ast_body();
     auto got = test::ToString(p->program(), ast_body);
-    auto* expect = R"(var x_101_phi : bool;
+    auto* expect = R"(var x_101 : bool;
 let x_11 : bool = (true & true);
 let x_12 : bool = !(x_11);
-x_101_phi = x_11;
+x_101 = x_11;
 if (true) {
-  x_101_phi = x_12;
+  x_101 = x_12;
 }
-let x_101 : bool = x_101_phi;
+let x_102 : bool = x_101;
+return;
+)";
+    EXPECT_EQ(expect, got);
+}
+
+TEST_F(SpvParserFunctionVarTest, EmitStatement_Phi_PhiInLoopHeader_FedByHoistedVar_PhiUnused) {
+    // From investigation into crbug.com/1649
+    //
+    // Value %999 is defined deep in control flow, then we arrange for
+    // it to dominate the backedge of the outer loop. The %999 value is then
+    // fed back into the phi in the loop header.  So %999 needs to be hoisted
+    // out of the loop.  The phi assignment needs to use the hoisted variable.
+    // The hoisted variable needs to be placed such that its scope encloses
+    // that phi in the header of the outer loop. The compiler needs
+    // to "see" that there is an implicit use of %999 in the backedge block
+    // of that outer loop.
+    auto assembly = Preamble() + R"(
+%100 = OpFunction %void None %voidfn
+
+%10 = OpLabel
+OpBranch %20
+
+%20 = OpLabel
+%101 = OpPhi %bool %true %10 %999 %80
+OpLoopMerge %99 %80 None
+OpBranchConditional %true %30 %99
+
+  %30 = OpLabel
+  OpSelectionMerge %50 None
+  OpBranchConditional %true %40 %50
+
+    %40 = OpLabel
+    %999 = OpCopyObject %bool %true
+    OpBranch %60
+
+    %50 = OpLabel
+    OpReturn
+
+  %60 = OpLabel ; if merge
+  OpBranch %80
+
+  %80 = OpLabel ; continue target
+  OpBranch %20
+
+%99 = OpLabel
+OpReturn
+
+OpFunctionEnd
+
+  )";
+    auto p = parser(test::Assemble(assembly));
+    ASSERT_TRUE(p->BuildAndParseInternalModuleExceptFunctions()) << assembly;
+    auto fe = p->function_emitter(100);
+    EXPECT_TRUE(fe.EmitBody()) << p->error();
+
+    auto ast_body = fe.ast_body();
+    auto got = test::ToString(p->program(), ast_body);
+    auto* expect = R"(loop {
+  var x_999 : bool;
+  if (true) {
+  } else {
+    break;
+  }
+  if (true) {
+    x_999 = true;
+    continue;
+  }
+  return;
+}
+return;
+)";
+    EXPECT_EQ(expect, got);
+}
+
+TEST_F(SpvParserFunctionVarTest, EmitStatement_Phi_PhiInLoopHeader_FedByHoistedVar_PhiUsed) {
+    // From investigation into crbug.com/1649
+    //
+    // Value %999 is defined deep in control flow, then we arrange for
+    // it to dominate the backedge of the outer loop. The %999 value is then
+    // fed back into the phi in the loop header.  So %999 needs to be hoisted
+    // out of the loop.  The phi assignment needs to use the hoisted variable.
+    // The hoisted variable needs to be placed such that its scope encloses
+    // that phi in the header of the outer loop. The compiler needs
+    // to "see" that there is an implicit use of %999 in the backedge block
+    // of that outer loop.
+    auto assembly = Preamble() + R"(
+%100 = OpFunction %void None %voidfn
+
+%10 = OpLabel
+OpBranch %20
+
+%20 = OpLabel
+%101 = OpPhi %bool %true %10 %999 %80
+OpLoopMerge %99 %80 None
+OpBranchConditional %true %30 %99
+
+  %30 = OpLabel
+  OpSelectionMerge %50 None
+  OpBranchConditional %true %40 %50
+
+    %40 = OpLabel
+    %999 = OpCopyObject %bool %true
+    OpBranch %60
+
+    %50 = OpLabel
+    OpReturn
+
+  %60 = OpLabel ; if merge
+  OpBranch %80
+
+  %80 = OpLabel ; continue target
+  OpBranch %20
+
+%99 = OpLabel
+%1000 = OpCopyObject %bool %101
+OpReturn
+
+OpFunctionEnd
+
+  )";
+    auto p = parser(test::Assemble(assembly));
+    ASSERT_TRUE(p->BuildAndParseInternalModuleExceptFunctions()) << assembly;
+    auto fe = p->function_emitter(100);
+    EXPECT_TRUE(fe.EmitBody()) << p->error();
+
+    auto ast_body = fe.ast_body();
+    auto got = test::ToString(p->program(), ast_body);
+    auto* expect = R"(var x_101 : bool;
+x_101 = true;
+loop {
+  var x_999 : bool;
+  if (true) {
+  } else {
+    break;
+  }
+  if (true) {
+    x_999 = true;
+    continue;
+  }
+  return;
+
+  continuing {
+    x_101 = x_999;
+  }
+}
+let x_1000 : bool = x_101;
+return;
+)";
+    EXPECT_EQ(expect, got);
+}
+
+TEST_F(SpvParserFunctionVarTest, EmitStatement_Phi_PhiInLoopHeader_FedByPhi_PhiUnused) {
+    // From investigation into crbug.com/1649
+    //
+    // This is a reduction of one of the hard parts of test case
+    // vk-gl-cts/graphicsfuzz/stable-binarysearch-tree-false-if-discard-loop/1.spvasm
+    // In particular, see the data flow around %114 in that case.
+    //
+    // Here value %999 is is a *phi* defined deep in control flow, then we
+    // arrange for it to dominate the backedge of the outer loop. The %999
+    // value is then fed back into the phi in the loop header.  The variable
+    // generated to hold the %999 value needs to be placed such that its scope
+    // encloses that phi in the header of the outer loop. The compiler needs
+    // to "see" that there is an implicit use of %999 in the backedge block
+    // of that outer loop.
+    auto assembly = Preamble() + R"(
+%100 = OpFunction %void None %voidfn
+
+%10 = OpLabel
+OpBranch %20
+
+%20 = OpLabel
+%101 = OpPhi %bool %true %10 %999 %80
+OpLoopMerge %99 %80 None
+OpBranchConditional %true %99 %30
+
+  %30 = OpLabel
+  OpLoopMerge %70 %60 None
+  OpBranch %40
+
+    %40 = OpLabel
+    OpBranchConditional %true %60 %50
+
+      %50 = OpLabel
+      OpBranch %60
+
+    %60 = OpLabel ; inner continue
+    %999 = OpPhi %bool %true %40 %false %50
+    OpBranchConditional %true %70 %30
+
+  %70 = OpLabel  ; inner merge
+  OpBranch %80
+
+  %80 = OpLabel ; outer continue target
+  OpBranch %20
+
+%99 = OpLabel
+OpReturn
+
+OpFunctionEnd
+  )";
+    auto p = parser(test::Assemble(assembly));
+    ASSERT_TRUE(p->BuildAndParseInternalModuleExceptFunctions()) << assembly;
+    auto fe = p->function_emitter(100);
+    EXPECT_TRUE(fe.EmitBody()) << p->error();
+
+    auto ast_body = fe.ast_body();
+    auto got = test::ToString(p->program(), ast_body);
+    auto* expect = R"(loop {
+  var x_999 : bool;
+  if (true) {
+    break;
+  }
+  loop {
+    x_999 = true;
+    if (true) {
+      continue;
+    }
+    x_999 = false;
+
+    continuing {
+      if (true) {
+        break;
+      }
+    }
+  }
+}
+return;
+)";
+    EXPECT_EQ(expect, got);
+}
+
+TEST_F(SpvParserFunctionVarTest, EmitStatement_Phi_PhiInLoopHeader_FedByPhi_PhiUsed) {
+    // From investigation into crbug.com/1649
+    //
+    // This is a reduction of one of the hard parts of test case
+    // vk-gl-cts/graphicsfuzz/stable-binarysearch-tree-false-if-discard-loop/1.spvasm
+    // In particular, see the data flow around %114 in that case.
+    //
+    // Here value %999 is is a *phi* defined deep in control flow, then we
+    // arrange for it to dominate the backedge of the outer loop. The %999
+    // value is then fed back into the phi in the loop header.  The variable
+    // generated to hold the %999 value needs to be placed such that its scope
+    // encloses that phi in the header of the outer loop. The compiler needs
+    // to "see" that there is an implicit use of %999 in the backedge block
+    // of that outer loop.
+    auto assembly = Preamble() + R"(
+%100 = OpFunction %void None %voidfn
+
+%10 = OpLabel
+OpBranch %20
+
+%20 = OpLabel
+%101 = OpPhi %bool %true %10 %999 %80
+OpLoopMerge %99 %80 None
+OpBranchConditional %true %99 %30
+
+  %30 = OpLabel
+  OpLoopMerge %70 %60 None
+  OpBranch %40
+
+    %40 = OpLabel
+    OpBranchConditional %true %60 %50
+
+      %50 = OpLabel
+      OpBranch %60
+
+    %60 = OpLabel ; inner continue
+    %999 = OpPhi %bool %true %40 %false %50
+    OpBranchConditional %true %70 %30
+
+  %70 = OpLabel  ; inner merge
+  OpBranch %80
+
+  %80 = OpLabel ; outer continue target
+  OpBranch %20
+
+%99 = OpLabel
+%1000 = OpCopyObject %bool %101
+OpReturn
+
+OpFunctionEnd
+  )";
+    auto p = parser(test::Assemble(assembly));
+    ASSERT_TRUE(p->BuildAndParseInternalModuleExceptFunctions()) << assembly;
+    auto fe = p->function_emitter(100);
+    EXPECT_TRUE(fe.EmitBody()) << p->error();
+
+    auto ast_body = fe.ast_body();
+    auto got = test::ToString(p->program(), ast_body);
+    auto* expect = R"(var x_101 : bool;
+x_101 = true;
+loop {
+  var x_999 : bool;
+  if (true) {
+    break;
+  }
+  loop {
+    x_999 = true;
+    if (true) {
+      continue;
+    }
+    x_999 = false;
+
+    continuing {
+      if (true) {
+        break;
+      }
+    }
+  }
+
+  continuing {
+    x_101 = x_999;
+  }
+}
+let x_1000 : bool = x_101;
 return;
 )";
     EXPECT_EQ(expect, got);
diff --git a/src/tint/reader/spirv/parser_impl.cc b/src/tint/reader/spirv/parser_impl.cc
index caa2c69..683dd59 100644
--- a/src/tint/reader/spirv/parser_impl.cc
+++ b/src/tint/reader/spirv/parser_impl.cc
@@ -2053,7 +2053,8 @@
         Fail() << "internal error: RectifyOperandSignedness given a null expr\n";
         return {};
     }
-    auto* type = expr.type;
+    // TODO(crbug.com/tint/1669) should this unpack aliases too?
+    auto* type = expr.type->UnwrapRef();
     if (!type) {
         Fail() << "internal error: unmapped type for: " << expr.expr->TypeInfo().name << "\n";
         return {};
@@ -2078,12 +2079,12 @@
 TypedExpression ParserImpl::RectifySecondOperandSignedness(const spvtools::opt::Instruction& inst,
                                                            const Type* first_operand_type,
                                                            TypedExpression&& second_operand_expr) {
-    if ((first_operand_type != second_operand_expr.type) &&
+    const Type* target_type = first_operand_type->UnwrapRef();
+    if ((target_type != second_operand_expr.type->UnwrapRef()) &&
         AssumesSecondOperandSignednessMatchesFirstOperand(inst.opcode())) {
         // Conversion is required.
-        return {first_operand_type,
-                create<ast::BitcastExpression>(Source{}, first_operand_type->Build(builder_),
-                                               second_operand_expr.expr)};
+        return {target_type, create<ast::BitcastExpression>(Source{}, target_type->Build(builder_),
+                                                            second_operand_expr.expr)};
     }
     // No conversion necessary.
     return std::move(second_operand_expr);
@@ -2091,6 +2092,7 @@
 
 const Type* ParserImpl::ForcedResultType(const spvtools::opt::Instruction& inst,
                                          const Type* first_operand_type) {
+    first_operand_type = first_operand_type->UnwrapRef();
     const auto opcode = inst.opcode();
     if (AssumesResultSignednessMatchesFirstOperand(opcode)) {
         return first_operand_type;
diff --git a/src/tint/reader/spirv/parser_impl_handle_test.cc b/src/tint/reader/spirv/parser_impl_handle_test.cc
index a165f7b..cbad954 100644
--- a/src/tint/reader/spirv/parser_impl_handle_test.cc
+++ b/src/tint/reader/spirv/parser_impl_handle_test.cc
@@ -3984,14 +3984,11 @@
     auto ast_body = fe.ast_body();
     const auto got = test::ToString(p->program(), ast_body);
     auto* expect = R"(var x_24 : vec2<f32>;
-var x_24_phi_1 : vec2<f32>;
-var x_26_phi_1 : i32;
-x_24_phi_1 = vec2<f32>(0.0f, 0.0f);
-x_26_phi_1 = 0i;
+var x_26 : i32;
+x_24 = vec2<f32>(0.0f, 0.0f);
+x_26 = 0i;
 loop {
   var x_27 : i32;
-  x_24 = x_24_phi_1;
-  let x_26 : i32 = x_26_phi_1;
   if ((x_26 < 2i)) {
   } else {
     break;
@@ -3999,8 +3996,8 @@
 
   continuing {
     x_27 = (x_26 + 1i);
-    x_24_phi_1 = vec2<f32>(1.0f, 1.0f);
-    x_26_phi_1 = x_27;
+    x_24 = vec2<f32>(1.0f, 1.0f);
+    x_26 = x_27;
   }
 }
 textureStore(Output2Texture2D, vec2<i32>(vec2<u32>(1u, 1u)), vec4<f32>(x_24, 0.0f, 0.0f));
@@ -4060,14 +4057,11 @@
     auto ast_body = fe.ast_body();
     const auto got = test::ToString(p->program(), ast_body);
     auto* expect = R"(var x_14 : f32;
-var x_14_phi_1 : f32;
-var x_15_phi_1 : f32;
-x_14_phi_1 = 0.0f;
-x_15_phi_1 = 0.0f;
+var x_15 : f32;
+x_14 = 0.0f;
+x_15 = 0.0f;
 loop {
   var x_17 : f32;
-  x_14 = x_14_phi_1;
-  let x_15 : f32 = x_15_phi_1;
   if ((x_15 < 1.0f)) {
   } else {
     break;
@@ -4075,8 +4069,9 @@
 
   continuing {
     x_17 = (x_15 + 1.0f);
-    x_14_phi_1 = x_15;
-    x_15_phi_1 = x_17;
+    let x_15_c16_1 = x_15;
+    x_14 = x_15_c16_1;
+    x_15 = x_17;
   }
 }
 let x_21 : f32 = select(0.0f, x_14, (x_14 > 1.0f));
diff --git a/src/tint/resolver/const_eval.cc b/src/tint/resolver/const_eval.cc
index 8e46f9a..7e33244 100644
--- a/src/tint/resolver/const_eval.cc
+++ b/src/tint/resolver/const_eval.cc
@@ -1445,6 +1445,23 @@
     return r;
 }
 
+ConstEval::ConstantResult ConstEval::OpXor(const sem::Type* ty,
+                                           utils::VectorRef<const sem::Constant*> args,
+                                           const Source&) {
+    auto transform = [&](const sem::Constant* c0, const sem::Constant* c1) {
+        auto create = [&](auto i, auto j) -> const Constant* {
+            return CreateElement(builder, sem::Type::DeepestElementOf(ty), decltype(i){i ^ j});
+        };
+        return Dispatch_ia_iu32(create, c0, c1);
+    };
+
+    auto r = TransformElements(builder, ty, transform, args[0], args[1]);
+    if (builder.Diagnostics().contains_errors()) {
+        return utils::Failure;
+    }
+    return r;
+}
+
 ConstEval::ConstantResult ConstEval::atan2(const sem::Type* ty,
                                            utils::VectorRef<const sem::Constant*> args,
                                            const Source&) {
diff --git a/src/tint/resolver/const_eval.h b/src/tint/resolver/const_eval.h
index 6b57556..04e2282 100644
--- a/src/tint/resolver/const_eval.h
+++ b/src/tint/resolver/const_eval.h
@@ -356,6 +356,15 @@
                         utils::VectorRef<const sem::Constant*> args,
                         const Source& source);
 
+    /// Bitwise xor operator '^'
+    /// @param ty the expression type
+    /// @param args the input arguments
+    /// @param source the source location of the conversion
+    /// @return the result value, or null if the value cannot be calculated
+    ConstantResult OpXor(const sem::Type* ty,
+                         utils::VectorRef<const sem::Constant*> args,
+                         const Source& source);
+
     ////////////////////////////////////////////////////////////////////////////
     // Builtins
     ////////////////////////////////////////////////////////////////////////////
diff --git a/src/tint/resolver/const_eval_test.cc b/src/tint/resolver/const_eval_test.cc
index f06fa32..3b1d7ee 100644
--- a/src/tint/resolver/const_eval_test.cc
+++ b/src/tint/resolver/const_eval_test.cc
@@ -3675,6 +3675,43 @@
     });
 }
 
+template <typename T>
+std::vector<Case> XorCases() {
+    using B = BitValues<T>;
+    return {
+        C(T{0b1010}, T{0b1111}, T{0b0101}),
+        C(T{0b1010}, T{0b0000}, T{0b1010}),
+        C(T{0b1010}, T{0b0011}, T{0b1001}),
+        C(T{0b1010}, T{0b1100}, T{0b0110}),
+        C(T{0b1010}, T{0b0101}, T{0b1111}),
+        C(B::All, B::All, T{0}),
+        C(B::LeftMost, B::LeftMost, T{0}),
+        C(B::RightMost, B::RightMost, T{0}),
+        C(B::All, T{0}, B::All),
+        C(T{0}, B::All, B::All),
+        C(B::LeftMost, B::AllButLeftMost, B::All),
+        C(B::AllButLeftMost, B::LeftMost, B::All),
+        C(B::RightMost, B::AllButRightMost, B::All),
+        C(B::AllButRightMost, B::RightMost, B::All),
+        C(Vec(B::All, B::LeftMost, B::RightMost),             //
+          Vec(B::All, B::All, B::All),                        //
+          Vec(T{0}, B::AllButLeftMost, B::AllButRightMost)),  //
+        C(Vec(B::All, B::LeftMost, B::RightMost),             //
+          Vec(T{0}, T{0}, T{0}),                              //
+          Vec(B::All, B::LeftMost, B::RightMost)),            //
+        C(Vec(B::LeftMost, B::RightMost),                     //
+          Vec(B::AllButLeftMost, B::AllButRightMost),         //
+          Vec(B::All, B::All)),
+    };
+}
+INSTANTIATE_TEST_SUITE_P(Xor,
+                         ResolverConstEvalBinaryOpTest,
+                         testing::Combine(  //
+                             testing::Values(ast::BinaryOp::kXor),
+                             testing::ValuesIn(Concat(XorCases<AInt>(),  //
+                                                      XorCases<i32>(),   //
+                                                      XorCases<u32>()))));
+
 // Tests for errors on overflow/underflow of binary operations with abstract numbers
 struct OverflowCase {
     ast::BinaryOp op;
diff --git a/src/tint/resolver/intrinsic_table.inl b/src/tint/resolver/intrinsic_table.inl
index 559466d..f74cb25 100644
--- a/src/tint/resolver/intrinsic_table.inl
+++ b/src/tint/resolver/intrinsic_table.inl
@@ -13122,24 +13122,24 @@
     /* num parameters */ 2,
     /* num template types */ 1,
     /* num template numbers */ 0,
-    /* template types */ &kTemplateTypes[14],
+    /* template types */ &kTemplateTypes[10],
     /* template numbers */ &kTemplateNumbers[10],
     /* parameters */ &kParameters[689],
     /* return matcher indices */ &kMatcherIndices[1],
     /* flags */ OverloadFlags(OverloadFlag::kIsOperator, OverloadFlag::kSupportsVertexPipeline, OverloadFlag::kSupportsFragmentPipeline, OverloadFlag::kSupportsComputePipeline),
-    /* const eval */ nullptr,
+    /* const eval */ &ConstEval::OpXor,
   },
   {
     /* [413] */
     /* num parameters */ 2,
     /* num template types */ 1,
     /* num template numbers */ 1,
-    /* template types */ &kTemplateTypes[14],
+    /* template types */ &kTemplateTypes[10],
     /* template numbers */ &kTemplateNumbers[6],
     /* parameters */ &kParameters[687],
     /* return matcher indices */ &kMatcherIndices[30],
     /* flags */ OverloadFlags(OverloadFlag::kIsOperator, OverloadFlag::kSupportsVertexPipeline, OverloadFlag::kSupportsFragmentPipeline, OverloadFlag::kSupportsComputePipeline),
-    /* const eval */ nullptr,
+    /* const eval */ &ConstEval::OpXor,
   },
   {
     /* [414] */
@@ -14703,8 +14703,8 @@
   },
   {
     /* [5] */
-    /* op ^<T : iu32>(T, T) -> T */
-    /* op ^<T : iu32, N : num>(vec<N, T>, vec<N, T>) -> vec<N, T> */
+    /* op ^<T : ia_iu32>(T, T) -> T */
+    /* op ^<T : ia_iu32, N : num>(vec<N, T>, vec<N, T>) -> vec<N, T> */
     /* num overloads */ 2,
     /* overloads */ &kOverloads[412],
   },
diff --git a/src/tint/transform/std140.cc b/src/tint/transform/std140.cc
index e6f070b..06b4f5d 100644
--- a/src/tint/transform/std140.cc
+++ b/src/tint/transform/std140.cc
@@ -34,6 +34,14 @@
 
 namespace {
 
+/// UniformVariable is used by Std140::State::AccessIndex to indicate the root uniform variable
+struct UniformVariable {};
+
+/// Inequality operator for UniformVariable
+bool operator!=(const UniformVariable&, const UniformVariable&) {
+    return false;
+}
+
 /// DynamicIndex is used by Std140::State::AccessIndex to indicate a runtime-expression index
 struct DynamicIndex {
     size_t slot;  // The index of the expression in Std140::State::AccessChain::dynamic_indices
@@ -48,6 +56,14 @@
 
 namespace tint::utils {
 
+/// Hasher specialization for UniformVariable
+template <>
+struct Hasher<UniformVariable> {
+    /// The hash function for the UniformVariable
+    /// @return the hash for the given UniformVariable
+    size_t operator()(const UniformVariable&) const { return 0; }
+};
+
 /// Hasher specialization for DynamicIndex
 template <>
 struct Hasher<DynamicIndex> {
@@ -69,9 +85,9 @@
 
     /// Runs the transform
     void Run() {
-        // Begin by creating forked structures for any struct that is used as a uniform buffer, that
+        // Begin by creating forked types for any type that is used as a uniform buffer, that
         // either directly or transitively contains a matrix that needs splitting for std140 layout.
-        ForkStructs();
+        ForkTypes();
 
         // Next, replace all the uniform variables to use the forked types.
         ReplaceUniformVarTypes();
@@ -105,19 +121,43 @@
     /// @returns true if this transform should be run for the given program
     /// @param program the program to inspect
     static bool ShouldRun(const Program* program) {
+        // Returns true if the type needs to be forked for std140 usage.
+        auto needs_fork = [&](const sem::Type* ty) {
+            while (auto* arr = ty->As<sem::Array>()) {
+                ty = arr->ElemType();
+            }
+            if (auto* mat = ty->As<sem::Matrix>()) {
+                if (MatrixNeedsDecomposing(mat)) {
+                    return true;
+                }
+            }
+            return false;
+        };
+
+        // Scan structures for members that need forking
         for (auto* ty : program->Types()) {
             if (auto* str = ty->As<sem::Struct>()) {
                 if (str->UsedAs(ast::StorageClass::kUniform)) {
                     for (auto* member : str->Members()) {
-                        if (auto* mat = member->Type()->As<sem::Matrix>()) {
-                            if (MatrixNeedsDecomposing(mat)) {
-                                return true;
-                            }
+                        if (needs_fork(member->Type())) {
+                            return true;
                         }
                     }
                 }
             }
         }
+
+        // Scan uniform variables that have types that need forking
+        for (auto* decl : program->AST().GlobalVariables()) {
+            auto* global = program->Sem().Get(decl);
+            if (global->StorageClass() == ast::StorageClass::kUniform) {
+                if (needs_fork(global->Type()->UnwrapRef())) {
+                    return true;
+                }
+            }
+        }
+
+        // If we reach here, no uniform variables use a type that needs forking for std140 layout
         return false;
     }
 
@@ -127,12 +167,11 @@
 
     /// AccessIndex describes a single access in an access chain.
     /// The access is one of:
-    /// u32          - a static member index on a struct, static array index, static matrix column
-    ///                index, static vector element index.
-    /// DynamicIndex - a runtime-expression index on an array, matrix column selection, or vector
-    ///                element index.
-    /// Swizzle      - a static vector swizzle.
-    using AccessIndex = std::variant<u32, DynamicIndex, Swizzle>;
+    /// UniformVariable - the root uniform variable.
+    /// u32             - a static index on a struct, array index, matrix column or vector element.
+    /// DynamicIndex    - a runtime index on an array, matrix column, or vector element.
+    /// Swizzle         - a static vector swizzle.
+    using AccessIndex = std::variant<UniformVariable, u32, DynamicIndex, Swizzle>;
 
     /// A vector of AccessIndex.
     using AccessIndices = utils::Vector<AccessIndex, 8>;
@@ -182,7 +221,19 @@
     // Map of structure member in ctx.src of a matrix type, to list of decomposed column
     // members in ctx.dst.
     utils::Hashmap<const sem::StructMember*, utils::Vector<const ast::StructMember*, 4>, 8>
-        std140_mats;
+        std140_mat_members;
+
+    /// Describes a matrix that has been forked to a std140-structure holding the decomposed column
+    /// vectors of the matrix.
+    struct Std140Matrix {
+        /// The decomposed structure name (in ctx.dst)
+        Symbol name;
+        /// The column vector structure member names (in ctx.dst)
+        utils::Vector<Symbol, 4> columns;
+    };
+
+    // Map of matrix type in ctx.src, to decomposed column structure in ctx.dst.
+    utils::Hashmap<const sem::Matrix*, Std140Matrix, 8> std140_mats;
 
     /// AccessChain describes a chain of access expressions to uniform buffer variable.
     struct AccessChain {
@@ -209,11 +260,11 @@
     /// TODO(crbug.com/tint/1502): This may need adjusting for `f16` matrices.
     static bool MatrixNeedsDecomposing(const sem::Matrix* mat) { return mat->ColumnStride() == 8; }
 
-    /// ForkStructs walks the structures in dependency order, forking structures that are used as
-    /// uniform buffers which (transitively) use matrices that need std140 decomposition to column
-    /// vectors.
-    /// Populates the #std140_mats map and #std140_structs set.
-    void ForkStructs() {
+    /// ForkTypes walks the user-declared types in dependency order, forking structures that are
+    /// used as uniform buffers which (transitively) use matrices that need std140 decomposition to
+    /// column vectors. Populates the #std140_mat_members map, #std140_structs set and #std140_mats
+    /// map (via Std140Type()).
+    void ForkTypes() {
         // For each module scope declaration...
         for (auto* global : ctx.src->Sem().Module()->DependencyOrderedDeclarations()) {
             // Check to see if this is a structure used by a uniform buffer...
@@ -229,51 +280,30 @@
                             // Structure member of matrix type needs decomposition.
                             fork_std140 = true;
                             // Replace the member with column vectors.
-                            const auto num_columns = mat->columns();
                             const auto name_prefix = PrefixForUniqueNames(
-                                str->Declaration(), member->Name(), num_columns);
+                                str->Declaration(), member->Name(), mat->columns());
+
                             // Build a struct member for each column of the matrix
-                            utils::Vector<const ast::StructMember*, 4> column_members;
-                            for (uint32_t i = 0; i < num_columns; i++) {
-                                utils::Vector<const ast::Attribute*, 1> attributes;
-                                if ((i == 0) && mat->Align() != member->Align()) {
-                                    // The matrix was @align() annotated with a larger alignment
-                                    // than the natural alignment for the matrix. This extra padding
-                                    // needs to be applied to the first column vector.
-                                    attributes.Push(b.MemberAlign(u32(member->Align())));
-                                }
-                                if ((i == num_columns - 1) && mat->Size() != member->Size()) {
-                                    // The matrix was @size() annotated with a larger size than the
-                                    // natural size for the matrix. This extra padding needs to be
-                                    // applied to the last column vector.
-                                    attributes.Push(b.MemberSize(
-                                        AInt(member->Size() -
-                                             mat->ColumnType()->Size() * (num_columns - 1))));
-                                }
+                            auto column_members = DecomposedMatrixStructMembers(
+                                mat, name_prefix, member->Align(), member->Size());
 
-                                // Build the member
-                                const auto col_name = name_prefix + std::to_string(i);
-                                const auto* col_ty = CreateASTTypeFor(ctx, mat->ColumnType());
-                                const auto* col_member =
-                                    ctx.dst->Member(col_name, col_ty, std::move(attributes));
-                                // Add the member to the forked structure
-                                members.Push(col_member);
-                                // Record the member for std140_mats
-                                column_members.Push(col_member);
+                            // Add the member to the forked structure
+                            for (auto* column_member : column_members) {
+                                members.Push(column_member);
                             }
-                            std140_mats.Add(member, std::move(column_members));
-                            continue;
-                        }
-                    }
+                            // Record that this matrix member was replaced with the N column
+                            // members.
+                            std140_mat_members.Add(member, std::move(column_members));
 
-                    // Is the member part of a struct that has been forked for std140-layout?
-                    if (auto* std140_ty = Std140Type(member->Type())) {
-                        // Yes - use this type for the forked structure member.
+                            continue;  // Next member
+                        }
+                    } else if (auto* std140_ty = Std140Type(member->Type())) {
+                        // Member is of a type that requires forking for std140-layout
                         fork_std140 = true;
                         auto attrs = ctx.Clone(member->Declaration()->attributes);
                         members.Push(
                             b.Member(sym.NameFor(member->Name()), std140_ty, std::move(attrs)));
-                        continue;
+                        continue;  // Next member
                     }
 
                     // Nothing special about this member.
@@ -314,6 +344,7 @@
                     if (auto* std140_ty = Std140Type(v->Type()->UnwrapRef())) {
                         ctx.Replace(global->type, std140_ty);
                         std140_uniforms.Add(v);
+                        continue;
                     }
                 }
             }
@@ -355,10 +386,11 @@
         }
     }
 
-    /// @returns a new, forked std140 AST type for the corresponding non-forked semantic type. If
-    /// the
-    ///          semantic type is not split for std140-layout, then nullptr is returned.
-    const ast::Type* Std140Type(const sem::Type* ty) const {
+    /// @returns a new, forked std140 AST type for the corresponding non-forked semantic type.
+    ///          If the semantic type is not split for std140-layout, then nullptr is returned.
+    /// @note will construct new std140 structures to hold decomposed matrices, populating
+    ///       #std140_mats.
+    const ast::Type* Std140Type(const sem::Type* ty) {
         return Switch(
             ty,  //
             [&](const sem::Struct* str) -> const ast::Type* {
@@ -367,6 +399,24 @@
                 }
                 return nullptr;
             },
+            [&](const sem::Matrix* mat) -> const ast::Type* {
+                if (MatrixNeedsDecomposing(mat)) {
+                    auto std140_mat = std140_mats.GetOrCreate(mat, [&] {
+                        auto name = b.Symbols().New("mat" + std::to_string(mat->columns()) + "x" +
+                                                    std::to_string(mat->rows()) + "_" +
+                                                    ctx.src->FriendlyName(mat->type()));
+                        auto members =
+                            DecomposedMatrixStructMembers(mat, "col", mat->Align(), mat->Size());
+                        b.Structure(name, members);
+                        return Std140Matrix{
+                            name,
+                            utils::Transform(members, [&](auto* member) { return member->symbol; }),
+                        };
+                    });
+                    return b.ty.type_name(std140_mat.name);
+                }
+                return nullptr;
+            },
             [&](const sem::Array* arr) -> const ast::Type* {
                 if (auto* std140 = Std140Type(arr->ElemType())) {
                     utils::Vector<const ast::Attribute*, 1> attrs;
@@ -380,6 +430,46 @@
             });
     }
 
+    /// @param mat the matrix to decompose (in ctx.src)
+    /// @param name_prefix the name prefix to apply to each of the returned column vector members.
+    /// @param align the alignment in bytes of the matrix.
+    /// @param size the size in bytes of the matrix.
+    /// @returns a vector of decomposed matrix column vectors as structure members (in ctx.dst).
+    utils::Vector<const ast::StructMember*, 4> DecomposedMatrixStructMembers(
+        const sem::Matrix* mat,
+        const std::string& name_prefix,
+        uint32_t align,
+        uint32_t size) {
+        // Replace the member with column vectors.
+        const auto num_columns = mat->columns();
+        // Build a struct member for each column of the matrix
+        utils::Vector<const ast::StructMember*, 4> out;
+        for (uint32_t i = 0; i < num_columns; i++) {
+            utils::Vector<const ast::Attribute*, 1> attributes;
+            if ((i == 0) && mat->Align() != align) {
+                // The matrix was @align() annotated with a larger alignment
+                // than the natural alignment for the matrix. This extra padding
+                // needs to be applied to the first column vector.
+                attributes.Push(b.MemberAlign(u32(align)));
+            }
+            if ((i == num_columns - 1) && mat->Size() != size) {
+                // The matrix was @size() annotated with a larger size than the
+                // natural size for the matrix. This extra padding needs to be
+                // applied to the last column vector.
+                attributes.Push(
+                    b.MemberSize(AInt(size - mat->ColumnType()->Size() * (num_columns - 1))));
+            }
+
+            // Build the member
+            const auto col_name = name_prefix + std::to_string(i);
+            const auto* col_ty = CreateASTTypeFor(ctx, mat->ColumnType());
+            const auto* col_member = ctx.dst->Member(col_name, col_ty, std::move(attributes));
+            // Record the member for std140_mat_members
+            out.Push(col_member);
+        }
+        return out;
+    }
+
     /// Walks the @p ast_expr, constructing and returning an AccessChain.
     /// @returns an AccessChain if the expression is an access to a std140-forked uniform buffer,
     ///          otherwise returns a std::nullopt.
@@ -406,11 +496,13 @@
                 [&](const sem::VariableUser* user) {
                     if (user->Variable() == access.var) {
                         // Walked all the way to the source variable. We're done traversing.
+                        access.indices.Push(UniformVariable{});
                         return Action::kStop;
                     }
                     if (user->Variable()->Type()->Is<sem::Pointer>()) {
                         // Found a pointer. As the source variable is a uniform buffer variable,
-                        // this must be a pointer-let. Continue traversing from the let initializer.
+                        // this must be a pointer-let. Continue traversing from the let
+                        // initializer.
                         expr = user->Variable()->Constructor();
                         return Action::kContinue;
                     }
@@ -421,7 +513,7 @@
                 },
                 [&](const sem::StructMemberAccess* a) {
                     // Is this a std140 decomposed matrix?
-                    if (!access.std140_mat_ty && std140_mats.Contains(a->Member())) {
+                    if (std140_mat_members.Contains(a->Member())) {
                         // Record this on the access.
                         access.std140_mat_idx = access.indices.Length();
                         access.std140_mat_ty = expr->Type()->UnwrapRef()->As<sem::Matrix>();
@@ -440,6 +532,15 @@
                         access.dynamic_indices.Push(a->Index());
                     }
                     expr = a->Object();
+
+                    // Is the object a std140 decomposed matrix?
+                    if (auto* mat = expr->Type()->UnwrapRef()->As<sem::Matrix>()) {
+                        if (std140_mats.Contains(mat)) {
+                            // Record this on the access.
+                            access.std140_mat_idx = access.indices.Length();
+                            access.std140_mat_ty = mat;
+                        }
+                    }
                     return Action::kContinue;
                 },
                 [&](const sem::Swizzle* s) {
@@ -512,8 +613,13 @@
             ty,  //
             [&](const sem::Struct* str) { return sym.NameFor(str->Name()); },
             [&](const sem::Array* arr) {
-                return "arr_" + std::to_string(arr->Count()) + "_" + ConvertSuffix(arr->ElemType());
+                return "arr" + std::to_string(arr->Count()) + "_" + ConvertSuffix(arr->ElemType());
             },
+            [&](const sem::Matrix* mat) {
+                return "mat" + std::to_string(mat->columns()) + "x" + std::to_string(mat->rows()) +
+                       "_" + ConvertSuffix(mat->type());
+            },
+            [&](const sem::F32*) { return "f32"; },
             [&](Default) {
                 TINT_ICE(Transform, b.Diagnostics())
                     << "unhandled type for conversion name: " << ctx.src->FriendlyName(ty);
@@ -523,15 +629,15 @@
 
     /// Generates and returns an expression that loads the value from a std140 uniform buffer,
     /// converting the final result to a non-std140 type.
-    /// @param access the access chain from a uniform buffer to the value to load.
-    const ast::Expression* LoadWithConvert(const AccessChain& access) {
-        const ast::Expression* expr = b.Expr(sym.NameFor(access.var->Declaration()->symbol));
-        const sem::Type* ty = access.var->Type()->UnwrapRef();
+    /// @param chain the access chain from a uniform buffer to the value to load.
+    const ast::Expression* LoadWithConvert(const AccessChain& chain) {
+        const ast::Expression* expr = nullptr;
+        const sem::Type* ty = nullptr;
         auto dynamic_index = [&](size_t idx) {
-            return ctx.Clone(access.dynamic_indices[idx]->Declaration());
+            return ctx.Clone(chain.dynamic_indices[idx]->Declaration());
         };
-        for (auto index : access.indices) {
-            auto [new_expr, new_ty, _] = BuildAccessExpr(expr, ty, index, dynamic_index);
+        for (size_t i = 0; i < chain.indices.Length(); i++) {
+            auto [new_expr, new_ty, _] = BuildAccessExpr(expr, ty, chain, i, dynamic_index);
             expr = new_expr;
             ty = new_ty;
         }
@@ -559,11 +665,11 @@
             Switch(
                 ty,  //
                 [&](const sem::Struct* str) {
-                    // Convert each of the structure members using either a converter function call,
-                    // or by reassembling a std140 matrix from column vector members.
+                    // Convert each of the structure members using either a converter function
+                    // call, or by reassembling a std140 matrix from column vector members.
                     utils::Vector<const ast::Expression*, 8> args;
                     for (auto* member : str->Members()) {
-                        if (auto* col_members = std140_mats.Find(member)) {
+                        if (auto* col_members = std140_mat_members.Find(member)) {
                             // std140 decomposed matrix. Reassemble.
                             auto* mat_ty = CreateASTTypeFor(ctx, member->Type());
                             auto mat_args =
@@ -578,13 +684,28 @@
                                         b.MemberAccessor(param, sym.NameFor(member->Name()))));
                         }
                     }
-                    auto* converted = b.Construct(CreateASTTypeFor(ctx, ty), std::move(args));
-                    stmts.Push(b.Return(converted));
+                    stmts.Push(b.Return(b.Construct(CreateASTTypeFor(ctx, ty), std::move(args))));
+                },  //
+                [&](const sem::Matrix* mat) {
+                    // Reassemble a std140 matrix from the structure of column vector members.
+                    if (auto std140_mat = std140_mats.Get(mat)) {
+                        utils::Vector<const ast::Expression*, 8> args;
+                        // std140 decomposed matrix. Reassemble.
+                        auto* mat_ty = CreateASTTypeFor(ctx, mat);
+                        auto mat_args = utils::Transform(std140_mat->columns, [&](Symbol name) {
+                            return b.MemberAccessor(param, name);
+                        });
+                        stmts.Push(b.Return(b.Construct(mat_ty, std::move(mat_args))));
+                    } else {
+                        TINT_ICE(Transform, b.Diagnostics())
+                            << "failed to find std140 matrix info for: "
+                            << ctx.src->FriendlyName(ty);
+                    }
                 },  //
                 [&](const sem::Array* arr) {
-                    // Converting an array. Create a function var for the converted array, and loop
-                    // over the input elements, converting each and assigning the result to the
-                    // local array.
+                    // Converting an array. Create a function var for the converted array, and
+                    // loop over the input elements, converting each and assigning the result to
+                    // the local array.
                     auto* var = b.Var("arr", CreateASTTypeFor(ctx, ty));
                     auto* i = b.Var("i", b.ty.u32());
                     auto* dst_el = b.IndexAccessor(var, i);
@@ -646,38 +767,57 @@
 
     /// Loads a part of a std140-decomposed matrix from a uniform buffer, inline (without calling a
     /// helper function).
-    /// @param access the access chain from the uniform buffer to part of the matrix (column,
+    /// @param chain the access chain from the uniform buffer to part of the matrix (column,
     ///               column-swizzle, or element).
     /// @note The matrix column must be statically indexed to use this method.
     /// @returns the loaded value expression.
-    const ast::Expression* LoadSubMatrixInline(const AccessChain& access) {
-        const ast::Expression* expr = b.Expr(ctx.Clone(access.var->Declaration()->symbol));
-        const sem::Type* ty = access.var->Type()->UnwrapRef();
+    const ast::Expression* LoadSubMatrixInline(const AccessChain& chain) {
         // Method for generating dynamic index expressions.
         // As this is inline, we can just clone the expression.
         auto dynamic_index = [&](size_t idx) {
-            return ctx.Clone(access.dynamic_indices[idx]->Declaration());
+            return ctx.Clone(chain.dynamic_indices[idx]->Declaration());
         };
-        for (size_t i = 0; i < access.indices.Length(); i++) {
-            if (i == access.std140_mat_idx) {
-                // Access is to the std140 decomposed matrix.
-                // As this is accessing only part of the matrix, we just need to pick the right
-                // column vector member.
-                auto mat_member_idx = std::get<u32>(access.indices[i]);
-                auto* mat_member = ty->As<sem::Struct>()->Members()[mat_member_idx];
-                auto mat_columns = *std140_mats.Get(mat_member);
-                auto column_idx = std::get<u32>(access.indices[i + 1]);
-                expr = b.MemberAccessor(expr, mat_columns[column_idx]->symbol);
-                ty = mat_member->Type()->As<sem::Matrix>()->ColumnType();
-                // We've consumed both the matrix member access and the column access. Increment i.
-                i++;
-            } else {
-                // Access is to something that is not a decomposed matrix.
-                auto [new_expr, new_ty, _] =
-                    BuildAccessExpr(expr, ty, access.indices[i], dynamic_index);
-                expr = new_expr;
-                ty = new_ty;
-            }
+
+        const ast::Expression* expr = nullptr;
+        const sem::Type* ty = nullptr;
+
+        // Build the expression up to, but not including the matrix member
+        auto std140_mat_idx = *chain.std140_mat_idx;
+        for (size_t i = 0; i < std140_mat_idx; i++) {
+            auto [new_expr, new_ty, _] = BuildAccessExpr(expr, ty, chain, i, dynamic_index);
+            expr = new_expr;
+            ty = new_ty;
+        }
+
+        // Access is to the std140 decomposed matrix.
+        // As this is accessing only part of the matrix, we just need to pick the right column
+        // vector member.
+        auto column_idx = std::get<u32>(chain.indices[std140_mat_idx + 1]);
+        if (auto* str = tint::As<sem::Struct>(ty)) {
+            // Structure member matrix. The columns are decomposed into the structure.
+            auto mat_member_idx = std::get<u32>(chain.indices[std140_mat_idx]);
+            auto* mat_member = str->Members()[mat_member_idx];
+            auto mat_columns = *std140_mat_members.Get(mat_member);
+            expr = b.MemberAccessor(expr, mat_columns[column_idx]->symbol);
+            ty = mat_member->Type()->As<sem::Matrix>()->ColumnType();
+        } else {
+            // Non-structure-member matrix. The columns are decomposed into a new, bespoke std140
+            // structure.
+            auto [new_expr, new_ty, _] =
+                BuildAccessExpr(expr, ty, chain, std140_mat_idx, dynamic_index);
+            expr = new_expr;
+            ty = new_ty;
+            auto* mat = ty->As<sem::Matrix>();
+            auto std140_mat = std140_mats.Get(ty->As<sem::Matrix>());
+            expr = b.MemberAccessor(expr, std140_mat->columns[column_idx]);
+            ty = mat->ColumnType();
+        }
+
+        // Build any remaining accesses into the column
+        for (size_t i = std140_mat_idx + 2; i < chain.indices.Length(); i++) {
+            auto [new_expr, new_ty, _] = BuildAccessExpr(expr, ty, chain, i, dynamic_index);
+            expr = new_expr;
+            ty = new_ty;
         }
         return expr;
     }
@@ -687,27 +827,27 @@
     /// access chain.
     /// The generated function uses a WGSL switch statement to dynamically select the decomposed
     /// matrix column.
-    /// @param access the access chain from the uniform buffer to part of the matrix (column,
+    /// @param chain the access chain from the uniform buffer to part of the matrix (column,
     ///               column-swizzle, or element).
     /// @note The matrix column must be dynamically indexed to use this method.
     /// @returns the generated function name.
-    Symbol BuildLoadPartialMatrixFn(const AccessChain& access) {
+    Symbol BuildLoadPartialMatrixFn(const AccessChain& chain) {
         // Build the dynamic index parameters
-        auto dynamic_index_params = utils::Transform(access.dynamic_indices, [&](auto*, size_t i) {
+        auto dynamic_index_params = utils::Transform(chain.dynamic_indices, [&](auto*, size_t i) {
             return b.Param("p" + std::to_string(i), b.ty.u32());
         });
         // Method for generating dynamic index expressions.
         // These are passed in as arguments to the function.
         auto dynamic_index = [&](size_t idx) { return b.Expr(dynamic_index_params[idx]->symbol); };
 
-        // Fetch the access chain indices of the matrix access and the parameter index that holds
-        // the matrix column index.
-        auto std140_mat_idx = *access.std140_mat_idx;
-        auto column_param_idx = std::get<DynamicIndex>(access.indices[std140_mat_idx + 1]).slot;
+        // Fetch the access chain indices of the matrix access and the parameter index that
+        // holds the matrix column index.
+        auto std140_mat_idx = *chain.std140_mat_idx;
+        auto column_param_idx = std::get<DynamicIndex>(chain.indices[std140_mat_idx + 1]).slot;
 
         // Begin building the function name. This is extended with logic in the loop below
         // (when column_idx == 0).
-        std::string name = "load_" + sym.NameFor(access.var->Declaration()->symbol);
+        std::string name = "load";
 
         // The switch cases
         utils::Vector<const ast::CaseStatement*, 4> cases;
@@ -716,41 +856,57 @@
         const sem::Type* ret_ty = nullptr;
 
         // Build switch() cases for each column of the matrix
-        auto num_columns = access.std140_mat_ty->columns();
+        auto num_columns = chain.std140_mat_ty->columns();
         for (uint32_t column_idx = 0; column_idx < num_columns; column_idx++) {
-            const ast::Expression* expr = b.Expr(ctx.Clone(access.var->Declaration()->symbol));
-            const sem::Type* ty = access.var->Type()->UnwrapRef();
-            // Build the expression up to, but not including the matrix member
-            for (size_t i = 0; i < access.std140_mat_idx; i++) {
+            const ast::Expression* expr = nullptr;
+            const sem::Type* ty = nullptr;
+
+            // Build the expression up to, but not including the matrix
+            for (size_t i = 0; i < std140_mat_idx; i++) {
                 auto [new_expr, new_ty, access_name] =
-                    BuildAccessExpr(expr, ty, access.indices[i], dynamic_index);
+                    BuildAccessExpr(expr, ty, chain, i, dynamic_index);
                 expr = new_expr;
                 ty = new_ty;
                 if (column_idx == 0) {
-                    name = name + "_" + access_name;
+                    name += "_" + access_name;
                 }
             }
 
-            // Get the matrix member that was dynamically accessed.
-            auto mat_member_idx = std::get<u32>(access.indices[std140_mat_idx]);
-            auto* mat_member = ty->As<sem::Struct>()->Members()[mat_member_idx];
-            auto mat_columns = *std140_mats.Get(mat_member);
-            if (column_idx == 0) {
-                name = name + +"_" + sym.NameFor(mat_member->Name()) + "_p" +
-                       std::to_string(column_param_idx);
-            }
-
-            // Build the expression to the column vector member.
-            expr = b.MemberAccessor(expr, mat_columns[column_idx]->symbol);
-            ty = mat_member->Type()->As<sem::Matrix>()->ColumnType();
-            // Build the rest of the expression, skipping over the column index.
-            for (size_t i = std140_mat_idx + 2; i < access.indices.Length(); i++) {
-                auto [new_expr, new_ty, access_name] =
-                    BuildAccessExpr(expr, ty, access.indices[i], dynamic_index);
+            if (auto* str = tint::As<sem::Struct>(ty)) {
+                // Structure member matrix. The columns are decomposed into the structure.
+                auto mat_member_idx = std::get<u32>(chain.indices[std140_mat_idx]);
+                auto* mat_member = str->Members()[mat_member_idx];
+                if (column_idx == 0) {
+                    name += "_" + sym.NameFor(mat_member->Name()) + "_p" +
+                            std::to_string(column_param_idx);
+                }
+                auto mat_columns = *std140_mat_members.Get(mat_member);
+                expr = b.MemberAccessor(expr, mat_columns[column_idx]->symbol);
+                ty = mat_member->Type()->As<sem::Matrix>()->ColumnType();
+            } else {
+                // Non-structure-member matrix. The columns are decomposed into a new, bespoke
+                // std140 structure.
+                auto [new_expr, new_ty, mat_name] =
+                    BuildAccessExpr(expr, ty, chain, std140_mat_idx, dynamic_index);
                 expr = new_expr;
                 ty = new_ty;
                 if (column_idx == 0) {
-                    name = name + "_" + access_name;
+                    name += "_" + mat_name + "_p" + std::to_string(column_param_idx);
+                }
+                auto* mat = ty->As<sem::Matrix>();
+                auto std140_mat = std140_mats.Get(ty->As<sem::Matrix>());
+                expr = b.MemberAccessor(expr, std140_mat->columns[column_idx]);
+                ty = mat->ColumnType();
+            }
+
+            // Build the rest of the expression, skipping over the column index.
+            for (size_t i = std140_mat_idx + 2; i < chain.indices.Length(); i++) {
+                auto [new_expr, new_ty, access_name] =
+                    BuildAccessExpr(expr, ty, chain, i, dynamic_index);
+                expr = new_expr;
+                ty = new_ty;
+                if (column_idx == 0) {
+                    name += "_" + access_name;
                 }
             }
 
@@ -764,7 +920,8 @@
         }
 
         // Build the default case (required in WGSL).
-        // This just returns a zero value of the return type, as the index must be out of bounds.
+        // This just returns a zero value of the return type, as the index must be out of
+        // bounds.
         cases.Push(b.DefaultCase(b.Block(b.Return(b.Construct(CreateASTTypeFor(ctx, ret_ty))))));
 
         auto* column_selector = dynamic_index(column_param_idx);
@@ -779,30 +936,30 @@
     /// Generates a function to load a whole std140-decomposed matrix from a uniform buffer.
     /// The generated function will have a parameter per dynamic (runtime-evaluated) index in the
     /// access chain.
-    /// @param access the access chain from the uniform buffer to the whole std140-decomposed
+    /// @param chain the access chain from the uniform buffer to the whole std140-decomposed
     ///        matrix.
     /// @returns the generated function name.
-    Symbol BuildLoadWholeMatrixFn(const AccessChain& access) {
+    Symbol BuildLoadWholeMatrixFn(const AccessChain& chain) {
         // Build the dynamic index parameters
-        auto dynamic_index_params = utils::Transform(access.dynamic_indices, [&](auto*, size_t i) {
+        auto dynamic_index_params = utils::Transform(chain.dynamic_indices, [&](auto*, size_t i) {
             return b.Param("p" + std::to_string(i), b.ty.u32());
         });
         // Method for generating dynamic index expressions.
         // These are passed in as arguments to the function.
         auto dynamic_index = [&](size_t idx) { return b.Expr(dynamic_index_params[idx]->symbol); };
 
-        const ast::Expression* expr = b.Expr(ctx.Clone(access.var->Declaration()->symbol));
-        std::string name = sym.NameFor(access.var->Declaration()->symbol);
-        const sem::Type* ty = access.var->Type()->UnwrapRef();
+        const ast::Expression* expr = nullptr;
+        const sem::Type* ty = nullptr;
+        std::string name = "load";
 
         // Build the expression up to, but not including the matrix member
-        auto std140_mat_idx = *access.std140_mat_idx;
+        auto std140_mat_idx = *chain.std140_mat_idx;
         for (size_t i = 0; i < std140_mat_idx; i++) {
             auto [new_expr, new_ty, access_name] =
-                BuildAccessExpr(expr, ty, access.indices[i], dynamic_index);
+                BuildAccessExpr(expr, ty, chain, i, dynamic_index);
             expr = new_expr;
             ty = new_ty;
-            name = name + "_" + access_name;
+            name += "_" + access_name;
         }
 
         utils::Vector<const ast::Statement*, 2> stmts;
@@ -811,25 +968,41 @@
         auto* let = b.Let("s", b.AddressOf(expr));
         stmts.Push(b.Decl(let));
 
-        // Gather the decomposed matrix columns
-        auto mat_member_idx = std::get<u32>(access.indices[std140_mat_idx]);
-        auto* mat_member = ty->As<sem::Struct>()->Members()[mat_member_idx];
-        auto mat_columns = *std140_mats.Get(mat_member);
-        auto columns = utils::Transform(mat_columns, [&](auto* column_member) {
-            return b.MemberAccessor(b.Deref(let), column_member->symbol);
-        });
+        utils::Vector<const ast::MemberAccessorExpression*, 4> columns;
+        if (auto* str = tint::As<sem::Struct>(ty)) {
+            // Structure member matrix. The columns are decomposed into the structure.
+            auto mat_member_idx = std::get<u32>(chain.indices[std140_mat_idx]);
+            auto* mat_member = str->Members()[mat_member_idx];
+            auto mat_columns = *std140_mat_members.Get(mat_member);
+            columns = utils::Transform(mat_columns, [&](auto* column_member) {
+                return b.MemberAccessor(b.Deref(let), column_member->symbol);
+            });
+            ty = mat_member->Type();
+            name += "_" + sym.NameFor(mat_member->Name());
+        } else {
+            // Non-structure-member matrix. The columns are decomposed into a new, bespoke
+            // std140 structure.
+            auto [new_expr, new_ty, mat_name] =
+                BuildAccessExpr(expr, ty, chain, std140_mat_idx, dynamic_index);
+            expr = new_expr;
+            auto* mat = ty->As<sem::Matrix>();
+            auto std140_mat = std140_mats.Get(ty->As<sem::Matrix>());
+            columns = utils::Transform(std140_mat->columns, [&](auto column_name) {
+                return b.MemberAccessor(b.Deref(let), column_name);
+            });
+            ty = mat;
+            name += "_" + mat_name;
+        }
 
         // Reconstruct the matrix from the columns
-        expr = b.Construct(CreateASTTypeFor(ctx, access.std140_mat_ty), std::move(columns));
-        ty = mat_member->Type();
-        name = name + "_" + sym.NameFor(mat_member->Name());
+        expr = b.Construct(CreateASTTypeFor(ctx, chain.std140_mat_ty), std::move(columns));
 
         // Have the function return the constructed matrix
         stmts.Push(b.Return(expr));
 
         // Build the function
         auto* ret_ty = CreateASTTypeFor(ctx, ty);
-        auto fn_sym = b.Symbols().New("load_" + name);
+        auto fn_sym = b.Symbols().New(name);
         b.Func(fn_sym, std::move(dynamic_index_params), ret_ty, std::move(stmts));
         return fn_sym;
     }
@@ -847,14 +1020,24 @@
     /// Builds a single access in an access chain.
     /// @param lhs the expression to index using @p access
     /// @param ty the type of the expression @p lhs
-    /// @param access the access index to perform on @p lhs
+    /// @param chain the access index to perform on @p lhs
     /// @param dynamic_index a function that obtains the i'th dynamic index
     /// @returns a ExprTypeName which holds the new expression, new type and a name segment which
     ///          can be used for creating helper function names.
     ExprTypeName BuildAccessExpr(const ast::Expression* lhs,
                                  const sem::Type* ty,
-                                 AccessIndex access,
+                                 const AccessChain& chain,
+                                 size_t index,
                                  std::function<const ast::Expression*(size_t)> dynamic_index) {
+        auto& access = chain.indices[index];
+
+        if (std::get_if<UniformVariable>(&access)) {
+            const auto* expr = b.Expr(ctx.Clone(chain.var->Declaration()->symbol));
+            const auto name = ctx.src->Symbols().NameFor(chain.var->Declaration()->symbol);
+            ty = chain.var->Type()->UnwrapRef();
+            return {expr, ty, name};
+        }
+
         if (auto* dyn_idx = std::get_if<DynamicIndex>(&access)) {
             /// The access uses a dynamic (runtime-expression) index.
             auto name = "p" + std::to_string(dyn_idx->slot);
diff --git a/src/tint/transform/std140.h b/src/tint/transform/std140.h
index f987805..f41b1e3 100644
--- a/src/tint/transform/std140.h
+++ b/src/tint/transform/std140.h
@@ -19,13 +19,12 @@
 
 namespace tint::transform {
 
-/// Std140 is a transform that forks structures used in the uniform storage class that contain
-/// `matNx2<f32>` matrices into `N`x`vec2<f32>` column vectors. Structure types that transitively
-/// use these forked structures as members are also forked. `var<uniform>` variables will use these
-/// forked structures, and expressions loading from these variables will do appropriate conversions
-/// to the regular WGSL types. As `matNx2<f32>` matrices are the only type that violate
-/// std140-layout, this transformation is sufficient to have any WGSL structure be std140-layout
-/// conformant.
+/// Std140 is a transform that forks types used in the uniform storage class that contain
+/// `matNx2<f32>` matrices into `N`x`vec2<f32>` column vectors. Types that transitively use these
+/// forked types are also forked. `var<uniform>` variables will use these forked types, and
+/// expressions loading from these variables will do appropriate conversions to the regular WGSL
+/// types. As `matNx2<f32>` matrices are the only type that violate std140-layout, this
+/// transformation is sufficient to have any WGSL structure be std140-layout conformant.
 ///
 /// @note This transform requires the PromoteSideEffectsToDecl transform to have been run first.
 class Std140 final : public Castable<Std140, Transform> {
diff --git a/src/tint/transform/std140_test.cc b/src/tint/transform/std140_test.cc
index 57e23e6..97936ad 100644
--- a/src/tint/transform/std140_test.cc
+++ b/src/tint/transform/std140_test.cc
@@ -109,7 +109,7 @@
 
     src = utils::ReplaceAll(src, "${mat}", GetParam().Mat());
 
-    EXPECT_FALSE(ShouldRun<Std140>(src));
+    EXPECT_EQ(ShouldRun<Std140>(src), GetParam().should_run);
 }
 
 INSTANTIATE_TEST_SUITE_P(Std140TestShouldRun,
@@ -129,7 +129,7 @@
 TEST_F(Std140Test, EmptyModule) {
     auto* src = R"()";
 
-    auto* expect = R"()";
+    auto* expect = src;
 
     auto got = Run<Std140>(src);
 
@@ -1037,7 +1037,7 @@
   return S(mat3x2<f32>(val.m_0, val.m_1, val.m_2));
 }
 
-fn conv_arr_3_S(val : array<S_std140, 3u>) -> array<S, 3u> {
+fn conv_arr3_S(val : array<S_std140, 3u>) -> array<S, 3u> {
   var arr : array<S, 3u>;
   for(var i : u32; (i < 3u); i = (i + 1)) {
     arr[i] = conv_S(val[i]);
@@ -1046,7 +1046,7 @@
 }
 
 fn f() {
-  let l = conv_arr_3_S(a);
+  let l = conv_arr3_S(a);
 }
 )";
 
@@ -1563,6 +1563,1330 @@
     EXPECT_EQ(expect, str(got));
 }
 
+TEST_F(Std140Test, Mat4x2Uniform_LoadMatrix) {
+    auto* src = R"(
+@group(0) @binding(0) var<uniform> m : mat4x2<f32>;
+
+fn f() {
+  let l = m;
+}
+)";
+
+    auto* expect = R"(
+struct mat4x2_f32 {
+  col0 : vec2<f32>,
+  col1 : vec2<f32>,
+  col2 : vec2<f32>,
+  col3 : vec2<f32>,
+}
+
+@group(0) @binding(0) var<uniform> m : mat4x2_f32;
+
+fn conv_mat4x2_f32(val : mat4x2_f32) -> mat4x2<f32> {
+  return mat4x2<f32>(val.col0, val.col1, val.col2, val.col3);
+}
+
+fn f() {
+  let l = conv_mat4x2_f32(m);
+}
+)";
+
+    auto got = Run<Std140>(src);
+
+    EXPECT_EQ(expect, str(got));
+}
+
+TEST_F(Std140Test, Mat2x2Uniform_LoadColumn0) {
+    auto* src = R"(
+@group(0) @binding(0) var<uniform> a : mat2x2<f32>;
+
+fn f() {
+  let l = a[0];
+}
+)";
+
+    auto* expect = R"(
+struct mat2x2_f32 {
+  col0 : vec2<f32>,
+  col1 : vec2<f32>,
+}
+
+@group(0) @binding(0) var<uniform> a : mat2x2_f32;
+
+fn f() {
+  let l = a.col0;
+}
+)";
+
+    auto got = Run<Std140>(src);
+
+    EXPECT_EQ(expect, str(got));
+}
+
+TEST_F(Std140Test, Mat4x2Uniform_LoadColumn1) {
+    auto* src = R"(
+@group(0) @binding(0) var<uniform> a : mat4x2<f32>;
+
+fn f() {
+  let l = a[1];
+}
+)";
+
+    auto* expect = R"(
+struct mat4x2_f32 {
+  col0 : vec2<f32>,
+  col1 : vec2<f32>,
+  col2 : vec2<f32>,
+  col3 : vec2<f32>,
+}
+
+@group(0) @binding(0) var<uniform> a : mat4x2_f32;
+
+fn f() {
+  let l = a.col1;
+}
+)";
+
+    auto got = Run<Std140>(src);
+
+    EXPECT_EQ(expect, str(got));
+}
+
+TEST_F(Std140Test, Mat2x2Uniform_LoadColumnI) {
+    auto* src = R"(
+@group(0) @binding(0) var<uniform> a : mat2x2<f32>;
+
+fn f() {
+  let I = 1;
+
+  let l = a[I];
+}
+)";
+
+    auto* expect = R"(
+struct mat2x2_f32 {
+  col0 : vec2<f32>,
+  col1 : vec2<f32>,
+}
+
+@group(0) @binding(0) var<uniform> a : mat2x2_f32;
+
+fn load_a_p0(p0 : u32) -> vec2<f32> {
+  switch(p0) {
+    case 0u: {
+      return a.col0;
+    }
+    case 1u: {
+      return a.col1;
+    }
+    default: {
+      return vec2<f32>();
+    }
+  }
+}
+
+fn f() {
+  let I = 1;
+  let l = load_a_p0(u32(I));
+}
+)";
+
+    auto got = Run<Std140>(src);
+
+    EXPECT_EQ(expect, str(got));
+}
+
+TEST_F(Std140Test, Mat2x2Uniform_LoadColumn1Swizzle) {
+    auto* src = R"(
+@group(0) @binding(0) var<uniform> a : mat2x2<f32>;
+
+fn f() {
+  let l = a[1].yx;
+}
+)";
+
+    auto* expect = R"(
+struct mat2x2_f32 {
+  col0 : vec2<f32>,
+  col1 : vec2<f32>,
+}
+
+@group(0) @binding(0) var<uniform> a : mat2x2_f32;
+
+fn f() {
+  let l = a.col1.yx;
+}
+)";
+
+    auto got = Run<Std140>(src);
+
+    EXPECT_EQ(expect, str(got));
+}
+
+TEST_F(Std140Test, Mat4x2Uniform_LoadColumnISwizzle) {
+    auto* src = R"(
+@group(0) @binding(0) var<uniform> a : mat4x2<f32>;
+
+fn f() {
+  let I = 1;
+
+  let l = a[I].yx;
+}
+)";
+
+    auto* expect = R"(
+struct mat4x2_f32 {
+  col0 : vec2<f32>,
+  col1 : vec2<f32>,
+  col2 : vec2<f32>,
+  col3 : vec2<f32>,
+}
+
+@group(0) @binding(0) var<uniform> a : mat4x2_f32;
+
+fn load_a_p0_yx(p0 : u32) -> vec2<f32> {
+  switch(p0) {
+    case 0u: {
+      return a.col0.yx;
+    }
+    case 1u: {
+      return a.col1.yx;
+    }
+    case 2u: {
+      return a.col2.yx;
+    }
+    case 3u: {
+      return a.col3.yx;
+    }
+    default: {
+      return vec2<f32>();
+    }
+  }
+}
+
+fn f() {
+  let I = 1;
+  let l = load_a_p0_yx(u32(I));
+}
+)";
+
+    auto got = Run<Std140>(src);
+
+    EXPECT_EQ(expect, str(got));
+}
+
+TEST_F(Std140Test, Mat2x2Uniform_LoadColumn1Element1) {
+    auto* src = R"(
+@group(0) @binding(0) var<uniform> a : mat2x2<f32>;
+
+fn f() {
+  let l = a[1][1];
+}
+)";
+
+    auto* expect = R"(
+struct mat2x2_f32 {
+  col0 : vec2<f32>,
+  col1 : vec2<f32>,
+}
+
+@group(0) @binding(0) var<uniform> a : mat2x2_f32;
+
+fn f() {
+  let l = a.col1[1u];
+}
+)";
+
+    auto got = Run<Std140>(src);
+
+    EXPECT_EQ(expect, str(got));
+}
+
+TEST_F(Std140Test, Mat4x2Uniform_LoadColumnIElementI) {
+    auto* src = R"(
+@group(0) @binding(0) var<uniform> a : mat4x2<f32>;
+
+fn f() {
+  let I = 1;
+
+  let l = a[I][I];
+}
+)";
+
+    auto* expect = R"(
+struct mat4x2_f32 {
+  col0 : vec2<f32>,
+  col1 : vec2<f32>,
+  col2 : vec2<f32>,
+  col3 : vec2<f32>,
+}
+
+@group(0) @binding(0) var<uniform> a : mat4x2_f32;
+
+fn load_a_p0_p1(p0 : u32, p1 : u32) -> f32 {
+  switch(p0) {
+    case 0u: {
+      return a.col0[p1];
+    }
+    case 1u: {
+      return a.col1[p1];
+    }
+    case 2u: {
+      return a.col2[p1];
+    }
+    case 3u: {
+      return a.col3[p1];
+    }
+    default: {
+      return f32();
+    }
+  }
+}
+
+fn f() {
+  let I = 1;
+  let l = load_a_p0_p1(u32(I), u32(I));
+}
+)";
+
+    auto got = Run<Std140>(src);
+
+    EXPECT_EQ(expect, str(got));
+}
+
+TEST_F(Std140Test, ArrayMat2x2Uniform_LoadArray) {
+    auto* src = R"(
+@group(0) @binding(0) var<uniform> a : array<mat2x2<f32>, 3>;
+
+fn f() {
+  let l = a;
+}
+)";
+
+    auto* expect = R"(
+struct mat2x2_f32 {
+  col0 : vec2<f32>,
+  col1 : vec2<f32>,
+}
+
+@group(0) @binding(0) var<uniform> a : array<mat2x2_f32, 3u>;
+
+fn conv_mat2x2_f32(val : mat2x2_f32) -> mat2x2<f32> {
+  return mat2x2<f32>(val.col0, val.col1);
+}
+
+fn conv_arr3_mat2x2_f32(val : array<mat2x2_f32, 3u>) -> array<mat2x2<f32>, 3u> {
+  var arr : array<mat2x2<f32>, 3u>;
+  for(var i : u32; (i < 3u); i = (i + 1)) {
+    arr[i] = conv_mat2x2_f32(val[i]);
+  }
+  return arr;
+}
+
+fn f() {
+  let l = conv_arr3_mat2x2_f32(a);
+}
+)";
+
+    auto got = Run<Std140>(src);
+
+    EXPECT_EQ(expect, str(got));
+}
+
+TEST_F(Std140Test, ArrayMat4x2Uniform_LoadMatrix0) {
+    auto* src = R"(
+@group(0) @binding(0) var<uniform> a : array<mat4x2<f32>, 3>;
+
+fn f() {
+  let l = a[0];
+}
+)";
+
+    auto* expect = R"(
+struct mat4x2_f32 {
+  col0 : vec2<f32>,
+  col1 : vec2<f32>,
+  col2 : vec2<f32>,
+  col3 : vec2<f32>,
+}
+
+@group(0) @binding(0) var<uniform> a : array<mat4x2_f32, 3u>;
+
+fn conv_mat4x2_f32(val : mat4x2_f32) -> mat4x2<f32> {
+  return mat4x2<f32>(val.col0, val.col1, val.col2, val.col3);
+}
+
+fn f() {
+  let l = conv_mat4x2_f32(a[0u]);
+}
+)";
+
+    auto got = Run<Std140>(src);
+
+    EXPECT_EQ(expect, str(got));
+}
+
+TEST_F(Std140Test, ArrayMat2x2Uniform_LoadMatrix1) {
+    auto* src = R"(
+@group(0) @binding(0) var<uniform> a : array<mat2x2<f32>, 3>;
+
+fn f() {
+  let l = a[1];
+}
+)";
+
+    auto* expect = R"(
+struct mat2x2_f32 {
+  col0 : vec2<f32>,
+  col1 : vec2<f32>,
+}
+
+@group(0) @binding(0) var<uniform> a : array<mat2x2_f32, 3u>;
+
+fn conv_mat2x2_f32(val : mat2x2_f32) -> mat2x2<f32> {
+  return mat2x2<f32>(val.col0, val.col1);
+}
+
+fn f() {
+  let l = conv_mat2x2_f32(a[1u]);
+}
+)";
+
+    auto got = Run<Std140>(src);
+
+    EXPECT_EQ(expect, str(got));
+}
+
+TEST_F(Std140Test, ArrayMat4x2Uniform_LoadMatrixI) {
+    auto* src = R"(
+@group(0) @binding(0) var<uniform> a : array<mat4x2<f32>, 3>;
+
+fn f() {
+  let I = 1;
+  let l = a[I];
+}
+)";
+
+    auto* expect = R"(
+struct mat4x2_f32 {
+  col0 : vec2<f32>,
+  col1 : vec2<f32>,
+  col2 : vec2<f32>,
+  col3 : vec2<f32>,
+}
+
+@group(0) @binding(0) var<uniform> a : array<mat4x2_f32, 3u>;
+
+fn conv_mat4x2_f32(val : mat4x2_f32) -> mat4x2<f32> {
+  return mat4x2<f32>(val.col0, val.col1, val.col2, val.col3);
+}
+
+fn f() {
+  let I = 1;
+  let l = conv_mat4x2_f32(a[I]);
+}
+)";
+
+    auto got = Run<Std140>(src);
+
+    EXPECT_EQ(expect, str(got));
+}
+
+TEST_F(Std140Test, ArrayMat2x2Uniform_LoadMatrix1Column0) {
+    auto* src = R"(
+@group(0) @binding(0) var<uniform> a : array<mat2x2<f32>, 3>;
+
+fn f() {
+  let l = a[1][0];
+}
+)";
+
+    auto* expect = R"(
+struct mat2x2_f32 {
+  col0 : vec2<f32>,
+  col1 : vec2<f32>,
+}
+
+@group(0) @binding(0) var<uniform> a : array<mat2x2_f32, 3u>;
+
+fn f() {
+  let l = a[1u].col0;
+}
+)";
+
+    auto got = Run<Std140>(src);
+
+    EXPECT_EQ(expect, str(got));
+}
+
+TEST_F(Std140Test, ArrayMat4x2Uniform_LoadMatrix0Column1) {
+    auto* src = R"(
+@group(0) @binding(0) var<uniform> a : array<mat4x2<f32>, 3>;
+
+fn f() {
+  let l = a[0][1];
+}
+)";
+
+    auto* expect = R"(
+struct mat4x2_f32 {
+  col0 : vec2<f32>,
+  col1 : vec2<f32>,
+  col2 : vec2<f32>,
+  col3 : vec2<f32>,
+}
+
+@group(0) @binding(0) var<uniform> a : array<mat4x2_f32, 3u>;
+
+fn f() {
+  let l = a[0u].col1;
+}
+)";
+
+    auto got = Run<Std140>(src);
+
+    EXPECT_EQ(expect, str(got));
+}
+
+TEST_F(Std140Test, ArrayMat2x2Uniform_LoadMatrixIColumn1) {
+    auto* src = R"(
+@group(0) @binding(0) var<uniform> a : array<mat2x2<f32>, 3>;
+
+fn f() {
+  let I = 1;
+  let l = a[I][1];
+}
+)";
+
+    auto* expect = R"(
+struct mat2x2_f32 {
+  col0 : vec2<f32>,
+  col1 : vec2<f32>,
+}
+
+@group(0) @binding(0) var<uniform> a : array<mat2x2_f32, 3u>;
+
+fn f() {
+  let I = 1;
+  let l = a[I].col1;
+}
+)";
+
+    auto got = Run<Std140>(src);
+
+    EXPECT_EQ(expect, str(got));
+}
+
+TEST_F(Std140Test, ArrayMat4x2Uniform_LoadMatrix1ColumnI) {
+    auto* src = R"(
+@group(0) @binding(0) var<uniform> a : array<mat4x2<f32>, 3>;
+
+fn f() {
+  let I = 1;
+  let l = a[1][I];
+}
+)";
+
+    auto* expect = R"(
+struct mat4x2_f32 {
+  col0 : vec2<f32>,
+  col1 : vec2<f32>,
+  col2 : vec2<f32>,
+  col3 : vec2<f32>,
+}
+
+@group(0) @binding(0) var<uniform> a : array<mat4x2_f32, 3u>;
+
+fn load_a_1_p0(p0 : u32) -> vec2<f32> {
+  switch(p0) {
+    case 0u: {
+      return a[1u].col0;
+    }
+    case 1u: {
+      return a[1u].col1;
+    }
+    case 2u: {
+      return a[1u].col2;
+    }
+    case 3u: {
+      return a[1u].col3;
+    }
+    default: {
+      return vec2<f32>();
+    }
+  }
+}
+
+fn f() {
+  let I = 1;
+  let l = load_a_1_p0(u32(I));
+}
+)";
+
+    auto got = Run<Std140>(src);
+
+    EXPECT_EQ(expect, str(got));
+}
+
+TEST_F(Std140Test, ArrayArrayMat2x2Uniform_LoadArrays) {
+    auto* src = R"(
+@group(0) @binding(0) var<uniform> a : array<array<mat2x2<f32>, 3>, 4>;
+
+fn f() {
+  let l = a;
+}
+)";
+
+    auto* expect = R"(
+struct mat2x2_f32 {
+  col0 : vec2<f32>,
+  col1 : vec2<f32>,
+}
+
+@group(0) @binding(0) var<uniform> a : array<array<mat2x2_f32, 3u>, 4u>;
+
+fn conv_mat2x2_f32(val : mat2x2_f32) -> mat2x2<f32> {
+  return mat2x2<f32>(val.col0, val.col1);
+}
+
+fn conv_arr3_mat2x2_f32(val : array<mat2x2_f32, 3u>) -> array<mat2x2<f32>, 3u> {
+  var arr : array<mat2x2<f32>, 3u>;
+  for(var i : u32; (i < 3u); i = (i + 1)) {
+    arr[i] = conv_mat2x2_f32(val[i]);
+  }
+  return arr;
+}
+
+fn conv_arr4_arr3_mat2x2_f32(val : array<array<mat2x2_f32, 3u>, 4u>) -> array<array<mat2x2<f32>, 3u>, 4u> {
+  var arr : array<array<mat2x2<f32>, 3u>, 4u>;
+  for(var i : u32; (i < 4u); i = (i + 1)) {
+    arr[i] = conv_arr3_mat2x2_f32(val[i]);
+  }
+  return arr;
+}
+
+fn f() {
+  let l = conv_arr4_arr3_mat2x2_f32(a);
+}
+)";
+
+    auto got = Run<Std140>(src);
+
+    EXPECT_EQ(expect, str(got));
+}
+
+TEST_F(Std140Test, ArrayArrayMat4x2Uniform_LoadArray0) {
+    auto* src = R"(
+@group(0) @binding(0) var<uniform> a : array<array<mat4x2<f32>, 3>, 4>;
+
+fn f() {
+  let l = a[0];
+}
+)";
+
+    auto* expect = R"(
+struct mat4x2_f32 {
+  col0 : vec2<f32>,
+  col1 : vec2<f32>,
+  col2 : vec2<f32>,
+  col3 : vec2<f32>,
+}
+
+@group(0) @binding(0) var<uniform> a : array<array<mat4x2_f32, 3u>, 4u>;
+
+fn conv_mat4x2_f32(val : mat4x2_f32) -> mat4x2<f32> {
+  return mat4x2<f32>(val.col0, val.col1, val.col2, val.col3);
+}
+
+fn conv_arr3_mat4x2_f32(val : array<mat4x2_f32, 3u>) -> array<mat4x2<f32>, 3u> {
+  var arr : array<mat4x2<f32>, 3u>;
+  for(var i : u32; (i < 3u); i = (i + 1)) {
+    arr[i] = conv_mat4x2_f32(val[i]);
+  }
+  return arr;
+}
+
+fn f() {
+  let l = conv_arr3_mat4x2_f32(a[0u]);
+}
+)";
+
+    auto got = Run<Std140>(src);
+
+    EXPECT_EQ(expect, str(got));
+}
+
+TEST_F(Std140Test, ArrayArrayMat2x2Uniform_LoadArray1) {
+    auto* src = R"(
+@group(0) @binding(0) var<uniform> a : array<array<mat2x2<f32>, 3>,4>;
+
+fn f() {
+  let l = a[1];
+}
+)";
+
+    auto* expect = R"(
+struct mat2x2_f32 {
+  col0 : vec2<f32>,
+  col1 : vec2<f32>,
+}
+
+@group(0) @binding(0) var<uniform> a : array<array<mat2x2_f32, 3u>, 4u>;
+
+fn conv_mat2x2_f32(val : mat2x2_f32) -> mat2x2<f32> {
+  return mat2x2<f32>(val.col0, val.col1);
+}
+
+fn conv_arr3_mat2x2_f32(val : array<mat2x2_f32, 3u>) -> array<mat2x2<f32>, 3u> {
+  var arr : array<mat2x2<f32>, 3u>;
+  for(var i : u32; (i < 3u); i = (i + 1)) {
+    arr[i] = conv_mat2x2_f32(val[i]);
+  }
+  return arr;
+}
+
+fn f() {
+  let l = conv_arr3_mat2x2_f32(a[1u]);
+}
+)";
+
+    auto got = Run<Std140>(src);
+
+    EXPECT_EQ(expect, str(got));
+}
+
+TEST_F(Std140Test, ArrayArrayMat2x2Uniform_LoadArrayI) {
+    auto* src = R"(
+@group(0) @binding(0) var<uniform> a : array<array<mat2x2<f32>, 3>,4>;
+
+fn f() {
+  let I = 1;
+  let l = a[I];
+}
+)";
+
+    auto* expect = R"(
+struct mat2x2_f32 {
+  col0 : vec2<f32>,
+  col1 : vec2<f32>,
+}
+
+@group(0) @binding(0) var<uniform> a : array<array<mat2x2_f32, 3u>, 4u>;
+
+fn conv_mat2x2_f32(val : mat2x2_f32) -> mat2x2<f32> {
+  return mat2x2<f32>(val.col0, val.col1);
+}
+
+fn conv_arr3_mat2x2_f32(val : array<mat2x2_f32, 3u>) -> array<mat2x2<f32>, 3u> {
+  var arr : array<mat2x2<f32>, 3u>;
+  for(var i : u32; (i < 3u); i = (i + 1)) {
+    arr[i] = conv_mat2x2_f32(val[i]);
+  }
+  return arr;
+}
+
+fn f() {
+  let I = 1;
+  let l = conv_arr3_mat2x2_f32(a[I]);
+}
+)";
+
+    auto got = Run<Std140>(src);
+
+    EXPECT_EQ(expect, str(got));
+}
+TEST_F(Std140Test, ArrayArrayMat2x2Uniform_LoadMatrix12Column0) {
+    auto* src = R"(
+@group(0) @binding(0) var<uniform> a : array<array<mat2x2<f32>, 3>, 4>;
+
+fn f() {
+  let l = a[1][2][0];
+}
+)";
+
+    auto* expect = R"(
+struct mat2x2_f32 {
+  col0 : vec2<f32>,
+  col1 : vec2<f32>,
+}
+
+@group(0) @binding(0) var<uniform> a : array<array<mat2x2_f32, 3u>, 4u>;
+
+fn f() {
+  let l = a[1u][2u].col0;
+}
+)";
+
+    auto got = Run<Std140>(src);
+
+    EXPECT_EQ(expect, str(got));
+}
+
+TEST_F(Std140Test, ArrayArrayMat4x2Uniform_LoadMatrix2IColumn1) {
+    auto* src = R"(
+@group(0) @binding(0) var<uniform> a : array<array<mat4x2<f32>, 3>, 4>;
+
+fn f() {
+  let I = 1;
+  let l = a[2][I][1];
+}
+)";
+
+    auto* expect = R"(
+struct mat4x2_f32 {
+  col0 : vec2<f32>,
+  col1 : vec2<f32>,
+  col2 : vec2<f32>,
+  col3 : vec2<f32>,
+}
+
+@group(0) @binding(0) var<uniform> a : array<array<mat4x2_f32, 3u>, 4u>;
+
+fn f() {
+  let I = 1;
+  let l = a[2u][I].col1;
+}
+)";
+
+    auto got = Run<Std140>(src);
+
+    EXPECT_EQ(expect, str(got));
+}
+
+TEST_F(Std140Test, ArrayArrayMat2x2Uniform_LoadMatrixI2Column1) {
+    auto* src = R"(
+@group(0) @binding(0) var<uniform> a : array<array<mat2x2<f32>, 3>, 4>;
+
+fn f() {
+  let I = 1;
+  let l = a[I][2][1];
+}
+)";
+
+    auto* expect = R"(
+struct mat2x2_f32 {
+  col0 : vec2<f32>,
+  col1 : vec2<f32>,
+}
+
+@group(0) @binding(0) var<uniform> a : array<array<mat2x2_f32, 3u>, 4u>;
+
+fn f() {
+  let I = 1;
+  let l = a[I][2u].col1;
+}
+)";
+
+    auto got = Run<Std140>(src);
+
+    EXPECT_EQ(expect, str(got));
+}
+
+TEST_F(Std140Test, ArrayArrayMat2x2Uniform_LoadMatrixIIColumn1) {
+    auto* src = R"(
+@group(0) @binding(0) var<uniform> a : array<array<mat2x2<f32>, 3>, 4>;
+
+fn f() {
+  let I = 1;
+  let l = a[I][I][1];
+}
+)";
+
+    auto* expect = R"(
+struct mat2x2_f32 {
+  col0 : vec2<f32>,
+  col1 : vec2<f32>,
+}
+
+@group(0) @binding(0) var<uniform> a : array<array<mat2x2_f32, 3u>, 4u>;
+
+fn f() {
+  let I = 1;
+  let l = a[I][I].col1;
+}
+)";
+
+    auto got = Run<Std140>(src);
+
+    EXPECT_EQ(expect, str(got));
+}
+
+TEST_F(Std140Test, ArrayArrayMat4x2Uniform_LoadMatrix12ColumnI) {
+    auto* src = R"(
+@group(0) @binding(0) var<uniform> a : array<array<mat4x2<f32>, 3>, 4>;
+
+fn f() {
+  let I = 1;
+  let l = a[1][2][I];
+}
+)";
+
+    auto* expect = R"(
+struct mat4x2_f32 {
+  col0 : vec2<f32>,
+  col1 : vec2<f32>,
+  col2 : vec2<f32>,
+  col3 : vec2<f32>,
+}
+
+@group(0) @binding(0) var<uniform> a : array<array<mat4x2_f32, 3u>, 4u>;
+
+fn load_a_1_2_p0(p0 : u32) -> vec2<f32> {
+  switch(p0) {
+    case 0u: {
+      return a[1u][2u].col0;
+    }
+    case 1u: {
+      return a[1u][2u].col1;
+    }
+    case 2u: {
+      return a[1u][2u].col2;
+    }
+    case 3u: {
+      return a[1u][2u].col3;
+    }
+    default: {
+      return vec2<f32>();
+    }
+  }
+}
+
+fn f() {
+  let I = 1;
+  let l = load_a_1_2_p0(u32(I));
+}
+)";
+
+    auto got = Run<Std140>(src);
+
+    EXPECT_EQ(expect, str(got));
+}
+
+TEST_F(Std140Test, StructArrayMat2x2Uniform_LoadStruct) {
+    auto* src = R"(
+struct S {
+  a : array<mat2x2<f32>, 3>,
+};
+
+@group(0) @binding(0) var<uniform> s : S;
+
+fn f() {
+  let l = s;
+}
+)";
+
+    auto* expect = R"(
+struct mat2x2_f32 {
+  col0 : vec2<f32>,
+  col1 : vec2<f32>,
+}
+
+struct S {
+  a : array<mat2x2<f32>, 3>,
+}
+
+struct S_std140 {
+  a : array<mat2x2_f32, 3u>,
+}
+
+@group(0) @binding(0) var<uniform> s : S_std140;
+
+fn conv_mat2x2_f32(val : mat2x2_f32) -> mat2x2<f32> {
+  return mat2x2<f32>(val.col0, val.col1);
+}
+
+fn conv_arr3_mat2x2_f32(val : array<mat2x2_f32, 3u>) -> array<mat2x2<f32>, 3u> {
+  var arr : array<mat2x2<f32>, 3u>;
+  for(var i : u32; (i < 3u); i = (i + 1)) {
+    arr[i] = conv_mat2x2_f32(val[i]);
+  }
+  return arr;
+}
+
+fn conv_S(val : S_std140) -> S {
+  return S(conv_arr3_mat2x2_f32(val.a));
+}
+
+fn f() {
+  let l = conv_S(s);
+}
+)";
+
+    auto got = Run<Std140>(src);
+
+    EXPECT_EQ(expect, str(got));
+}
+
+TEST_F(Std140Test, StructArrayMat2x2Uniform_LoadArray) {
+    auto* src = R"(
+struct S {
+  a : array<mat2x2<f32>, 3>,
+};
+
+@group(0) @binding(0) var<uniform> s : S;
+
+fn f() {
+  let l = s.a;
+}
+)";
+
+    auto* expect = R"(
+struct mat2x2_f32 {
+  col0 : vec2<f32>,
+  col1 : vec2<f32>,
+}
+
+struct S {
+  a : array<mat2x2<f32>, 3>,
+}
+
+struct S_std140 {
+  a : array<mat2x2_f32, 3u>,
+}
+
+@group(0) @binding(0) var<uniform> s : S_std140;
+
+fn conv_mat2x2_f32(val : mat2x2_f32) -> mat2x2<f32> {
+  return mat2x2<f32>(val.col0, val.col1);
+}
+
+fn conv_arr3_mat2x2_f32(val : array<mat2x2_f32, 3u>) -> array<mat2x2<f32>, 3u> {
+  var arr : array<mat2x2<f32>, 3u>;
+  for(var i : u32; (i < 3u); i = (i + 1)) {
+    arr[i] = conv_mat2x2_f32(val[i]);
+  }
+  return arr;
+}
+
+fn f() {
+  let l = conv_arr3_mat2x2_f32(s.a);
+}
+)";
+
+    auto got = Run<Std140>(src);
+
+    EXPECT_EQ(expect, str(got));
+}
+
+TEST_F(Std140Test, StructArrayMat4x2Uniform_LoadMatrix0) {
+    auto* src = R"(
+struct S {
+  a : array<mat4x2<f32>, 3>,
+};
+
+@group(0) @binding(0) var<uniform> s : S;
+
+fn f() {
+  let l = s.a[0];
+}
+)";
+
+    auto* expect = R"(
+struct mat4x2_f32 {
+  col0 : vec2<f32>,
+  col1 : vec2<f32>,
+  col2 : vec2<f32>,
+  col3 : vec2<f32>,
+}
+
+struct S {
+  a : array<mat4x2<f32>, 3>,
+}
+
+struct S_std140 {
+  a : array<mat4x2_f32, 3u>,
+}
+
+@group(0) @binding(0) var<uniform> s : S_std140;
+
+fn conv_mat4x2_f32(val : mat4x2_f32) -> mat4x2<f32> {
+  return mat4x2<f32>(val.col0, val.col1, val.col2, val.col3);
+}
+
+fn f() {
+  let l = conv_mat4x2_f32(s.a[0u]);
+}
+)";
+
+    auto got = Run<Std140>(src);
+
+    EXPECT_EQ(expect, str(got));
+}
+
+TEST_F(Std140Test, StructArrayMat2x2Uniform_LoadMatrix1) {
+    auto* src = R"(
+struct S {
+  a : array<mat2x2<f32>, 3>,
+};
+
+@group(0) @binding(0) var<uniform> s : S;
+
+fn f() {
+  let l = s.a[1];
+}
+)";
+
+    auto* expect = R"(
+struct mat2x2_f32 {
+  col0 : vec2<f32>,
+  col1 : vec2<f32>,
+}
+
+struct S {
+  a : array<mat2x2<f32>, 3>,
+}
+
+struct S_std140 {
+  a : array<mat2x2_f32, 3u>,
+}
+
+@group(0) @binding(0) var<uniform> s : S_std140;
+
+fn conv_mat2x2_f32(val : mat2x2_f32) -> mat2x2<f32> {
+  return mat2x2<f32>(val.col0, val.col1);
+}
+
+fn f() {
+  let l = conv_mat2x2_f32(s.a[1u]);
+}
+)";
+
+    auto got = Run<Std140>(src);
+
+    EXPECT_EQ(expect, str(got));
+}
+
+TEST_F(Std140Test, StructArrayMat4x2Uniform_LoadMatrixI) {
+    auto* src = R"(
+struct S {
+  a : array<mat4x2<f32>, 3>,
+};
+
+@group(0) @binding(0) var<uniform> s : S;
+
+fn f() {
+  let I = 1;
+  let l = s.a[I];
+}
+)";
+
+    auto* expect = R"(
+struct mat4x2_f32 {
+  col0 : vec2<f32>,
+  col1 : vec2<f32>,
+  col2 : vec2<f32>,
+  col3 : vec2<f32>,
+}
+
+struct S {
+  a : array<mat4x2<f32>, 3>,
+}
+
+struct S_std140 {
+  a : array<mat4x2_f32, 3u>,
+}
+
+@group(0) @binding(0) var<uniform> s : S_std140;
+
+fn conv_mat4x2_f32(val : mat4x2_f32) -> mat4x2<f32> {
+  return mat4x2<f32>(val.col0, val.col1, val.col2, val.col3);
+}
+
+fn f() {
+  let I = 1;
+  let l = conv_mat4x2_f32(s.a[I]);
+}
+)";
+
+    auto got = Run<Std140>(src);
+
+    EXPECT_EQ(expect, str(got));
+}
+
+TEST_F(Std140Test, StructArrayMat2x2Uniform_LoadMatrix1Column0) {
+    auto* src = R"(
+struct S {
+  a : array<mat2x2<f32>, 3>,
+};
+
+@group(0) @binding(0) var<uniform> s : S;
+
+fn f() {
+  let l = s.a[1][0];
+}
+)";
+
+    auto* expect = R"(
+struct mat2x2_f32 {
+  col0 : vec2<f32>,
+  col1 : vec2<f32>,
+}
+
+struct S {
+  a : array<mat2x2<f32>, 3>,
+}
+
+struct S_std140 {
+  a : array<mat2x2_f32, 3u>,
+}
+
+@group(0) @binding(0) var<uniform> s : S_std140;
+
+fn f() {
+  let l = s.a[1u].col0;
+}
+)";
+
+    auto got = Run<Std140>(src);
+
+    EXPECT_EQ(expect, str(got));
+}
+
+TEST_F(Std140Test, StructArrayMat4x2Uniform_LoadMatrix0Column1) {
+    auto* src = R"(
+struct S {
+  a : array<mat4x2<f32>, 3>,
+};
+
+@group(0) @binding(0) var<uniform> s : S;
+
+fn f() {
+  let l = s.a[0][1];
+}
+)";
+
+    auto* expect = R"(
+struct mat4x2_f32 {
+  col0 : vec2<f32>,
+  col1 : vec2<f32>,
+  col2 : vec2<f32>,
+  col3 : vec2<f32>,
+}
+
+struct S {
+  a : array<mat4x2<f32>, 3>,
+}
+
+struct S_std140 {
+  a : array<mat4x2_f32, 3u>,
+}
+
+@group(0) @binding(0) var<uniform> s : S_std140;
+
+fn f() {
+  let l = s.a[0u].col1;
+}
+)";
+
+    auto got = Run<Std140>(src);
+
+    EXPECT_EQ(expect, str(got));
+}
+
+TEST_F(Std140Test, StructArrayMat2x2Uniform_LoadMatrixIColumn1) {
+    auto* src = R"(
+struct S {
+  a : array<mat2x2<f32>, 3>,
+};
+
+@group(0) @binding(0) var<uniform> s : S;
+
+fn f() {
+  let I = 1;
+  let l = s.a[I][1];
+}
+)";
+
+    auto* expect = R"(
+struct mat2x2_f32 {
+  col0 : vec2<f32>,
+  col1 : vec2<f32>,
+}
+
+struct S {
+  a : array<mat2x2<f32>, 3>,
+}
+
+struct S_std140 {
+  a : array<mat2x2_f32, 3u>,
+}
+
+@group(0) @binding(0) var<uniform> s : S_std140;
+
+fn f() {
+  let I = 1;
+  let l = s.a[I].col1;
+}
+)";
+
+    auto got = Run<Std140>(src);
+
+    EXPECT_EQ(expect, str(got));
+}
+
+TEST_F(Std140Test, StructArrayMat4x2Uniform_LoadMatrix1ColumnI) {
+    auto* src = R"(
+struct S {
+  a : array<mat4x2<f32>, 3>,
+};
+
+@group(0) @binding(0) var<uniform> s : S;
+
+fn f() {
+  let I = 1;
+  let l = s.a[1][I];
+}
+)";
+
+    auto* expect = R"(
+struct mat4x2_f32 {
+  col0 : vec2<f32>,
+  col1 : vec2<f32>,
+  col2 : vec2<f32>,
+  col3 : vec2<f32>,
+}
+
+struct S {
+  a : array<mat4x2<f32>, 3>,
+}
+
+struct S_std140 {
+  a : array<mat4x2_f32, 3u>,
+}
+
+@group(0) @binding(0) var<uniform> s : S_std140;
+
+fn load_s_a_1_p0(p0 : u32) -> vec2<f32> {
+  switch(p0) {
+    case 0u: {
+      return s.a[1u].col0;
+    }
+    case 1u: {
+      return s.a[1u].col1;
+    }
+    case 2u: {
+      return s.a[1u].col2;
+    }
+    case 3u: {
+      return s.a[1u].col3;
+    }
+    default: {
+      return vec2<f32>();
+    }
+  }
+}
+
+fn f() {
+  let I = 1;
+  let l = load_s_a_1_p0(u32(I));
+}
+)";
+
+    auto got = Run<Std140>(src);
+
+    EXPECT_EQ(expect, str(got));
+}
+
 TEST_F(Std140Test, ArrayStructArrayStructMat4x2Uniform_Loads) {
     auto* src = R"(
 struct Inner {
@@ -1614,7 +2938,7 @@
   return Inner(mat4x2<f32>(val.m_0, val.m_1, val.m_2, val.m_3));
 }
 
-fn conv_arr_4_Inner(val : array<Inner_std140, 4u>) -> array<Inner, 4u> {
+fn conv_arr4_Inner(val : array<Inner_std140, 4u>) -> array<Inner, 4u> {
   var arr : array<Inner, 4u>;
   for(var i : u32; (i < 4u); i = (i + 1)) {
     arr[i] = conv_Inner(val[i]);
@@ -1623,10 +2947,10 @@
 }
 
 fn conv_Outer(val : Outer_std140) -> Outer {
-  return Outer(conv_arr_4_Inner(val.a));
+  return Outer(conv_arr4_Inner(val.a));
 }
 
-fn conv_arr_4_Outer(val : array<Outer_std140, 4u>) -> array<Outer, 4u> {
+fn conv_arr4_Outer(val : array<Outer_std140, 4u>) -> array<Outer, 4u> {
   var arr : array<Outer, 4u>;
   for(var i : u32; (i < 4u); i = (i + 1)) {
     arr[i] = conv_Outer(val[i]);
@@ -1641,9 +2965,9 @@
 
 fn f() {
   let I = 1;
-  let l_a : array<Outer, 4> = conv_arr_4_Outer(a);
+  let l_a : array<Outer, 4> = conv_arr4_Outer(a);
   let l_a_1 : Outer = conv_Outer(a[1u]);
-  let l_a_2_a : array<Inner, 4> = conv_arr_4_Inner(a[2u].a);
+  let l_a_2_a : array<Inner, 4> = conv_arr4_Inner(a[2u].a);
   let l_a_3_a_1 : Inner = conv_Inner(a[3u].a[1u]);
   let l_a_0_a_2_m : mat4x2<f32> = load_a_0_a_2_m();
   let l_a_1_a_3_m_0 : vec2<f32> = a[1u].a[3u].m_0;
@@ -1717,7 +3041,7 @@
   return Inner(mat4x2<f32>(val.m_0, val.m_1, val.m_2, val.m_3));
 }
 
-fn conv_arr_4_Inner(val : array<Inner_std140, 4u>) -> array<Inner, 4u> {
+fn conv_arr4_Inner(val : array<Inner_std140, 4u>) -> array<Inner, 4u> {
   var arr : array<Inner, 4u>;
   for(var i : u32; (i < 4u); i = (i + 1)) {
     arr[i] = conv_Inner(val[i]);
@@ -1726,10 +3050,10 @@
 }
 
 fn conv_Outer(val : Outer_std140) -> Outer {
-  return Outer(conv_arr_4_Inner(val.a));
+  return Outer(conv_arr4_Inner(val.a));
 }
 
-fn conv_arr_4_Outer(val : array<Outer_std140, 4u>) -> array<Outer, 4u> {
+fn conv_arr4_Outer(val : array<Outer_std140, 4u>) -> array<Outer, 4u> {
   var arr : array<Outer, 4u>;
   for(var i : u32; (i < 4u); i = (i + 1)) {
     arr[i] = conv_Outer(val[i]);
@@ -1744,15 +3068,15 @@
 
 fn f() {
   let I = 1;
-  let p_a = conv_arr_4_Outer(a);
+  let p_a = conv_arr4_Outer(a);
   let p_a_3 = conv_Outer(a[3u]);
-  let p_a_3_a = conv_arr_4_Inner(a[3u].a);
+  let p_a_3_a = conv_arr4_Inner(a[3u].a);
   let p_a_3_a_2 = conv_Inner(a[3u].a[2u]);
   let p_a_3_a_2_m = load_a_3_a_2_m();
   let p_a_3_a_2_m_1 = a[3u].a[2u].m_1;
-  let l_a : array<Outer, 4> = conv_arr_4_Outer(a);
+  let l_a : array<Outer, 4> = conv_arr4_Outer(a);
   let l_a_3 : Outer = conv_Outer(a[3u]);
-  let l_a_3_a : array<Inner, 4> = conv_arr_4_Inner(a[3u].a);
+  let l_a_3_a : array<Inner, 4> = conv_arr4_Inner(a[3u].a);
   let l_a_3_a_2 : Inner = conv_Inner(a[3u].a[2u]);
   let l_a_3_a_2_m : mat4x2<f32> = load_a_3_a_2_m();
   let l_a_3_a_2_m_1 : vec2<f32> = a[3u].a[2u].m_1;
@@ -1801,7 +3125,7 @@
   return S(mat3x2<f32>(val.m_0, val.m_1, val.m_2));
 }
 
-fn conv_arr_4_S(val : array<S_std140, 4u>) -> array<S, 4u> {
+fn conv_arr4_S(val : array<S_std140, 4u>) -> array<S, 4u> {
   var arr : array<S, 4u>;
   for(var i : u32; (i < 4u); i = (i + 1)) {
     arr[i] = conv_S(val[i]);
@@ -1810,7 +3134,7 @@
 }
 
 fn f() {
-  s = conv_arr_4_S(u);
+  s = conv_arr4_S(u);
 }
 )";
 
@@ -1827,7 +3151,7 @@
 }
 
 @group(0) @binding(0) var<uniform> u : array<S, 4>;
-@group(0) @binding(1) var<workgroup> w : array<S, 4>;
+var<workgroup> w : array<S, 4>;
 
 fn f() {
     w[0] = u[1];
@@ -1835,9 +3159,32 @@
 )";
 
     auto* expect =
-        R"(test:8:38 error: non-resource variables must not have @group or @binding attributes
-@group(0) @binding(1) var<workgroup> w : array<S, 4>;
-                                     ^
+        R"(
+struct S {
+  v : vec4<i32>,
+  @size(64)
+  m : mat3x2<f32>,
+}
+
+struct S_std140 {
+  v : vec4<i32>,
+  m_0 : vec2<f32>,
+  m_1 : vec2<f32>,
+  @size(48)
+  m_2 : vec2<f32>,
+}
+
+@group(0) @binding(0) var<uniform> u : array<S_std140, 4u>;
+
+var<workgroup> w : array<S, 4>;
+
+fn conv_S(val : S_std140) -> S {
+  return S(val.v, mat3x2<f32>(val.m_0, val.m_1, val.m_2));
+}
+
+fn f() {
+  w[0] = conv_S(u[1u]);
+}
 )";
 
     auto got = Run<Std140>(src);
diff --git a/src/tint/writer/glsl/generator_impl.cc b/src/tint/writer/glsl/generator_impl.cc
index 0771da9..d051981 100644
--- a/src/tint/writer/glsl/generator_impl.cc
+++ b/src/tint/writer/glsl/generator_impl.cc
@@ -221,10 +221,8 @@
     manager.Add<transform::CanonicalizeEntryPointIO>();
     manager.Add<transform::ExpandCompoundAssignment>();
     manager.Add<transform::PromoteSideEffectsToDecl>();
-    manager.Add<transform::Std140>();  // Must come after PromoteSideEffectsToDecl
     manager.Add<transform::PadStructs>();
     manager.Add<transform::UnwindDiscardFunctions>();
-    manager.Add<transform::SimplifyPointers>();
 
     manager.Add<transform::RemovePhonies>();
 
@@ -245,6 +243,12 @@
     manager.Add<transform::PromoteInitializersToLet>();
     manager.Add<transform::AddEmptyEntryPoint>();
     manager.Add<transform::AddBlockAttribute>();
+
+    // Std140 must come after PromoteSideEffectsToDecl and before SimplifyPointers.
+    manager.Add<transform::Std140>();
+
+    manager.Add<transform::SimplifyPointers>();
+
     data.Add<transform::CanonicalizeEntryPointIO::Config>(
         transform::CanonicalizeEntryPointIO::ShaderStyle::kGlsl);
 
diff --git a/src/tint/writer/spirv/generator_impl.cc b/src/tint/writer/spirv/generator_impl.cc
index f303324..bf92bf9 100644
--- a/src/tint/writer/spirv/generator_impl.cc
+++ b/src/tint/writer/spirv/generator_impl.cc
@@ -77,19 +77,26 @@
     manager.Add<transform::RemoveUnreachableStatements>();
     manager.Add<transform::ExpandCompoundAssignment>();
     manager.Add<transform::PromoteSideEffectsToDecl>();
-    manager.Add<transform::Std140>();  // Must come after PromoteSideEffectsToDecl
     manager.Add<transform::UnwindDiscardFunctions>();
     manager.Add<transform::SimplifyPointers>();  // Required for arrayLength()
     manager.Add<transform::RemovePhonies>();
     manager.Add<transform::VectorizeScalarMatrixConstructors>();
     manager.Add<transform::VectorizeMatrixConversions>();
-    manager.Add<transform::ForLoopToLoop>();  // Must come after
-    manager.Add<transform::WhileToLoop>();    // ZeroInitWorkgroupMemory
+    manager.Add<transform::WhileToLoop>();  // ZeroInitWorkgroupMemory
     manager.Add<transform::CanonicalizeEntryPointIO>();
     manager.Add<transform::AddEmptyEntryPoint>();
     manager.Add<transform::AddBlockAttribute>();
+
+    // Std140 must come after PromoteSideEffectsToDecl.
+    // Std140 must come before VarForDynamicIndex and ForLoopToLoop.
+    manager.Add<transform::Std140>();
+
+    // VarForDynamicIndex must come after Std140
     manager.Add<transform::VarForDynamicIndex>();
 
+    // ForLoopToLoop must come after Std140, ZeroInitWorkgroupMemory
+    manager.Add<transform::ForLoopToLoop>();
+
     data.Add<transform::CanonicalizeEntryPointIO::Config>(
         transform::CanonicalizeEntryPointIO::Config(
             transform::CanonicalizeEntryPointIO::ShaderStyle::kSpirv, 0xFFFFFFFF,