src/tint/lang/wgsl/ast/transform/vertex_pulling.cc - dawn - Git at Google

 // Copyright 2020 The Dawn & Tint Authors
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
 // 1. Redistributions of source code must retain the above copyright notice, this
 //    list of conditions and the following disclaimer.
 //
 // 2. Redistributions in binary form must reproduce the above copyright notice,
 //    this list of conditions and the following disclaimer in the documentation
 //    and/or other materials provided with the distribution.
 //
 // 3. Neither the name of the copyright holder nor the names of its
 //    contributors may be used to endorse or promote products derived from
 //    this software without specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #include "src/tint/lang/wgsl/ast/transform/vertex_pulling.h"

 #include <algorithm>
 #include <utility>

 #include "src/tint/lang/core/builtin_value.h"
 #include "src/tint/lang/wgsl/ast/assignment_statement.h"
 #include "src/tint/lang/wgsl/ast/bitcast_expression.h"
 #include "src/tint/lang/wgsl/ast/variable_decl_statement.h"
 #include "src/tint/lang/wgsl/program/clone_context.h"
 #include "src/tint/lang/wgsl/program/program_builder.h"
 #include "src/tint/lang/wgsl/resolver/resolve.h"
 #include "src/tint/lang/wgsl/sem/variable.h"
 #include "src/tint/utils/containers/map.h"
 #include "src/tint/utils/macros/compiler.h"
 #include "src/tint/utils/math/math.h"
 #include "src/tint/utils/rtti/switch.h"
 #include "src/tint/utils/text/string_stream.h"

 TINT_INSTANTIATE_TYPEINFO(tint::ast::transform::VertexPulling);
 TINT_INSTANTIATE_TYPEINFO(tint::ast::transform::VertexPulling::Config);

 namespace tint::ast::transform {

 using namespace tint::core::fluent_types;     // NOLINT
 using namespace tint::core::number_suffixes;  // NOLINT

 namespace {

 /// The base WGSL type of a component.
 /// The format type is either this type or a vector of this type.
 enum class BaseWGSLType {
     kInvalid,
     kU32,
     kI32,
     kF32,
     kF16,
 };

 /// The data type of a vertex format.
 /// The format type is either this type or a vector of this type.
 enum class VertexDataType {
     kInvalid,
     kUInt,   // unsigned int
     kSInt,   // signed int
     kFloat,  // unsigned normalized, signed normalized, and float
 };

 /// Writes the VertexFormat to the stream.
 /// @param out the stream to write to
 /// @param format the VertexFormat to write
 /// @returns out so calls can be chained
 StringStream& operator<<(StringStream& out, VertexFormat format) {
     switch (format) {
         case VertexFormat::kUint8x2:
             return out << "uint8x2";
         case VertexFormat::kUint8x4:
             return out << "uint8x4";
         case VertexFormat::kSint8x2:
             return out << "sint8x2";
         case VertexFormat::kSint8x4:
             return out << "sint8x4";
         case VertexFormat::kUnorm8x2:
             return out << "unorm8x2";
         case VertexFormat::kUnorm8x4:
             return out << "unorm8x4";
         case VertexFormat::kSnorm8x2:
             return out << "snorm8x2";
         case VertexFormat::kSnorm8x4:
             return out << "snorm8x4";
         case VertexFormat::kUint16x2:
             return out << "uint16x2";
         case VertexFormat::kUint16x4:
             return out << "uint16x4";
         case VertexFormat::kSint16x2:
             return out << "sint16x2";
         case VertexFormat::kSint16x4:
             return out << "sint16x4";
         case VertexFormat::kUnorm16x2:
             return out << "unorm16x2";
         case VertexFormat::kUnorm16x4:
             return out << "unorm16x4";
         case VertexFormat::kSnorm16x2:
             return out << "snorm16x2";
         case VertexFormat::kSnorm16x4:
             return out << "snorm16x4";
         case VertexFormat::kFloat16x2:
             return out << "float16x2";
         case VertexFormat::kFloat16x4:
             return out << "float16x4";
         case VertexFormat::kFloat32:
             return out << "float32";
         case VertexFormat::kFloat32x2:
             return out << "float32x2";
         case VertexFormat::kFloat32x3:
             return out << "float32x3";
         case VertexFormat::kFloat32x4:
             return out << "float32x4";
         case VertexFormat::kUint32:
             return out << "uint32";
         case VertexFormat::kUint32x2:
             return out << "uint32x2";
         case VertexFormat::kUint32x3:
             return out << "uint32x3";
         case VertexFormat::kUint32x4:
             return out << "uint32x4";
         case VertexFormat::kSint32:
             return out << "sint32";
         case VertexFormat::kSint32x2:
             return out << "sint32x2";
         case VertexFormat::kSint32x3:
             return out << "sint32x3";
         case VertexFormat::kSint32x4:
             return out << "sint32x4";
         case VertexFormat::kUnorm10_10_10_2:
             return out << "unorm10-10-10-2";
     }
     return out << "<unknown>";
 }

 /// Type information of a vertex input attribute.
 struct AttributeWGSLType {
     BaseWGSLType base_type;
     uint32_t width;  // 1 for scalar, 2+ for a vector
 };

 /// Type information of a vertex format.
 struct VertexFormatType {
     VertexDataType base_type;
     uint32_t width;  // 1 for scalar, 2+ for a vector
 };

 // Check if base types match between the WGSL variable and the vertex format
 bool IsTypeCompatible(AttributeWGSLType wgslType, VertexFormatType vertexFormatType) {
     switch (wgslType.base_type) {
         case BaseWGSLType::kF32:
         case BaseWGSLType::kF16:
             return (vertexFormatType.base_type == VertexDataType::kFloat);
         case BaseWGSLType::kU32:
             return (vertexFormatType.base_type == VertexDataType::kUInt);
         case BaseWGSLType::kI32:
             return (vertexFormatType.base_type == VertexDataType::kSInt);
         default:
             return false;
     }
 }

 AttributeWGSLType WGSLTypeOf(const core::type::Type* ty) {
     return Switch(
         ty,
         [](const core::type::I32*) -> AttributeWGSLType {
             return {BaseWGSLType::kI32, 1};
         },
         [](const core::type::U32*) -> AttributeWGSLType {
             return {BaseWGSLType::kU32, 1};
         },
         [](const core::type::F32*) -> AttributeWGSLType {
             return {BaseWGSLType::kF32, 1};
         },
         [](const core::type::F16*) -> AttributeWGSLType {
             return {BaseWGSLType::kF16, 1};
         },
         [](const core::type::Vector* vec) -> AttributeWGSLType {
             return {WGSLTypeOf(vec->type()).base_type, vec->Width()};
         },
         [](Default) -> AttributeWGSLType {
             return {BaseWGSLType::kInvalid, 0};
         });
 }

 VertexFormatType VertexFormatTypeOf(VertexFormat format) {
     switch (format) {
         case VertexFormat::kUint32:
             return {VertexDataType::kUInt, 1};
         case VertexFormat::kUint8x2:
         case VertexFormat::kUint16x2:
         case VertexFormat::kUint32x2:
             return {VertexDataType::kUInt, 2};
         case VertexFormat::kUint32x3:
             return {VertexDataType::kUInt, 3};
         case VertexFormat::kUint8x4:
         case VertexFormat::kUint16x4:
         case VertexFormat::kUint32x4:
             return {VertexDataType::kUInt, 4};
         case VertexFormat::kSint32:
             return {VertexDataType::kSInt, 1};
         case VertexFormat::kSint8x2:
         case VertexFormat::kSint16x2:
         case VertexFormat::kSint32x2:
             return {VertexDataType::kSInt, 2};
         case VertexFormat::kSint32x3:
             return {VertexDataType::kSInt, 3};
         case VertexFormat::kSint8x4:
         case VertexFormat::kSint16x4:
         case VertexFormat::kSint32x4:
             return {VertexDataType::kSInt, 4};
         case VertexFormat::kFloat32:
             return {VertexDataType::kFloat, 1};
         case VertexFormat::kUnorm8x2:
         case VertexFormat::kSnorm8x2:
         case VertexFormat::kUnorm16x2:
         case VertexFormat::kSnorm16x2:
         case VertexFormat::kFloat16x2:
         case VertexFormat::kFloat32x2:
             return {VertexDataType::kFloat, 2};
         case VertexFormat::kFloat32x3:
             return {VertexDataType::kFloat, 3};
         case VertexFormat::kUnorm8x4:
         case VertexFormat::kSnorm8x4:
         case VertexFormat::kUnorm16x4:
         case VertexFormat::kSnorm16x4:
         case VertexFormat::kFloat16x4:
         case VertexFormat::kFloat32x4:
         case VertexFormat::kUnorm10_10_10_2:
             return {VertexDataType::kFloat, 4};
     }
     return {VertexDataType::kInvalid, 0};
 }

 }  // namespace

 /// PIMPL state for the transform
 struct VertexPulling::State {
     /// Constructor
     /// @param program the source program
     /// @param c the VertexPulling config
     State(const Program& program, const VertexPulling::Config& c) : src(program), cfg(c) {}

     /// Runs the transform
     /// @returns the new program or SkipTransform if the transform is not required
     ApplyResult Run() {
         // Find entry point
         const Function* func = nullptr;
         for (auto* fn : src.AST().Functions()) {
             if (fn->PipelineStage() == PipelineStage::kVertex) {
                 if (func != nullptr) {
                     b.Diagnostics().add_error(
                         diag::System::Transform,
                         "VertexPulling found more than one vertex entry point");
                     return resolver::Resolve(b);
                 }
                 func = fn;
             }
         }
         if (func == nullptr) {
             b.Diagnostics().add_error(diag::System::Transform,
                                       "Vertex stage entry point not found");
             return resolver::Resolve(b);
         }

         AddVertexStorageBuffers();
         Process(func);

         ctx.Clone();
         return resolver::Resolve(b);
     }

   private:
     /// LocationReplacement describes an Variable replacement for a location input.
     struct LocationReplacement {
         /// The variable to replace in the source Program
         Variable* from;
         /// The replacement to use in the target ProgramBuilder
         Variable* to;
     };

     /// LocationInfo describes an input location
     struct LocationInfo {
         /// A builder that builds the expression that resolves to the (transformed) input location
         std::function<const Expression*()> expr;
         /// The store type of the location variable
         const core::type::Type* type;
     };

     /// The source program
     const Program& src;
     /// The transform config
     VertexPulling::Config const cfg;
     /// The target program builder
     ProgramBuilder b;
     /// The clone context
     program::CloneContext ctx = {&b, &src, /* auto_clone_symbols */ true};
     std::unordered_map<uint32_t, LocationInfo> location_info;
     std::function<const Expression*()> vertex_index_expr = nullptr;
     std::function<const Expression*()> instance_index_expr = nullptr;
     Symbol pulling_position_name;
     Symbol struct_buffer_name;
     std::unordered_map<uint32_t, Symbol> vertex_buffer_names;
     tint::Vector<const Parameter*, 8> new_function_parameters;

     /// Generate the vertex buffer binding name
     /// @param index index to append to buffer name
     Symbol GetVertexBufferName(uint32_t index) {
         return tint::GetOrCreate(vertex_buffer_names, index, [&] {
             static const char kVertexBufferNamePrefix[] = "tint_pulling_vertex_buffer_";
             return b.Symbols().New(kVertexBufferNamePrefix + std::to_string(index));
         });
     }

     /// Lazily generates the structure buffer symbol
     Symbol GetStructBufferName() {
         if (!struct_buffer_name.IsValid()) {
             static const char kStructBufferName[] = "tint_vertex_data";
             struct_buffer_name = b.Symbols().New(kStructBufferName);
         }
         return struct_buffer_name;
     }

     /// Adds storage buffer decorated variables for the vertex buffers
     void AddVertexStorageBuffers() {
         // Creating the struct type
         static const char kStructName[] = "TintVertexData";
         auto* struct_type = b.Structure(b.Symbols().New(kStructName),
                                         tint::Vector{
                                             b.Member(GetStructBufferName(), b.ty.array<u32>()),
                                         });
         for (uint32_t i = 0; i < cfg.vertex_state.size(); ++i) {
             // The decorated variable with struct type
             b.GlobalVar(GetVertexBufferName(i), b.ty.Of(struct_type), core::AddressSpace::kStorage,
                         core::Access::kRead, b.Binding(AInt(i)), b.Group(AInt(cfg.pulling_group)));
         }
     }

     /// Creates and returns the assignment to the variables from the buffers
     const BlockStatement* CreateVertexPullingPreamble() {
         // Assign by looking at the vertex descriptor to find attributes with
         // matching location.

         tint::Vector<const Statement*, 8> stmts;

         for (uint32_t buffer_idx = 0; buffer_idx < cfg.vertex_state.size(); ++buffer_idx) {
             const VertexBufferLayoutDescriptor& buffer_layout = cfg.vertex_state[buffer_idx];

             if ((buffer_layout.array_stride & 3) != 0) {
                 b.Diagnostics().add_error(
                     diag::System::Transform,
                     "WebGPU requires that vertex stride must be a multiple of 4 bytes, "
                     "but VertexPulling array stride for buffer " +
                         std::to_string(buffer_idx) + " was " +
                         std::to_string(buffer_layout.array_stride) + " bytes");
                 return nullptr;
             }

             auto* index_expr = buffer_layout.step_mode == VertexStepMode::kVertex
                                    ? vertex_index_expr()
                                    : instance_index_expr();

             // buffer_array_base is the base array offset for all the vertex
             // attributes. These are units of uint (4 bytes).
             auto buffer_array_base =
                 b.Symbols().New("buffer_array_base_" + std::to_string(buffer_idx));

             auto* attribute_offset = index_expr;
             if (buffer_layout.array_stride != 4) {
                 attribute_offset = b.Mul(index_expr, u32(buffer_layout.array_stride / 4u));
             }

             // let pulling_offset_n = <attribute_offset>
             stmts.Push(b.Decl(b.Let(buffer_array_base, attribute_offset)));

             for (const VertexAttributeDescriptor& attribute_desc : buffer_layout.attributes) {
                 auto it = location_info.find(attribute_desc.shader_location);
                 if (it == location_info.end()) {
                     continue;
                 }
                 auto& var = it->second;

                 // Data type of the target WGSL variable
                 auto var_dt = WGSLTypeOf(var.type);
                 // Data type of the vertex stream attribute
                 auto fmt_dt = VertexFormatTypeOf(attribute_desc.format);

                 // Base types must match between the vertex stream and the WGSL variable
                 if (!IsTypeCompatible(var_dt, fmt_dt)) {
                     StringStream err;
                     err << "VertexAttributeDescriptor for location "
                         << std::to_string(attribute_desc.shader_location) << " has format "
                         << attribute_desc.format << " but shader expects "
                         << var.type->FriendlyName();
                     b.Diagnostics().add_error(diag::System::Transform, err.str());
                     return nullptr;
                 }

                 // Load the attribute value according to vertex format and convert the element type
                 // of result to match target WGSL variable. The result of `Fetch` should be of WGSL
                 // types `f32`, `i32`, `u32`, and their vectors, while WGSL variable can be of
                 // `f16`.
                 auto* fetch = Fetch(buffer_array_base, attribute_desc.offset, buffer_idx,
                                     attribute_desc.format);
                 // Convert the fetched scalar/vector if WGSL variable is of `f16` types
                 if (var_dt.base_type == BaseWGSLType::kF16) {
                     // The type of the same element number of base type of target WGSL variable
                     Type loaded_data_target_type;
                     if (fmt_dt.width == 1) {
                         loaded_data_target_type = b.ty.f16();
                     } else {
                         loaded_data_target_type = b.ty.vec(b.ty.f16(), fmt_dt.width);
                     }

                     fetch = b.Call(loaded_data_target_type, fetch);
                 }

                 // The attribute value may not be of the desired vector width. If it is not, we'll
                 // need to either reduce the width with a swizzle, or append 0's and / or a 1.
                 auto* value = fetch;
                 if (var_dt.width < fmt_dt.width) {
                     // WGSL variable vector width is smaller than the loaded vector width
                     switch (var_dt.width) {
                         case 1:
                             value = b.MemberAccessor(fetch, "x");
                             break;
                         case 2:
                             value = b.MemberAccessor(fetch, "xy");
                             break;
                         case 3:
                             value = b.MemberAccessor(fetch, "xyz");
                             break;
                         default:
                             TINT_UNREACHABLE() << var_dt.width;
                             return nullptr;
                     }
                 } else if (var_dt.width > fmt_dt.width) {
                     // WGSL variable vector width is wider than the loaded vector width, do padding.

                     // The components of result vector variable, initialized with type-converted
                     // loaded data vector.
                     tint::Vector<const Expression*, 8> values{fetch};

                     // Add padding elements. The result must be of vector types of signed/unsigned
                     // integer or float, so use the abstract integer or abstract float value to do
                     // padding.
                     for (uint32_t i = fmt_dt.width; i < var_dt.width; i++) {
                         if (var_dt.base_type == BaseWGSLType::kI32 ||
                             var_dt.base_type == BaseWGSLType::kU32) {
                             values.Push(b.Expr((i == 3) ? 1_a : 0_a));
                         } else {
                             values.Push(b.Expr((i == 3) ? 1.0_a : 0.0_a));
                         }
                     }

                     value = b.Call(CreateASTTypeFor(ctx, var.type), values);
                 }

                 // Assign the value to the WGSL variable
                 stmts.Push(b.Assign(var.expr(), value));
             }
         }

         if (stmts.IsEmpty()) {
             return nullptr;
         }

         return b.Block(std::move(stmts));
     }

     /// Generates an expression reading a specific vertex format from a buffer. Any vertex format of
     /// signed normailized, unsigned normailized, or float will result in `f32` or `vecN<f32>` WGSL
     /// type.
     /// @param array_base the symbol of the variable holding the base array offset
     /// of the vertex array (each index is 4-bytes).
     /// @param offset the byte offset of the data from `buffer_base`
     /// @param buffer the index of the vertex buffer
     /// @param format the vertex format to read
     const Expression* Fetch(Symbol array_base,
                             uint32_t offset,
                             uint32_t buffer,
                             VertexFormat format) {
         // Returns a u32 loaded from buffer_base + offset.
         auto load_u32 = [&] {
             return LoadPrimitive(array_base, offset, buffer, VertexFormat::kUint32);
         };

         // Returns a i32 loaded from buffer_base + offset.
         auto load_i32 = [&] { return b.Bitcast<i32>(load_u32()); };

         // Returns a u32 loaded from buffer_base + offset + 4.
         auto load_next_u32 = [&] {
             return LoadPrimitive(array_base, offset + 4, buffer, VertexFormat::kUint32);
         };

         // Returns a i32 loaded from buffer_base + offset + 4.
         auto load_next_i32 = [&] { return b.Bitcast<i32>(load_next_u32()); };

         // Returns a u16 loaded from offset, packed in the high 16 bits of a u32.
         // The low 16 bits are 0.
         // `min_alignment` must be a power of two.
         // `offset` must be `min_alignment` bytes aligned.
         auto load_u16_h = [&] {
             auto low_u32_offset = offset & ~3u;
             auto* low_u32 =
                 LoadPrimitive(array_base, low_u32_offset, buffer, VertexFormat::kUint32);
             switch (offset & 3) {
                 case 0:
                     return b.Shl(low_u32, 16_u);
                 case 1:
                     return b.And(b.Shl(low_u32, 8_u), 0xffff0000_u);
                 case 2:
                     return b.And(low_u32, 0xffff0000_u);
                 default: {  // 3:
                     auto* high_u32 = LoadPrimitive(array_base, low_u32_offset + 4, buffer,
                                                    VertexFormat::kUint32);
                     auto* shr = b.Shr(low_u32, 8_u);
                     auto* shl = b.Shl(high_u32, 24_u);
                     return b.And(b.Or(shl, shr), 0xffff0000_u);
                 }
             }
         };

         // Returns a u16 loaded from offset, packed in the low 16 bits of a u32.
         // The high 16 bits are 0.
         auto load_u16_l = [&] {
             auto low_u32_offset = offset & ~3u;
             auto* low_u32 =
                 LoadPrimitive(array_base, low_u32_offset, buffer, VertexFormat::kUint32);
             switch (offset & 3) {
                 case 0:
                     return b.And(low_u32, 0xffff_u);
                 case 1:
                     return b.And(b.Shr(low_u32, 8_u), 0xffff_u);
                 case 2:
                     return b.Shr(low_u32, 16_u);
                 default: {  // 3:
                     auto* high_u32 = LoadPrimitive(array_base, low_u32_offset + 4, buffer,
                                                    VertexFormat::kUint32);
                     auto* shr = b.Shr(low_u32, 24_u);
                     auto* shl = b.Shl(high_u32, 8_u);
                     return b.And(b.Or(shl, shr), 0xffff_u);
                 }
             }
         };

         // Returns a i16 loaded from offset, packed in the high 16 bits of a u32.
         // The low 16 bits are 0.
         auto load_i16_h = [&] { return b.Bitcast<i32>(load_u16_h()); };

         // Assumptions are made that alignment must be at least as large as the size
         // of a single component.
         switch (format) {
             // Basic primitives
             case VertexFormat::kUint32:
             case VertexFormat::kSint32:
             case VertexFormat::kFloat32:
                 return LoadPrimitive(array_base, offset, buffer, format);

                 // Vectors of basic primitives
             case VertexFormat::kUint32x2:
                 return LoadVec(array_base, offset, buffer, 4, b.ty.u32(), VertexFormat::kUint32, 2);
             case VertexFormat::kUint32x3:
                 return LoadVec(array_base, offset, buffer, 4, b.ty.u32(), VertexFormat::kUint32, 3);
             case VertexFormat::kUint32x4:
                 return LoadVec(array_base, offset, buffer, 4, b.ty.u32(), VertexFormat::kUint32, 4);
             case VertexFormat::kSint32x2:
                 return LoadVec(array_base, offset, buffer, 4, b.ty.i32(), VertexFormat::kSint32, 2);
             case VertexFormat::kSint32x3:
                 return LoadVec(array_base, offset, buffer, 4, b.ty.i32(), VertexFormat::kSint32, 3);
             case VertexFormat::kSint32x4:
                 return LoadVec(array_base, offset, buffer, 4, b.ty.i32(), VertexFormat::kSint32, 4);
             case VertexFormat::kFloat32x2:
                 return LoadVec(array_base, offset, buffer, 4, b.ty.f32(), VertexFormat::kFloat32,
                                2);
             case VertexFormat::kFloat32x3:
                 return LoadVec(array_base, offset, buffer, 4, b.ty.f32(), VertexFormat::kFloat32,
                                3);
             case VertexFormat::kFloat32x4:
                 return LoadVec(array_base, offset, buffer, 4, b.ty.f32(), VertexFormat::kFloat32,
                                4);

             case VertexFormat::kUint8x2: {
                 // yyxx0000, yyxx0000
                 auto* u16s = b.Call<vec2<u32>>(load_u16_h());
                 // xx000000, yyxx0000
                 auto* shl = b.Shl(u16s, b.Call<vec2<u32>>(8_u, 0_u));
                 // 000000xx, 000000yy
                 return b.Shr(shl, b.Call<vec2<u32>>(24_u));
             }
             case VertexFormat::kUint8x4: {
                 // wwzzyyxx, wwzzyyxx, wwzzyyxx, wwzzyyxx
                 auto* u32s = b.Call<vec4<u32>>(load_u32());
                 // xx000000, yyxx0000, zzyyxx00, wwzzyyxx
                 auto* shl = b.Shl(u32s, b.Call<vec4<u32>>(24_u, 16_u, 8_u, 0_u));
                 // 000000xx, 000000yy, 000000zz, 000000ww
                 return b.Shr(shl, b.Call<vec4<u32>>(24_u));
             }
             case VertexFormat::kUint16x2: {
                 // yyyyxxxx, yyyyxxxx
                 auto* u32s = b.Call<vec2<u32>>(load_u32());
                 // xxxx0000, yyyyxxxx
                 auto* shl = b.Shl(u32s, b.Call<vec2<u32>>(16_u, 0_u));
                 // 0000xxxx, 0000yyyy
                 return b.Shr(shl, b.Call<vec2<u32>>(16_u));
             }
             case VertexFormat::kUint16x4: {
                 // yyyyxxxx, wwwwzzzz
                 auto* u32s = b.Call<vec2<u32>>(load_u32(), load_next_u32());
                 // yyyyxxxx, yyyyxxxx, wwwwzzzz, wwwwzzzz
                 auto* xxyy = b.MemberAccessor(u32s, "xxyy");
                 // xxxx0000, yyyyxxxx, zzzz0000, wwwwzzzz
                 auto* shl = b.Shl(xxyy, b.Call<vec4<u32>>(16_u, 0_u, 16_u, 0_u));
                 // 0000xxxx, 0000yyyy, 0000zzzz, 0000wwww
                 return b.Shr(shl, b.Call<vec4<u32>>(16_u));
             }
             case VertexFormat::kSint8x2: {
                 // yyxx0000, yyxx0000
                 auto* i16s = b.Call<vec2<i32>>(load_i16_h());
                 // xx000000, yyxx0000
                 auto* shl = b.Shl(i16s, b.Call<vec2<u32>>(8_u, 0_u));
                 // ssssssxx, ssssssyy
                 return b.Shr(shl, b.Call<vec2<u32>>(24_u));
             }
             case VertexFormat::kSint8x4: {
                 // wwzzyyxx, wwzzyyxx, wwzzyyxx, wwzzyyxx
                 auto* i32s = b.Call<vec4<i32>>(load_i32());
                 // xx000000, yyxx0000, zzyyxx00, wwzzyyxx
                 auto* shl = b.Shl(i32s, b.Call<vec4<u32>>(24_u, 16_u, 8_u, 0_u));
                 // ssssssxx, ssssssyy, sssssszz, ssssssww
                 return b.Shr(shl, b.Call<vec4<u32>>(24_u));
             }
             case VertexFormat::kSint16x2: {
                 // yyyyxxxx, yyyyxxxx
                 auto* i32s = b.Call<vec2<i32>>(load_i32());
                 // xxxx0000, yyyyxxxx
                 auto* shl = b.Shl(i32s, b.Call<vec2<u32>>(16_u, 0_u));
                 // ssssxxxx, ssssyyyy
                 return b.Shr(shl, b.Call<vec2<u32>>(16_u));
             }
             case VertexFormat::kSint16x4: {
                 // yyyyxxxx, wwwwzzzz
                 auto* i32s = b.Call<vec2<i32>>(load_i32(), load_next_i32());
                 // yyyyxxxx, yyyyxxxx, wwwwzzzz, wwwwzzzz
                 auto* xxyy = b.MemberAccessor(i32s, "xxyy");
                 // xxxx0000, yyyyxxxx, zzzz0000, wwwwzzzz
                 auto* shl = b.Shl(xxyy, b.Call<vec4<u32>>(16_u, 0_u, 16_u, 0_u));
                 // ssssxxxx, ssssyyyy, sssszzzz, sssswwww
                 return b.Shr(shl, b.Call<vec4<u32>>(16_u));
             }
             case VertexFormat::kUnorm8x2:
                 return b.MemberAccessor(b.Call("unpack4x8unorm", load_u16_l()), "xy");
             case VertexFormat::kSnorm8x2:
                 return b.MemberAccessor(b.Call("unpack4x8snorm", load_u16_l()), "xy");
             case VertexFormat::kUnorm8x4:
                 return b.Call("unpack4x8unorm", load_u32());
             case VertexFormat::kSnorm8x4:
                 return b.Call("unpack4x8snorm", load_u32());
             case VertexFormat::kUnorm16x2:
                 return b.Call("unpack2x16unorm", load_u32());
             case VertexFormat::kSnorm16x2:
                 return b.Call("unpack2x16snorm", load_u32());
             case VertexFormat::kFloat16x2:
                 return b.Call("unpack2x16float", load_u32());
             case VertexFormat::kUnorm16x4:
                 return b.Call<vec4<f32>>(b.Call("unpack2x16unorm", load_u32()),
                                          b.Call("unpack2x16unorm", load_next_u32()));
             case VertexFormat::kSnorm16x4:
                 return b.Call<vec4<f32>>(b.Call("unpack2x16snorm", load_u32()),
                                          b.Call("unpack2x16snorm", load_next_u32()));
             case VertexFormat::kFloat16x4:
                 return b.Call<vec4<f32>>(b.Call("unpack2x16float", load_u32()),
                                          b.Call("unpack2x16float", load_next_u32()));
             case VertexFormat::kUnorm10_10_10_2:
                 auto* u32s = b.Call<vec4<u32>>(load_u32());
                 // shr = u32s >> vec4u(0, 10, 20, 30);
                 auto* shr = b.Shr(u32s, b.Call<vec4<u32>>(0_u, 10_u, 20_u, 30_u));
                 // mask = shr & vec4u(0x3FF, 0x3FF, 0x3FF, 0x3);
                 auto* mask = b.And(shr, b.Call<vec4<u32>>(0x3FF_u, 0x3FF_u, 0x3FF_u, 0x3_u));
                 // return vec4f(mask) / vec4f(1023, 1023, 1023, 3);
                 return b.Div(b.Call<vec4<f32>>(mask),
                              b.Call<vec4<f32>>(1023_f, 1023_f, 1023_f, 3_f));
         }

         TINT_UNREACHABLE() << "format " << static_cast<int>(format);
         return nullptr;
     }

     /// Generates an expression reading an aligned basic type (u32, i32, f32) from
     /// a vertex buffer.
     /// @param array_base the symbol of the variable holding the base array offset
     /// of the vertex array (each index is 4-bytes).
     /// @param offset the byte offset of the data from `buffer_base`
     /// @param buffer the index of the vertex buffer
     /// @param format VertexFormat::kUint32, VertexFormat::kSint32 or
     /// VertexFormat::kFloat32
     const Expression* LoadPrimitive(Symbol array_base,
                                     uint32_t offset,
                                     uint32_t buffer,
                                     VertexFormat format) {
         const Expression* u = nullptr;
         if ((offset & 3) == 0) {
             // Aligned load.

             const ast ::Expression* index = nullptr;
             if (offset > 0) {
                 index = b.Add(array_base, u32(offset / 4));
             } else {
                 index = b.Expr(array_base);
             }
             u = b.IndexAccessor(
                 b.MemberAccessor(GetVertexBufferName(buffer), GetStructBufferName()), index);

         } else {
             // Unaligned load
             uint32_t offset_aligned = offset & ~3u;
             auto* low = LoadPrimitive(array_base, offset_aligned, buffer, VertexFormat::kUint32);
             auto* high =
                 LoadPrimitive(array_base, offset_aligned + 4u, buffer, VertexFormat::kUint32);

             uint32_t shift = 8u * (offset & 3u);

             auto* low_shr = b.Shr(low, u32(shift));
             auto* high_shl = b.Shl(high, u32(32u - shift));
             u = b.Or(low_shr, high_shl);
         }

         switch (format) {
             case VertexFormat::kUint32:
                 return u;
             case VertexFormat::kSint32:
                 return b.Bitcast(b.ty.i32(), u);
             case VertexFormat::kFloat32:
                 return b.Bitcast(b.ty.f32(), u);
             default:
                 break;
         }
         TINT_UNREACHABLE() << "invalid format for LoadPrimitive" << static_cast<int>(format);
         return nullptr;
     }

     /// Generates an expression reading a vec2/3/4 from a vertex buffer.
     /// @param array_base the symbol of the variable holding the base array offset
     /// of the vertex array (each index is 4-bytes).
     /// @param offset the byte offset of the data from `buffer_base`
     /// @param buffer the index of the vertex buffer
     /// @param element_stride stride between elements, in bytes
     /// @param base_type underlying AST type
     /// @param base_format underlying vertex format
     /// @param count how many elements the vector has
     const Expression* LoadVec(Symbol array_base,
                               uint32_t offset,
                               uint32_t buffer,
                               uint32_t element_stride,
                               Type base_type,
                               VertexFormat base_format,
                               uint32_t count) {
         tint::Vector<const Expression*, 8> expr_list;
         for (uint32_t i = 0; i < count; ++i) {
             // Offset read position by element_stride for each component
             uint32_t primitive_offset = offset + element_stride * i;
             expr_list.Push(LoadPrimitive(array_base, primitive_offset, buffer, base_format));
         }

         return b.Call(b.ty.vec(base_type, count), std::move(expr_list));
     }

     /// Process a non-struct entry point parameter.
     /// Generate function-scope variables for location parameters, and record
     /// vertex_index and instance_index builtins if present.
     /// @param func the entry point function
     /// @param param the parameter to process
     void ProcessNonStructParameter(const Function* func, const Parameter* param) {
         if (HasAttribute<LocationAttribute>(param->attributes)) {
             // Create a function-scope variable to replace the parameter.
             auto func_var_sym = ctx.Clone(param->name->symbol);
             auto func_var_type = ctx.Clone(param->type);
             auto* func_var = b.Var(func_var_sym, func_var_type);
             ctx.InsertFront(func->body->statements, b.Decl(func_var));
             // Capture mapping from location to the new variable.
             LocationInfo info;
             info.expr = [this, func_var] { return b.Expr(func_var); };

             auto* sem = src.Sem().Get(param);
             info.type = sem->Type();

             if (TINT_UNLIKELY(!sem->Attributes().location.has_value())) {
                 TINT_ICE() << "Location missing value";
                 return;
             }
             location_info[sem->Attributes().location.value()] = info;
         } else {
             auto* builtin_attr = GetAttribute<BuiltinAttribute>(param->attributes);
             if (TINT_UNLIKELY(!builtin_attr)) {
                 TINT_ICE() << "Invalid entry point parameter";
                 return;
             }
             auto builtin = src.Sem().Get(builtin_attr)->Value();
             // Check for existing vertex_index and instance_index builtins.
             if (builtin == core::BuiltinValue::kVertexIndex) {
                 vertex_index_expr = [this, param] {
                     return b.Expr(ctx.Clone(param->name->symbol));
                 };
             } else if (builtin == core::BuiltinValue::kInstanceIndex) {
                 instance_index_expr = [this, param] {
                     return b.Expr(ctx.Clone(param->name->symbol));
                 };
             }
             new_function_parameters.Push(ctx.Clone(param));
         }
     }

     /// Process a struct entry point parameter.
     /// If the struct has members with location attributes, push the parameter to
     /// a function-scope variable and create a new struct parameter without those
     /// attributes. Record expressions for members that are vertex_index and
     /// instance_index builtins.
     /// @param func the entry point function
     /// @param param the parameter to process
     /// @param struct_ty the structure type
     void ProcessStructParameter(const Function* func,
                                 const Parameter* param,
                                 const Struct* struct_ty) {
         auto param_sym = ctx.Clone(param->name->symbol);

         // Process the struct members.
         bool has_locations = false;
         tint::Vector<const StructMember*, 8> members_to_clone;
         for (auto* member : struct_ty->members) {
             auto member_sym = ctx.Clone(member->name->symbol);
             std::function<const Expression*()> member_expr = [this, param_sym, member_sym] {
                 return b.MemberAccessor(param_sym, member_sym);
             };

             if (HasAttribute<LocationAttribute>(member->attributes)) {
                 // Capture mapping from location to struct member.
                 LocationInfo info;
                 info.expr = member_expr;

                 auto* sem = src.Sem().Get(member);
                 info.type = sem->Type();

                 TINT_ASSERT(sem->Attributes().location.has_value());
                 location_info[sem->Attributes().location.value()] = info;
                 has_locations = true;
             } else {
                 auto* builtin_attr = GetAttribute<BuiltinAttribute>(member->attributes);
                 if (TINT_UNLIKELY(!builtin_attr)) {
                     TINT_ICE() << "Invalid entry point parameter";
                     return;
                 }
                 auto builtin = src.Sem().Get(builtin_attr)->Value();
                 // Check for existing vertex_index and instance_index builtins.
                 if (builtin == core::BuiltinValue::kVertexIndex) {
                     vertex_index_expr = member_expr;
                 } else if (builtin == core::BuiltinValue::kInstanceIndex) {
                     instance_index_expr = member_expr;
                 }
                 members_to_clone.Push(member);
             }
         }

         if (!has_locations) {
             // Nothing to do.
             new_function_parameters.Push(ctx.Clone(param));
             return;
         }

         // Create a function-scope variable to replace the parameter.
         auto* func_var = b.Var(param_sym, ctx.Clone(param->type));
         ctx.InsertFront(func->body->statements, b.Decl(func_var));

         if (!members_to_clone.IsEmpty()) {
             // Create a new struct without the location attributes.
             tint::Vector<const StructMember*, 8> new_members;
             for (auto* member : members_to_clone) {
                 auto member_name = ctx.Clone(member->name);
                 auto member_type = ctx.Clone(member->type);
                 auto member_attrs = ctx.Clone(member->attributes);
                 new_members.Push(b.Member(member_name, member_type, std::move(member_attrs)));
             }
             auto* new_struct = b.Structure(b.Sym(), new_members);

             // Create a new function parameter with this struct.
             auto* new_param = b.Param(b.Sym(), b.ty.Of(new_struct));
             new_function_parameters.Push(new_param);

             // Copy values from the new parameter to the function-scope variable.
             for (auto* member : members_to_clone) {
                 auto member_name = ctx.Clone(member->name->symbol);
                 ctx.InsertFront(func->body->statements,
                                 b.Assign(b.MemberAccessor(func_var, member_name),
                                          b.MemberAccessor(new_param, member_name)));
             }
         }
     }

     /// Process an entry point function.
     /// @param func the entry point function
     void Process(const Function* func) {
         if (func->body->Empty()) {
             return;
         }

         // Process entry point parameters.
         for (auto* param : func->params) {
             auto* sem = src.Sem().Get(param);
             if (auto* str = sem->Type()->As<sem::Struct>()) {
                 ProcessStructParameter(func, param, str->Declaration());
             } else {
                 ProcessNonStructParameter(func, param);
             }
         }

         // Insert new parameters for vertex_index and instance_index if needed.
         if (!vertex_index_expr) {
             for (const VertexBufferLayoutDescriptor& layout : cfg.vertex_state) {
                 if (layout.step_mode == VertexStepMode::kVertex) {
                     auto name = b.Symbols().New("tint_pulling_vertex_index");
                     new_function_parameters.Push(
                         b.Param(name, b.ty.u32(),
                                 tint::Vector{b.Builtin(core::BuiltinValue::kVertexIndex)}));
                     vertex_index_expr = [this, name] { return b.Expr(name); };
                     break;
                 }
             }
         }
         if (!instance_index_expr) {
             for (const VertexBufferLayoutDescriptor& layout : cfg.vertex_state) {
                 if (layout.step_mode == VertexStepMode::kInstance) {
                     auto name = b.Symbols().New("tint_pulling_instance_index");
                     new_function_parameters.Push(
                         b.Param(name, b.ty.u32(),
                                 tint::Vector{b.Builtin(core::BuiltinValue::kInstanceIndex)}));
                     instance_index_expr = [this, name] { return b.Expr(name); };
                     break;
                 }
             }
         }

         // Generate vertex pulling preamble.
         if (auto* block = CreateVertexPullingPreamble()) {
             ctx.InsertFront(func->body->statements, block);
         }

         // Rewrite the function header with the new parameters.
         auto func_sym = ctx.Clone(func->name->symbol);
         auto ret_type = ctx.Clone(func->return_type);
         auto* body = ctx.Clone(func->body);
         auto attrs = ctx.Clone(func->attributes);
         auto ret_attrs = ctx.Clone(func->return_type_attributes);
         auto* new_func =
             b.create<Function>(func->source, b.Ident(func_sym), new_function_parameters, ret_type,
                                body, std::move(attrs), std::move(ret_attrs));
         ctx.Replace(func, new_func);
     }
 };

 VertexPulling::VertexPulling() = default;
 VertexPulling::~VertexPulling() = default;

 Transform::ApplyResult VertexPulling::Apply(const Program& src,
                                             const DataMap& inputs,
                                             DataMap&) const {
     auto cfg = cfg_;
     if (auto* cfg_data = inputs.Get<Config>()) {
         cfg = *cfg_data;
     }

     return State{src, cfg}.Run();
 }

 VertexPulling::Config::Config() = default;
 VertexPulling::Config::Config(const Config&) = default;
 VertexPulling::Config::~Config() = default;
 VertexPulling::Config& VertexPulling::Config::operator=(const Config&) = default;

 VertexBufferLayoutDescriptor::VertexBufferLayoutDescriptor() = default;

 VertexBufferLayoutDescriptor::VertexBufferLayoutDescriptor(
     uint32_t in_array_stride,
     VertexStepMode in_step_mode,
     std::vector<VertexAttributeDescriptor> in_attributes)
     : array_stride(in_array_stride),
       step_mode(in_step_mode),
       attributes(std::move(in_attributes)) {}

 VertexBufferLayoutDescriptor::VertexBufferLayoutDescriptor(
     const VertexBufferLayoutDescriptor& other) = default;

 VertexBufferLayoutDescriptor& VertexBufferLayoutDescriptor::operator=(
     const VertexBufferLayoutDescriptor& other) = default;

 VertexBufferLayoutDescriptor::~VertexBufferLayoutDescriptor() = default;

 }  // namespace tint::ast::transform