Add polyfill for pack/unpack 4x8 snorm/unorm

This CL adds a polyfill for the 4x8 pack and unpack normalized methods.

Bug: 379551588
Change-Id: I5f3823c3fd63fb09f828f8af7fd0d7293ac6a727
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/222215
Reviewed-by: Corentin Wallez <cwallez@chromium.org>
Reviewed-by: James Price <jrprice@google.com>
Commit-Queue: dan sinclair <dsinclair@chromium.org>
diff --git a/src/dawn/native/Toggles.cpp b/src/dawn/native/Toggles.cpp
index d6208f2..3732ca4 100644
--- a/src/dawn/native/Toggles.cpp
+++ b/src/dawn/native/Toggles.cpp
@@ -516,6 +516,11 @@
      {"polyfill_packed_4x8_dot_product",
       "Always use the polyfill version of dot4I8Packed() and dot4U8Packed().",
       "https://crbug.com/tint/1497", ToggleStage::Device}},
+    {Toggle::PolyfillPackUnpack4x8Norm,
+     {"polyfill_pack_unpack_4x8_norm",
+      "Always use the polyfill version of pack4x8snorm, pack4x8unorm, unpack4x8snorm, "
+      "unpack4x8unorm.",
+      "https://crbug.com/379551588", ToggleStage::Device}},
     {Toggle::D3D12PolyFillPackUnpack4x8,
      {"d3d12_polyfill_pack_unpack_4x8",
       "Always use the polyfill version of pack4xI8(), pack4xU8(), pack4xI8Clamp(), unpack4xI8() "
diff --git a/src/dawn/native/Toggles.h b/src/dawn/native/Toggles.h
index 93ed488..07dcbae 100644
--- a/src/dawn/native/Toggles.h
+++ b/src/dawn/native/Toggles.h
@@ -127,6 +127,7 @@
     UseTintIR,
     D3DDisableIEEEStrictness,
     PolyFillPacked4x8DotProduct,
+    PolyfillPackUnpack4x8Norm,
     D3D12PolyFillPackUnpack4x8,
     ExposeWGSLTestingFeatures,
     ExposeWGSLExperimentalFeatures,
diff --git a/src/dawn/native/vulkan/PhysicalDeviceVk.cpp b/src/dawn/native/vulkan/PhysicalDeviceVk.cpp
index 6c41216..7bffff1 100644
--- a/src/dawn/native/vulkan/PhysicalDeviceVk.cpp
+++ b/src/dawn/native/vulkan/PhysicalDeviceVk.cpp
@@ -692,6 +692,11 @@
         // forced to store the multisampled targets and do the resolves as separate passes injected
         // after the original one.
         deviceToggles->Default(Toggle::ResolveMultipleAttachmentInSeparatePasses, true);
+
+        // dawn:379551588: Using the `pack4x8snorm`, `pack4x8unorm`, `unpack4x8snorm` and
+        // `unpack4x8unorm` methods can have issues on ARM. To work around the issue we re-write the
+        // pack/unpack calls and do the packing manually.
+        deviceToggles->Default(Toggle::PolyfillPackUnpack4x8Norm, true);
     }
 
     if (IsAndroidSamsung() || IsAndroidQualcomm()) {
diff --git a/src/dawn/native/vulkan/ShaderModuleVk.cpp b/src/dawn/native/vulkan/ShaderModuleVk.cpp
index 1a2b969..4fd2907 100644
--- a/src/dawn/native/vulkan/ShaderModuleVk.cpp
+++ b/src/dawn/native/vulkan/ShaderModuleVk.cpp
@@ -370,6 +370,8 @@
         GetDevice()->IsToggleEnabled(Toggle::VulkanUseBufferRobustAccess2);
     req.tintOptions.polyfill_dot_4x8_packed =
         GetDevice()->IsToggleEnabled(Toggle::PolyFillPacked4x8DotProduct);
+    req.tintOptions.polyfill_pack_unpack_4x8_norm =
+        GetDevice()->IsToggleEnabled(Toggle::PolyfillPackUnpack4x8Norm);
     req.tintOptions.disable_polyfill_integer_div_mod =
         GetDevice()->IsToggleEnabled(Toggle::DisablePolyfillsOnIntegerDivisonAndModulo);
 
diff --git a/src/dawn/tests/BUILD.gn b/src/dawn/tests/BUILD.gn
index 24a8e91..94f4529 100644
--- a/src/dawn/tests/BUILD.gn
+++ b/src/dawn/tests/BUILD.gn
@@ -642,6 +642,7 @@
     "end2end/NonzeroBufferCreationTests.cpp",
     "end2end/NonzeroTextureCreationTests.cpp",
     "end2end/OpArrayLengthTests.cpp",
+    "end2end/PackUnpack4x8NormTests.cpp",
     "end2end/Packed4x8IntegerDotProductTests.cpp",
     "end2end/PipelineCachingTests.cpp",
     "end2end/PipelineLayoutTests.cpp",
diff --git a/src/dawn/tests/end2end/PackUnpack4x8NormTests.cpp b/src/dawn/tests/end2end/PackUnpack4x8NormTests.cpp
new file mode 100644
index 0000000..002de45
--- /dev/null
+++ b/src/dawn/tests/end2end/PackUnpack4x8NormTests.cpp
@@ -0,0 +1,311 @@
+// Copyright 2025 The Dawn & Tint Authors
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its
+//    contributors may be used to endorse or promote products derived from
+//    this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <limits>
+#include <vector>
+
+#include "dawn/tests/DawnTest.h"
+#include "dawn/utils/WGPUHelpers.h"
+
+namespace dawn {
+namespace {
+
+class PackUnpack4x8NormTests : public DawnTest {};
+
+TEST_P(PackUnpack4x8NormTests, Pack4x8Snorm) {
+    const char* computeShader = R"(
+        @group(0) @binding(0) var<storage, read_write> buf : array<u32>;
+        @group(0) @binding(1) var<storage, read> inputBuf : array<vec4f>;
+
+        @compute @workgroup_size(1)
+        fn main() {
+            var r: vec2<u32>;
+            for (var i = 0; i < 8; i++) {
+                r.x = pack4x8snorm(inputBuf[i]);
+                buf[i] = r.x;
+            }
+        }
+)";
+
+    static uint32_t kNumTests = 8;
+
+    wgpu::BufferDescriptor bufferDesc;
+    bufferDesc.size = kNumTests * sizeof(uint32_t);
+    bufferDesc.usage = wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc;
+    wgpu::Buffer bufferOut = device.CreateBuffer(&bufferDesc);
+
+    wgpu::Buffer inputBuffer = utils::CreateBufferFromData(
+        device,
+        wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::Storage,
+        {
+            // clang-format off
+            0.f, 0.f, 0.f, 0.f,  //
+            0.f, 0.f, 0.f, -1.f,  //
+            0.f, 0.f, 0.f, 1.f,  //
+            0.f, 0.f, -1.f, 0.f,  //
+            0.f, 1.f, 0.f, 0.f,  //
+            -1.f, 0.f, 0.f, 0.f,  //
+            1.f, -1.f, 1.f, -1.f,  //
+            std::numeric_limits<float>::max(), -0.495f, 0.5f, std::numeric_limits<float>::lowest(),
+            // clang-format on
+        });
+
+    wgpu::ComputePipelineDescriptor csDesc;
+    csDesc.compute.module = utils::CreateShaderModule(device, computeShader);
+    wgpu::ComputePipeline pipeline = device.CreateComputePipeline(&csDesc);
+
+    wgpu::BindGroup bindGroup = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
+                                                     {
+                                                         {0, bufferOut},
+                                                         {1, inputBuffer},
+                                                     });
+
+    wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+    wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
+    pass.SetPipeline(pipeline);
+    pass.SetBindGroup(0, bindGroup);
+    pass.DispatchWorkgroups(1);
+    pass.End();
+    wgpu::CommandBuffer commands = encoder.Finish();
+    queue.Submit(1, &commands);
+
+    uint32_t expected[] = {0u,           0x8100'0000u, 0x7f00'0000u, 0x0081'0000u,
+                           0x0000'7f00u, 0x0000'0081u, 0x817f'817fu, 0x8140'c17fu};
+    EXPECT_BUFFER_U32_RANGE_EQ(reinterpret_cast<uint32_t*>(expected), bufferOut, 0, kNumTests);
+}
+
+TEST_P(PackUnpack4x8NormTests, Pack4x8Unorm) {
+    const char* computeShader = R"(
+        @group(0) @binding(0) var<storage, read_write> buf : array<u32>;
+        @group(0) @binding(1) var<storage, read> inputBuf : array<vec4f>;
+
+        @compute @workgroup_size(1)
+        fn main() {
+            var r: vec2<u32>;
+            for (var i = 0; i < 7; i++) {
+                r.x = pack4x8unorm(inputBuf[i]);
+                buf[i] = r.x;
+            }
+        }
+)";
+
+    static uint32_t kNumTests = 7;
+
+    wgpu::BufferDescriptor bufferDesc;
+    bufferDesc.size = kNumTests * sizeof(uint32_t);
+    bufferDesc.usage = wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc;
+    wgpu::Buffer bufferOut = device.CreateBuffer(&bufferDesc);
+
+    wgpu::Buffer inputBuffer = utils::CreateBufferFromData(
+        device,
+        wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::Storage,
+        {
+            // clang-format off
+            0.f, 0.f, 0.f, 0.f,  //
+            0.f, 0.f, 0.f, 1.f,  //
+            0.f, 0.f, 1.f, 0.f,  //
+            0.f, 1.f, 0.f, 0.f,  //
+            1.f, 0.f, 0.f, 0.f,  //
+            1.f, 0.f, 1.f, 0.f,  //
+            std::numeric_limits<float>::max(), 0.f, 0.5f, std::numeric_limits<float>::lowest(),
+            // clang-format on
+        });
+
+    wgpu::ComputePipelineDescriptor csDesc;
+    csDesc.compute.module = utils::CreateShaderModule(device, computeShader);
+    wgpu::ComputePipeline pipeline = device.CreateComputePipeline(&csDesc);
+
+    wgpu::BindGroup bindGroup = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
+                                                     {
+                                                         {0, bufferOut},
+                                                         {1, inputBuffer},
+                                                     });
+
+    wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+    wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
+    pass.SetPipeline(pipeline);
+    pass.SetBindGroup(0, bindGroup);
+    pass.DispatchWorkgroups(1);
+    pass.End();
+    wgpu::CommandBuffer commands = encoder.Finish();
+    queue.Submit(1, &commands);
+
+    uint32_t expected[] = {0u,           0xff00'0000u, 0x00ff'0000u, 0x0000'ff00u,
+                           0x0000'00ffu, 0x00ff'00ffu, 0x0080'00ffu};
+    EXPECT_BUFFER_U32_RANGE_EQ(reinterpret_cast<uint32_t*>(expected), bufferOut, 0, kNumTests);
+}
+
+TEST_P(PackUnpack4x8NormTests, Unpack4x8Unorm) {
+    const char* computeShader = R"(
+        @group(0) @binding(0) var<storage, read_write> buf : array<vec4f>;
+        @group(0) @binding(1) var<storage, read> inputBuf : array<u32>;
+
+        @compute @workgroup_size(1)
+        fn main() {
+            var r: vec2<u32>;
+            for (var i = 0; i < 7; i++) {
+                r.x = inputBuf[i];
+                buf[i] = unpack4x8unorm(r.x);
+            }
+        }
+)";
+
+    static uint32_t kNumTests = 7;
+
+    wgpu::BufferDescriptor bufferDesc;
+    bufferDesc.size = kNumTests * 4 * sizeof(float);
+    bufferDesc.usage = wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc;
+    wgpu::Buffer bufferOut = device.CreateBuffer(&bufferDesc);
+
+    wgpu::Buffer inputBuffer = utils::CreateBufferFromData(
+        device,
+        wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::Storage,
+        {
+            // clang-format off
+            0x0000'0000u,
+            0xff00'0000u,
+            0x00ff'0000u,
+            0x0000'ff00u,
+            0x0000'00ffu,
+            0x00ff'00ffu,
+            0x0066'00ffu
+            // clang-format on
+        });
+
+    wgpu::ComputePipelineDescriptor csDesc;
+    csDesc.compute.module = utils::CreateShaderModule(device, computeShader);
+    wgpu::ComputePipeline pipeline = device.CreateComputePipeline(&csDesc);
+
+    wgpu::BindGroup bindGroup = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
+                                                     {
+                                                         {0, bufferOut},
+                                                         {1, inputBuffer},
+                                                     });
+
+    wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+    wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
+    pass.SetPipeline(pipeline);
+    pass.SetBindGroup(0, bindGroup);
+    pass.DispatchWorkgroups(1);
+    pass.End();
+    wgpu::CommandBuffer commands = encoder.Finish();
+    queue.Submit(1, &commands);
+
+    float expected[] = {
+        // clang-format off
+        0.f, 0.f, 0.f, 0.f,
+        0.f, 0.f, 0.f, 1.f,
+        0.f, 0.f, 1.f, 0.f,
+        0.f, 1.f, 0.f, 0.f,
+        1.f, 0.f, 0.f, 0.f,
+        1.f, 0.f, 1.f, 0.f,
+        1.f, 0.f, 0.4f, 0.f
+        // clang-format on
+    };
+    EXPECT_BUFFER_FLOAT_RANGE_EQ(expected, bufferOut, 0, kNumTests);
+}
+
+TEST_P(PackUnpack4x8NormTests, Unpack4x8Snorm) {
+    const char* computeShader = R"(
+        @group(0) @binding(0) var<storage, read_write> buf : array<vec4f>;
+        @group(0) @binding(1) var<storage, read> inputBuf : array<u32>;
+
+        @compute @workgroup_size(1)
+        fn main() {
+            var r: vec2<u32>;
+            for (var i = 0; i < 8; i++) {
+                r.x = inputBuf[i];
+                buf[i] = unpack4x8snorm(r.x);
+            }
+        }
+)";
+
+    static uint32_t kNumTests = 8;
+
+    wgpu::BufferDescriptor bufferDesc;
+    bufferDesc.size = kNumTests * 4 * sizeof(float);
+    bufferDesc.usage = wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc;
+    wgpu::Buffer bufferOut = device.CreateBuffer(&bufferDesc);
+
+    wgpu::Buffer inputBuffer = utils::CreateBufferFromData(
+        device,
+        wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::Storage,
+        {
+            // clang-format off
+            0x0000'0000u,
+            0x8100'0000u,
+            0x7f00'0000u,
+            0x0081'0000u,
+            0x0000'7f00u,
+            0x0000'0081u,
+            0x817f'817fu,
+            0x816d'937fu
+            // clang-format on
+        });
+
+    wgpu::ComputePipelineDescriptor csDesc;
+    csDesc.compute.module = utils::CreateShaderModule(device, computeShader);
+    wgpu::ComputePipeline pipeline = device.CreateComputePipeline(&csDesc);
+
+    wgpu::BindGroup bindGroup = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
+                                                     {
+                                                         {0, bufferOut},
+                                                         {1, inputBuffer},
+                                                     });
+
+    wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+    wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
+    pass.SetPipeline(pipeline);
+    pass.SetBindGroup(0, bindGroup);
+    pass.DispatchWorkgroups(1);
+    pass.End();
+    wgpu::CommandBuffer commands = encoder.Finish();
+    queue.Submit(1, &commands);
+
+    float expected[] = {
+        // clang-format off
+        0.f, 0.f, 0.f, 0.f,
+        0.f, 0.f, 0.f, -1.f,
+        0.f, 0.f, 0.f, 1.f,
+        0.f, 0.f, -1.f, 0.f,
+        0.f, 1.f, 0.f, 0.f,
+        -1.f, 0.f, 0.f, 0.f,
+        -1.f, 0.f, 1.f, 0.f,
+        1.f, -1.f, 1.f, -1.f,
+        1.f, -0.8582677165354f, 0.8582677165354f, -1.f
+        // clang-format on
+    };
+    EXPECT_BUFFER_FLOAT_RANGE_EQ(expected, bufferOut, 0, kNumTests);
+}
+
+// DawnTestBase::CreateDeviceImpl always enables allow_unsafe_apis toggle.
+DAWN_INSTANTIATE_TEST(PackUnpack4x8NormTests,
+                      VulkanBackend(),
+                      VulkanBackend({"polyfill_pack_unpack_4x8_norm"}));
+
+}  // anonymous namespace
+}  // namespace dawn
diff --git a/src/tint/lang/core/ir/transform/builtin_polyfill.cc b/src/tint/lang/core/ir/transform/builtin_polyfill.cc
index da8c722..c012a30 100644
--- a/src/tint/lang/core/ir/transform/builtin_polyfill.cc
+++ b/src/tint/lang/core/ir/transform/builtin_polyfill.cc
@@ -174,6 +174,14 @@
                         }
                         break;
                     }
+                    case core::BuiltinFn::kPack4X8Snorm:
+                    case core::BuiltinFn::kPack4X8Unorm:
+                    case core::BuiltinFn::kUnpack4X8Snorm:
+                    case core::BuiltinFn::kUnpack4X8Unorm:
+                        if (config.pack_unpack_4x8_norm) {
+                            worklist.Push(builtin);
+                        }
+                        break;
                     default:
                         break;
                 }
@@ -252,12 +260,132 @@
                 case core::BuiltinFn::kUnpack4XU8:
                     Unpack4xU8(builtin);
                     break;
+                case core::BuiltinFn::kPack4X8Snorm:
+                    Pack4x8Snorm(builtin);
+                    break;
+                case core::BuiltinFn::kPack4X8Unorm:
+                    Pack4x8Unorm(builtin);
+                    break;
+                case core::BuiltinFn::kUnpack4X8Snorm:
+                    Unpack4x8Snorm(builtin);
+                    break;
+                case core::BuiltinFn::kUnpack4X8Unorm:
+                    Unpack4x8Unorm(builtin);
+                    break;
                 default:
                     break;
             }
         }
     }
 
+    /// Polyfill a `pack4x8snorm` builtin call
+    void Pack4x8Snorm(ir::CoreBuiltinCall* call) {
+        auto* arg = call->Args()[0];
+
+        b.InsertBefore(call, [&] {
+            auto* vec4f = ty.vec4<f32>();
+            auto* vec4u = ty.vec4<u32>();
+
+            auto* neg_one = b.Splat(vec4f, -1_f);
+            auto* one = b.Splat(vec4f, 1_f);
+
+            core::ir::Value* v =
+                b.Call(vec4f, core::BuiltinFn::kClamp, Vector{arg, neg_one, one})->Result(0);
+            v = b.Multiply(vec4f, b.Splat(vec4f, 127_f), v)->Result(0);
+            v = b.Add(vec4f, b.Splat(vec4f, 0.5_f), v)->Result(0);
+            v = b.Call(vec4f, core::BuiltinFn::kFloor, Vector{v})->Result(0);
+            v = b.Convert(ty.vec4<i32>(), v)->Result(0);
+            v = b.Bitcast(vec4u, v)->Result(0);
+            v = b.And(vec4u, v, b.Splat(vec4u, 0xff_u))->Result(0);
+            v = b.ShiftLeft(vec4u, v, b.Construct(vec4u, 0_u, 8_u, 16_u, 24_u))->Result(0);
+
+            auto* x = b.Access(ty.u32(), v, 0_u);
+            auto* y = b.Access(ty.u32(), v, 1_u);
+            auto* z = b.Access(ty.u32(), v, 2_u);
+            auto* w = b.Access(ty.u32(), v, 3_u);
+
+            v = b.Or(ty.u32(), x, b.Or(ty.u32(), y, b.Or(ty.u32(), z, w)))->Result(0);
+
+            call->Result(0)->ReplaceAllUsesWith(v);
+        });
+        call->Destroy();
+    }
+
+    /// Polyfill a `pack4x8unorm` builtin call
+    void Pack4x8Unorm(ir::CoreBuiltinCall* call) {
+        auto* arg = call->Args()[0];
+
+        b.InsertBefore(call, [&] {
+            auto* vec4f = ty.vec4<f32>();
+            auto* vec4u = ty.vec4<u32>();
+
+            auto* zero = b.Zero(vec4f);
+            auto* one = b.Splat(vec4f, 1_f);
+
+            auto* v = b.Call(vec4f, core::BuiltinFn::kClamp, Vector{arg, zero, one})->Result(0);
+            v = b.Multiply(vec4f, b.Splat(vec4f, 255_f), v)->Result(0);
+            v = b.Add(vec4f, b.Splat(vec4f, 0.5_f), v)->Result(0);
+            v = b.Call(vec4f, core::BuiltinFn::kFloor, Vector{v})->Result(0);
+            v = b.Convert(vec4u, v)->Result(0);
+            v = b.And(vec4u, v, b.Splat(vec4u, 0xff_u))->Result(0);
+            v = b.ShiftLeft(vec4u, v, b.Construct(vec4u, 0_u, 8_u, 16_u, 24_u))->Result(0);
+
+            auto* x = b.Access(ty.u32(), v, 0_u);
+            auto* y = b.Access(ty.u32(), v, 1_u);
+            auto* z = b.Access(ty.u32(), v, 2_u);
+            auto* w = b.Access(ty.u32(), v, 3_u);
+
+            v = b.Or(ty.u32(), x, b.Or(ty.u32(), y, b.Or(ty.u32(), z, w)))->Result(0);
+
+            call->Result(0)->ReplaceAllUsesWith(v);
+        });
+        call->Destroy();
+    }
+
+    /// Polyfill a `unpack4x8snorm` builtin call
+    void Unpack4x8Snorm(ir::CoreBuiltinCall* call) {
+        auto* arg = call->Args()[0];
+
+        b.InsertBefore(call, [&] {
+            auto* vec4f = ty.vec4<f32>();
+            auto* vec4u = ty.vec4<u32>();
+            auto* vec4i = ty.vec4<i32>();
+
+            auto* v = b.Construct(vec4u, arg)->Result(0);
+            // Shift left to put the 8th bit of each number into the sign bit location, we then
+            // convert to an i32 and shift back, so the sign bit will be set as needed. The bits
+            // outside the bottom 8 are then masked off.
+            v = b.ShiftLeft(vec4u, v, b.Construct(vec4u, 24_u, 16_u, 8_u, 0_u))->Result(0);
+            v = b.Bitcast(vec4i, v)->Result(0);
+            v = b.ShiftRight(vec4i, v, b.Splat(vec4u, 24_u))->Result(0);
+            v = b.Convert(vec4f, v)->Result(0);
+            v = b.Divide(vec4f, v, b.Splat(vec4f, 127_f))->Result(0);
+            v = b.Call(vec4f, core::BuiltinFn::kMax, v, b.Splat(vec4f, -1_f))->Result(0);
+
+            call->Result(0)->ReplaceAllUsesWith(v);
+        });
+        call->Destroy();
+    }
+
+    /// Polyfill a `unpack4x8unorm` builtin call
+    void Unpack4x8Unorm(ir::CoreBuiltinCall* call) {
+        auto* arg = call->Args()[0];
+
+        b.InsertBefore(call, [&] {
+            auto* vec4f = ty.vec4<f32>();
+            auto* vec4u = ty.vec4<u32>();
+
+            auto* v = b.Construct(vec4u, arg)->Result(0);
+            v = b.ShiftRight(vec4u, v, b.Construct(vec4u, 0_u, 8_u, 16_u, 24_u))->Result(0);
+            v = b.And(vec4u, v, b.Splat(vec4u, 0xff_u))->Result(0);
+            v = b.Convert(vec4f, v)->Result(0);
+            v = b.Divide(vec4f, v, b.Splat(vec4f, 255_f))->Result(0);
+
+            call->Result(0)->ReplaceAllUsesWith(v);
+        });
+        call->Destroy();
+    }
+
     /// Polyfill a `clamp()` builtin call for integers.
     /// @param call the builtin call instruction
     void ClampInt(ir::CoreBuiltinCall* call) {
diff --git a/src/tint/lang/core/ir/transform/builtin_polyfill.h b/src/tint/lang/core/ir/transform/builtin_polyfill.h
index 5e68e24..825b550 100644
--- a/src/tint/lang/core/ir/transform/builtin_polyfill.h
+++ b/src/tint/lang/core/ir/transform/builtin_polyfill.h
@@ -86,6 +86,8 @@
     /// Should `pack4xU8Clamp()` be polyfilled?
     /// TODO(tint:1497): remove the option once the bug in DXC is fixed.
     bool pack_4xu8_clamp = false;
+    /// Should `pack4x8snorm`, `pack4x8unorm`, `unpack4x8snorm` and `unpack4x8unorm` be polyfilled?
+    bool pack_unpack_4x8_norm = false;
 
     /// Reflection for this class
     TINT_REFLECT(BuiltinPolyfillConfig,
diff --git a/src/tint/lang/core/ir/transform/builtin_polyfill_test.cc b/src/tint/lang/core/ir/transform/builtin_polyfill_test.cc
index 163b943..44e65e9 100644
--- a/src/tint/lang/core/ir/transform/builtin_polyfill_test.cc
+++ b/src/tint/lang/core/ir/transform/builtin_polyfill_test.cc
@@ -2739,5 +2739,154 @@
     EXPECT_EQ(expect, str());
 }
 
+TEST_F(IR_BuiltinPolyfillTest, Pack4x8snorm) {
+    Build(core::BuiltinFn::kPack4X8Snorm, ty.u32(), Vector{ty.vec4<f32>()});
+    auto* src = R"(
+%foo = func(%arg:vec4<f32>):u32 {
+  $B1: {
+    %result:u32 = pack4x8snorm %arg
+    ret %result
+  }
+}
+)";
+    auto* expect = R"(
+%foo = func(%arg:vec4<f32>):u32 {
+  $B1: {
+    %3:vec4<f32> = clamp %arg, vec4<f32>(-1.0f), vec4<f32>(1.0f)
+    %4:vec4<f32> = mul vec4<f32>(127.0f), %3
+    %5:vec4<f32> = add vec4<f32>(0.5f), %4
+    %6:vec4<f32> = floor %5
+    %7:vec4<i32> = convert %6
+    %8:vec4<u32> = bitcast %7
+    %9:vec4<u32> = and %8, vec4<u32>(255u)
+    %10:vec4<u32> = construct 0u, 8u, 16u, 24u
+    %11:vec4<u32> = shl %9, %10
+    %12:u32 = access %11, 0u
+    %13:u32 = access %11, 1u
+    %14:u32 = access %11, 2u
+    %15:u32 = access %11, 3u
+    %16:u32 = or %14, %15
+    %17:u32 = or %13, %16
+    %18:u32 = or %12, %17
+    ret %18
+  }
+}
+)";
+
+    EXPECT_EQ(src, str());
+
+    BuiltinPolyfillConfig config;
+    config.pack_unpack_4x8_norm = true;
+    Run(BuiltinPolyfill, config);
+    EXPECT_EQ(expect, str());
+}
+
+TEST_F(IR_BuiltinPolyfillTest, Pack4x8unorm) {
+    Build(core::BuiltinFn::kPack4X8Unorm, ty.u32(), Vector{ty.vec4<f32>()});
+    auto* src = R"(
+%foo = func(%arg:vec4<f32>):u32 {
+  $B1: {
+    %result:u32 = pack4x8unorm %arg
+    ret %result
+  }
+}
+)";
+    auto* expect = R"(
+%foo = func(%arg:vec4<f32>):u32 {
+  $B1: {
+    %3:vec4<f32> = clamp %arg, vec4<f32>(0.0f), vec4<f32>(1.0f)
+    %4:vec4<f32> = mul vec4<f32>(255.0f), %3
+    %5:vec4<f32> = add vec4<f32>(0.5f), %4
+    %6:vec4<f32> = floor %5
+    %7:vec4<u32> = convert %6
+    %8:vec4<u32> = and %7, vec4<u32>(255u)
+    %9:vec4<u32> = construct 0u, 8u, 16u, 24u
+    %10:vec4<u32> = shl %8, %9
+    %11:u32 = access %10, 0u
+    %12:u32 = access %10, 1u
+    %13:u32 = access %10, 2u
+    %14:u32 = access %10, 3u
+    %15:u32 = or %13, %14
+    %16:u32 = or %12, %15
+    %17:u32 = or %11, %16
+    ret %17
+  }
+}
+)";
+
+    EXPECT_EQ(src, str());
+
+    BuiltinPolyfillConfig config;
+    config.pack_unpack_4x8_norm = true;
+    Run(BuiltinPolyfill, config);
+    EXPECT_EQ(expect, str());
+}
+
+TEST_F(IR_BuiltinPolyfillTest, Unpack4x8snorm) {
+    Build(core::BuiltinFn::kUnpack4X8Snorm, ty.vec4<f32>(), Vector{ty.u32()});
+    auto* src = R"(
+%foo = func(%arg:u32):vec4<f32> {
+  $B1: {
+    %result:vec4<f32> = unpack4x8snorm %arg
+    ret %result
+  }
+}
+)";
+    auto* expect = R"(
+%foo = func(%arg:u32):vec4<f32> {
+  $B1: {
+    %3:vec4<u32> = construct %arg
+    %4:vec4<u32> = construct 24u, 16u, 8u, 0u
+    %5:vec4<u32> = shl %3, %4
+    %6:vec4<i32> = bitcast %5
+    %7:vec4<i32> = shr %6, vec4<u32>(24u)
+    %8:vec4<f32> = convert %7
+    %9:vec4<f32> = div %8, vec4<f32>(127.0f)
+    %10:vec4<f32> = max %9, vec4<f32>(-1.0f)
+    ret %10
+  }
+}
+)";
+
+    EXPECT_EQ(src, str());
+
+    BuiltinPolyfillConfig config;
+    config.pack_unpack_4x8_norm = true;
+    Run(BuiltinPolyfill, config);
+    EXPECT_EQ(expect, str());
+}
+
+TEST_F(IR_BuiltinPolyfillTest, Unpack4x8unorm) {
+    Build(core::BuiltinFn::kUnpack4X8Unorm, ty.vec4<f32>(), Vector{ty.u32()});
+    auto* src = R"(
+%foo = func(%arg:u32):vec4<f32> {
+  $B1: {
+    %result:vec4<f32> = unpack4x8unorm %arg
+    ret %result
+  }
+}
+)";
+    auto* expect = R"(
+%foo = func(%arg:u32):vec4<f32> {
+  $B1: {
+    %3:vec4<u32> = construct %arg
+    %4:vec4<u32> = construct 0u, 8u, 16u, 24u
+    %5:vec4<u32> = shr %3, %4
+    %6:vec4<u32> = and %5, vec4<u32>(255u)
+    %7:vec4<f32> = convert %6
+    %8:vec4<f32> = div %7, vec4<f32>(255.0f)
+    ret %8
+  }
+}
+)";
+
+    EXPECT_EQ(src, str());
+
+    BuiltinPolyfillConfig config;
+    config.pack_unpack_4x8_norm = true;
+    Run(BuiltinPolyfill, config);
+    EXPECT_EQ(expect, str());
+}
+
 }  // namespace
 }  // namespace tint::core::ir::transform
diff --git a/src/tint/lang/spirv/writer/common/options.h b/src/tint/lang/spirv/writer/common/options.h
index 264f2bd..9ccb595 100644
--- a/src/tint/lang/spirv/writer/common/options.h
+++ b/src/tint/lang/spirv/writer/common/options.h
@@ -191,6 +191,10 @@
     /// Set to `true` to generate polyfill for `dot4I8Packed` and `dot4U8Packed` builtins
     bool polyfill_dot_4x8_packed = false;
 
+    /// Set to `true` to generate polyfill for `pack4x8snorm`, `pack4x8unorm`, `unpack4x8snorm` and
+    /// `unpack4x8unorm` builtins
+    bool polyfill_pack_unpack_4x8_norm = false;
+
     /// Set to `true` to disable the polyfills on integer division and modulo.
     bool disable_polyfill_integer_div_mod = false;
 
@@ -217,6 +221,7 @@
                  clamp_frag_depth,
                  pass_matrix_by_pointer,
                  polyfill_dot_4x8_packed,
+                 polyfill_pack_unpack_4x8_norm,
                  disable_polyfill_integer_div_mod,
                  use_vulkan_memory_model,
                  depth_range_offsets);
diff --git a/src/tint/lang/spirv/writer/raise/raise.cc b/src/tint/lang/spirv/writer/raise/raise.cc
index 5cd98e5..85df674 100644
--- a/src/tint/lang/spirv/writer/raise/raise.cc
+++ b/src/tint/lang/spirv/writer/raise/raise.cc
@@ -114,6 +114,7 @@
     core_polyfills.dot_4x8_packed = options.polyfill_dot_4x8_packed;
     core_polyfills.pack_unpack_4x8 = true;
     core_polyfills.pack_4xu8_clamp = true;
+    core_polyfills.pack_unpack_4x8_norm = options.polyfill_pack_unpack_4x8_norm;
     RUN_TRANSFORM(core::ir::transform::BuiltinPolyfill, module, core_polyfills);
 
     core::ir::transform::ConversionPolyfillConfig conversion_polyfills;