Add polyfill for pack/unpack 4x8 snorm/unorm
This CL adds a polyfill for the 4x8 pack and unpack normalized methods.
Bug: 379551588
Change-Id: I5f3823c3fd63fb09f828f8af7fd0d7293ac6a727
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/222215
Reviewed-by: Corentin Wallez <cwallez@chromium.org>
Reviewed-by: James Price <jrprice@google.com>
Commit-Queue: dan sinclair <dsinclair@chromium.org>
diff --git a/src/dawn/native/Toggles.cpp b/src/dawn/native/Toggles.cpp
index d6208f2..3732ca4 100644
--- a/src/dawn/native/Toggles.cpp
+++ b/src/dawn/native/Toggles.cpp
@@ -516,6 +516,11 @@
{"polyfill_packed_4x8_dot_product",
"Always use the polyfill version of dot4I8Packed() and dot4U8Packed().",
"https://crbug.com/tint/1497", ToggleStage::Device}},
+ {Toggle::PolyfillPackUnpack4x8Norm,
+ {"polyfill_pack_unpack_4x8_norm",
+ "Always use the polyfill version of pack4x8snorm, pack4x8unorm, unpack4x8snorm, "
+ "unpack4x8unorm.",
+ "https://crbug.com/379551588", ToggleStage::Device}},
{Toggle::D3D12PolyFillPackUnpack4x8,
{"d3d12_polyfill_pack_unpack_4x8",
"Always use the polyfill version of pack4xI8(), pack4xU8(), pack4xI8Clamp(), unpack4xI8() "
diff --git a/src/dawn/native/Toggles.h b/src/dawn/native/Toggles.h
index 93ed488..07dcbae 100644
--- a/src/dawn/native/Toggles.h
+++ b/src/dawn/native/Toggles.h
@@ -127,6 +127,7 @@
UseTintIR,
D3DDisableIEEEStrictness,
PolyFillPacked4x8DotProduct,
+ PolyfillPackUnpack4x8Norm,
D3D12PolyFillPackUnpack4x8,
ExposeWGSLTestingFeatures,
ExposeWGSLExperimentalFeatures,
diff --git a/src/dawn/native/vulkan/PhysicalDeviceVk.cpp b/src/dawn/native/vulkan/PhysicalDeviceVk.cpp
index 6c41216..7bffff1 100644
--- a/src/dawn/native/vulkan/PhysicalDeviceVk.cpp
+++ b/src/dawn/native/vulkan/PhysicalDeviceVk.cpp
@@ -692,6 +692,11 @@
// forced to store the multisampled targets and do the resolves as separate passes injected
// after the original one.
deviceToggles->Default(Toggle::ResolveMultipleAttachmentInSeparatePasses, true);
+
+ // dawn:379551588: Using the `pack4x8snorm`, `pack4x8unorm`, `unpack4x8snorm` and
+ // `unpack4x8unorm` methods can have issues on ARM. To work around the issue we re-write the
+ // pack/unpack calls and do the packing manually.
+ deviceToggles->Default(Toggle::PolyfillPackUnpack4x8Norm, true);
}
if (IsAndroidSamsung() || IsAndroidQualcomm()) {
diff --git a/src/dawn/native/vulkan/ShaderModuleVk.cpp b/src/dawn/native/vulkan/ShaderModuleVk.cpp
index 1a2b969..4fd2907 100644
--- a/src/dawn/native/vulkan/ShaderModuleVk.cpp
+++ b/src/dawn/native/vulkan/ShaderModuleVk.cpp
@@ -370,6 +370,8 @@
GetDevice()->IsToggleEnabled(Toggle::VulkanUseBufferRobustAccess2);
req.tintOptions.polyfill_dot_4x8_packed =
GetDevice()->IsToggleEnabled(Toggle::PolyFillPacked4x8DotProduct);
+ req.tintOptions.polyfill_pack_unpack_4x8_norm =
+ GetDevice()->IsToggleEnabled(Toggle::PolyfillPackUnpack4x8Norm);
req.tintOptions.disable_polyfill_integer_div_mod =
GetDevice()->IsToggleEnabled(Toggle::DisablePolyfillsOnIntegerDivisonAndModulo);
diff --git a/src/dawn/tests/BUILD.gn b/src/dawn/tests/BUILD.gn
index 24a8e91..94f4529 100644
--- a/src/dawn/tests/BUILD.gn
+++ b/src/dawn/tests/BUILD.gn
@@ -642,6 +642,7 @@
"end2end/NonzeroBufferCreationTests.cpp",
"end2end/NonzeroTextureCreationTests.cpp",
"end2end/OpArrayLengthTests.cpp",
+ "end2end/PackUnpack4x8NormTests.cpp",
"end2end/Packed4x8IntegerDotProductTests.cpp",
"end2end/PipelineCachingTests.cpp",
"end2end/PipelineLayoutTests.cpp",
diff --git a/src/dawn/tests/end2end/PackUnpack4x8NormTests.cpp b/src/dawn/tests/end2end/PackUnpack4x8NormTests.cpp
new file mode 100644
index 0000000..002de45
--- /dev/null
+++ b/src/dawn/tests/end2end/PackUnpack4x8NormTests.cpp
@@ -0,0 +1,311 @@
+// Copyright 2025 The Dawn & Tint Authors
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+// list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <limits>
+#include <vector>
+
+#include "dawn/tests/DawnTest.h"
+#include "dawn/utils/WGPUHelpers.h"
+
+namespace dawn {
+namespace {
+
+class PackUnpack4x8NormTests : public DawnTest {};
+
+TEST_P(PackUnpack4x8NormTests, Pack4x8Snorm) {
+ const char* computeShader = R"(
+ @group(0) @binding(0) var<storage, read_write> buf : array<u32>;
+ @group(0) @binding(1) var<storage, read> inputBuf : array<vec4f>;
+
+ @compute @workgroup_size(1)
+ fn main() {
+ var r: vec2<u32>;
+ for (var i = 0; i < 8; i++) {
+ r.x = pack4x8snorm(inputBuf[i]);
+ buf[i] = r.x;
+ }
+ }
+)";
+
+ static uint32_t kNumTests = 8;
+
+ wgpu::BufferDescriptor bufferDesc;
+ bufferDesc.size = kNumTests * sizeof(uint32_t);
+ bufferDesc.usage = wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc;
+ wgpu::Buffer bufferOut = device.CreateBuffer(&bufferDesc);
+
+ wgpu::Buffer inputBuffer = utils::CreateBufferFromData(
+ device,
+ wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::Storage,
+ {
+ // clang-format off
+ 0.f, 0.f, 0.f, 0.f, //
+ 0.f, 0.f, 0.f, -1.f, //
+ 0.f, 0.f, 0.f, 1.f, //
+ 0.f, 0.f, -1.f, 0.f, //
+ 0.f, 1.f, 0.f, 0.f, //
+ -1.f, 0.f, 0.f, 0.f, //
+ 1.f, -1.f, 1.f, -1.f, //
+ std::numeric_limits<float>::max(), -0.495f, 0.5f, std::numeric_limits<float>::lowest(),
+ // clang-format on
+ });
+
+ wgpu::ComputePipelineDescriptor csDesc;
+ csDesc.compute.module = utils::CreateShaderModule(device, computeShader);
+ wgpu::ComputePipeline pipeline = device.CreateComputePipeline(&csDesc);
+
+ wgpu::BindGroup bindGroup = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
+ {
+ {0, bufferOut},
+ {1, inputBuffer},
+ });
+
+ wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+ wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
+ pass.SetPipeline(pipeline);
+ pass.SetBindGroup(0, bindGroup);
+ pass.DispatchWorkgroups(1);
+ pass.End();
+ wgpu::CommandBuffer commands = encoder.Finish();
+ queue.Submit(1, &commands);
+
+ uint32_t expected[] = {0u, 0x8100'0000u, 0x7f00'0000u, 0x0081'0000u,
+ 0x0000'7f00u, 0x0000'0081u, 0x817f'817fu, 0x8140'c17fu};
+ EXPECT_BUFFER_U32_RANGE_EQ(reinterpret_cast<uint32_t*>(expected), bufferOut, 0, kNumTests);
+}
+
+TEST_P(PackUnpack4x8NormTests, Pack4x8Unorm) {
+ const char* computeShader = R"(
+ @group(0) @binding(0) var<storage, read_write> buf : array<u32>;
+ @group(0) @binding(1) var<storage, read> inputBuf : array<vec4f>;
+
+ @compute @workgroup_size(1)
+ fn main() {
+ var r: vec2<u32>;
+ for (var i = 0; i < 7; i++) {
+ r.x = pack4x8unorm(inputBuf[i]);
+ buf[i] = r.x;
+ }
+ }
+)";
+
+ static uint32_t kNumTests = 7;
+
+ wgpu::BufferDescriptor bufferDesc;
+ bufferDesc.size = kNumTests * sizeof(uint32_t);
+ bufferDesc.usage = wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc;
+ wgpu::Buffer bufferOut = device.CreateBuffer(&bufferDesc);
+
+ wgpu::Buffer inputBuffer = utils::CreateBufferFromData(
+ device,
+ wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::Storage,
+ {
+ // clang-format off
+ 0.f, 0.f, 0.f, 0.f, //
+ 0.f, 0.f, 0.f, 1.f, //
+ 0.f, 0.f, 1.f, 0.f, //
+ 0.f, 1.f, 0.f, 0.f, //
+ 1.f, 0.f, 0.f, 0.f, //
+ 1.f, 0.f, 1.f, 0.f, //
+ std::numeric_limits<float>::max(), 0.f, 0.5f, std::numeric_limits<float>::lowest(),
+ // clang-format on
+ });
+
+ wgpu::ComputePipelineDescriptor csDesc;
+ csDesc.compute.module = utils::CreateShaderModule(device, computeShader);
+ wgpu::ComputePipeline pipeline = device.CreateComputePipeline(&csDesc);
+
+ wgpu::BindGroup bindGroup = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
+ {
+ {0, bufferOut},
+ {1, inputBuffer},
+ });
+
+ wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+ wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
+ pass.SetPipeline(pipeline);
+ pass.SetBindGroup(0, bindGroup);
+ pass.DispatchWorkgroups(1);
+ pass.End();
+ wgpu::CommandBuffer commands = encoder.Finish();
+ queue.Submit(1, &commands);
+
+ uint32_t expected[] = {0u, 0xff00'0000u, 0x00ff'0000u, 0x0000'ff00u,
+ 0x0000'00ffu, 0x00ff'00ffu, 0x0080'00ffu};
+ EXPECT_BUFFER_U32_RANGE_EQ(reinterpret_cast<uint32_t*>(expected), bufferOut, 0, kNumTests);
+}
+
+TEST_P(PackUnpack4x8NormTests, Unpack4x8Unorm) {
+ const char* computeShader = R"(
+ @group(0) @binding(0) var<storage, read_write> buf : array<vec4f>;
+ @group(0) @binding(1) var<storage, read> inputBuf : array<u32>;
+
+ @compute @workgroup_size(1)
+ fn main() {
+ var r: vec2<u32>;
+ for (var i = 0; i < 7; i++) {
+ r.x = inputBuf[i];
+ buf[i] = unpack4x8unorm(r.x);
+ }
+ }
+)";
+
+ static uint32_t kNumTests = 7;
+
+ wgpu::BufferDescriptor bufferDesc;
+ bufferDesc.size = kNumTests * 4 * sizeof(float);
+ bufferDesc.usage = wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc;
+ wgpu::Buffer bufferOut = device.CreateBuffer(&bufferDesc);
+
+ wgpu::Buffer inputBuffer = utils::CreateBufferFromData(
+ device,
+ wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::Storage,
+ {
+ // clang-format off
+ 0x0000'0000u,
+ 0xff00'0000u,
+ 0x00ff'0000u,
+ 0x0000'ff00u,
+ 0x0000'00ffu,
+ 0x00ff'00ffu,
+ 0x0066'00ffu
+ // clang-format on
+ });
+
+ wgpu::ComputePipelineDescriptor csDesc;
+ csDesc.compute.module = utils::CreateShaderModule(device, computeShader);
+ wgpu::ComputePipeline pipeline = device.CreateComputePipeline(&csDesc);
+
+ wgpu::BindGroup bindGroup = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
+ {
+ {0, bufferOut},
+ {1, inputBuffer},
+ });
+
+ wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+ wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
+ pass.SetPipeline(pipeline);
+ pass.SetBindGroup(0, bindGroup);
+ pass.DispatchWorkgroups(1);
+ pass.End();
+ wgpu::CommandBuffer commands = encoder.Finish();
+ queue.Submit(1, &commands);
+
+ float expected[] = {
+ // clang-format off
+ 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 1.f,
+ 0.f, 0.f, 1.f, 0.f,
+ 0.f, 1.f, 0.f, 0.f,
+ 1.f, 0.f, 0.f, 0.f,
+ 1.f, 0.f, 1.f, 0.f,
+ 1.f, 0.f, 0.4f, 0.f
+ // clang-format on
+ };
+ EXPECT_BUFFER_FLOAT_RANGE_EQ(expected, bufferOut, 0, kNumTests);
+}
+
+TEST_P(PackUnpack4x8NormTests, Unpack4x8Snorm) {
+ const char* computeShader = R"(
+ @group(0) @binding(0) var<storage, read_write> buf : array<vec4f>;
+ @group(0) @binding(1) var<storage, read> inputBuf : array<u32>;
+
+ @compute @workgroup_size(1)
+ fn main() {
+ var r: vec2<u32>;
+ for (var i = 0; i < 8; i++) {
+ r.x = inputBuf[i];
+ buf[i] = unpack4x8snorm(r.x);
+ }
+ }
+)";
+
+ static uint32_t kNumTests = 8;
+
+ wgpu::BufferDescriptor bufferDesc;
+ bufferDesc.size = kNumTests * 4 * sizeof(float);
+ bufferDesc.usage = wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc;
+ wgpu::Buffer bufferOut = device.CreateBuffer(&bufferDesc);
+
+ wgpu::Buffer inputBuffer = utils::CreateBufferFromData(
+ device,
+ wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::Storage,
+ {
+ // clang-format off
+ 0x0000'0000u,
+ 0x8100'0000u,
+ 0x7f00'0000u,
+ 0x0081'0000u,
+ 0x0000'7f00u,
+ 0x0000'0081u,
+ 0x817f'817fu,
+ 0x816d'937fu
+ // clang-format on
+ });
+
+ wgpu::ComputePipelineDescriptor csDesc;
+ csDesc.compute.module = utils::CreateShaderModule(device, computeShader);
+ wgpu::ComputePipeline pipeline = device.CreateComputePipeline(&csDesc);
+
+ wgpu::BindGroup bindGroup = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
+ {
+ {0, bufferOut},
+ {1, inputBuffer},
+ });
+
+ wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+ wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
+ pass.SetPipeline(pipeline);
+ pass.SetBindGroup(0, bindGroup);
+ pass.DispatchWorkgroups(1);
+ pass.End();
+ wgpu::CommandBuffer commands = encoder.Finish();
+ queue.Submit(1, &commands);
+
+ float expected[] = {
+ // clang-format off
+ 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, -1.f,
+ 0.f, 0.f, 0.f, 1.f,
+ 0.f, 0.f, -1.f, 0.f,
+ 0.f, 1.f, 0.f, 0.f,
+ -1.f, 0.f, 0.f, 0.f,
+ -1.f, 0.f, 1.f, 0.f,
+ 1.f, -1.f, 1.f, -1.f,
+ 1.f, -0.8582677165354f, 0.8582677165354f, -1.f
+ // clang-format on
+ };
+ EXPECT_BUFFER_FLOAT_RANGE_EQ(expected, bufferOut, 0, kNumTests);
+}
+
+// DawnTestBase::CreateDeviceImpl always enables allow_unsafe_apis toggle.
+DAWN_INSTANTIATE_TEST(PackUnpack4x8NormTests,
+ VulkanBackend(),
+ VulkanBackend({"polyfill_pack_unpack_4x8_norm"}));
+
+} // anonymous namespace
+} // namespace dawn
diff --git a/src/tint/lang/core/ir/transform/builtin_polyfill.cc b/src/tint/lang/core/ir/transform/builtin_polyfill.cc
index da8c722..c012a30 100644
--- a/src/tint/lang/core/ir/transform/builtin_polyfill.cc
+++ b/src/tint/lang/core/ir/transform/builtin_polyfill.cc
@@ -174,6 +174,14 @@
}
break;
}
+ case core::BuiltinFn::kPack4X8Snorm:
+ case core::BuiltinFn::kPack4X8Unorm:
+ case core::BuiltinFn::kUnpack4X8Snorm:
+ case core::BuiltinFn::kUnpack4X8Unorm:
+ if (config.pack_unpack_4x8_norm) {
+ worklist.Push(builtin);
+ }
+ break;
default:
break;
}
@@ -252,12 +260,132 @@
case core::BuiltinFn::kUnpack4XU8:
Unpack4xU8(builtin);
break;
+ case core::BuiltinFn::kPack4X8Snorm:
+ Pack4x8Snorm(builtin);
+ break;
+ case core::BuiltinFn::kPack4X8Unorm:
+ Pack4x8Unorm(builtin);
+ break;
+ case core::BuiltinFn::kUnpack4X8Snorm:
+ Unpack4x8Snorm(builtin);
+ break;
+ case core::BuiltinFn::kUnpack4X8Unorm:
+ Unpack4x8Unorm(builtin);
+ break;
default:
break;
}
}
}
+ /// Polyfill a `pack4x8snorm` builtin call
+ void Pack4x8Snorm(ir::CoreBuiltinCall* call) {
+ auto* arg = call->Args()[0];
+
+ b.InsertBefore(call, [&] {
+ auto* vec4f = ty.vec4<f32>();
+ auto* vec4u = ty.vec4<u32>();
+
+ auto* neg_one = b.Splat(vec4f, -1_f);
+ auto* one = b.Splat(vec4f, 1_f);
+
+ core::ir::Value* v =
+ b.Call(vec4f, core::BuiltinFn::kClamp, Vector{arg, neg_one, one})->Result(0);
+ v = b.Multiply(vec4f, b.Splat(vec4f, 127_f), v)->Result(0);
+ v = b.Add(vec4f, b.Splat(vec4f, 0.5_f), v)->Result(0);
+ v = b.Call(vec4f, core::BuiltinFn::kFloor, Vector{v})->Result(0);
+ v = b.Convert(ty.vec4<i32>(), v)->Result(0);
+ v = b.Bitcast(vec4u, v)->Result(0);
+ v = b.And(vec4u, v, b.Splat(vec4u, 0xff_u))->Result(0);
+ v = b.ShiftLeft(vec4u, v, b.Construct(vec4u, 0_u, 8_u, 16_u, 24_u))->Result(0);
+
+ auto* x = b.Access(ty.u32(), v, 0_u);
+ auto* y = b.Access(ty.u32(), v, 1_u);
+ auto* z = b.Access(ty.u32(), v, 2_u);
+ auto* w = b.Access(ty.u32(), v, 3_u);
+
+ v = b.Or(ty.u32(), x, b.Or(ty.u32(), y, b.Or(ty.u32(), z, w)))->Result(0);
+
+ call->Result(0)->ReplaceAllUsesWith(v);
+ });
+ call->Destroy();
+ }
+
+ /// Polyfill a `pack4x8unorm` builtin call
+ void Pack4x8Unorm(ir::CoreBuiltinCall* call) {
+ auto* arg = call->Args()[0];
+
+ b.InsertBefore(call, [&] {
+ auto* vec4f = ty.vec4<f32>();
+ auto* vec4u = ty.vec4<u32>();
+
+ auto* zero = b.Zero(vec4f);
+ auto* one = b.Splat(vec4f, 1_f);
+
+ auto* v = b.Call(vec4f, core::BuiltinFn::kClamp, Vector{arg, zero, one})->Result(0);
+ v = b.Multiply(vec4f, b.Splat(vec4f, 255_f), v)->Result(0);
+ v = b.Add(vec4f, b.Splat(vec4f, 0.5_f), v)->Result(0);
+ v = b.Call(vec4f, core::BuiltinFn::kFloor, Vector{v})->Result(0);
+ v = b.Convert(vec4u, v)->Result(0);
+ v = b.And(vec4u, v, b.Splat(vec4u, 0xff_u))->Result(0);
+ v = b.ShiftLeft(vec4u, v, b.Construct(vec4u, 0_u, 8_u, 16_u, 24_u))->Result(0);
+
+ auto* x = b.Access(ty.u32(), v, 0_u);
+ auto* y = b.Access(ty.u32(), v, 1_u);
+ auto* z = b.Access(ty.u32(), v, 2_u);
+ auto* w = b.Access(ty.u32(), v, 3_u);
+
+ v = b.Or(ty.u32(), x, b.Or(ty.u32(), y, b.Or(ty.u32(), z, w)))->Result(0);
+
+ call->Result(0)->ReplaceAllUsesWith(v);
+ });
+ call->Destroy();
+ }
+
+ /// Polyfill a `unpack4x8snorm` builtin call
+ void Unpack4x8Snorm(ir::CoreBuiltinCall* call) {
+ auto* arg = call->Args()[0];
+
+ b.InsertBefore(call, [&] {
+ auto* vec4f = ty.vec4<f32>();
+ auto* vec4u = ty.vec4<u32>();
+ auto* vec4i = ty.vec4<i32>();
+
+ auto* v = b.Construct(vec4u, arg)->Result(0);
+ // Shift left to put the 8th bit of each number into the sign bit location, we then
+ // convert to an i32 and shift back, so the sign bit will be set as needed. The bits
+ // outside the bottom 8 are then masked off.
+ v = b.ShiftLeft(vec4u, v, b.Construct(vec4u, 24_u, 16_u, 8_u, 0_u))->Result(0);
+ v = b.Bitcast(vec4i, v)->Result(0);
+ v = b.ShiftRight(vec4i, v, b.Splat(vec4u, 24_u))->Result(0);
+ v = b.Convert(vec4f, v)->Result(0);
+ v = b.Divide(vec4f, v, b.Splat(vec4f, 127_f))->Result(0);
+ v = b.Call(vec4f, core::BuiltinFn::kMax, v, b.Splat(vec4f, -1_f))->Result(0);
+
+ call->Result(0)->ReplaceAllUsesWith(v);
+ });
+ call->Destroy();
+ }
+
+ /// Polyfill a `unpack4x8unorm` builtin call
+ void Unpack4x8Unorm(ir::CoreBuiltinCall* call) {
+ auto* arg = call->Args()[0];
+
+ b.InsertBefore(call, [&] {
+ auto* vec4f = ty.vec4<f32>();
+ auto* vec4u = ty.vec4<u32>();
+
+ auto* v = b.Construct(vec4u, arg)->Result(0);
+ v = b.ShiftRight(vec4u, v, b.Construct(vec4u, 0_u, 8_u, 16_u, 24_u))->Result(0);
+ v = b.And(vec4u, v, b.Splat(vec4u, 0xff_u))->Result(0);
+ v = b.Convert(vec4f, v)->Result(0);
+ v = b.Divide(vec4f, v, b.Splat(vec4f, 255_f))->Result(0);
+
+ call->Result(0)->ReplaceAllUsesWith(v);
+ });
+ call->Destroy();
+ }
+
/// Polyfill a `clamp()` builtin call for integers.
/// @param call the builtin call instruction
void ClampInt(ir::CoreBuiltinCall* call) {
diff --git a/src/tint/lang/core/ir/transform/builtin_polyfill.h b/src/tint/lang/core/ir/transform/builtin_polyfill.h
index 5e68e24..825b550 100644
--- a/src/tint/lang/core/ir/transform/builtin_polyfill.h
+++ b/src/tint/lang/core/ir/transform/builtin_polyfill.h
@@ -86,6 +86,8 @@
/// Should `pack4xU8Clamp()` be polyfilled?
/// TODO(tint:1497): remove the option once the bug in DXC is fixed.
bool pack_4xu8_clamp = false;
+ /// Should `pack4x8snorm`, `pack4x8unorm`, `unpack4x8snorm` and `unpack4x8unorm` be polyfilled?
+ bool pack_unpack_4x8_norm = false;
/// Reflection for this class
TINT_REFLECT(BuiltinPolyfillConfig,
diff --git a/src/tint/lang/core/ir/transform/builtin_polyfill_test.cc b/src/tint/lang/core/ir/transform/builtin_polyfill_test.cc
index 163b943..44e65e9 100644
--- a/src/tint/lang/core/ir/transform/builtin_polyfill_test.cc
+++ b/src/tint/lang/core/ir/transform/builtin_polyfill_test.cc
@@ -2739,5 +2739,154 @@
EXPECT_EQ(expect, str());
}
+TEST_F(IR_BuiltinPolyfillTest, Pack4x8snorm) {
+ Build(core::BuiltinFn::kPack4X8Snorm, ty.u32(), Vector{ty.vec4<f32>()});
+ auto* src = R"(
+%foo = func(%arg:vec4<f32>):u32 {
+ $B1: {
+ %result:u32 = pack4x8snorm %arg
+ ret %result
+ }
+}
+)";
+ auto* expect = R"(
+%foo = func(%arg:vec4<f32>):u32 {
+ $B1: {
+ %3:vec4<f32> = clamp %arg, vec4<f32>(-1.0f), vec4<f32>(1.0f)
+ %4:vec4<f32> = mul vec4<f32>(127.0f), %3
+ %5:vec4<f32> = add vec4<f32>(0.5f), %4
+ %6:vec4<f32> = floor %5
+ %7:vec4<i32> = convert %6
+ %8:vec4<u32> = bitcast %7
+ %9:vec4<u32> = and %8, vec4<u32>(255u)
+ %10:vec4<u32> = construct 0u, 8u, 16u, 24u
+ %11:vec4<u32> = shl %9, %10
+ %12:u32 = access %11, 0u
+ %13:u32 = access %11, 1u
+ %14:u32 = access %11, 2u
+ %15:u32 = access %11, 3u
+ %16:u32 = or %14, %15
+ %17:u32 = or %13, %16
+ %18:u32 = or %12, %17
+ ret %18
+ }
+}
+)";
+
+ EXPECT_EQ(src, str());
+
+ BuiltinPolyfillConfig config;
+ config.pack_unpack_4x8_norm = true;
+ Run(BuiltinPolyfill, config);
+ EXPECT_EQ(expect, str());
+}
+
+TEST_F(IR_BuiltinPolyfillTest, Pack4x8unorm) {
+ Build(core::BuiltinFn::kPack4X8Unorm, ty.u32(), Vector{ty.vec4<f32>()});
+ auto* src = R"(
+%foo = func(%arg:vec4<f32>):u32 {
+ $B1: {
+ %result:u32 = pack4x8unorm %arg
+ ret %result
+ }
+}
+)";
+ auto* expect = R"(
+%foo = func(%arg:vec4<f32>):u32 {
+ $B1: {
+ %3:vec4<f32> = clamp %arg, vec4<f32>(0.0f), vec4<f32>(1.0f)
+ %4:vec4<f32> = mul vec4<f32>(255.0f), %3
+ %5:vec4<f32> = add vec4<f32>(0.5f), %4
+ %6:vec4<f32> = floor %5
+ %7:vec4<u32> = convert %6
+ %8:vec4<u32> = and %7, vec4<u32>(255u)
+ %9:vec4<u32> = construct 0u, 8u, 16u, 24u
+ %10:vec4<u32> = shl %8, %9
+ %11:u32 = access %10, 0u
+ %12:u32 = access %10, 1u
+ %13:u32 = access %10, 2u
+ %14:u32 = access %10, 3u
+ %15:u32 = or %13, %14
+ %16:u32 = or %12, %15
+ %17:u32 = or %11, %16
+ ret %17
+ }
+}
+)";
+
+ EXPECT_EQ(src, str());
+
+ BuiltinPolyfillConfig config;
+ config.pack_unpack_4x8_norm = true;
+ Run(BuiltinPolyfill, config);
+ EXPECT_EQ(expect, str());
+}
+
+TEST_F(IR_BuiltinPolyfillTest, Unpack4x8snorm) {
+ Build(core::BuiltinFn::kUnpack4X8Snorm, ty.vec4<f32>(), Vector{ty.u32()});
+ auto* src = R"(
+%foo = func(%arg:u32):vec4<f32> {
+ $B1: {
+ %result:vec4<f32> = unpack4x8snorm %arg
+ ret %result
+ }
+}
+)";
+ auto* expect = R"(
+%foo = func(%arg:u32):vec4<f32> {
+ $B1: {
+ %3:vec4<u32> = construct %arg
+ %4:vec4<u32> = construct 24u, 16u, 8u, 0u
+ %5:vec4<u32> = shl %3, %4
+ %6:vec4<i32> = bitcast %5
+ %7:vec4<i32> = shr %6, vec4<u32>(24u)
+ %8:vec4<f32> = convert %7
+ %9:vec4<f32> = div %8, vec4<f32>(127.0f)
+ %10:vec4<f32> = max %9, vec4<f32>(-1.0f)
+ ret %10
+ }
+}
+)";
+
+ EXPECT_EQ(src, str());
+
+ BuiltinPolyfillConfig config;
+ config.pack_unpack_4x8_norm = true;
+ Run(BuiltinPolyfill, config);
+ EXPECT_EQ(expect, str());
+}
+
+TEST_F(IR_BuiltinPolyfillTest, Unpack4x8unorm) {
+ Build(core::BuiltinFn::kUnpack4X8Unorm, ty.vec4<f32>(), Vector{ty.u32()});
+ auto* src = R"(
+%foo = func(%arg:u32):vec4<f32> {
+ $B1: {
+ %result:vec4<f32> = unpack4x8unorm %arg
+ ret %result
+ }
+}
+)";
+ auto* expect = R"(
+%foo = func(%arg:u32):vec4<f32> {
+ $B1: {
+ %3:vec4<u32> = construct %arg
+ %4:vec4<u32> = construct 0u, 8u, 16u, 24u
+ %5:vec4<u32> = shr %3, %4
+ %6:vec4<u32> = and %5, vec4<u32>(255u)
+ %7:vec4<f32> = convert %6
+ %8:vec4<f32> = div %7, vec4<f32>(255.0f)
+ ret %8
+ }
+}
+)";
+
+ EXPECT_EQ(src, str());
+
+ BuiltinPolyfillConfig config;
+ config.pack_unpack_4x8_norm = true;
+ Run(BuiltinPolyfill, config);
+ EXPECT_EQ(expect, str());
+}
+
} // namespace
} // namespace tint::core::ir::transform
diff --git a/src/tint/lang/spirv/writer/common/options.h b/src/tint/lang/spirv/writer/common/options.h
index 264f2bd..9ccb595 100644
--- a/src/tint/lang/spirv/writer/common/options.h
+++ b/src/tint/lang/spirv/writer/common/options.h
@@ -191,6 +191,10 @@
/// Set to `true` to generate polyfill for `dot4I8Packed` and `dot4U8Packed` builtins
bool polyfill_dot_4x8_packed = false;
+ /// Set to `true` to generate polyfill for `pack4x8snorm`, `pack4x8unorm`, `unpack4x8snorm` and
+ /// `unpack4x8unorm` builtins
+ bool polyfill_pack_unpack_4x8_norm = false;
+
/// Set to `true` to disable the polyfills on integer division and modulo.
bool disable_polyfill_integer_div_mod = false;
@@ -217,6 +221,7 @@
clamp_frag_depth,
pass_matrix_by_pointer,
polyfill_dot_4x8_packed,
+ polyfill_pack_unpack_4x8_norm,
disable_polyfill_integer_div_mod,
use_vulkan_memory_model,
depth_range_offsets);
diff --git a/src/tint/lang/spirv/writer/raise/raise.cc b/src/tint/lang/spirv/writer/raise/raise.cc
index 5cd98e5..85df674 100644
--- a/src/tint/lang/spirv/writer/raise/raise.cc
+++ b/src/tint/lang/spirv/writer/raise/raise.cc
@@ -114,6 +114,7 @@
core_polyfills.dot_4x8_packed = options.polyfill_dot_4x8_packed;
core_polyfills.pack_unpack_4x8 = true;
core_polyfills.pack_4xu8_clamp = true;
+ core_polyfills.pack_unpack_4x8_norm = options.polyfill_pack_unpack_4x8_norm;
RUN_TRANSFORM(core::ir::transform::BuiltinPolyfill, module, core_polyfills);
core::ir::transform::ConversionPolyfillConfig conversion_polyfills;