[ir]: polyfill dot4I8Packed, dot4UPacked when needed

Enable packed_4x8_integer_dot_product end2end tests for Android

Bug: tint:1497
Change-Id: Ie59e3ae541329c2c6883f2a943b6cc944e36abb5
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/168684
Commit-Queue: David Neto <dneto@google.com>
Reviewed-by: James Price <jrprice@google.com>
Kokoro: Kokoro <noreply+kokoro@google.com>
diff --git a/src/dawn/tests/end2end/Packed4x8IntegerDotProductTests.cpp b/src/dawn/tests/end2end/Packed4x8IntegerDotProductTests.cpp
index 556c851..e187d72 100644
--- a/src/dawn/tests/end2end/Packed4x8IntegerDotProductTests.cpp
+++ b/src/dawn/tests/end2end/Packed4x8IntegerDotProductTests.cpp
@@ -36,10 +36,6 @@
 class Packed4x8IntegerDotProductTests : public DawnTest {};
 
 TEST_P(Packed4x8IntegerDotProductTests, Dot4x8Packed) {
-    // TODO(tint:1497): investigate why the creation of compute pipeline with dot4{U|I}8Packed()
-    // fails on Pixel 4
-    DAWN_SUPPRESS_TEST_IF(IsAndroid());
-
     const char* computeShader = R"(
         struct Buf {
             data1 : i32,
@@ -105,10 +101,6 @@
 }
 
 TEST_P(Packed4x8IntegerDotProductTests, Pack4x8) {
-    // TODO(tint:1497): investigate why the creation of compute pipeline with pack4xI8(),
-    // pack4xU8(), pack4xI8Clamp() or pack4xU8Clamp() fails on Pixel 6
-    DAWN_SUPPRESS_TEST_IF(IsAndroid());
-
     const char* computeShader = R"(
         struct Buf {
             data1 : u32,
@@ -181,10 +173,6 @@
 }
 
 TEST_P(Packed4x8IntegerDotProductTests, Unpack4x8) {
-    // TODO(tint:1497): investigate why the creation of compute pipeline with unpack4xI8() or
-    // unpack4xU8() fails on Pixel 6
-    DAWN_SUPPRESS_TEST_IF(IsAndroid());
-
     const char* computeShader = R"(
         struct Buf {
             data1 : vec4i,
diff --git a/src/tint/lang/core/ir/transform/builtin_polyfill.cc b/src/tint/lang/core/ir/transform/builtin_polyfill.cc
index e5c506e..4ff5cbf 100644
--- a/src/tint/lang/core/ir/transform/builtin_polyfill.cc
+++ b/src/tint/lang/core/ir/transform/builtin_polyfill.cc
@@ -119,6 +119,13 @@
                             }
                         }
                         break;
+                    case core::BuiltinFn::kDot4U8Packed:
+                    case core::BuiltinFn::kDot4I8Packed: {
+                        if (config.dot_4x8_packed) {
+                            worklist.Push(builtin);
+                        }
+                        break;
+                    }
                     case core::BuiltinFn::kPack4XI8:
                     case core::BuiltinFn::kPack4XU8:
                     case core::BuiltinFn::kPack4XI8Clamp:
@@ -167,6 +174,12 @@
                 case core::BuiltinFn::kTextureSampleBaseClampToEdge:
                     replacement = TextureSampleBaseClampToEdge_2d_f32(builtin);
                     break;
+                case core::BuiltinFn::kDot4I8Packed:
+                    replacement = Dot4I8Packed(builtin);
+                    break;
+                case core::BuiltinFn::kDot4U8Packed:
+                    replacement = Dot4U8Packed(builtin);
+                    break;
                 case core::BuiltinFn::kPack4XI8:
                     replacement = Pack4xI8(builtin);
                     break;
@@ -601,6 +614,44 @@
         return result;
     }
 
+    /// Polyfill a `dot4I8Packed()` builtin call
+    /// @param call the builtin call instruction
+    /// @returns the replacement value
+    ir::Value* Dot4I8Packed(ir::CoreBuiltinCall* call) {
+        // Replace `dot4I8Packed(%x,%y)` with:
+        //   %unpacked_x = unpack4xI8(%x);
+        //   %unpacked_y = unpack4xI8(%y);
+        //   %result = dot(%unpacked_x, %unpacked_y);
+        auto* x = call->Args()[0];
+        auto* y = call->Args()[1];
+        auto* unpacked_x = Unpack4xI8OnValue(call, x);
+        auto* unpacked_y = Unpack4xI8OnValue(call, y);
+        ir::Value* result = nullptr;
+        b.InsertBefore(call, [&] {
+            result = b.Call(ty.i32(), core::BuiltinFn::kDot, unpacked_x, unpacked_y)->Result(0);
+        });
+        return result;
+    }
+
+    /// Polyfill a `dot4U8Packed()` builtin call
+    /// @param call the builtin call instruction
+    /// @returns the replacement value
+    ir::Value* Dot4U8Packed(ir::CoreBuiltinCall* call) {
+        // Replace `dot4U8Packed(%x,%y)` with:
+        //   %unpacked_x = unpack4xU8(%x);
+        //   %unpacked_y = unpack4xU8(%y);
+        //   %result = dot(%unpacked_x, %unpacked_y);
+        auto* x = call->Args()[0];
+        auto* y = call->Args()[1];
+        auto* unpacked_x = Unpack4xU8OnValue(call, x);
+        auto* unpacked_y = Unpack4xU8OnValue(call, y);
+        ir::Value* result = nullptr;
+        b.InsertBefore(call, [&] {
+            result = b.Call(ty.u32(), core::BuiltinFn::kDot, unpacked_x, unpacked_y)->Result(0);
+        });
+        return result;
+    }
+
     /// Polyfill a `pack4xI8()` builtin call
     /// @param call the builtin call instruction
     /// @returns the replacement value
@@ -713,17 +764,16 @@
         return result;
     }
 
-    /// Polyfill a `unpack4xI8()` builtin call
-    /// @param call the builtin call instruction
-    /// @returns the replacement value
-    ir::Value* Unpack4xI8(ir::CoreBuiltinCall* call) {
+    /// Emit code for `unpack4xI8` on u32 value `x`, before the given call.
+    /// @param call the instruction that should follow the emitted code
+    /// @param x the u32 value to be unpacked
+    ir::Value* Unpack4xI8OnValue(ir::CoreBuiltinCall* call, ir::Value* x) {
         // Replace `unpack4xI8(%x)` with:
         //   %n       = vec4u(24, 16, 8, 0);
-        //   %x_splat = vec4u(x); // splat the scalar to a vector
+        //   %x_splat = vec4u(%x); // splat the scalar to a vector
         //   %x_vec4i = bitcast<vec4i>(%x_splat << n);
         //   %result  = %x_vec4i >> vec4u(24);
         ir::Value* result = nullptr;
-        auto* x = call->Args()[0];
         b.InsertBefore(call, [&] {
             auto* vec4i = ty.vec4<i32>();
             auto* vec4u = ty.vec4<u32>();
@@ -738,17 +788,23 @@
         return result;
     }
 
-    /// Polyfill a `unpack4xU8()` builtin call
+    /// Polyfill a `unpack4xI8()` builtin call
     /// @param call the builtin call instruction
     /// @returns the replacement value
-    ir::Value* Unpack4xU8(ir::CoreBuiltinCall* call) {
+    ir::Value* Unpack4xI8(ir::CoreBuiltinCall* call) {
+        return Unpack4xI8OnValue(call, call->Args()[0]);
+    }
+
+    /// Emit code for `unpack4xU8` on u32 value `x`, before the given call.
+    /// @param call the instruction that should follow the emitted code
+    /// @param x the u32 value to be unpacked
+    ir::Value* Unpack4xU8OnValue(ir::CoreBuiltinCall* call, ir::Value* x) {
         // Replace `unpack4xU8(%x)` with:
         //   %n       = vec4u(0, 8, 16, 24);
-        //   %x_splat = vec4u(x); // splat the scalar to a vector
+        //   %x_splat = vec4u(%x); // splat the scalar to a vector
         //   %x_vec4u = %x_splat >> n;
         //   %result  = %x_vec4u & vec4u(0xff);
         ir::Value* result = nullptr;
-        auto* x = call->Args()[0];
         b.InsertBefore(call, [&] {
             auto* vec4u = ty.vec4<u32>();
 
@@ -760,6 +816,13 @@
         });
         return result;
     }
+
+    /// Polyfill a `unpack4xU8()` builtin call
+    /// @param call the builtin call instruction
+    /// @returns the replacement value
+    ir::Value* Unpack4xU8(ir::CoreBuiltinCall* call) {
+        return Unpack4xU8OnValue(call, call->Args()[0]);
+    }
 };
 
 }  // namespace
diff --git a/src/tint/lang/core/ir/transform/builtin_polyfill.h b/src/tint/lang/core/ir/transform/builtin_polyfill.h
index 2f90486..cb3c054 100644
--- a/src/tint/lang/core/ir/transform/builtin_polyfill.h
+++ b/src/tint/lang/core/ir/transform/builtin_polyfill.h
@@ -69,6 +69,8 @@
     bool saturate = false;
     /// Should `textureSampleBaseClampToEdge()` be polyfilled for texture_2d<f32> textures?
     bool texture_sample_base_clamp_to_edge_2d_f32 = false;
+    /// Should `dot4U8Packed()` and `dot4I8Packed()` be polyfilled?
+    bool dot_4x8_packed = false;
     /// Should `pack4xI8()` and `pack4xU8()` be polyfilled?
     bool pack_unpack_4x8 = false;
 };
diff --git a/src/tint/lang/core/ir/transform/builtin_polyfill_test.cc b/src/tint/lang/core/ir/transform/builtin_polyfill_test.cc
index 29c7afd..9dfaee0 100644
--- a/src/tint/lang/core/ir/transform/builtin_polyfill_test.cc
+++ b/src/tint/lang/core/ir/transform/builtin_polyfill_test.cc
@@ -1622,5 +1622,85 @@
     EXPECT_EQ(expect, str());
 }
 
+TEST_F(IR_BuiltinPolyfillTest, Dot4I8Packed) {
+    Build(core::BuiltinFn::kDot4I8Packed, ty.i32(), Vector{ty.u32(), ty.u32()});
+
+    auto* src = R"(
+%foo = func(%arg:u32, %arg_1:u32):i32 -> %b1 {  # %arg_1: 'arg'
+  %b1 = block {
+    %result:i32 = dot4I8Packed %arg, %arg_1
+    ret %result
+  }
+}
+)";
+    EXPECT_EQ(src, str());
+
+    auto* expect = R"(
+%foo = func(%arg:u32, %arg_1:u32):i32 -> %b1 {  # %arg_1: 'arg'
+  %b1 = block {
+    %4:vec4<u32> = construct 24u, 16u, 8u, 0u
+    %5:vec4<u32> = construct %arg
+    %6:vec4<u32> = shl %5, %4
+    %7:vec4<i32> = bitcast %6
+    %8:vec4<u32> = construct 24u
+    %9:vec4<i32> = shr %7, %8
+    %10:vec4<u32> = construct 24u, 16u, 8u, 0u
+    %11:vec4<u32> = construct %arg_1
+    %12:vec4<u32> = shl %11, %10
+    %13:vec4<i32> = bitcast %12
+    %14:vec4<u32> = construct 24u
+    %15:vec4<i32> = shr %13, %14
+    %result:i32 = dot %9, %15
+    ret %result
+  }
+}
+)";
+
+    BuiltinPolyfillConfig config;
+    config.dot_4x8_packed = true;
+    Run(BuiltinPolyfill, config);
+
+    EXPECT_EQ(expect, str());
+}
+
+TEST_F(IR_BuiltinPolyfillTest, Dot4U8Packed) {
+    Build(core::BuiltinFn::kDot4U8Packed, ty.u32(), Vector{ty.u32(), ty.u32()});
+
+    auto* src = R"(
+%foo = func(%arg:u32, %arg_1:u32):u32 -> %b1 {  # %arg_1: 'arg'
+  %b1 = block {
+    %result:u32 = dot4U8Packed %arg, %arg_1
+    ret %result
+  }
+}
+)";
+    EXPECT_EQ(src, str());
+
+    auto* expect = R"(
+%foo = func(%arg:u32, %arg_1:u32):u32 -> %b1 {  # %arg_1: 'arg'
+  %b1 = block {
+    %4:vec4<u32> = construct 0u, 8u, 16u, 24u
+    %5:vec4<u32> = construct %arg
+    %6:vec4<u32> = shr %5, %4
+    %7:vec4<u32> = construct 255u
+    %8:vec4<u32> = and %6, %7
+    %9:vec4<u32> = construct 0u, 8u, 16u, 24u
+    %10:vec4<u32> = construct %arg_1
+    %11:vec4<u32> = shr %10, %9
+    %12:vec4<u32> = construct 255u
+    %13:vec4<u32> = and %11, %12
+    %result:u32 = dot %8, %13
+    ret %result
+  }
+}
+)";
+
+    BuiltinPolyfillConfig config;
+    config.dot_4x8_packed = true;
+    Run(BuiltinPolyfill, config);
+
+    EXPECT_EQ(expect, str());
+}
+
 }  // namespace
 }  // namespace tint::core::ir::transform
diff --git a/src/tint/lang/spirv/writer/raise/raise.cc b/src/tint/lang/spirv/writer/raise/raise.cc
index 095e47e..bfa4285 100644
--- a/src/tint/lang/spirv/writer/raise/raise.cc
+++ b/src/tint/lang/spirv/writer/raise/raise.cc
@@ -86,6 +86,7 @@
     core_polyfills.insert_bits = core::ir::transform::BuiltinPolyfillLevel::kClampOrRangeCheck;
     core_polyfills.saturate = true;
     core_polyfills.texture_sample_base_clamp_to_edge_2d_f32 = true;
+    core_polyfills.dot_4x8_packed = options.polyfill_dot_4x8_packed;
     core_polyfills.pack_unpack_4x8 = true;
     RUN_TRANSFORM(core::ir::transform::BuiltinPolyfill, module, core_polyfills);