D3D11: Implement polyfill of DP4A

This patch implements the polyfill versions of `dot4I8Packed()` and
`dot4U8Packed()` and uses them on the D3D11 backend. In the next
patch they will also be used on the D3D12 and Vulkan backends where
the DP4A instructions are not supported in native HLSL or SPIRV.

Bug: tint:1497
Test: dawn_end2end_tests

Change-Id: I330127a3a5e4c4cf7bc8b75625e6f2b0a72fc054
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/162323
Reviewed-by: Ben Clayton <bclayton@google.com>
Kokoro: Kokoro <noreply+kokoro@google.com>
Commit-Queue: Jiawei Shao <jiawei.shao@intel.com>
Reviewed-by: Corentin Wallez <cwallez@chromium.org>
Reviewed-by: David Neto <dneto@google.com>
diff --git a/src/dawn/native/d3d11/PhysicalDeviceD3D11.cpp b/src/dawn/native/d3d11/PhysicalDeviceD3D11.cpp
index 8493cb8..456d7e6 100644
--- a/src/dawn/native/d3d11/PhysicalDeviceD3D11.cpp
+++ b/src/dawn/native/d3d11/PhysicalDeviceD3D11.cpp
@@ -189,6 +189,7 @@
     EnableFeature(Feature::DualSourceBlending);
     EnableFeature(Feature::Norm16TextureFormats);
     EnableFeature(Feature::AdapterPropertiesMemoryHeaps);
+    EnableFeature(Feature::ChromiumExperimentalDp4a);
 
     // To import multi planar textures, we need to at least tier 2 support.
     if (mDeviceInfo.supportsSharedResourceCapabilityTier2) {
diff --git a/src/dawn/native/d3d11/ShaderModuleD3D11.cpp b/src/dawn/native/d3d11/ShaderModuleD3D11.cpp
index e5a573a..37ebdeb 100644
--- a/src/dawn/native/d3d11/ShaderModuleD3D11.cpp
+++ b/src/dawn/native/d3d11/ShaderModuleD3D11.cpp
@@ -209,6 +209,7 @@
 
     // TODO(dawn:1705): do we need to support it?
     req.hlsl.tintOptions.polyfill_reflect_vec2_f32 = false;
+    req.hlsl.tintOptions.polyfill_dot_4x8_packed = true;
 
     CacheResult<d3d::CompiledShader> compiledShader;
     MaybeError compileError = [&]() -> MaybeError {
diff --git a/src/dawn/tests/end2end/ExperimentalDP4aTests.cpp b/src/dawn/tests/end2end/ExperimentalDP4aTests.cpp
index 0b042ec..71aaf12 100644
--- a/src/dawn/tests/end2end/ExperimentalDP4aTests.cpp
+++ b/src/dawn/tests/end2end/ExperimentalDP4aTests.cpp
@@ -86,8 +86,8 @@
 
         @compute @workgroup_size(1)
         fn main() {
-            var a = 0xFFFFFFFFu;
-            var b = 0xFFFFFFFEu;
+            var a = 0xFFFEFDFCu;
+            var b = 0xFBFAF9F8u;
             var c = 0x01020304u;
             buf.data1 = dot4I8Packed(a, b);
             buf.data2 = dot4U8Packed(a, b);
@@ -137,13 +137,14 @@
     wgpu::CommandBuffer commands = encoder.Finish();
     queue.Submit(1, &commands);
 
-    uint32_t expected[] = {5, 259845, static_cast<uint32_t>(-10), 2550};
+    uint32_t expected[] = {70, 252998, static_cast<uint32_t>(-30), 2530};
     EXPECT_BUFFER_U32_RANGE_EQ(expected, bufferOut, 0, 4);
 }
 
 // DawnTestBase::CreateDeviceImpl always enables allow_unsafe_apis toggle.
 DAWN_INSTANTIATE_TEST_P(ExperimentalDP4aTests,
                         {
+                            D3D11Backend(),
                             D3D12Backend(),
                             D3D12Backend({}, {"use_dxc"}),
                             MetalBackend(),
diff --git a/src/tint/lang/hlsl/writer/ast_printer/ast_printer.cc b/src/tint/lang/hlsl/writer/ast_printer/ast_printer.cc
index d66be8d..c668179 100644
--- a/src/tint/lang/hlsl/writer/ast_printer/ast_printer.cc
+++ b/src/tint/lang/hlsl/writer/ast_printer/ast_printer.cc
@@ -255,6 +255,7 @@
         polyfills.reflect_vec2_f32 = options.polyfill_reflect_vec2_f32;
         polyfills.texture_sample_base_clamp_to_edge_2d_f32 = true;
         polyfills.workgroup_uniform_load = true;
+        polyfills.dot_4x8_packed = options.polyfill_dot_4x8_packed;
         data.Add<ast::transform::BuiltinPolyfill::Config>(polyfills);
         manager.Add<ast::transform::BuiltinPolyfill>();  // Must come before DirectVariableAccess
     }
diff --git a/src/tint/lang/hlsl/writer/common/options.h b/src/tint/lang/hlsl/writer/common/options.h
index 1ff17bb..44cb403 100644
--- a/src/tint/lang/hlsl/writer/common/options.h
+++ b/src/tint/lang/hlsl/writer/common/options.h
@@ -71,6 +71,9 @@
     /// Set to `true` to generate polyfill for `reflect` builtin for vec2<f32>
     bool polyfill_reflect_vec2_f32 = false;
 
+    /// Set to `true` to generate polyfill for `dot4I8Packed` and `dot4U8Packed` builtins
+    bool polyfill_dot_4x8_packed = false;
+
     /// Options used to specify a mapping of binding points to indices into a UBO
     /// from which to load buffer sizes.
     ArrayLengthFromUniformOptions array_length_from_uniform = {};
@@ -102,6 +105,7 @@
                  disable_workgroup_init,
                  truncate_interstage_variables,
                  polyfill_reflect_vec2_f32,
+                 polyfill_dot_4x8_packed,
                  array_length_from_uniform,
                  interstage_locations,
                  root_constant_binding_point,
diff --git a/src/tint/lang/wgsl/ast/transform/builtin_polyfill.cc b/src/tint/lang/wgsl/ast/transform/builtin_polyfill.cc
index 6e38d10..a222570 100644
--- a/src/tint/lang/wgsl/ast/transform/builtin_polyfill.cc
+++ b/src/tint/lang/wgsl/ast/transform/builtin_polyfill.cc
@@ -888,6 +888,58 @@
         return name;
     }
 
+    Symbol Dot4I8Packed() {
+        using vec4i = vec4<i32>;
+        using vec4u = vec4<u32>;
+
+        auto name = b.Symbols().New("tint_dot4_i8_packed");
+
+        auto body = tint::Vector{
+            // const n = vec4u(24, 16, 8, 0);
+            // let a_i8 = bitcast<vec4i>(vec4u(a) << n) >> vec4u(24);
+            // let b_i8 = bitcast<vec4i>(vec4u(b) << n) >> vec4u(24);
+            // return dot(a_i8, b_i8);
+            b.Decl(b.Const("n", b.Call<vec4u>(24_a, 16_a, 8_a, 0_a))),
+            b.Decl(b.Let("a_i8", b.Shr(b.Bitcast<vec4i>(b.Shl(b.Call<vec4u>("a"), "n")),
+                                       b.Call<vec4u>(24_a)))),
+            b.Decl(b.Let("b_i8", b.Shr(b.Bitcast<vec4i>(b.Shl(b.Call<vec4u>("b"), "n")),
+                                       b.Call<vec4u>(24_a)))),
+            b.Return(b.Call("dot", "a_i8", "b_i8")),
+        };
+        b.Func(name,
+               tint::Vector{
+                   b.Param("a", b.ty.u32()),
+                   b.Param("b", b.ty.u32()),
+               },
+               b.ty.i32(), body);
+
+        return name;
+    }
+
+    Symbol Dot4U8Packed() {
+        using vec4u = vec4<u32>;
+        auto name = b.Symbols().New("tint_dot4_u8_packed");
+
+        auto body = tint::Vector{
+            // const n = vec4u(24, 16, 8, 0);
+            // let a_u8 = (vec4u(a) >> n) & vec4u(0xff);
+            // let b_u8 = (vec4u(b) >> n) & vec4u(0xff);
+            // return dot(a_u8, b_u8);
+            b.Decl(b.Const("n", b.Call<vec4u>(24_a, 16_a, 8_a, 0_a))),
+            b.Decl(b.Let("a_u8", b.And(b.Shr(b.Call<vec4u>("a"), "n"), b.Call<vec4u>(0xff_a)))),
+            b.Decl(b.Let("b_u8", b.And(b.Shr(b.Call<vec4u>("b"), "n"), b.Call<vec4u>(0xff_a)))),
+            b.Return(b.Call("dot", "a_u8", "b_u8")),
+        };
+        b.Func(name,
+               tint::Vector{
+                   b.Param("a", b.ty.u32()),
+                   b.Param("b", b.ty.u32()),
+               },
+               b.ty.u32(), body);
+
+        return name;
+    }
+
     ////////////////////////////////////////////////////////////////////////////
     // Inline polyfills
     ////////////////////////////////////////////////////////////////////////////
@@ -1270,6 +1322,22 @@
                         }
                         return Symbol{};
 
+                    case wgsl::BuiltinFn::kDot4I8Packed: {
+                        if (cfg.builtins.dot_4x8_packed) {
+                            return builtin_polyfills.GetOrCreate(builtin,
+                                                                 [&] { return Dot4I8Packed(); });
+                        }
+                        return Symbol{};
+                    }
+
+                    case wgsl::BuiltinFn::kDot4U8Packed: {
+                        if (cfg.builtins.dot_4x8_packed) {
+                            return builtin_polyfills.GetOrCreate(builtin,
+                                                                 [&] { return Dot4U8Packed(); });
+                        }
+                        return Symbol{};
+                    }
+
                     default:
                         return Symbol{};
                 }
diff --git a/src/tint/lang/wgsl/ast/transform/builtin_polyfill.h b/src/tint/lang/wgsl/ast/transform/builtin_polyfill.h
index 0252fce..86059b6 100644
--- a/src/tint/lang/wgsl/ast/transform/builtin_polyfill.h
+++ b/src/tint/lang/wgsl/ast/transform/builtin_polyfill.h
@@ -98,6 +98,8 @@
         bool quantize_to_vec_f16 = false;
         /// Should `workgroupUniformLoad()` be polyfilled?
         bool workgroup_uniform_load = false;
+        /// Should `dot4I8Packed()` and `dot4U8Packed()` be polyfilled?
+        bool dot_4x8_packed = false;
     };
 
     /// Config is consumed by the BuiltinPolyfill transform.
diff --git a/src/tint/lang/wgsl/ast/transform/builtin_polyfill_test.cc b/src/tint/lang/wgsl/ast/transform/builtin_polyfill_test.cc
index e2d0dde..7045762 100644
--- a/src/tint/lang/wgsl/ast/transform/builtin_polyfill_test.cc
+++ b/src/tint/lang/wgsl/ast/transform/builtin_polyfill_test.cc
@@ -4055,6 +4055,83 @@
 }
 
 ////////////////////////////////////////////////////////////////////////////////
+// Built-in functions in packed_4x8_integer_dot_product
+////////////////////////////////////////////////////////////////////////////////
+DataMap polyfillPacked4x8IntegerDotProduct() {
+    BuiltinPolyfill::Builtins builtins;
+    builtins.dot_4x8_packed = true;
+    DataMap data;
+    data.Add<BuiltinPolyfill::Config>(builtins);
+    return data;
+}
+
+TEST_F(BuiltinPolyfillTest, Dot4I8Packed) {
+    auto* src = R"(
+enable chromium_experimental_dp4a;
+
+fn f() {
+  let v1 = 0x01020304u;
+  let v2 = 0xF1F2F3F4u;
+  _ = dot4I8Packed(v1, v2);
+}
+)";
+
+    auto* expect = R"(
+enable chromium_experimental_dp4a;
+
+fn tint_dot4_i8_packed(a : u32, b : u32) -> i32 {
+  const n = vec4<u32>(24, 16, 8, 0);
+  let a_i8 = (bitcast<vec4<i32>>((vec4<u32>(a) << n)) >> vec4<u32>(24));
+  let b_i8 = (bitcast<vec4<i32>>((vec4<u32>(b) << n)) >> vec4<u32>(24));
+  return dot(a_i8, b_i8);
+}
+
+fn f() {
+  let v1 = 16909060u;
+  let v2 = 4059231220u;
+  _ = tint_dot4_i8_packed(v1, v2);
+}
+)";
+
+    auto got = Run<BuiltinPolyfill>(src, polyfillPacked4x8IntegerDotProduct());
+
+    EXPECT_EQ(expect, str(got));
+}
+
+TEST_F(BuiltinPolyfillTest, Dot4U8Packed) {
+    auto* src = R"(
+enable chromium_experimental_dp4a;
+
+fn f() {
+  let v1 = 0x01020304u;
+  let v2 = 0xF1F2F3F4u;
+  _ = dot4U8Packed(v1, v2);
+}
+)";
+
+    auto* expect = R"(
+enable chromium_experimental_dp4a;
+
+fn tint_dot4_u8_packed(a : u32, b : u32) -> u32 {
+  const n = vec4<u32>(24, 16, 8, 0);
+  let a_u8 = ((vec4<u32>(a) >> n) & vec4<u32>(255));
+  let b_u8 = ((vec4<u32>(b) >> n) & vec4<u32>(255));
+  return dot(a_u8, b_u8);
+}
+
+fn f() {
+  let v1 = 16909060u;
+  let v2 = 4059231220u;
+  _ = tint_dot4_u8_packed(v1, v2);
+}
+)";
+
+    auto got = Run<BuiltinPolyfill>(src, polyfillPacked4x8IntegerDotProduct());
+
+    EXPECT_EQ(expect, str(got));
+}
+
+////////////////////////////////////////////////////////////////////////////////
 // Polyfill combinations
 ////////////////////////////////////////////////////////////////////////////////