Support R16, RG16, RGBA16, R32 Float in T2B compute shader

Bug: 348654098
Change-Id: I4aca6af8c43fdc29c53c2c5a64d5dd837d2568e8
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/195534
Reviewed-by: Austin Eng <enga@chromium.org>
Commit-Queue: Quyen Le <lehoangquyen@chromium.org>
Reviewed-by: Corentin Wallez <cwallez@chromium.org>
diff --git a/src/dawn/native/BlitTextureToBuffer.cpp b/src/dawn/native/BlitTextureToBuffer.cpp
index 0276475..f49666b 100644
--- a/src/dawn/native/BlitTextureToBuffer.cpp
+++ b/src/dawn/native/BlitTextureToBuffer.cpp
@@ -27,6 +27,7 @@
 
 #include "dawn/native/BlitTextureToBuffer.h"
 
+#include <algorithm>
 #include <array>
 #include <string>
 #include <string_view>
@@ -166,6 +167,12 @@
 }
 )";
 
+constexpr std::string_view kEncodeRG16FloatInU32 = R"(
+fn encodeVectorInU32General(v: vec2f) -> u32 {
+    return pack2x16float(v);
+}
+)";
+
 // Each thread is responsible for reading (packTexelCount) texel and packing them into a 4-byte u32.
 constexpr std::string_view kCommonHead = R"(
 struct Params {
@@ -194,6 +201,9 @@
 override workgroupSizeX: u32;
 override workgroupSizeY: u32;
 
+// Size of one element in the destination buffer this thread will write to.
+override gOutputUnitSize: u32;
+
 @compute @workgroup_size(workgroupSizeX, workgroupSizeY, 1) fn main
 (@builtin(global_invocation_id) id : vec3u) {
 )";
@@ -205,8 +215,8 @@
     return;
 }
 
-let indicesPerRow = params.bytesPerRow / 4;
-let indicesOffset = params.offset / 4;
+let indicesPerRow = params.bytesPerRow / gOutputUnitSize;
+let indicesOffset = params.offset / gOutputUnitSize;
 let dstOffset = indicesOffset + id.x + id.y * indicesPerRow + id.z * indicesPerRow * params.rowsPerImage;
 )";
 
@@ -265,10 +275,10 @@
     return;
 }
 
-let byteOffset = params.offset + id.x * 4
+let byteOffset = params.offset + id.x * gOutputUnitSize
     + id.y * params.bytesPerRow
     + id.z * params.bytesPerRow * params.rowsPerImage;
-let dstOffset = byteOffset / 4;
+let dstOffset = byteOffset / gOutputUnitSize;
 let srcBoundary = params.srcOrigin + params.srcExtent;
 
 // Start coord, End coord
@@ -516,6 +526,54 @@
 }
 )";
 
+// R16: texelByte = 2; each thread reads 1 ~ 2 texels.
+// General packing algorithm is similar to kPackRG8ToU32.
+constexpr std::string_view kPackR16ToU32 = R"(
+// Result bits to store into dst_buf
+var result: u32 = 0u;
+// Storing half texel values
+// later called by pack2x16unorm to convert to u32.
+var v: vec2f;
+
+// dstBuf value is used for starting part.
+var mask: u32 = 0xffffffffu;
+if (!readDstBufAtStart) {
+    // coordS is used
+    mask &= 0xffff0000u;
+    let texel0 = textureLoadGeneral(src_tex, coordS, params.mipLevel).r;
+    v[0] = texel0;
+}
+
+if (coordE.x < srcBoundary.x) {
+    // coordE is used
+    mask &= 0x0000ffffu;
+    let texel1 = textureLoadGeneral(src_tex, coordE, params.mipLevel).r;
+    v[1] = texel1;
+}
+
+if (readDstBufAtStart || readDstBufAtEnd) {
+    let original: u32 = dst_buf[dstOffset];
+    result = (original & mask) | (encodeVectorInU32General(v) & ~mask);
+} else {
+    result = encodeVectorInU32General(v);
+}
+)";
+
+constexpr std::string_view kPackRG16ToU32 = R"(
+    let v: vec2f = textureLoadGeneral(src_tex, coord0, params.mipLevel).rg;
+    let result = encodeVectorInU32General(v);
+)";
+
+// Load RGBA16 and pack to 2 uint4_t
+constexpr std::string_view kLoadRGBA16ToU32 = R"(
+    let v: vec4f = textureLoadGeneral(src_tex, coord0, params.mipLevel);
+    // dstOffset is based on 8 bytes so we need to multiply by 2 to get uint32 offset.
+    let uintOffset = dstOffset << 1;
+    dst_buf[uintOffset] = encodeVectorInU32General(v.rg);
+    dst_buf[uintOffset + 1] = encodeVectorInU32General(v.ba);
+}
+)";
+
 // ShaderF16 extension is only enabled by GL_AMD_gpu_shader_half_float for GL
 // so we should not use it generally for the emulation.
 // As a result we are using f32 and array<u32> to do all the math and byte manipulation.
@@ -597,9 +655,9 @@
 }
 )";
 
-// Directly loading depth32float values into dst_buf
+// Directly loading R32Float values into dst_buf
 // No bit manipulation and packing is needed.
-constexpr std::string_view kLoadDepth32Float = R"(
+constexpr std::string_view kLoadR32Float = R"(
     dst_buf[dstOffset] = textureLoadGeneral(src_tex, coord0, params.mipLevel).r;
 }
 )";
@@ -718,6 +776,31 @@
             shader += kCommonEnd;
             textureSampleType = wgpu::TextureSampleType::Float;
             break;
+        case wgpu::TextureFormat::R16Float:
+        case wgpu::TextureFormat::RG16Float:
+            AppendFloatTextureHead();
+            shader += kDstBufferU32;
+            shader += kEncodeRG16FloatInU32;
+            shader += kCommonHead;
+            if (format.format == wgpu::TextureFormat::R16Float) {
+                shader += kNonMultipleOf4OffsetStart;
+                shader += kPackR16ToU32;
+            } else {
+                shader += kCommonStart;
+                shader += kPackRG16ToU32;
+            }
+            shader += kCommonEnd;
+            textureSampleType = wgpu::TextureSampleType::UnfilterableFloat;
+            break;
+        case wgpu::TextureFormat::RGBA16Float:
+            AppendFloatTextureHead();
+            shader += kDstBufferU32;
+            shader += kEncodeRG16FloatInU32;
+            shader += kCommonHead;
+            shader += kCommonStart;
+            shader += kLoadRGBA16ToU32;
+            textureSampleType = wgpu::TextureSampleType::UnfilterableFloat;
+            break;
         case wgpu::TextureFormat::Depth16Unorm:
             AppendFloatTextureHead();
             shader += kDstBufferU32;
@@ -728,11 +811,12 @@
             textureSampleType = wgpu::TextureSampleType::UnfilterableFloat;
             break;
         case wgpu::TextureFormat::Depth32Float:
+        case wgpu::TextureFormat::R32Float:
             AppendFloatTextureHead();
             shader += kDstBufferF32;
             shader += kCommonHead;
             shader += kCommonStart;
-            shader += kLoadDepth32Float;
+            shader += kLoadR32Float;
             textureSampleType = wgpu::TextureSampleType::UnfilterableFloat;
             break;
         case wgpu::TextureFormat::Stencil8:
@@ -756,7 +840,7 @@
                     shader += kDstBufferF32;
                     shader += kCommonHead;
                     shader += kCommonStart;
-                    shader += kLoadDepth32Float;
+                    shader += kLoadR32Float;
                     textureSampleType = wgpu::TextureSampleType::UnfilterableFloat;
                     break;
                 case Aspect::Stencil:
@@ -821,11 +905,16 @@
     computePipelineDescriptor.compute.module = shaderModule.Get();
     computePipelineDescriptor.compute.entryPoint = "main";
 
+    const uint32_t bytesPerTexel = format.GetAspectInfo(src.aspect).block.byteSize;
+    // Size of one unit for a thread to write to. For format < 4 bytes, we always write 4 bytes at a
+    // time.
+    const uint32_t ouputUnitSize = std::max(bytesPerTexel, 4u);
     const uint32_t adjustedWorkGroupSizeY =
         (viewDimension == wgpu::TextureViewDimension::e1D) ? 1 : kWorkgroupSizeY;
-    const std::array<ConstantEntry, 2> constants = {{
+    const std::array<ConstantEntry, 3> constants = {{
         {nullptr, "workgroupSizeX", kWorkgroupSizeX},
         {nullptr, "workgroupSizeY", static_cast<double>(adjustedWorkGroupSizeY)},
+        {nullptr, "gOutputUnitSize", static_cast<double>(ouputUnitSize)},
     }};
     computePipelineDescriptor.compute.constantCount = constants.size();
     computePipelineDescriptor.compute.constants = constants.data();
@@ -851,6 +940,10 @@
         case wgpu::TextureFormat::RGBA8Unorm:
         case wgpu::TextureFormat::BGRA8Unorm:
         case wgpu::TextureFormat::RGB9E5Ufloat:
+        case wgpu::TextureFormat::R16Float:
+        case wgpu::TextureFormat::RG16Float:
+        case wgpu::TextureFormat::RGBA16Float:
+        case wgpu::TextureFormat::R32Float:
         case wgpu::TextureFormat::Depth16Unorm:
         case wgpu::TextureFormat::Depth32Float:
         case wgpu::TextureFormat::Stencil8:
@@ -946,6 +1039,7 @@
                     Align(copyExtent.width, 2 * kWorkgroupSizeX) / (2 * kWorkgroupSizeX);
                 break;
             case 4:
+            case 8:
                 workgroupCountX = Align(copyExtent.width, kWorkgroupSizeX) / kWorkgroupSizeX;
                 break;
             default:
@@ -1064,7 +1158,7 @@
 
         // packTexelCount: number of texel values (1, 2, or 4) one thread packs into the dst
         // buffer
-        params[3] = 4 / bytesPerTexel;
+        params[3] = std::max(1u, 4 / bytesPerTexel);
         // srcExtent: vec3u
         params[4] = copyExtent.width;
         params[5] = copyExtent.height;
@@ -1076,7 +1170,7 @@
         params[9] = rowsPerImage;
         params[10] = shaderStartOffset;
 
-        // These params are only used for R8Snorm and R8Snorm
+        // These params are only used for formats smaller than 4 bytes
         params[11] = (shaderStartOffset % 4) / bytesPerTexel;  // shift
 
         params[16] = bytesPerTexel;
diff --git a/src/dawn/tests/end2end/CopyTests.cpp b/src/dawn/tests/end2end/CopyTests.cpp
index 5a913ac..25c4669 100644
--- a/src/dawn/tests/end2end/CopyTests.cpp
+++ b/src/dawn/tests/end2end/CopyTests.cpp
@@ -427,8 +427,14 @@
 
             if (useMappableBuffer) {
                 const auto* mappedPtr = static_cast<const uint8_t*>(buffer.GetConstMappedRange());
-                EXPECT_EQ(memcmp(mappedPtr + bufferOffset, expected.data(), expected.size()), 0)
-                    << errorMsgSs.str();
+                for (size_t i = 0; i < expected.size(); ++i) {
+                    if (mappedPtr[bufferOffset + i] != expected[i]) {
+                        EXPECT_EQ(mappedPtr[bufferOffset + i], expected[i])
+                            << "with i=" << i << "\n"
+                            << errorMsgSs.str();
+                        break;
+                    }
+                }
             } else {
                 EXPECT_BUFFER_U8_RANGE_EQ(reinterpret_cast<const uint8_t*>(expected.data()), buffer,
                                           bufferOffset, expected.size())
@@ -1726,6 +1732,10 @@
                             wgpu::TextureFormat::R16Sint,
                             wgpu::TextureFormat::R16Float,
 
+                            wgpu::TextureFormat::RG16Uint,
+                            wgpu::TextureFormat::RG16Sint,
+                            wgpu::TextureFormat::RG16Float,
+
                             wgpu::TextureFormat::R32Uint,
                             wgpu::TextureFormat::R32Sint,
                             wgpu::TextureFormat::R32Float,