Support R16, RG16, RGBA16, R32 Float in T2B compute shader
Bug: 348654098
Change-Id: I4aca6af8c43fdc29c53c2c5a64d5dd837d2568e8
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/195534
Reviewed-by: Austin Eng <enga@chromium.org>
Commit-Queue: Quyen Le <lehoangquyen@chromium.org>
Reviewed-by: Corentin Wallez <cwallez@chromium.org>
diff --git a/src/dawn/native/BlitTextureToBuffer.cpp b/src/dawn/native/BlitTextureToBuffer.cpp
index 0276475..f49666b 100644
--- a/src/dawn/native/BlitTextureToBuffer.cpp
+++ b/src/dawn/native/BlitTextureToBuffer.cpp
@@ -27,6 +27,7 @@
#include "dawn/native/BlitTextureToBuffer.h"
+#include <algorithm>
#include <array>
#include <string>
#include <string_view>
@@ -166,6 +167,12 @@
}
)";
+constexpr std::string_view kEncodeRG16FloatInU32 = R"(
+fn encodeVectorInU32General(v: vec2f) -> u32 {
+ return pack2x16float(v);
+}
+)";
+
// Each thread is responsible for reading (packTexelCount) texel and packing them into a 4-byte u32.
constexpr std::string_view kCommonHead = R"(
struct Params {
@@ -194,6 +201,9 @@
override workgroupSizeX: u32;
override workgroupSizeY: u32;
+// Size of one element in the destination buffer this thread will write to.
+override gOutputUnitSize: u32;
+
@compute @workgroup_size(workgroupSizeX, workgroupSizeY, 1) fn main
(@builtin(global_invocation_id) id : vec3u) {
)";
@@ -205,8 +215,8 @@
return;
}
-let indicesPerRow = params.bytesPerRow / 4;
-let indicesOffset = params.offset / 4;
+let indicesPerRow = params.bytesPerRow / gOutputUnitSize;
+let indicesOffset = params.offset / gOutputUnitSize;
let dstOffset = indicesOffset + id.x + id.y * indicesPerRow + id.z * indicesPerRow * params.rowsPerImage;
)";
@@ -265,10 +275,10 @@
return;
}
-let byteOffset = params.offset + id.x * 4
+let byteOffset = params.offset + id.x * gOutputUnitSize
+ id.y * params.bytesPerRow
+ id.z * params.bytesPerRow * params.rowsPerImage;
-let dstOffset = byteOffset / 4;
+let dstOffset = byteOffset / gOutputUnitSize;
let srcBoundary = params.srcOrigin + params.srcExtent;
// Start coord, End coord
@@ -516,6 +526,54 @@
}
)";
+// R16: texelByte = 2; each thread reads 1 ~ 2 texels.
+// General packing algorithm is similar to kPackRG8ToU32.
+constexpr std::string_view kPackR16ToU32 = R"(
+// Result bits to store into dst_buf
+var result: u32 = 0u;
+// Storing half texel values
+// later called by pack2x16unorm to convert to u32.
+var v: vec2f;
+
+// dstBuf value is used for starting part.
+var mask: u32 = 0xffffffffu;
+if (!readDstBufAtStart) {
+ // coordS is used
+ mask &= 0xffff0000u;
+ let texel0 = textureLoadGeneral(src_tex, coordS, params.mipLevel).r;
+ v[0] = texel0;
+}
+
+if (coordE.x < srcBoundary.x) {
+ // coordE is used
+ mask &= 0x0000ffffu;
+ let texel1 = textureLoadGeneral(src_tex, coordE, params.mipLevel).r;
+ v[1] = texel1;
+}
+
+if (readDstBufAtStart || readDstBufAtEnd) {
+ let original: u32 = dst_buf[dstOffset];
+ result = (original & mask) | (encodeVectorInU32General(v) & ~mask);
+} else {
+ result = encodeVectorInU32General(v);
+}
+)";
+
+constexpr std::string_view kPackRG16ToU32 = R"(
+ let v: vec2f = textureLoadGeneral(src_tex, coord0, params.mipLevel).rg;
+ let result = encodeVectorInU32General(v);
+)";
+
+// Load RGBA16 and pack to 2 uint4_t
+constexpr std::string_view kLoadRGBA16ToU32 = R"(
+ let v: vec4f = textureLoadGeneral(src_tex, coord0, params.mipLevel);
+ // dstOffset is based on 8 bytes so we need to multiply by 2 to get uint32 offset.
+ let uintOffset = dstOffset << 1;
+ dst_buf[uintOffset] = encodeVectorInU32General(v.rg);
+ dst_buf[uintOffset + 1] = encodeVectorInU32General(v.ba);
+}
+)";
+
// ShaderF16 extension is only enabled by GL_AMD_gpu_shader_half_float for GL
// so we should not use it generally for the emulation.
// As a result we are using f32 and array<u32> to do all the math and byte manipulation.
@@ -597,9 +655,9 @@
}
)";
-// Directly loading depth32float values into dst_buf
+// Directly loading R32Float values into dst_buf
// No bit manipulation and packing is needed.
-constexpr std::string_view kLoadDepth32Float = R"(
+constexpr std::string_view kLoadR32Float = R"(
dst_buf[dstOffset] = textureLoadGeneral(src_tex, coord0, params.mipLevel).r;
}
)";
@@ -718,6 +776,31 @@
shader += kCommonEnd;
textureSampleType = wgpu::TextureSampleType::Float;
break;
+ case wgpu::TextureFormat::R16Float:
+ case wgpu::TextureFormat::RG16Float:
+ AppendFloatTextureHead();
+ shader += kDstBufferU32;
+ shader += kEncodeRG16FloatInU32;
+ shader += kCommonHead;
+ if (format.format == wgpu::TextureFormat::R16Float) {
+ shader += kNonMultipleOf4OffsetStart;
+ shader += kPackR16ToU32;
+ } else {
+ shader += kCommonStart;
+ shader += kPackRG16ToU32;
+ }
+ shader += kCommonEnd;
+ textureSampleType = wgpu::TextureSampleType::UnfilterableFloat;
+ break;
+ case wgpu::TextureFormat::RGBA16Float:
+ AppendFloatTextureHead();
+ shader += kDstBufferU32;
+ shader += kEncodeRG16FloatInU32;
+ shader += kCommonHead;
+ shader += kCommonStart;
+ shader += kLoadRGBA16ToU32;
+ textureSampleType = wgpu::TextureSampleType::UnfilterableFloat;
+ break;
case wgpu::TextureFormat::Depth16Unorm:
AppendFloatTextureHead();
shader += kDstBufferU32;
@@ -728,11 +811,12 @@
textureSampleType = wgpu::TextureSampleType::UnfilterableFloat;
break;
case wgpu::TextureFormat::Depth32Float:
+ case wgpu::TextureFormat::R32Float:
AppendFloatTextureHead();
shader += kDstBufferF32;
shader += kCommonHead;
shader += kCommonStart;
- shader += kLoadDepth32Float;
+ shader += kLoadR32Float;
textureSampleType = wgpu::TextureSampleType::UnfilterableFloat;
break;
case wgpu::TextureFormat::Stencil8:
@@ -756,7 +840,7 @@
shader += kDstBufferF32;
shader += kCommonHead;
shader += kCommonStart;
- shader += kLoadDepth32Float;
+ shader += kLoadR32Float;
textureSampleType = wgpu::TextureSampleType::UnfilterableFloat;
break;
case Aspect::Stencil:
@@ -821,11 +905,16 @@
computePipelineDescriptor.compute.module = shaderModule.Get();
computePipelineDescriptor.compute.entryPoint = "main";
+ const uint32_t bytesPerTexel = format.GetAspectInfo(src.aspect).block.byteSize;
+ // Size of one unit for a thread to write to. For format < 4 bytes, we always write 4 bytes at a
+ // time.
+ const uint32_t ouputUnitSize = std::max(bytesPerTexel, 4u);
const uint32_t adjustedWorkGroupSizeY =
(viewDimension == wgpu::TextureViewDimension::e1D) ? 1 : kWorkgroupSizeY;
- const std::array<ConstantEntry, 2> constants = {{
+ const std::array<ConstantEntry, 3> constants = {{
{nullptr, "workgroupSizeX", kWorkgroupSizeX},
{nullptr, "workgroupSizeY", static_cast<double>(adjustedWorkGroupSizeY)},
+ {nullptr, "gOutputUnitSize", static_cast<double>(ouputUnitSize)},
}};
computePipelineDescriptor.compute.constantCount = constants.size();
computePipelineDescriptor.compute.constants = constants.data();
@@ -851,6 +940,10 @@
case wgpu::TextureFormat::RGBA8Unorm:
case wgpu::TextureFormat::BGRA8Unorm:
case wgpu::TextureFormat::RGB9E5Ufloat:
+ case wgpu::TextureFormat::R16Float:
+ case wgpu::TextureFormat::RG16Float:
+ case wgpu::TextureFormat::RGBA16Float:
+ case wgpu::TextureFormat::R32Float:
case wgpu::TextureFormat::Depth16Unorm:
case wgpu::TextureFormat::Depth32Float:
case wgpu::TextureFormat::Stencil8:
@@ -946,6 +1039,7 @@
Align(copyExtent.width, 2 * kWorkgroupSizeX) / (2 * kWorkgroupSizeX);
break;
case 4:
+ case 8:
workgroupCountX = Align(copyExtent.width, kWorkgroupSizeX) / kWorkgroupSizeX;
break;
default:
@@ -1064,7 +1158,7 @@
// packTexelCount: number of texel values (1, 2, or 4) one thread packs into the dst
// buffer
- params[3] = 4 / bytesPerTexel;
+ params[3] = std::max(1u, 4 / bytesPerTexel);
// srcExtent: vec3u
params[4] = copyExtent.width;
params[5] = copyExtent.height;
@@ -1076,7 +1170,7 @@
params[9] = rowsPerImage;
params[10] = shaderStartOffset;
- // These params are only used for R8Snorm and R8Snorm
+ // These params are only used for formats smaller than 4 bytes
params[11] = (shaderStartOffset % 4) / bytesPerTexel; // shift
params[16] = bytesPerTexel;
diff --git a/src/dawn/tests/end2end/CopyTests.cpp b/src/dawn/tests/end2end/CopyTests.cpp
index 5a913ac..25c4669 100644
--- a/src/dawn/tests/end2end/CopyTests.cpp
+++ b/src/dawn/tests/end2end/CopyTests.cpp
@@ -427,8 +427,14 @@
if (useMappableBuffer) {
const auto* mappedPtr = static_cast<const uint8_t*>(buffer.GetConstMappedRange());
- EXPECT_EQ(memcmp(mappedPtr + bufferOffset, expected.data(), expected.size()), 0)
- << errorMsgSs.str();
+ for (size_t i = 0; i < expected.size(); ++i) {
+ if (mappedPtr[bufferOffset + i] != expected[i]) {
+ EXPECT_EQ(mappedPtr[bufferOffset + i], expected[i])
+ << "with i=" << i << "\n"
+ << errorMsgSs.str();
+ break;
+ }
+ }
} else {
EXPECT_BUFFER_U8_RANGE_EQ(reinterpret_cast<const uint8_t*>(expected.data()), buffer,
bufferOffset, expected.size())
@@ -1726,6 +1732,10 @@
wgpu::TextureFormat::R16Sint,
wgpu::TextureFormat::R16Float,
+ wgpu::TextureFormat::RG16Uint,
+ wgpu::TextureFormat::RG16Sint,
+ wgpu::TextureFormat::RG16Float,
+
wgpu::TextureFormat::R32Uint,
wgpu::TextureFormat::R32Sint,
wgpu::TextureFormat::R32Float,