src/dawn/native/BlitTextureToBuffer.cpp - dawn - Git at Google

 // Copyright 2023 The Dawn & Tint Authors
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
 // 1. Redistributions of source code must retain the above copyright notice, this
 //    list of conditions and the following disclaimer.
 //
 // 2. Redistributions in binary form must reproduce the above copyright notice,
 //    this list of conditions and the following disclaimer in the documentation
 //    and/or other materials provided with the distribution.
 //
 // 3. Neither the name of the copyright holder nor the names of its
 //    contributors may be used to endorse or promote products derived from
 //    this software without specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #include "dawn/native/BlitTextureToBuffer.h"

 #include <array>
 #include <string>
 #include <string_view>
 #include <utility>

 #include "dawn/common/Assert.h"
 #include "dawn/native/BindGroup.h"
 #include "dawn/native/CommandBuffer.h"
 #include "dawn/native/CommandEncoder.h"
 #include "dawn/native/ComputePassEncoder.h"
 #include "dawn/native/ComputePipeline.h"
 #include "dawn/native/Device.h"
 #include "dawn/native/InternalPipelineStore.h"
 #include "dawn/native/PhysicalDevice.h"
 #include "dawn/native/Queue.h"
 #include "dawn/native/Sampler.h"
 #include "dawn/native/utils/WGPUHelpers.h"

 namespace dawn::native {

 namespace {

 constexpr uint32_t kWorkgroupSizeX = 8;
 constexpr uint32_t kWorkgroupSizeY = 8;

 constexpr std::string_view kDstBufferU32 = R"(
 @group(0) @binding(1) var<storage, read_write> dst_buf : array<u32>;
 )";

 // For DepthFloat32 we can directly use f32 for the buffer array data type as we don't need packing.
 constexpr std::string_view kDstBufferF32 = R"(
 @group(0) @binding(1) var<storage, read_write> dst_buf : array<f32>;
 )";

 constexpr std::string_view kFloatTexture1D = R"(
 fn textureLoadGeneral(tex: texture_1d<f32>, coords: vec3u, level: u32) -> vec4<f32> {
     return textureLoad(tex, coords.x, level);
 }
 @group(0) @binding(0) var src_tex : texture_1d<f32>;
 )";

 constexpr std::string_view kFloatTexture2D = R"(
 fn textureLoadGeneral(tex: texture_2d<f32>, coords: vec3u, level: u32) -> vec4<f32> {
     return textureLoad(tex, coords.xy, level);
 }
 @group(0) @binding(0) var src_tex : texture_2d<f32>;
 )";

 constexpr std::string_view kFloatTexture2DArray = R"(
 fn textureLoadGeneral(tex: texture_2d_array<f32>, coords: vec3u, level: u32) -> vec4<f32> {
     return textureLoad(tex, coords.xy, coords.z, level);
 }
 @group(0) @binding(0) var src_tex : texture_2d_array<f32>;
 )";

 constexpr std::string_view kFloatTexture3D = R"(
 fn textureLoadGeneral(tex: texture_3d<f32>, coords: vec3u, level: u32) -> vec4<f32> {
     return textureLoad(tex, coords, level);
 }
 @group(0) @binding(0) var src_tex : texture_3d<f32>;
 )";

 // Cube map reference: https://en.wikipedia.org/wiki/Cube_mapping
 // Function converting texel coord to sample st coord for cube texture.
 constexpr std::string_view kCubeCoordCommon = R"(
 fn coordToCubeSampleST(coords: vec3u, size: vec3u) -> vec3<f32> {
     var st = (vec2f(coords.xy) + vec2f(0.5, 0.5)) / vec2f(params.levelSize.xy);
     st.y = 1. - st.y;
     st = st * 2. - 1.;
     var sample_coords: vec3f;
     switch(coords.z) {
         case 0: { sample_coords = vec3f(1., st.y, -st.x); } // Positive X
         case 1: { sample_coords = vec3f(-1., st.y, st.x); } // Negative X
         case 2: { sample_coords = vec3f(st.x, 1., -st.y); } // Positive Y
         case 3: { sample_coords = vec3f(st.x, -1., st.y); } // Negative Y
         case 4: { sample_coords = vec3f(st.x, st.y, 1.); }  // Positive Z
         case 5: { sample_coords = vec3f(-st.x, st.y, -1.);} // Negative Z
         default: { return vec3f(0.); } // Unreachable
     }
     return sample_coords;
 }
 )";

 constexpr std::string_view kFloatTextureCube = R"(
 @group(1) @binding(0) var default_sampler: sampler;
 fn textureLoadGeneral(tex: texture_cube<f32>, coords: vec3u, level: u32) -> vec4<f32> {
     let sample_coords = coordToCubeSampleST(coords, params.levelSize);
     return textureSampleLevel(tex, default_sampler, sample_coords, f32(level));
 }
 @group(0) @binding(0) var src_tex : texture_cube<f32>;
 )";

 constexpr std::string_view kUintTexture = R"(
 fn textureLoadGeneral(tex: texture_2d<u32>, coords: vec3u, level: u32) -> vec4<u32> {
     return textureLoad(tex, coords.xy, level);
 }
 @group(0) @binding(0) var src_tex : texture_2d<u32>;
 )";

 constexpr std::string_view kUintTextureArray = R"(
 fn textureLoadGeneral(tex: texture_2d_array<u32>, coords: vec3u, level: u32) -> vec4<u32> {
     return textureLoad(tex, coords.xy, coords.z, level);
 }
 @group(0) @binding(0) var src_tex : texture_2d_array<u32>;
 )";

 constexpr std::string_view kUintTextureCube = R"(
 @group(1) @binding(0) var default_sampler: sampler;
 fn textureLoadGeneral(tex: texture_cube<u32>, coords: vec3u, level: u32) -> vec4<u32> {
     let sample_coords = coordToCubeSampleST(coords, params.levelSize);
     return textureSampleLevel(tex, default_sampler, sample_coords, f32(level));
 }
 @group(0) @binding(0) var src_tex : texture_cube<u32>;
 )";

 // Each thread is responsible for reading (packTexelCount) texel and packing them into a 4-byte u32.
 constexpr std::string_view kCommonHead = R"(
 struct Params {
     // copyExtent
     srcOrigin: vec3u,
     // How many texel values one thread needs to pack (1, 2, or 4)
     packTexelCount: u32,
     srcExtent: vec3u,
     mipLevel: u32,
     // GPUImageDataLayout
     bytesPerRow: u32,
     rowsPerImage: u32,
     offset: u32,
     shift: u32,
     // Used for cube sample
     levelSize: vec3u,
     pad0: u32,
     texelSize: u32,
     numU32PerRowNeedsWriting: u32,
     readPreviousRow: u32,
     isCompactImage: u32,
 };

 @group(0) @binding(2) var<uniform> params : Params;

 override workgroupSizeX: u32;
 override workgroupSizeY: u32;

 @compute @workgroup_size(workgroupSizeX, workgroupSizeY, 1) fn main
 (@builtin(global_invocation_id) id : vec3u) {
 )";

 constexpr std::string_view kCommonStart = R"(
 let srcBoundary = params.srcOrigin + params.srcExtent;
 let coord0 = vec3u(id.x * params.packTexelCount, id.y, id.z) + params.srcOrigin;
 if (any(coord0 >= srcBoundary)) {
     return;
 }

 let indicesPerRow = params.bytesPerRow / 4;
 let indicesOffset = params.offset / 4;
 let dstOffset = indicesOffset + id.x + id.y * indicesPerRow + id.z * indicesPerRow * params.rowsPerImage;
 )";

 constexpr std::string_view kCommonEnd = R"(
     dst_buf[dstOffset] = result;
 }
 )";

 constexpr std::string_view kPackStencil8ToU32 = R"(
     // Storing stencil8 texel values
     var result: u32 = 0xff & textureLoadGeneral(src_tex, coord0, params.mipLevel).r;

     if (coord0.x + 4u <= srcBoundary.x) {
         // All 4 texels for this thread are within texture bounds.
         for (var i = 1u; i < 4u; i += 1u) {
             let coordi = coord0 + vec3u(i, 0, 0);
             let ri = 0xff & textureLoadGeneral(src_tex, coordi, params.mipLevel).r;
             result |= ri << (i * 8u);
         }
     } else {
         // Otherwise, srcExtent.x is not a multiple of 4 and this thread is at right edge of the texture
         // To preserve the original buffer content, we need to read from the buffer and pack it together with other values.
         let original: u32 = dst_buf[dstOffset];
         result |= original & 0xffffff00;

         for (var i = 1u; i < 4u; i += 1u) {
             let coordi = coord0 + vec3u(i, 0, 0);
             if (coordi.x >= srcBoundary.x) {
                 break;
             }
             let ri = 0xff & textureLoadGeneral(src_tex, coordi, params.mipLevel).r;
             result |= ri << (i * 8u);
         }
     }
 )";

 // Color format R8Snorm and RG8Snorm T2B copy doesn't require offset to be multiple of 4 bytes,
 // making it more complicated than other formats.
 // TODO(dawn:1886): potentially separate "middle of the image" case
 // and "on the edge" case into different shaders and passes for better performance.
 constexpr std::string_view kNonMultipleOf4OffsetStart = R"(
 let readPreviousRow: bool = params.readPreviousRow == 1;
 let isCompactImage: bool = params.isCompactImage == 1;
 let idBoundary = vec3u(params.numU32PerRowNeedsWriting
     - select(1u, 0u,
         params.shift == 0 ||
         // one more thread at end of row
         !readPreviousRow ||
         // one more thread at end of image
         (!isCompactImage && id.y == params.srcExtent.y - 1) ||
         // one more thread at end of buffer
         (id.y == params.srcExtent.y - 1 && id.z == params.srcExtent.z - 1)
         )
     , params.srcExtent.y, params.srcExtent.z);
 if (any(id >= idBoundary)) {
     return;
 }

 let byteOffset = params.offset + id.x * 4
     + id.y * params.bytesPerRow
     + id.z * params.bytesPerRow * params.rowsPerImage;
 let dstOffset = byteOffset / 4;
 let srcBoundary = params.srcOrigin + params.srcExtent;

 // Start coord, End coord
 var coordS = vec3u(id.x * params.packTexelCount, id.y, id.z) + params.srcOrigin;
 var coordE = coordS;
 coordE.x += params.packTexelCount - 1;

 if (params.shift > 0) {
     // Adjust coordS
     if (id.x == 0) {
         // Front of a row
         if (readPreviousRow) {
             // Needs reading from previous row
             coordS.x += params.bytesPerRow / params.texelSize - params.shift;
             if (id.y == 0) {
                 // Front of a layer
                 if (isCompactImage) {
                     // Needs reading from previous layer
                     coordS.y += params.srcExtent.y - 1;
                     if (id.z > 0) {
                         coordS.z -= 1;
                     }
                 }
             } else {
                 coordS.y -= 1;
             }
         }
     } else {
         coordS.x -= params.shift;
     }
     coordE.x -= params.shift;
 }

 let readDstBufAtStart: bool = params.shift > 0 && (
         all(id == vec3u(0u))    // start of buffer
         || (id.x == 0 && (!readPreviousRow      // start of non-compact row
             || (id.y == 0 && !isCompactImage)   // start of non-compact image
         )));
 let readDstBufAtEnd: bool = coordE.x >= srcBoundary.x;
 )";

 // R8snorm: texelByte = 1; each thread reads 1 ~ 4 texels.
 // Different scenarios are listed below:
 //
 // * In the middle of the row: reads 4 texels
 //       |  x  | x+1 | x+2 | x+3 |
 //
 // * At the edge of the row: when offset % 4 > 0
 //   - when copyWidth % bytesPerRow == 0 (compact row), read 4 texels
 //       e.g. offset = 1; copyWidth = 256;
 //       | 255,y-1 | 0,y | 1,y | 2,y |
 //   - when copyWidth % bytesPerRow > 0 || rowsPerImage > copyHeight (sparse row / sparse image)
 //     One more thread is added to the end of each row,
 //     reads 1 ~ 3 texels, reads dst buf values
 //       e.g. offset = 1; copyWidth = 128; mask = 0xffffff00;
 //       | 127,y-1 |  b  |  b  |  b  |
 //   - when copyWidth % bytesPerRow > 0 && copyWidth + offset % 4 > bytesPerRow (special case)
 //     reads 1 ~ 3 texels, reads dst buf values; mask = 0x0000ff00;
 //       e.g. offset = 1; copyWidth = 255;
 //       | 254,y-1 |  b  | 0,y | 1,y |
 //
 // * At the start of the whole copy:
 //   - when offset % 4 == 0, reads 4 texels
 //   - when offset % 4 > 0, reads 1 ~ 3 texels, reads dst buf values
 //       e.g. offset = 1; mask = 0x000000ff;
 //       |  b  |  0  |  1  |  2  |
 //       e.g. offset = 1, copyWidth = 2; mask = 0xff0000ff;
 //       |  b  |  0  |  1  |  b  |
 //
 // * At the end of the whole copy:
 //   - reads 1 ~ 4 texels, reads dst buf values;
 //       e.g. offset = 0; copyWidth = 256;
 //       | 252 | 253 | 254 | 255 |
 //       e.g. offset = 1; copyWidth = 256; mask = 0xffffff00;
 //       | 255 |  b  |  b  |  b  |

 constexpr std::string_view kPackR8SnormToU32 = R"(
 // Result bits to store into dst_buf
 var result: u32 = 0u;
 // Storing snorm8 texel values
 // later called by pack4x8snorm to convert to u32.
 var v: vec4<f32>;

 // dstBuf value is used for starting part.
 var mask: u32 = 0xffffffffu;
 if (!readDstBufAtStart) {
     // coordS is used
     mask &= 0xffffff00u;
     v[0] = textureLoadGeneral(src_tex, coordS, params.mipLevel).r;
 } else {
     // start of buffer, boundary check
     if (coordE.x >= 1) {
         if (coordE.x - 1 < srcBoundary.x) {
             mask &= 0xff00ffffu;
             v[2] = textureLoadGeneral(src_tex, coordE - vec3u(1, 0, 0), params.mipLevel).r;
         }

         if (coordE.x >= 2) {
             if (coordE.x - 2 < srcBoundary.x) {
                 mask &= 0xffff00ffu;
                 v[1] = textureLoadGeneral(src_tex, coordE - vec3u(2, 0, 0), params.mipLevel).r;
             }

             if (coordE.x >= 3) {
                 if (coordE.x - 3 < srcBoundary.x) {
                     mask &= 0xffffff00u;
                     v[0] = textureLoadGeneral(src_tex, coordE - vec3u(3, 0, 0), params.mipLevel).r;
                 }
             }
         }
     }
 }

 if (coordE.x < srcBoundary.x) {
     mask &= 0x00ffffffu;
     v[3] = textureLoadGeneral(src_tex, coordE, params.mipLevel).r;
 } else {
     // coordE is not used
     // dstBuf value is used for later part.
     // end of buffer (last thread) / end of non-compact row + x boundary check
     if (coordE.x - 2 < srcBoundary.x) {
         mask &= 0xffff00ffu;
         v[1] = textureLoadGeneral(src_tex, coordE - vec3u(2, 0, 0), params.mipLevel).r;
         if (coordE.x - 1 < srcBoundary.x) {
             mask &= 0xff00ffffu;
             v[2] = textureLoadGeneral(src_tex, coordE - vec3u(1, 0, 0), params.mipLevel).r;
         }
     }
 }

 if (readDstBufAtStart || readDstBufAtEnd) {
     let original: u32 = dst_buf[dstOffset];
     result = (original & mask) | (pack4x8snorm(v) & ~mask);
 } else {
     var coord1: vec3u;
     var coord2: vec3u;
     if (coordS.x < coordE.x) {
         // middle of row
         coord1 = coordE - vec3u(2, 0, 0);
         coord2 = coordE - vec3u(1, 0, 0);
     } else {
         // start of row
         switch params.shift {
             case 0: {
                 coord1 = coordS + vec3u(1, 0, 0);
                 coord2 = coordS + vec3u(2, 0, 0);
             }
             case 1: {
                 coord1 = coordE - vec3u(2, 0, 0);
                 coord2 = coordE - vec3u(1, 0, 0);
             }
             case 2: {
                 coord1 = coordS + vec3u(1, 0, 0);
                 coord2 = coordE - vec3u(1, 0, 0);
             }
             case 3: {
                 coord1 = coordS + vec3u(1, 0, 0);
                 coord2 = coordS + vec3u(2, 0, 0);
             }
             default: {
                 return; // unreachable when shift == 0
             }
         }
     }

     if (coord1.x < srcBoundary.x) {
         mask &= 0xffff00ffu;
         v[1] = textureLoadGeneral(src_tex, coord1, params.mipLevel).r;
     }
     if (coord2.x < srcBoundary.x) {
         mask &= 0xff00ffffu;
         v[2] = textureLoadGeneral(src_tex, coord2, params.mipLevel).r;
     }

     let readDstBufAtMid: bool = (params.srcExtent.x + params.shift > params.bytesPerRow)
         && (params.srcExtent.x < params.bytesPerRow);
     if (readDstBufAtMid && id.x == 0) {
         let original: u32 = dst_buf[dstOffset];
         result = (original & mask) | (pack4x8snorm(v) & ~mask);
     } else {
         result = pack4x8snorm(v);
     }
 }
 )";

 // RG8snorm: texelByte = 2; each thread reads 1 ~ 2 texels.
 // Different scenarios are listed below:
 //
 // * In the middle of the row: reads 2 texels
 //       |    x    |   x+1   |
 //
 // * At the edge of the row: when offset % 4 > 0
 //   - when copyWidth % bytesPerRow == 0 (compact row), read 2 texels
 //       e.g. offset = 2; copyWidth = 128;
 //       | 127,y-1 |   0,y   |
 //   - when copyWidth % bytesPerRow > 0 || rowsPerImage > copyHeight (sparse row / sparse image)
 //     One more thread is added to the end of each row,
 //     reads 1 texels, reads dst buf values
 //       e.g. offset = 1; copyWidth = 64; mask = 0xffff0000;
 //       |  63,y-1 |    b    |
 //
 // * At the start of the whole copy:
 //   - when offset % 4 == 0, reads 2 texels
 //   - when offset % 4 > 0, reads 1 texels, reads dst buf values
 //       e.g. offset = 2; mask = 0x0000ffff;
 //       |    b    |    0    |
 //
 // * At the end of the whole copy:
 //   - reads 1 ~ 2 texels, reads dst buf values;
 //       e.g. offset = 0; copyWidth = 128;
 //       |   126   |   127   |
 //       e.g. offset = 1; copyWidth = 128; mask = 0xffff0000;
 //       |   127   |    b    |

 constexpr std::string_view kPackRG8SnormToU32 = R"(
 // Result bits to store into dst_buf
 var result: u32 = 0u;
 // Storing snorm8 texel values
 // later called by pack4x8snorm to convert to u32.
 var v: vec4<f32>;

 // dstBuf value is used for starting part.
 var mask: u32 = 0xffffffffu;
 if (!readDstBufAtStart) {
     // coordS is used
     mask &= 0xffff0000u;
     let texel0 = textureLoadGeneral(src_tex, coordS, params.mipLevel).rg;
     v[0] = texel0.r;
     v[1] = texel0.g;
 }

 if (coordE.x < srcBoundary.x) {
     // coordE is used
     mask &= 0x0000ffffu;
     let texel1 = textureLoadGeneral(src_tex, coordE, params.mipLevel).rg;
     v[2] = texel1.r;
     v[3] = texel1.g;
 }

 if (readDstBufAtStart || readDstBufAtEnd) {
     let original: u32 = dst_buf[dstOffset];
     result = (original & mask) | (pack4x8snorm(v) & ~mask);
 } else {
     result = pack4x8snorm(v);
 }
 )";

 // ShaderF16 extension is only enabled by GL_AMD_gpu_shader_half_float for GL
 // so we should not use it generally for the emulation.
 // As a result we are using f32 and array<u32> to do all the math and byte manipulation.
 // If we have 2-byte scalar type (f16, u16) it can be a bit easier when writing to the storage
 // buffer.
 constexpr std::string_view kPackDepth16UnormToU32 = R"(
     // Result bits to store into dst_buf
     var result: u32 = 0u;
     // Storing depth16unorm texel values
     // later called by pack2x16unorm to convert to u32.
     var v: vec2<f32>;
     v[0] = textureLoadGeneral(src_tex, coord0, params.mipLevel).r;

     let coord1 = coord0 + vec3u(1, 0, 0);
     if (coord1.x < srcBoundary.x) {
         // Make sure coord1 is still within the copy boundary.
         v[1] = textureLoadGeneral(src_tex, coord1, params.mipLevel).r;
         result = pack2x16unorm(v);
     } else {
         // Otherwise, srcExtent.x is not a multiple of 2 and this thread is at right edge of the texture
         // To preserve the original buffer content, we need to read from the buffer and pack it together with other values.
         // TODO(dawn:1782): profiling against making a separate pass for this edge case
         // as it requires reading from dst_buf.
         let original: u32 = dst_buf[dstOffset];
         const mask = 0xffff0000u;
         result = (original & mask) | (pack2x16unorm(v) & ~mask);
     }
 )";

 // Storing snorm8 texel values
 // later called by pack4x8snorm to convert to u32.
 constexpr std::string_view kPackRGBA8SnormToU32 = R"(
     let v = textureLoadGeneral(src_tex, coord0, params.mipLevel);
     let result: u32 = pack4x8snorm(v);
 )";

 // Storing and swizzling bgra8unorm texel values
 // later called by pack4x8unorm to convert to u32.
 constexpr std::string_view kPackBGRA8UnormToU32 = R"(
     var v: vec4<f32>;

     let texel0 = textureLoadGeneral(src_tex, coord0, params.mipLevel);
     v = texel0.bgra;

     let result: u32 = pack4x8unorm(v);
 )";

 // Storing rgb9e5ufloat texel values
 // In this format float is represented as
 // 2^(exponent - bias) * (mantissa / 2^numMantissaBits)
 // Packing algorithm is from:
 // https://registry.khronos.org/OpenGL/extensions/EXT/EXT_texture_shared_exponent.txt
 //
 // Note: there are multiple bytes that could represent the same value in this format.
 // e.g.
 // 0x0a090807 and 0x0412100e both unpack to
 // [8.344650268554688e-7, 0.000015735626220703125, 0.000015497207641601562]
 // So the bytes copied via blit could be different.
 constexpr std::string_view kPackRGB9E5UfloatToU32 = R"(
     let v = textureLoadGeneral(src_tex, coord0, params.mipLevel);

     const n = 9; // number of mantissa bits
     const e_max = 31; // max exponent
     const b = 15; // exponent bias
     const sharedexp_max: f32 = (f32((1 << n) - 1) / f32(1 << n)) * (1 << (e_max - b));

     let red_c = clamp(v.r, 0.0, sharedexp_max);
     let green_c = clamp(v.g, 0.0, sharedexp_max);
     let blue_c = clamp(v.b, 0.0, sharedexp_max);

     let max_c = max(max(red_c, green_c), blue_c);
     let exp_shared_p: i32 = max(-b - 1, i32(floor(log2(max_c)))) + 1 + b;
     let max_s = u32(floor(max_c / exp2(f32(exp_shared_p - b - n)) + 0.5));
     var exp_shared = exp_shared_p;
     if (max_s == (1 << n)) {
         exp_shared += 1;
     }

     let scalar = 1.0 / exp2(f32(exp_shared - b - n));
     let red_s = u32(red_c * scalar + 0.5);
     let green_s = u32(green_c * scalar + 0.5);
     let blue_s = u32(blue_c * scalar + 0.5);

     const mask_9 = 0x1ffu;
     let result = (u32(exp_shared) << 27u) |
         ((blue_s & mask_9) << 18u) |
         ((green_s & mask_9) << 9u) |
         (red_s & mask_9);
 )";

 // Directly loading depth32float values into dst_buf
 // No bit manipulation and packing is needed.
 constexpr std::string_view kLoadDepth32Float = R"(
     dst_buf[dstOffset] = textureLoadGeneral(src_tex, coord0, params.mipLevel).r;
 }
 )";

 ResultOrError<Ref<ComputePipelineBase>> GetOrCreateTextureToBufferPipeline(
     DeviceBase* device,
     const TextureCopy& src,
     wgpu::TextureViewDimension viewDimension) {
     InternalPipelineStore* store = device->GetInternalPipelineStore();

     const Format& format = src.texture->GetFormat();

     auto iter = store->blitTextureToBufferComputePipelines.find({format.format, viewDimension});
     if (iter != store->blitTextureToBufferComputePipelines.end()) {
         return iter->second;
     }

     ShaderModuleWGSLDescriptor wgslDesc = {};
     ShaderModuleDescriptor shaderModuleDesc = {};
     shaderModuleDesc.nextInChain = &wgslDesc;

     wgpu::TextureSampleType textureSampleType;
     std::string shader;

     auto AppendFloatTextureHead = [&]() {
         switch (viewDimension) {
             case wgpu::TextureViewDimension::e1D:
                 shader += kFloatTexture1D;
                 break;
             case wgpu::TextureViewDimension::e2D:
                 shader += kFloatTexture2D;
                 break;
             case wgpu::TextureViewDimension::e2DArray:
                 shader += kFloatTexture2DArray;
                 break;
             case wgpu::TextureViewDimension::e3D:
                 shader += kFloatTexture3D;
                 break;
             case wgpu::TextureViewDimension::Cube:
                 shader += kCubeCoordCommon;
                 shader += kFloatTextureCube;
                 break;
             default:
                 DAWN_UNREACHABLE();
         }
     };
     auto AppendStencilTextureHead = [&]() {
         switch (viewDimension) {
             // Stencil cannot have e1D texture.
             case wgpu::TextureViewDimension::e2D:
                 shader += kUintTexture;
                 break;
             case wgpu::TextureViewDimension::e2DArray:
                 shader += kUintTextureArray;
                 break;
             case wgpu::TextureViewDimension::Cube:
                 shader += kCubeCoordCommon;
                 shader += kUintTextureCube;
                 break;
             default:
                 DAWN_UNREACHABLE();
         }
     };

     switch (format.format) {
         case wgpu::TextureFormat::R8Snorm:
             AppendFloatTextureHead();
             shader += kDstBufferU32;
             shader += kCommonHead;
             shader += kNonMultipleOf4OffsetStart;
             shader += kPackR8SnormToU32;
             shader += kCommonEnd;
             textureSampleType = wgpu::TextureSampleType::Float;
             break;
         case wgpu::TextureFormat::RG8Snorm:
             AppendFloatTextureHead();
             shader += kDstBufferU32;
             shader += kCommonHead;
             shader += kNonMultipleOf4OffsetStart;
             shader += kPackRG8SnormToU32;
             shader += kCommonEnd;
             textureSampleType = wgpu::TextureSampleType::Float;
             break;
         case wgpu::TextureFormat::RGBA8Snorm:
             AppendFloatTextureHead();
             shader += kDstBufferU32;
             shader += kCommonHead;
             shader += kCommonStart;
             shader += kPackRGBA8SnormToU32;
             shader += kCommonEnd;
             textureSampleType = wgpu::TextureSampleType::Float;
             break;
         case wgpu::TextureFormat::BGRA8Unorm:
             AppendFloatTextureHead();
             shader += kDstBufferU32;
             shader += kCommonHead;
             shader += kCommonStart;
             shader += kPackBGRA8UnormToU32;
             shader += kCommonEnd;
             textureSampleType = wgpu::TextureSampleType::Float;
             break;
         case wgpu::TextureFormat::RGB9E5Ufloat:
             AppendFloatTextureHead();
             shader += kDstBufferU32;
             shader += kCommonHead;
             shader += kCommonStart;
             shader += kPackRGB9E5UfloatToU32;
             shader += kCommonEnd;
             textureSampleType = wgpu::TextureSampleType::Float;
             break;
         case wgpu::TextureFormat::Depth16Unorm:
             AppendFloatTextureHead();
             shader += kDstBufferU32;
             shader += kCommonHead;
             shader += kCommonStart;
             shader += kPackDepth16UnormToU32;
             shader += kCommonEnd;
             textureSampleType = wgpu::TextureSampleType::UnfilterableFloat;
             break;
         case wgpu::TextureFormat::Depth32Float:
             AppendFloatTextureHead();
             shader += kDstBufferF32;
             shader += kCommonHead;
             shader += kCommonStart;
             shader += kLoadDepth32Float;
             textureSampleType = wgpu::TextureSampleType::UnfilterableFloat;
             break;
         case wgpu::TextureFormat::Stencil8:
         case wgpu::TextureFormat::Depth24PlusStencil8:
             // Depth24PlusStencil8 can only copy with stencil aspect and is gated by validation.
             AppendStencilTextureHead();
             shader += kDstBufferU32;
             shader += kCommonHead;
             shader += kCommonStart;
             shader += kPackStencil8ToU32;
             shader += kCommonEnd;
             textureSampleType = wgpu::TextureSampleType::Uint;
             break;
         case wgpu::TextureFormat::Depth32FloatStencil8: {
             // Depth32FloatStencil8 is not supported on OpenGL/OpenGLES where the blit path is
             // enabled by default. But could be hit if the blit path toggle is manually set on other
             // backends.
             switch (src.aspect) {
                 case Aspect::Depth:
                     AppendFloatTextureHead();
                     shader += kDstBufferF32;
                     shader += kCommonHead;
                     shader += kCommonStart;
                     shader += kLoadDepth32Float;
                     textureSampleType = wgpu::TextureSampleType::UnfilterableFloat;
                     break;
                 case Aspect::Stencil:
                     AppendStencilTextureHead();
                     shader += kDstBufferU32;
                     shader += kCommonHead;
                     shader += kCommonStart;
                     shader += kPackStencil8ToU32;
                     shader += kCommonEnd;
                     textureSampleType = wgpu::TextureSampleType::Uint;
                     break;
                 default:
                     DAWN_UNREACHABLE();
             }
             break;
         }
         default:
             DAWN_UNREACHABLE();
     }

     wgslDesc.code = shader.c_str();

     Ref<ShaderModuleBase> shaderModule;
     DAWN_TRY_ASSIGN(shaderModule, device->CreateShaderModule(&shaderModuleDesc));

     Ref<BindGroupLayoutBase> bindGroupLayout0;
     DAWN_TRY_ASSIGN(bindGroupLayout0,
                     utils::MakeBindGroupLayout(
                         device,
                         {
                             {0, wgpu::ShaderStage::Compute, textureSampleType, viewDimension},
                             {1, wgpu::ShaderStage::Compute, kInternalStorageBufferBinding},
                             {2, wgpu::ShaderStage::Compute, wgpu::BufferBindingType::Uniform},
                         },
                         /* allowInternalBinding */ true));

     Ref<PipelineLayoutBase> pipelineLayout;
     if (viewDimension == wgpu::TextureViewDimension::Cube) {
         // Cube texture requires an extra sampler to call textureSampleLevel
         Ref<BindGroupLayoutBase> bindGroupLayout1;
         DAWN_TRY_ASSIGN(bindGroupLayout1,
                         utils::MakeBindGroupLayout(device,
                                                    {
                                                        {0, wgpu::ShaderStage::Compute,
                                                         wgpu::SamplerBindingType::NonFiltering},
                                                    },
                                                    /* allowInternalBinding */ true));

         std::array<BindGroupLayoutBase*, 2> bindGroupLayouts = {bindGroupLayout0.Get(),
                                                                 bindGroupLayout1.Get()};

         PipelineLayoutDescriptor descriptor;
         descriptor.bindGroupLayoutCount = bindGroupLayouts.size();
         descriptor.bindGroupLayouts = bindGroupLayouts.data();
         DAWN_TRY_ASSIGN(pipelineLayout, device->CreatePipelineLayout(&descriptor));
     } else {
         DAWN_TRY_ASSIGN(pipelineLayout, utils::MakeBasicPipelineLayout(device, bindGroupLayout0));
     }

     ComputePipelineDescriptor computePipelineDescriptor = {};
     computePipelineDescriptor.layout = pipelineLayout.Get();
     computePipelineDescriptor.compute.module = shaderModule.Get();
     computePipelineDescriptor.compute.entryPoint = "main";

     const uint32_t adjustedWorkGroupSizeY =
         (viewDimension == wgpu::TextureViewDimension::e1D) ? 1 : kWorkgroupSizeY;
     const std::array<ConstantEntry, 2> constants = {{
         {nullptr, "workgroupSizeX", kWorkgroupSizeX},
         {nullptr, "workgroupSizeY", static_cast<double>(adjustedWorkGroupSizeY)},
     }};
     computePipelineDescriptor.compute.constantCount = constants.size();
     computePipelineDescriptor.compute.constants = constants.data();

     Ref<ComputePipelineBase> pipeline;
     DAWN_TRY_ASSIGN(pipeline, device->CreateComputePipeline(&computePipelineDescriptor));
     store->blitTextureToBufferComputePipelines.emplace(std::make_pair(format.format, viewDimension),
                                                        pipeline);
     return pipeline;
 }

 }  // anonymous namespace

 MaybeError BlitTextureToBuffer(DeviceBase* device,
                                CommandEncoder* commandEncoder,
                                const TextureCopy& src,
                                const BufferCopy& dst,
                                const Extent3D& copyExtent) {
     wgpu::TextureViewDimension textureViewDimension;
     {
         if (device->IsCompatibilityMode()) {
             textureViewDimension = src.texture->GetCompatibilityTextureBindingViewDimension();
         } else {
             wgpu::TextureDimension dimension = src.texture->GetDimension();
             switch (dimension) {
                 case wgpu::TextureDimension::Undefined:
                     DAWN_UNREACHABLE();
                 case wgpu::TextureDimension::e1D:
                     textureViewDimension = wgpu::TextureViewDimension::e1D;
                     break;
                 case wgpu::TextureDimension::e2D:
                     if (src.texture->GetArrayLayers() > 1) {
                         textureViewDimension = wgpu::TextureViewDimension::e2DArray;
                     } else {
                         textureViewDimension = wgpu::TextureViewDimension::e2D;
                     }
                     break;
                 case wgpu::TextureDimension::e3D:
                     textureViewDimension = wgpu::TextureViewDimension::e3D;
                     break;
             }
         }
     }
     DAWN_ASSERT(textureViewDimension != wgpu::TextureViewDimension::Undefined &&
                 textureViewDimension != wgpu::TextureViewDimension::CubeArray);

     Ref<ComputePipelineBase> pipeline;
     DAWN_TRY_ASSIGN(pipeline,
                     GetOrCreateTextureToBufferPipeline(device, src, textureViewDimension));

     const Format& format = src.texture->GetFormat();

     uint32_t bytesPerTexel = format.GetAspectInfo(src.aspect).block.byteSize;
     uint32_t workgroupCountX = 1;
     uint32_t workgroupCountY = (textureViewDimension == wgpu::TextureViewDimension::e1D)
                                    ? 1
                                    : (copyExtent.height + kWorkgroupSizeY - 1) / kWorkgroupSizeY;
     uint32_t workgroupCountZ = copyExtent.depthOrArrayLayers;

     uint32_t numU32PerRowNeedsWriting = 0;
     bool readPreviousRow = false;
     if (format.format == wgpu::TextureFormat::R8Snorm ||
         format.format == wgpu::TextureFormat::RG8Snorm) {
         // number of u32 needs writing
         // uint32_t extra = (dst.offset % 4 > 0) ? 1 : 0;
         uint32_t extraBytes = dst.offset % 4;

         // Between rows and image (whether thread at end of each row needs read start of next row)
         readPreviousRow = ((copyExtent.width * bytesPerTexel) + extraBytes > dst.bytesPerRow);

         // numU32PerRowNeedsWriting = bytesPerTexel * copyExtent.width / 4 + (1 or 0)
         // One more thread is needed when offset % 4 > 0 and the end of the buffer occupies one more
         // 4-byte word.
         // e.g. for R8Snorm copyWidth = 256, when offset = 0, 64 u32 needs writing;
         // when offset = 1, 65 u32 needs writing;
         // (The first u32 needs reading 3 texels and mix up with the original buffer value,
         // the last u32 needs reading 1 texel and mix up with the original buffer value);
         numU32PerRowNeedsWriting = (bytesPerTexel * copyExtent.width + extraBytes + 3) / 4;
         workgroupCountX = numU32PerRowNeedsWriting;
     } else {
         switch (bytesPerTexel) {
             case 1:
                 // One thread is responsible for writing four texel values (x, y) ~ (x+3, y).
                 workgroupCountX =
                     (copyExtent.width + 4 * kWorkgroupSizeX - 1) / (4 * kWorkgroupSizeX);
                 break;
             case 2:
                 // One thread is responsible for writing two texel values (x, y) and (x+1, y).
                 workgroupCountX =
                     (copyExtent.width + 2 * kWorkgroupSizeX - 1) / (2 * kWorkgroupSizeX);
                 break;
             case 4:
                 workgroupCountX = (copyExtent.width + kWorkgroupSizeX - 1) / kWorkgroupSizeX;
                 break;
             default:
                 DAWN_UNREACHABLE();
         }
     }

     Ref<BufferBase> destinationBuffer = dst.buffer;
     bool useIntermediateCopyBuffer = false;
     if (bytesPerTexel < 4 && dst.buffer->GetSize() % 4 != 0 &&
         copyExtent.width % (4 / bytesPerTexel) != 0) {
         // This path is made for OpenGL/GLES bliting a texture with an width % (4 / texelByteSize)
         // != 0, to a compact buffer. When we copy the last texel, we inevitably need to access an
         // out of bounds location given by dst.buffer.size as we use array<u32> in the shader for
         // the storage buffer. Although the allocated size of dst.buffer is aligned to 4 bytes for
         // OpenGL/GLES backend, the size of the storage buffer binding for the shader is not. Thus
         // we make an intermediate buffer aligned to 4 bytes for the compute shader to safely
         // access, and perform an additional buffer to buffer copy at the end. This path should be
         // hit rarely.
         useIntermediateCopyBuffer = true;
         BufferDescriptor descriptor = {};
         descriptor.size = Align(dst.buffer->GetSize(), 4);
         // TODO(dawn:1485): adding CopyDst usage to add kInternalStorageBuffer usage internally.
         descriptor.usage = wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst;
         DAWN_TRY_ASSIGN(destinationBuffer, device->CreateBuffer(&descriptor));
     }

     // Allow internal usages since we need to use the source as a texture binding
     // and buffer as a storage binding.
     auto scope = commandEncoder->MakeInternalUsageScope();

     Ref<BufferBase> uniformBuffer;
     {
         BufferDescriptor bufferDesc = {};
         // Uniform buffer size needs to be multiple of 16 bytes
         bufferDesc.size = sizeof(uint32_t) * 20;
         bufferDesc.usage = wgpu::BufferUsage::Uniform;
         bufferDesc.mappedAtCreation = true;
         DAWN_TRY_ASSIGN(uniformBuffer, device->CreateBuffer(&bufferDesc));

         uint32_t* params =
             static_cast<uint32_t*>(uniformBuffer->GetMappedRange(0, bufferDesc.size));
         // srcOrigin: vec3u
         params[0] = src.origin.x;
         params[1] = src.origin.y;
         params[2] = src.origin.z;

         // packTexelCount: number of texel values (1, 2, or 4) one thread packs into the dst buffer
         params[3] = 4 / bytesPerTexel;
         // srcExtent: vec3u
         params[4] = copyExtent.width;
         params[5] = copyExtent.height;
         params[6] = copyExtent.depthOrArrayLayers;

         params[7] = src.mipLevel;

         params[8] = dst.bytesPerRow;
         params[9] = dst.rowsPerImage;
         params[10] = dst.offset;

         // These params are only used for R8Snorm and R8Snorm
         params[11] = (dst.offset % 4) / bytesPerTexel;  // shift

         params[16] = bytesPerTexel;
         params[17] = numU32PerRowNeedsWriting;
         params[18] = readPreviousRow ? 1 : 0;
         params[19] = dst.rowsPerImage == copyExtent.height ? 1 : 0;  // isCompactImage

         if (textureViewDimension == wgpu::TextureViewDimension::Cube) {
             // cube need texture size to convert texel coord to sample location
             auto levelSize =
                 src.texture->GetMipLevelSingleSubresourceVirtualSize(src.mipLevel, Aspect::Color);
             params[12] = levelSize.width;
             params[13] = levelSize.height;
             params[14] = levelSize.depthOrArrayLayers;
         }

         DAWN_TRY(uniformBuffer->Unmap());
     }

     TextureViewDescriptor viewDesc = {};
     switch (src.aspect) {
         case Aspect::Color:
             viewDesc.aspect = wgpu::TextureAspect::All;
             break;
         case Aspect::Depth:
             viewDesc.aspect = wgpu::TextureAspect::DepthOnly;
             break;
         case Aspect::Stencil:
             viewDesc.aspect = wgpu::TextureAspect::StencilOnly;
             break;
         default:
             DAWN_UNREACHABLE();
     }

     viewDesc.dimension = textureViewDimension;
     viewDesc.baseMipLevel = 0;
     viewDesc.mipLevelCount = src.texture->GetNumMipLevels();
     viewDesc.baseArrayLayer = 0;
     if (viewDesc.dimension == wgpu::TextureViewDimension::e2DArray ||
         viewDesc.dimension == wgpu::TextureViewDimension::Cube) {
         viewDesc.arrayLayerCount = src.texture->GetArrayLayers();
     } else {
         viewDesc.arrayLayerCount = 1;
     }

     Ref<TextureViewBase> srcView;
     DAWN_TRY_ASSIGN(srcView, src.texture->CreateView(&viewDesc));

     Ref<BindGroupLayoutBase> bindGroupLayout0;
     DAWN_TRY_ASSIGN(bindGroupLayout0, pipeline->GetBindGroupLayout(0));
     Ref<BindGroupBase> bindGroup0;
     DAWN_TRY_ASSIGN(bindGroup0, utils::MakeBindGroup(device, bindGroupLayout0,
                                                      {
                                                          {0, srcView},
                                                          {1, destinationBuffer},
                                                          {2, uniformBuffer},
                                                      },
                                                      UsageValidationMode::Internal));

     Ref<BindGroupLayoutBase> bindGroupLayout1;
     Ref<BindGroupBase> bindGroup1;
     if (textureViewDimension == wgpu::TextureViewDimension::Cube) {
         // Cube texture requires an extra sampler to call textureSampleLevel
         DAWN_TRY_ASSIGN(bindGroupLayout1, pipeline->GetBindGroupLayout(1));

         SamplerDescriptor samplerDesc = {};
         Ref<SamplerBase> sampler;
         DAWN_TRY_ASSIGN(sampler, device->CreateSampler(&samplerDesc));

         DAWN_TRY_ASSIGN(bindGroup1, utils::MakeBindGroup(device, bindGroupLayout1,
                                                          {
                                                              {0, sampler},
                                                          },
                                                          UsageValidationMode::Internal));
     }

     Ref<ComputePassEncoder> pass = commandEncoder->BeginComputePass();
     pass->APISetPipeline(pipeline.Get());
     pass->APISetBindGroup(0, bindGroup0.Get());
     if (textureViewDimension == wgpu::TextureViewDimension::Cube) {
         pass->APISetBindGroup(1, bindGroup1.Get());
     }
     pass->APIDispatchWorkgroups(workgroupCountX, workgroupCountY, workgroupCountZ);
     pass->APIEnd();

     if (useIntermediateCopyBuffer) {
         DAWN_ASSERT(destinationBuffer->GetSize() <= dst.buffer->GetAllocatedSize());
         commandEncoder->InternalCopyBufferToBufferWithAllocatedSize(
             destinationBuffer.Get(), 0, dst.buffer.Get(), 0, destinationBuffer->GetSize());
     }

     return {};
 }

 }  // namespace dawn::native