Compat GL/GLES: blit a stencil texture to a buffer using compute

Add compute blit emulation path for Stencil8 textures for
OpenGLES backend.

Bug: dawn:1782, dawn:1835
Change-Id: I4719d339ee78fd5fc524d809417504125d2c0aee
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/133364
Kokoro: Kokoro <noreply+kokoro@google.com>
Commit-Queue: Shrek Shao <shrekshao@google.com>
Reviewed-by: Austin Eng <enga@chromium.org>
diff --git a/src/dawn/native/BlitDepthStencilToBuffer.cpp b/src/dawn/native/BlitDepthStencilToBuffer.cpp
index a6795ff..a37d108 100644
--- a/src/dawn/native/BlitDepthStencilToBuffer.cpp
+++ b/src/dawn/native/BlitDepthStencilToBuffer.cpp
@@ -141,6 +141,73 @@
 }
 )";
 
+constexpr char kBlitStencil8ToBufferShaders[] = R"(
+@group(0) @binding(0) var src_tex : texture_2d_array<u32>;
+@group(0) @binding(1) var<storage, read_write> dst_buf : array<u32>;
+
+struct Params {
+    // copyExtent
+    srcOrigin: vec3u,
+    pad0: u32,
+    srcExtent: vec3u,
+    pad1: u32,
+
+    // GPUImageDataLayout
+    indicesPerRow: u32,
+    rowsPerImage: u32,
+    indicesOffset: u32,
+};
+
+@group(0) @binding(2) var<uniform> params : Params;
+
+override workgroupSizeX: u32;
+override workgroupSizeY: u32;
+
+// Load the stencil value and write to storage buffer.
+// Each thread is responsible for reading 4 u8 values and packing them into 1 u32 value.
+@compute @workgroup_size(workgroupSizeX, workgroupSizeY, 1) fn blit_stencil_to_buffer(@builtin(global_invocation_id) id : vec3u) {
+    let srcBoundary = params.srcOrigin + params.srcExtent;
+
+    let coord0 = vec3u(id.x * 4, id.y, id.z) + params.srcOrigin;
+
+    if (any(coord0 >= srcBoundary)) {
+        return;
+    }
+
+    let r0: u32 = 0x000000ff & textureLoad(src_tex, coord0.xy, coord0.z, 0).r;
+
+    let dstOffset = params.indicesOffset + id.x + id.y * params.indicesPerRow + id.z * params.indicesPerRow * params.rowsPerImage;
+
+    var result: u32 = r0;
+
+    let coord4 = coord0 + vec3u(4, 0, 0);
+    if (coord4.x <= srcBoundary.x) {
+        // All 4 texels for this thread are within texture bounds.
+        for (var i = 1u; i < 4u; i = i + 1u) {
+            let coordi = coord0 + vec3u(i, 0, 0);
+            let ri: u32 = 0x000000ff & textureLoad(src_tex, coordi.xy, coordi.z, 0).r;
+            result += ri << (i * 8u);
+        }
+    } else {
+        // Otherwise, srcExtent.x is not a multiply of 4 and this thread is at right edge of the texture
+        // To preserve the original buffer content, we need to read from the buffer and pack it together with other values.
+        let original: u32 = dst_buf[dstOffset];
+        result += original & 0xffffff00;
+
+        for (var i = 1u; i < 4u; i = i + 1u) {
+            let coordi = coord0 + vec3u(i, 0, 0);
+            if (coordi.x >= srcBoundary.x) {
+                break;
+            }
+            let ri: u32 = 0x000000ff & textureLoad(src_tex, coordi.xy, coordi.z, 0).r;
+            result += ri << (i * 8u);
+        }
+    }
+
+    dst_buf[dstOffset] = result;
+}
+)";
+
 ResultOrError<Ref<ComputePipelineBase>> CreateDepthBlitComputePipeline(DeviceBase* device,
                                                                        InternalPipelineStore* store,
                                                                        wgpu::TextureFormat format) {
@@ -149,10 +216,10 @@
     shaderModuleDesc.nextInChain = &wgslDesc;
     switch (format) {
         case wgpu::TextureFormat::Depth16Unorm:
-            wgslDesc.source = kBlitDepth16UnormToBufferShaders;
+            wgslDesc.code = kBlitDepth16UnormToBufferShaders;
             break;
         case wgpu::TextureFormat::Depth32Float:
-            wgslDesc.source = kBlitDepth32FloatToBufferShaders;
+            wgslDesc.code = kBlitDepth32FloatToBufferShaders;
             break;
         default:
             UNREACHABLE();
@@ -233,6 +300,53 @@
     return pipeline;
 }
 
+ResultOrError<Ref<ComputePipelineBase>> GetOrCreateStencil8ToBufferPipeline(DeviceBase* device) {
+    InternalPipelineStore* store = device->GetInternalPipelineStore();
+    if (store->blitStencil8ToBufferComputePipeline != nullptr) {
+        return store->blitStencil8ToBufferComputePipeline;
+    }
+
+    ShaderModuleWGSLDescriptor wgslDesc = {};
+    ShaderModuleDescriptor shaderModuleDesc = {};
+    shaderModuleDesc.nextInChain = &wgslDesc;
+    wgslDesc.code = kBlitStencil8ToBufferShaders;
+
+    Ref<ShaderModuleBase> shaderModule;
+    DAWN_TRY_ASSIGN(shaderModule, device->CreateShaderModule(&shaderModuleDesc));
+
+    Ref<BindGroupLayoutBase> bindGroupLayout;
+    DAWN_TRY_ASSIGN(bindGroupLayout,
+                    utils::MakeBindGroupLayout(
+                        device,
+                        {
+                            {0, wgpu::ShaderStage::Compute, wgpu::TextureSampleType::Uint,
+                             wgpu::TextureViewDimension::e2DArray},
+                            {1, wgpu::ShaderStage::Compute, kInternalStorageBufferBinding},
+                            {2, wgpu::ShaderStage::Compute, wgpu::BufferBindingType::Uniform},
+                        },
+                        /* allowInternalBinding */ true));
+
+    Ref<PipelineLayoutBase> pipelineLayout;
+    DAWN_TRY_ASSIGN(pipelineLayout, utils::MakeBasicPipelineLayout(device, bindGroupLayout));
+
+    ComputePipelineDescriptor computePipelineDescriptor = {};
+    computePipelineDescriptor.layout = pipelineLayout.Get();
+    computePipelineDescriptor.compute.module = shaderModule.Get();
+    computePipelineDescriptor.compute.entryPoint = "blit_stencil_to_buffer";
+
+    constexpr std::array<ConstantEntry, 2> constants = {{
+        {nullptr, "workgroupSizeX", kWorkgroupSizeX},
+        {nullptr, "workgroupSizeY", kWorkgroupSizeY},
+    }};
+    computePipelineDescriptor.compute.constantCount = constants.size();
+    computePipelineDescriptor.compute.constants = constants.data();
+
+    Ref<ComputePipelineBase> pipeline;
+    DAWN_TRY_ASSIGN(pipeline, device->CreateComputePipeline(&computePipelineDescriptor));
+    store->blitStencil8ToBufferComputePipeline = pipeline;
+    return pipeline;
+}
+
 }  // anonymous namespace
 
 MaybeError BlitDepthToBuffer(DeviceBase* device,
@@ -358,4 +472,113 @@
     return {};
 }
 
+MaybeError BlitStencilToBuffer(DeviceBase* device,
+                               CommandEncoder* commandEncoder,
+                               const TextureCopy& src,
+                               const BufferCopy& dst,
+                               const Extent3D& copyExtent) {
+    const Format& format = src.texture->GetFormat();
+
+    Ref<BufferBase> destinationBuffer = dst.buffer;
+    bool useIntermediateCopyBuffer = false;
+    if (dst.buffer->GetSize() % 4 != 0 && copyExtent.width % 4 != 0) {
+        // This path is made for OpenGL/GLES stencil8 bliting a texture with an width % 4 != 0,
+        // to a compact buffer. When we copy the last texel, we inevitably need to access an
+        // out of bounds location given by dst.buffer.size as we use array<u32> in the shader for
+        // the storage buffer. Although the allocated size of dst.buffer is aligned to 4 bytes for
+        // OpenGL/GLES backend, the size of the storage buffer binding for the shader is not. Thus
+        // we make an intermediate buffer aligned to 4 bytes for the compute shader to safely
+        // access, and perform an additional buffer to buffer copy at the end. This path should be
+        // hit rarely.
+        useIntermediateCopyBuffer = true;
+        BufferDescriptor descriptor = {};
+        descriptor.size = Align(dst.buffer->GetSize(), 4);
+        // TODO(dawn:1485): adding CopyDst usage to add kInternalStorageBuffer usage internally.
+        descriptor.usage = wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst;
+        DAWN_TRY_ASSIGN(destinationBuffer, device->CreateBuffer(&descriptor));
+    }
+
+    // Supported format = {Stencil8, Depth24PlusStencil8}
+    // Depth32FloatStencil8 is not supported on OpenGL/OpenGLES where we enabled this workaround.
+    ASSERT(format.format == wgpu::TextureFormat::Stencil8 ||
+           format.format == wgpu::TextureFormat::Depth24PlusStencil8);
+    uint32_t workgroupCountX = (copyExtent.width + 4 * kWorkgroupSizeX - 1) / (4 * kWorkgroupSizeX);
+    uint32_t workgroupCountY = (copyExtent.height + kWorkgroupSizeY - 1) / kWorkgroupSizeY;
+    uint32_t workgroupCountZ = copyExtent.depthOrArrayLayers;
+    Ref<ComputePipelineBase> pipeline;
+    DAWN_TRY_ASSIGN(pipeline, GetOrCreateStencil8ToBufferPipeline(device));
+
+    // Allow internal usages since we need to use the source as a texture binding
+    // and buffer as a storage binding.
+    auto scope = commandEncoder->MakeInternalUsageScope();
+
+    Ref<BindGroupLayoutBase> bindGroupLayout;
+    DAWN_TRY_ASSIGN(bindGroupLayout, pipeline->GetBindGroupLayout(0));
+
+    Ref<BufferBase> uniformBuffer;
+    {
+        BufferDescriptor bufferDesc = {};
+        // Uniform buffer size needs to be multiple of 16 bytes
+        bufferDesc.size = sizeof(uint32_t) * 12;
+        bufferDesc.usage = wgpu::BufferUsage::Uniform;
+        bufferDesc.mappedAtCreation = true;
+        DAWN_TRY_ASSIGN(uniformBuffer, device->CreateBuffer(&bufferDesc));
+
+        uint32_t* params =
+            static_cast<uint32_t*>(uniformBuffer->GetMappedRange(0, bufferDesc.size));
+        // srcOrigin: vec3u
+        params[0] = src.origin.x;
+        params[1] = src.origin.y;
+        // src.origin.z is set at textureView.baseArrayLayer
+        params[2] = 0;
+        // srcExtent: vec3u
+        params[4] = copyExtent.width;
+        params[5] = copyExtent.height;
+        params[6] = copyExtent.depthOrArrayLayers;
+
+        // Turn bytesPerRow, (bytes)offset to use array index as unit
+        // We use array<u32> for stencil8
+        params[8] = dst.bytesPerRow / 4;
+        params[9] = dst.rowsPerImage;
+        params[10] = dst.offset / 4;
+
+        DAWN_TRY(uniformBuffer->Unmap());
+    }
+
+    TextureViewDescriptor viewDesc = {};
+    viewDesc.aspect = wgpu::TextureAspect::StencilOnly;
+    viewDesc.dimension = wgpu::TextureViewDimension::e2DArray;
+    viewDesc.baseMipLevel = src.mipLevel;
+    viewDesc.mipLevelCount = 1;
+    viewDesc.baseArrayLayer = src.origin.z;
+    viewDesc.arrayLayerCount = copyExtent.depthOrArrayLayers;
+
+    Ref<TextureViewBase> srcView;
+    DAWN_TRY_ASSIGN(srcView, src.texture->CreateView(&viewDesc));
+
+    Ref<BindGroupBase> bindGroup;
+    DAWN_TRY_ASSIGN(bindGroup, utils::MakeBindGroup(device, bindGroupLayout,
+                                                    {
+                                                        {0, srcView},
+                                                        {1, destinationBuffer},
+                                                        {2, uniformBuffer},
+                                                    },
+                                                    UsageValidationMode::Internal));
+
+    Ref<ComputePassEncoder> pass = commandEncoder->BeginComputePass();
+    pass->APISetPipeline(pipeline.Get());
+    pass->APISetBindGroup(0, bindGroup.Get());
+    pass->APIDispatchWorkgroups(workgroupCountX, workgroupCountY, workgroupCountZ);
+
+    pass->APIEnd();
+
+    if (useIntermediateCopyBuffer) {
+        ASSERT(destinationBuffer->GetSize() <= dst.buffer->GetAllocatedSize());
+        commandEncoder->InternalCopyBufferToBufferWithAllocatedSize(
+            destinationBuffer.Get(), 0, dst.buffer.Get(), 0, destinationBuffer->GetSize());
+    }
+
+    return {};
+}
+
 }  // namespace dawn::native
diff --git a/src/dawn/native/BlitDepthStencilToBuffer.h b/src/dawn/native/BlitDepthStencilToBuffer.h
index ca60b9e..38f558b 100644
--- a/src/dawn/native/BlitDepthStencilToBuffer.h
+++ b/src/dawn/native/BlitDepthStencilToBuffer.h
@@ -33,6 +33,17 @@
                              const BufferCopy& dst,
                              const Extent3D& copyExtent);
 
+// BlitStencilToBuffer works around OpenGLES issues of copying stencil textures to a buffer.
+// Supported stencil texture format: *stencil8
+// It dispatches a compute shader textureLoad from the stencil texture and writes to the buffer as a
+// storage buffer.
+
+MaybeError BlitStencilToBuffer(DeviceBase* device,
+                               CommandEncoder* commandEncoder,
+                               const TextureCopy& src,
+                               const BufferCopy& dst,
+                               const Extent3D& copyExtent);
+
 }  // namespace dawn::native
 
 #endif  // SRC_DAWN_NATIVE_BLITDEPTHSTENCILTOBUFFER_H_
diff --git a/src/dawn/native/Buffer.cpp b/src/dawn/native/Buffer.cpp
index 92acd77..0068870 100644
--- a/src/dawn/native/Buffer.cpp
+++ b/src/dawn/native/Buffer.cpp
@@ -177,6 +177,9 @@
             device->IsToggleEnabled(Toggle::UseBlitForDepth32FloatTextureToBufferCopy)) {
             mUsage |= kInternalStorageBuffer;
         }
+        if (device->IsToggleEnabled(Toggle::UseBlitForStencilTextureToBufferCopy)) {
+            mUsage |= kInternalStorageBuffer;
+        }
     }
 
     GetObjectTrackingList()->Track(this);
diff --git a/src/dawn/native/CommandEncoder.cpp b/src/dawn/native/CommandEncoder.cpp
index 476248f..1c3e82e 100644
--- a/src/dawn/native/CommandEncoder.cpp
+++ b/src/dawn/native/CommandEncoder.cpp
@@ -1362,8 +1362,26 @@
 
                     return {};
                 }
+            } else if (aspect == Aspect::Stencil) {
+                if (GetDevice()->IsToggleEnabled(Toggle::UseBlitForStencilTextureToBufferCopy)) {
+                    TextureCopy src;
+                    src.texture = source->texture;
+                    src.origin = source->origin;
+                    src.mipLevel = source->mipLevel;
+                    src.aspect = aspect;
+
+                    BufferCopy dst;
+                    dst.buffer = destination->buffer;
+                    dst.bytesPerRow = destination->layout.bytesPerRow;
+                    dst.rowsPerImage = destination->layout.rowsPerImage;
+                    dst.offset = destination->layout.offset;
+                    DAWN_TRY_CONTEXT(BlitStencilToBuffer(GetDevice(), this, src, dst, *copySize),
+                                     "copying stencil aspect from %s to %s using blit workaround.",
+                                     src.texture.Get(), destination->buffer);
+
+                    return {};
+                }
             }
-            // TODO(crbug.com/dawn/1782): implement emulation for stencil
 
             CopyTextureToBufferCmd* t2b =
                 allocator->Allocate<CopyTextureToBufferCmd>(Command::CopyTextureToBuffer);
diff --git a/src/dawn/native/InternalPipelineStore.h b/src/dawn/native/InternalPipelineStore.h
index 3500ed2..97731c2 100644
--- a/src/dawn/native/InternalPipelineStore.h
+++ b/src/dawn/native/InternalPipelineStore.h
@@ -62,6 +62,7 @@
 
     Ref<ComputePipelineBase> blitDepth16UnormToBufferComputePipeline;
     Ref<ComputePipelineBase> blitDepth32FloatToBufferComputePipeline;
+    Ref<ComputePipelineBase> blitStencil8ToBufferComputePipeline;
 
     struct BlitR8ToStencilPipelines {
         Ref<RenderPipelineBase> clearPipeline;
diff --git a/src/dawn/native/Texture.cpp b/src/dawn/native/Texture.cpp
index 0747b2c..7a72628 100644
--- a/src/dawn/native/Texture.cpp
+++ b/src/dawn/native/Texture.cpp
@@ -604,6 +604,12 @@
             AddInternalUsage(wgpu::TextureUsage::TextureBinding);
         }
     }
+    if (mFormat.HasStencil() &&
+        device->IsToggleEnabled(Toggle::UseBlitForStencilTextureToBufferCopy)) {
+        if (mInternalUsage & wgpu::TextureUsage::CopySrc) {
+            AddInternalUsage(wgpu::TextureUsage::TextureBinding);
+        }
+    }
 }
 
 TextureBase::~TextureBase() = default;
diff --git a/src/dawn/native/Toggles.cpp b/src/dawn/native/Toggles.cpp
index f4213c5..0d0709c 100644
--- a/src/dawn/native/Toggles.cpp
+++ b/src/dawn/native/Toggles.cpp
@@ -401,6 +401,11 @@
       "Use a blit instead of a copy command to copy depth aspect of a texture to a buffer."
       "Workaround for OpenGLES.",
       "https://crbug.com/dawn/1782", ToggleStage::Device}},
+    {Toggle::UseBlitForStencilTextureToBufferCopy,
+     {"use_blit_for_stencil_texture_to_buffer_copy",
+      "Use a blit instead of a copy command to copy stencil aspect of a texture to a buffer."
+      "Workaround for OpenGLES.",
+      "https://crbug.com/dawn/1782", ToggleStage::Device}},
     {Toggle::D3D12ReplaceAddWithMinusWhenDstFactorIsZeroAndSrcFactorIsDstAlpha,
      {"d3d12_replace_add_with_minus_when_dst_factor_is_zero_and_src_factor_is_dst_alpha",
       "Replace the blending operation 'Add' with 'Minus' when dstBlendFactor is 'Zero' and "
diff --git a/src/dawn/native/Toggles.h b/src/dawn/native/Toggles.h
index 5fa5eff..b0bb95d 100644
--- a/src/dawn/native/Toggles.h
+++ b/src/dawn/native/Toggles.h
@@ -95,6 +95,7 @@
     UseBlitForDepthTextureToTextureCopyToNonzeroSubresource,
     UseBlitForDepth16UnormTextureToBufferCopy,
     UseBlitForDepth32FloatTextureToBufferCopy,
+    UseBlitForStencilTextureToBufferCopy,
     D3D12ReplaceAddWithMinusWhenDstFactorIsZeroAndSrcFactorIsDstAlpha,
     D3D12PolyfillReflectVec2F32,
     VulkanClearGen12TextureWithCCSAmbiguateOnCreation,
diff --git a/src/dawn/native/opengl/CommandBufferGL.cpp b/src/dawn/native/opengl/CommandBufferGL.cpp
index 87c973d..1271f8f 100644
--- a/src/dawn/native/opengl/CommandBufferGL.cpp
+++ b/src/dawn/native/opengl/CommandBufferGL.cpp
@@ -541,8 +541,6 @@
                     dst.aspect == Aspect::Stencil,
                     "Copies to stencil textures are unsupported on the OpenGL backend.");
 
-                ASSERT(dst.aspect == Aspect::Color);
-
                 buffer->EnsureDataInitialized();
                 SubresourceRange range = GetSubresourcesAffectedByCopy(dst, copy->copySize);
                 if (IsCompleteSubresourceCopiedTo(dst.texture.Get(), copy->copySize,
diff --git a/src/dawn/native/opengl/PhysicalDeviceGL.cpp b/src/dawn/native/opengl/PhysicalDeviceGL.cpp
index e12b442..bdcfc4f 100644
--- a/src/dawn/native/opengl/PhysicalDeviceGL.cpp
+++ b/src/dawn/native/opengl/PhysicalDeviceGL.cpp
@@ -226,6 +226,10 @@
     // For OpenGL ES, use compute shader blit to emulate depth32float texture to buffer copies.
     deviceToggles->Default(Toggle::UseBlitForDepth32FloatTextureToBufferCopy,
                            gl.GetVersion().IsES() && !kIsAngleOnWindows);
+
+    // For OpenGL ES, use compute shader blit to emulate stencil texture to buffer copies.
+    deviceToggles->Default(Toggle::UseBlitForStencilTextureToBufferCopy,
+                           gl.GetVersion().IsES() && !kIsAngleOnWindows);
 }
 
 ResultOrError<Ref<DeviceBase>> PhysicalDevice::CreateDeviceImpl(AdapterBase* adapter,
diff --git a/src/dawn/tests/end2end/DepthStencilCopyTests.cpp b/src/dawn/tests/end2end/DepthStencilCopyTests.cpp
index 654369e..b9dd36e 100644
--- a/src/dawn/tests/end2end/DepthStencilCopyTests.cpp
+++ b/src/dawn/tests/end2end/DepthStencilCopyTests.cpp
@@ -78,6 +78,19 @@
 // Use a non-zero clear depth to better test unorm16 compute emulation path.
 constexpr float kClearDepth = 0.69f;
 
+// Initialize other mip levels with differrent garbage values for better testing
+constexpr float kGarbageDepth = 0.123456789f;
+
+static_assert(kInitDepth != kGarbageDepth);
+static_assert(kClearDepth != kGarbageDepth);
+
+constexpr uint8_t kInitStencil = 1u;
+constexpr uint8_t kClearStencil = 0u;
+constexpr uint8_t kGarbageStencil = 99u;
+
+static_assert(kInitStencil != kGarbageStencil);
+static_assert(kClearStencil != kGarbageStencil);
+
 class DepthStencilCopyTests : public DawnTestWithParams<DepthStencilCopyTestParams> {
   protected:
     void MapAsyncAndWait(const wgpu::Buffer& buffer,
@@ -156,18 +169,6 @@
         return device.CreateTexture(&texDescriptor);
     }
 
-    wgpu::Texture CreateDepthStencilTexture(uint32_t width,
-                                            uint32_t height,
-                                            wgpu::TextureUsage usage,
-                                            uint32_t mipLevelCount = 1) {
-        wgpu::TextureDescriptor texDescriptor = {};
-        texDescriptor.size = {width, height, 1};
-        texDescriptor.format = GetParam().mTextureFormat;
-        texDescriptor.usage = usage;
-        texDescriptor.mipLevelCount = mipLevelCount;
-        return device.CreateTexture(&texDescriptor);
-    }
-
     wgpu::Texture CreateDepthTexture(uint32_t width,
                                      uint32_t height,
                                      wgpu::TextureUsage usage,
@@ -253,12 +254,12 @@
                                                                 uint32_t height,
                                                                 wgpu::TextureUsage usage,
                                                                 uint32_t mipLevel = 0) {
-        wgpu::Texture src = CreateDepthStencilTexture(
+        wgpu::Texture src = CreateTexture(
             width, height, wgpu::TextureUsage::RenderAttachment | wgpu::TextureUsage::CopySrc,
             mipLevel + 1);
 
-        wgpu::Texture dst = CreateDepthStencilTexture(
-            width, height, usage | wgpu::TextureUsage::CopyDst, mipLevel + 1);
+        wgpu::Texture dst =
+            CreateTexture(width, height, usage | wgpu::TextureUsage::CopyDst, mipLevel + 1);
 
         InitializeDepthStencilTextureRegion(src, clearDepth, regionDepth, clearStencil,
                                             regionStencil, mipLevel);
@@ -488,7 +489,6 @@
 class DepthCopyTests : public DepthStencilCopyTests {
   public:
     void DoCopyFromDepthTest(uint32_t bufferCopyOffset,
-                             float initDepth,
                              uint32_t textureWidth,
                              uint32_t textureHeight,
                              uint32_t textureArrayLayerCount,
@@ -503,13 +503,12 @@
                                      GetParam().mTextureFormat, wgpu::TextureAspect::DepthOnly);
         wgpu::Buffer destinationBuffer = device.CreateBuffer(&bufferDescriptor);
 
-        DoCopyFromDepthTestWithBuffer(destinationBuffer, bufferCopyOffset, initDepth, textureWidth,
+        DoCopyFromDepthTestWithBuffer(destinationBuffer, bufferCopyOffset, textureWidth,
                                       textureHeight, textureArrayLayerCount, testLevel, true);
     }
 
     void DoCopyFromDepthTestWithBuffer(wgpu::Buffer destinationBuffer,
                                        uint32_t bufferCopyOffset,
-                                       float initDepth,
                                        uint32_t textureWidth,
                                        uint32_t textureHeight,
                                        uint32_t textureArrayLayerCount,
@@ -524,12 +523,8 @@
                           wgpu::TextureUsage::RenderAttachment | wgpu::TextureUsage::CopySrc,
                           mipLevelCount, textureArrayLayerCount);
 
-        // Initialize other mip levels with different init values for better testing
-        constexpr float garbageDepth = 0.123456789f;
-        ASSERT(initDepth != garbageDepth);
-
         for (uint32_t level = 0; level < mipLevelCount; level++) {
-            float regionDepth = (level == testLevel) ? initDepth : garbageDepth;
+            float regionDepth = (level == testLevel) ? kInitDepth : kGarbageDepth;
             InitializeDepthStencilTextureRegion(texture, kClearDepth, regionDepth, 0, 0, level, 0,
                                                 textureArrayLayerCount);
         }
@@ -554,10 +549,10 @@
         queue.Submit(1, &commandBuffer);
 
         if (checkBufferContent) {
-            // Expected data pattern is that initDepth value at bottom left corner, while other
+            // Expected data pattern is that kInitDepth value at bottom left corner, while other
             // region is kClearDepth. Data of each layer is the same.
             if (format == wgpu::TextureFormat::Depth16Unorm) {
-                uint16_t expected = FloatToUnorm<uint16_t>(initDepth);
+                uint16_t expected = FloatToUnorm<uint16_t>(kInitDepth);
                 uint16_t cleared = FloatToUnorm<uint16_t>(kClearDepth);
                 std::vector<uint16_t> expectedData(copyWidth * copyHeight, cleared);
                 for (uint32_t y = copyHeight / 2; y < copyHeight; y++) {
@@ -578,7 +573,7 @@
                 std::vector<float> expectedData(copyWidth * copyHeight, kClearDepth);
                 for (uint32_t y = copyHeight / 2; y < copyHeight; y++) {
                     auto rowStart = expectedData.data() + y * copyWidth;
-                    std::fill(rowStart, rowStart + copyWidth / 2, initDepth);
+                    std::fill(rowStart, rowStart + copyWidth / 2, kInitDepth);
                 }
 
                 for (uint32_t z = 0; z < textureArrayLayerCount; ++z) {
@@ -618,7 +613,7 @@
 
     for (const uint32_t sizeZ : kTestTextureArrayLayerCounts) {
         for (const auto& size : kTestTextureSizes) {
-            DoCopyFromDepthTest(kBufferCopyOffset, kInitDepth, size[0], size[1], sizeZ, kTestLevel);
+            DoCopyFromDepthTest(kBufferCopyOffset, size[0], size[1], sizeZ, kTestLevel);
         }
     }
 }
@@ -643,10 +638,10 @@
         1,
         2,
     };
-    for (const uint32_t sizeZ : kTestTextureArrayLayerCounts) {
-        for (uint32_t offset : kBufferCopyOffsets) {
+    for (uint32_t offset : kBufferCopyOffsets) {
+        for (const uint32_t sizeZ : kTestTextureArrayLayerCounts) {
             for (const auto& size : kTestTextureSizes) {
-                DoCopyFromDepthTest(offset, kInitDepth, size[0], size[1], sizeZ, kTestLevel);
+                DoCopyFromDepthTest(offset, size[0], size[1], sizeZ, kTestLevel);
             }
         }
     }
@@ -657,8 +652,8 @@
     constexpr uint32_t kBufferCopyOffset = 0;
     constexpr uint32_t kWidth = 9;
     constexpr uint32_t kHeight = 9;
-    DoCopyFromDepthTest(kBufferCopyOffset, kInitDepth, kWidth, kHeight, 1, 1);
-    DoCopyFromDepthTest(kBufferCopyOffset, kInitDepth, kWidth, kHeight, 2, 2);
+    DoCopyFromDepthTest(kBufferCopyOffset, kWidth, kHeight, 1, 1);
+    DoCopyFromDepthTest(kBufferCopyOffset, kWidth, kHeight, 2, 2);
 }
 
 // Test buffer content outside of copy extent is preserved.
@@ -703,8 +698,8 @@
 
                 // Don't check copy region content because the buffer doesn't have
                 // wgpu::BufferUsage::CopySrc usage.
-                DoCopyFromDepthTestWithBuffer(buffer, offset, kInitDepth, size[0], size[1], kSizeZ,
-                                              kTestLevel, false);
+                DoCopyFromDepthTestWithBuffer(buffer, offset, size[0], size[1], kSizeZ, kTestLevel,
+                                              false);
 
                 std::vector<uint8_t> expected(bufferDescriptor.size, kOriginalValue);
                 // Get the offset of the end of the copy range (without aligning with 4 bytes)
@@ -762,14 +757,14 @@
             bufferDescriptor.usage = wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead;
             bufferDescriptor.size =
                 offset +
-                // Don't align for 4 bytes to get smallest possible buffer for depth16unorm.
+                // Don't align for 4 bytes to get the smallest possible buffer for depth16unorm.
                 BufferSizeForTextureCopy(size[0], size[1], kSizeZ, format, aspect, false);
             wgpu::Buffer buffer = device.CreateBuffer(&bufferDescriptor);
 
             // Don't check copy region content because the buffer doesn't have
             // wgpu::BufferUsage::CopySrc usage.
-            DoCopyFromDepthTestWithBuffer(buffer, offset, kInitDepth, size[0], size[1], kSizeZ,
-                                          kTestLevel, false);
+            DoCopyFromDepthTestWithBuffer(buffer, offset, size[0], size[1], kSizeZ, kTestLevel,
+                                          false);
 
             // Unable to check the result since either MapAsync and CopyBufferToBuffer requires size
             // to be multiple of 4 bytes.
@@ -781,10 +776,8 @@
 class DepthCopyFromBufferTests : public DepthStencilCopyTests {
   public:
     void DoTest(uint32_t bufferCopyOffset, bool hasRenderAttachmentUsage) {
-        // TODO(crbug.com/dawn/1237): Depth16Unorm test failed on OpenGL and OpenGLES which says
-        // Invalid format and type combination in glReadPixels
-        DAWN_TEST_UNSUPPORTED_IF(GetParam().mTextureFormat == wgpu::TextureFormat::Depth16Unorm &&
-                                 (IsOpenGL() || IsOpenGLES()));
+        // TODO(crbug.com/dawn/1291): Compute emulation path fails for Angle on Windows.
+        DAWN_SUPPRESS_TEST_IF(IsANGLE() && IsWindows());
 
         constexpr uint32_t kWidth = 8;
         constexpr uint32_t kHeight = 1;
@@ -881,55 +874,90 @@
     void DoCopyFromStencilTest(uint32_t bufferCopyOffset,
                                uint32_t textureWidth,
                                uint32_t textureHeight,
+                               uint32_t textureArrayLayerCount,
                                uint32_t testLevel) {
-        // TODO(crbug.com/dawn/1497): glReadPixels: GL error: HIGH: Invalid format and type
-        // combination.
-        DAWN_SUPPRESS_TEST_IF(IsANGLE());
+        uint32_t copyWidth = textureWidth >> testLevel;
+        uint32_t copyHeight = textureHeight >> testLevel;
+        wgpu::BufferDescriptor bufferDescriptor = {};
+        bufferDescriptor.usage = wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst;
+        bufferDescriptor.size =
+            bufferCopyOffset +
+            BufferSizeForTextureCopy(copyWidth, copyHeight, textureArrayLayerCount,
+                                     GetParam().mTextureFormat, wgpu::TextureAspect::StencilOnly);
+        wgpu::Buffer destinationBuffer = device.CreateBuffer(&bufferDescriptor);
+
+        DoCopyFromStencilTestWithBuffer(destinationBuffer, bufferCopyOffset, textureWidth,
+                                        textureHeight, textureArrayLayerCount, testLevel, true);
+    }
+    void DoCopyFromStencilTestWithBuffer(wgpu::Buffer destinationBuffer,
+                                         uint32_t bufferCopyOffset,
+                                         uint32_t textureWidth,
+                                         uint32_t textureHeight,
+                                         uint32_t textureArrayLayerCount,
+                                         uint32_t testLevel,
+                                         bool checkBufferContent) {
+        // TODO(crbug.com/dawn/1291): Compute emulation path fails for Angle on Windows.
+        DAWN_SUPPRESS_TEST_IF(IsANGLE() && IsWindows());
+
+        // TODO(crbug.com/dawn/1835): ResourceBarrier state mismatch.
+        DAWN_SUPPRESS_TEST_IF(textureArrayLayerCount > 1 && IsD3D12() &&
+                              IsBackendValidationEnabled());
 
         // TODO(crbug.com/dawn/667): Work around the fact that some platforms are unable to read
         // stencil.
         DAWN_TEST_UNSUPPORTED_IF(HasToggleEnabled("disable_depth_stencil_read"));
 
         uint32_t mipLevelCount = testLevel + 1;
-        wgpu::Texture depthStencilTexture = CreateDepthStencilTexture(
-            textureWidth, textureHeight,
-            wgpu::TextureUsage::RenderAttachment | wgpu::TextureUsage::CopySrc, mipLevelCount);
+        wgpu::Texture depthStencilTexture =
+            CreateTexture(textureWidth, textureHeight,
+                          wgpu::TextureUsage::RenderAttachment | wgpu::TextureUsage::CopySrc,
+                          mipLevelCount, textureArrayLayerCount);
 
-        InitializeDepthStencilTextureRegion(depthStencilTexture, 0.f, 0.3f, 0u, 1u, testLevel);
-
-        std::vector<uint8_t> expectedData = {
-            0u, 0u, 0u, 0u,  //
-            0u, 0u, 0u, 0u,  //
-            1u, 1u, 0u, 0u,  //
-            1u, 1u, 0u, 0u,  //
-        };
+        for (uint32_t level = 0; level < mipLevelCount; level++) {
+            uint8_t regionStencil = (level == testLevel) ? kInitStencil : kGarbageStencil;
+            InitializeDepthStencilTextureRegion(depthStencilTexture, 0.f, 0.3f, kClearStencil,
+                                                regionStencil, testLevel, 0,
+                                                textureArrayLayerCount);
+        }
 
         uint32_t copyWidth = textureWidth >> testLevel;
         uint32_t copyHeight = textureHeight >> testLevel;
-        ASSERT_EQ(expectedData.size(), copyWidth * copyHeight);
-        wgpu::Extent3D copySize = {copyWidth, copyHeight, 1};
+        wgpu::Extent3D copySize = {copyWidth, copyHeight, textureArrayLayerCount};
 
-        constexpr uint32_t kBytesPerRow = kTextureBytesPerRowAlignment;
-        wgpu::BufferDescriptor bufferDescriptor = {};
-        bufferDescriptor.usage = wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst;
-        bufferDescriptor.size =
-            bufferCopyOffset + BufferSizeForTextureCopy(copyWidth, copyHeight, 1,
-                                                        GetParam().mTextureFormat,
-                                                        wgpu::TextureAspect::StencilOnly);
-        wgpu::Buffer destinationBuffer = device.CreateBuffer(&bufferDescriptor);
+        // Expected data pattern is that kInitStencil value at bottom left corner, while other
+        // region is kClearStencil.
+
+        wgpu::TextureFormat format = GetParam().mTextureFormat;
+        constexpr wgpu::TextureAspect aspect = wgpu::TextureAspect::StencilOnly;
+        uint32_t bytesPerPixel = GetBytesPerPixel(format, aspect);
+        uint32_t bytesPerRow = Align(copyWidth * bytesPerPixel, kTextureBytesPerRowAlignment);
+        uint32_t bytesPerImage = bytesPerRow * copyHeight;
 
         wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
-        wgpu::ImageCopyTexture imageCopyTexture = utils::CreateImageCopyTexture(
-            depthStencilTexture, testLevel, {0, 0, 0}, wgpu::TextureAspect::StencilOnly);
+        wgpu::ImageCopyTexture imageCopyTexture =
+            utils::CreateImageCopyTexture(depthStencilTexture, testLevel, {0, 0, 0}, aspect);
         wgpu::ImageCopyBuffer imageCopyBuffer = utils::CreateImageCopyBuffer(
-            destinationBuffer, bufferCopyOffset, kBytesPerRow, copyHeight);
+            destinationBuffer, bufferCopyOffset, bytesPerRow, copyHeight);
         encoder.CopyTextureToBuffer(&imageCopyTexture, &imageCopyBuffer, &copySize);
         wgpu::CommandBuffer commandBuffer = encoder.Finish();
         queue.Submit(1, &commandBuffer);
 
-        for (uint32_t y = 0; y < copyHeight; ++y) {
-            EXPECT_BUFFER_U8_RANGE_EQ(expectedData.data() + copyWidth * y, destinationBuffer,
-                                      bufferCopyOffset + y * kBytesPerRow, copyWidth);
+        if (checkBufferContent) {
+            std::vector<uint8_t> expectedData(copyWidth * copyHeight, kClearStencil);
+            // std::fill(expectedData.data(), expectedData.data() + expectedData.size(), 0x77);
+            for (uint32_t y = copyHeight / 2; y < copyHeight; y++) {
+                auto rowStart = expectedData.data() + y * copyWidth;
+                std::fill(rowStart, rowStart + copyWidth / 2, kInitStencil);
+            }
+
+            for (uint32_t z = 0; z < textureArrayLayerCount; ++z) {
+                uint32_t bufferOffsetPerArrayLayer = bytesPerImage * z;
+                for (uint32_t y = 0; y < copyHeight; ++y) {
+                    EXPECT_BUFFER_U8_RANGE_EQ(
+                        expectedData.data() + copyWidth * y, destinationBuffer,
+                        bufferCopyOffset + bufferOffsetPerArrayLayer + y * bytesPerRow, copyWidth);
+                }
+            }
         }
     }
 
@@ -946,10 +974,10 @@
         constexpr uint32_t kHeight = 4;
         const bool hasDepth = !utils::IsStencilOnlyFormat(GetParam().mTextureFormat);
 
-        wgpu::Texture depthStencilTexture = CreateDepthStencilTexture(
-            kWidth, kHeight,
-            wgpu::TextureUsage::RenderAttachment | wgpu::TextureUsage::CopySrc |
-                wgpu::TextureUsage::CopyDst);
+        wgpu::Texture depthStencilTexture =
+            CreateTexture(kWidth, kHeight,
+                          wgpu::TextureUsage::RenderAttachment | wgpu::TextureUsage::CopySrc |
+                              wgpu::TextureUsage::CopyDst);
 
         if (hasDepth) {
             wgpu::CommandEncoder commandEncoder = device.CreateCommandEncoder();
@@ -1067,21 +1095,51 @@
 
 // Test copying the stencil-only aspect into a buffer.
 TEST_P(StencilCopyTests, FromStencilAspect) {
-    constexpr uint32_t kWidth = 4;
-    constexpr uint32_t kHeight = 4;
     constexpr uint32_t kTestLevel = 0;
     constexpr uint32_t kBufferCopyOffset = 0;
-    DoCopyFromStencilTest(kBufferCopyOffset, kWidth, kHeight, kTestLevel);
+    constexpr uint32_t kTestTextureSizes[][2] = {
+        // Original test parameter
+        {4, 4},
+        // Test compute emulation path for stencil 8
+        {2, 2},
+        {3, 3},
+        // stencil 8 needs bytesPerRow alignment
+        {257, 1},
+    };
+    constexpr uint32_t kTestTextureArrayLayerCounts[] = {
+        1,
+        2,
+    };
+    for (const uint32_t sizeZ : kTestTextureArrayLayerCounts) {
+        for (const auto& size : kTestTextureSizes) {
+            DoCopyFromStencilTest(kBufferCopyOffset, size[0], size[1], sizeZ, kTestLevel);
+        }
+    }
 }
 
 // Test copying the stencil-only aspect into a buffer at a non-zero offset
 TEST_P(StencilCopyTests, FromStencilAspectAtNonZeroOffset) {
-    constexpr uint32_t kWidth = 4;
-    constexpr uint32_t kHeight = 4;
     constexpr uint32_t kTestLevel = 0;
     constexpr std::array<uint32_t, 2> kBufferCopyOffsets = {4u, 512u};
+    constexpr uint32_t kTestTextureSizes[][2] = {
+        // Original test parameter
+        {4, 4},
+        // Test compute emulation path for stencil 8
+        {2, 2},
+        {3, 3},
+        // stencil 8 needs bytesPerRow alignment
+        {257, 1},
+    };
+    constexpr uint32_t kTestTextureArrayLayerCounts[] = {
+        1,
+        2,
+    };
     for (uint32_t offset : kBufferCopyOffsets) {
-        DoCopyFromStencilTest(offset, kWidth, kHeight, kTestLevel);
+        for (const uint32_t sizeZ : kTestTextureArrayLayerCounts) {
+            for (const auto& size : kTestTextureSizes) {
+                DoCopyFromStencilTest(offset, size[0], size[1], sizeZ, kTestLevel);
+            }
+        }
     }
 }
 
@@ -1089,9 +1147,123 @@
 TEST_P(StencilCopyTests, FromNonZeroMipStencilAspect) {
     constexpr uint32_t kWidth = 9;
     constexpr uint32_t kHeight = 9;
-    constexpr uint32_t kTestLevel = 1;
     constexpr uint32_t kBufferCopyOffset = 0;
-    DoCopyFromStencilTest(kBufferCopyOffset, kWidth, kHeight, kTestLevel);
+    DoCopyFromStencilTest(kBufferCopyOffset, kWidth, kHeight, 1, 1);
+    DoCopyFromStencilTest(kBufferCopyOffset, kWidth, kHeight, 2, 2);
+}
+
+// Test buffer content outside of copy extent is preserved.
+// This test is made specifially for compute blit for stencil8 emulation path.
+// The texel size is 1 byte, while in the compute shader we have to write 4 byte at a time.
+// When the texture width % 4 != 0, buffer content outside of the copy range is
+// inevitably written. So we need to make sure the original content of the buffer that's outside of
+// the copy extent is still correctly preserved.
+TEST_P(StencilCopyTests, PreserveBufferContent) {
+    constexpr uint32_t kBufferCopyOffsets[] = {0u, 4u, 512u};
+    constexpr uint32_t kTestTextureSizes[][2] = {
+        {1, 1},
+        {1, 2},
+        {3, 3},
+    };
+    constexpr uint32_t kExtraBufferSize[] = {0u, 4u};
+    const uint32_t kSizeZ = 1;
+    constexpr uint32_t kTestLevel = 0;
+
+    wgpu::TextureFormat format = GetParam().mTextureFormat;
+    constexpr wgpu::TextureAspect aspect = wgpu::TextureAspect::StencilOnly;
+
+    for (uint32_t extraBufferSize : kExtraBufferSize) {
+        for (uint32_t offset : kBufferCopyOffsets) {
+            for (const auto& size : kTestTextureSizes) {
+                wgpu::BufferDescriptor bufferDescriptor = {};
+                // Add wgpu::BufferUsage::MapRead to check the buffer content with mapAsync
+                bufferDescriptor.usage = wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead;
+                bufferDescriptor.size =
+                    extraBufferSize + offset +
+                    BufferSizeForTextureCopy(size[0], size[1], kSizeZ, format, aspect);
+                bufferDescriptor.mappedAtCreation = true;
+                wgpu::Buffer buffer = device.CreateBuffer(&bufferDescriptor);
+                constexpr uint8_t kOriginalValue = 0xff;
+                {
+                    // Fill the buffer with an original value other than 0 to check they are
+                    // incorrectly overwritten outside of the copy range.
+                    uint8_t* ptr = static_cast<uint8_t*>(buffer.GetMappedRange());
+                    std::fill(ptr, ptr + bufferDescriptor.size, kOriginalValue);
+                    buffer.Unmap();
+                }
+
+                // Don't check copy region content because the buffer doesn't have
+                // wgpu::BufferUsage::CopySrc usage.
+                DoCopyFromStencilTestWithBuffer(buffer, offset, size[0], size[1], kSizeZ,
+                                                kTestLevel, false);
+
+                std::vector<uint8_t> expected(bufferDescriptor.size, kOriginalValue);
+                // Get the offset of the end of the copy range (without aligning with 4 bytes)
+                uint32_t bufferEndOffset =
+                    offset +
+                    BufferSizeForTextureCopy(size[0], size[1], kSizeZ, format, aspect, false);
+                if (bufferDescriptor.size > bufferEndOffset) {
+                    // Cannot use EXPECT_BUFFER_* helper here because it needs to align the copy
+                    // size to a multiple of 4 bytes to call CopyBufferToBuffer. We are checking
+                    // stencil8.
+                    MapAsyncAndWait(buffer, wgpu::MapMode::Read, 0, wgpu::kWholeMapSize);
+                    const uint8_t* ptr = static_cast<const uint8_t*>(buffer.GetConstMappedRange());
+
+                    // Check the content before copy range.
+                    for (uint32_t i = 0; i < offset; i++) {
+                        EXPECT_EQ(ptr[i], kOriginalValue);
+                    }
+
+                    // Check the content after copy range.
+                    uint32_t checkSize = bufferDescriptor.size - bufferEndOffset;
+                    for (uint32_t i = 0; i < checkSize; i++) {
+                        EXPECT_EQ(ptr[bufferEndOffset + i], kOriginalValue);
+                    }
+                    buffer.Unmap();
+                }
+            }
+        }
+    }
+}
+
+// Test compact buffer size edge case.
+// This test is made specifially for compute blit for stencil8 emulation path.
+// When texture width % 4 != 0, the size of the most compact buffer copy
+// target can be something that's not a multiple of 4. We need to make sure access don't go out of
+// bounds in the shader, when still writing to array<u32> in the compute shader.
+TEST_P(StencilCopyTests, BufferCopySizeEdgeCase) {
+    constexpr uint32_t kBufferCopyOffsets[] = {0u, 4u, 512u};
+    constexpr uint32_t kTestTextureSizes[][2] = {
+        // Storage buffer binding requires size of at least 4 bytes.
+        {5, 1}, {6, 1}, {7, 1}, {1, 2}, {2, 2}, {3, 3},
+    };
+    const uint32_t kSizeZ = 1;
+    constexpr uint32_t kTestLevel = 0;
+
+    wgpu::TextureFormat format = GetParam().mTextureFormat;
+    constexpr wgpu::TextureAspect aspect = wgpu::TextureAspect::StencilOnly;
+
+    for (uint32_t offset : kBufferCopyOffsets) {
+        for (const auto& size : kTestTextureSizes) {
+            wgpu::BufferDescriptor bufferDescriptor = {};
+            // Add wgpu::BufferUsage::MapRead to check the buffer content with mapAsync
+            bufferDescriptor.usage = wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead;
+            bufferDescriptor.size =
+                offset +
+                // Don't align for 4 bytes to get the smallest possible buffer for depth16unorm.
+                BufferSizeForTextureCopy(size[0], size[1], kSizeZ, format, aspect, false);
+            wgpu::Buffer buffer = device.CreateBuffer(&bufferDescriptor);
+
+            // Don't check copy region content because the buffer doesn't have
+            // wgpu::BufferUsage::CopySrc usage.
+            DoCopyFromStencilTestWithBuffer(buffer, offset, size[0], size[1], kSizeZ, kTestLevel,
+                                            false);
+
+            // Unable to check the result since either MapAsync and CopyBufferToBuffer requires size
+            // to be multiple of 4 bytes.
+            // Just run and don't crash on ASSERT.
+        }
+    }
 }
 
 // Test copying to the stencil-aspect of a texture
@@ -1121,10 +1293,10 @@
     constexpr uint32_t kMipLevel = 1;
 
     wgpu::Texture depthStencilTexture =
-        CreateDepthStencilTexture(kWidth, kHeight,
-                                  wgpu::TextureUsage::RenderAttachment |
-                                      wgpu::TextureUsage::CopySrc | wgpu::TextureUsage::CopyDst,
-                                  kMipLevel + 1);
+        CreateTexture(kWidth, kHeight,
+                      wgpu::TextureUsage::RenderAttachment | wgpu::TextureUsage::CopySrc |
+                          wgpu::TextureUsage::CopyDst,
+                      kMipLevel + 1);
 
     std::vector<uint8_t> stencilData = {
         7u, 7u,  //