Add workarounds to blit a buffer to a depth/stencil texture

Works around issues on Metal Intel where CopyB2T and WriteTexture
with depth/stencil textures do not work correctly.
Fixes test failures with depth16unorm in the CTS.

Deletes UseTempTextureInStencilTextureToBufferCopy in favor of
the stencil blit. The former supposedly fixes a problem where
the stencil data is not flushed into the real stencil texture
by performing another T2T copy. This only works because the Metal
Intel backend also happens to allocate s8 as d32s8. Copying
the depth aspect as well seems to make the driver remember to
flush the data.
The stencil blit is a better fix for the problem since entirely
avoids getting the driver into a bad state where the stencil data
is not in sync.

Fixed: dawn:1389
Change-Id: If34b1d58996157036c164a5bc329e38b5e53f67a
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/117910
Reviewed-by: Loko Kung <lokokung@google.com>
Kokoro: Kokoro <noreply+kokoro@google.com>
Commit-Queue: Austin Eng <enga@chromium.org>
diff --git a/src/dawn/native/BUILD.gn b/src/dawn/native/BUILD.gn
index 95bf264..9f4cb17 100644
--- a/src/dawn/native/BUILD.gn
+++ b/src/dawn/native/BUILD.gn
@@ -199,6 +199,8 @@
     "BindGroupTracker.h",
     "BindingInfo.cpp",
     "BindingInfo.h",
+    "BlitBufferToDepthStencil.cpp",
+    "BlitBufferToDepthStencil.h",
     "Blob.cpp",
     "Blob.h",
     "BlobCache.cpp",
diff --git a/src/dawn/native/BlitBufferToDepthStencil.cpp b/src/dawn/native/BlitBufferToDepthStencil.cpp
new file mode 100644
index 0000000..173919f
--- /dev/null
+++ b/src/dawn/native/BlitBufferToDepthStencil.cpp
@@ -0,0 +1,573 @@
+// Copyright 2023 The Dawn Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dawn/native/BlitBufferToDepthStencil.h"
+
+#include <utility>
+
+#include "dawn/common/Assert.h"
+#include "dawn/native/BindGroup.h"
+#include "dawn/native/CommandBuffer.h"
+#include "dawn/native/CommandEncoder.h"
+#include "dawn/native/Device.h"
+#include "dawn/native/InternalPipelineStore.h"
+#include "dawn/native/Queue.h"
+#include "dawn/native/RenderPassEncoder.h"
+#include "dawn/native/RenderPipeline.h"
+
+namespace dawn::native {
+
+namespace {
+
+constexpr char kBlitRG8ToDepthShaders[] = R"(
+
+@vertex fn vert_fullscreen_quad(
+  @builtin(vertex_index) vertex_index : u32
+) -> @builtin(position) vec4<f32> {
+  const pos = array<vec2<f32>, 3>(
+      vec2<f32>(-1.0, -1.0),
+      vec2<f32>( 3.0, -1.0),
+      vec2<f32>(-1.0,  3.0));
+  return vec4<f32>(pos[vertex_index], 0.0, 1.0);
+}
+
+struct Params {
+  origin : vec2<u32>
+};
+
+@group(0) @binding(0) var src_tex : texture_2d<u32>;
+@group(0) @binding(1) var<uniform> params : Params;
+
+@fragment fn blit_to_depth(
+    @builtin(position) position : vec4<f32>
+) -> @builtin(frag_depth) f32 {
+  // Load the source texel.
+  let src_texel = textureLoad(
+    src_tex, vec2<u32>(position.xy) - params.origin, 0u);
+
+  let depth_u16_val = (src_texel.y << 8u) + src_texel.x;
+
+  const one_over_max : f32 = 1.0 / f32(0xFFFFu);
+  return f32(depth_u16_val) * one_over_max;
+}
+
+)";
+
+constexpr char kBlitStencilShaders[] = R"(
+
+struct VertexOutputs {
+  @location(0) @interpolate(flat) stencil_val : u32,
+  @builtin(position) position : vec4<f32>,
+};
+
+// The instance_index here is not used for instancing.
+// It represents the current stencil mask we're testing in the
+// source.
+// This is a cheap way to get the stencil value into the shader
+// since WebGPU doesn't have push constants.
+@vertex fn vert_fullscreen_quad(
+  @builtin(vertex_index) vertex_index : u32,
+  @builtin(instance_index) instance_index: u32,
+) -> VertexOutputs {
+  const pos = array<vec2<f32>, 3>(
+      vec2<f32>(-1.0, -1.0),
+      vec2<f32>( 3.0, -1.0),
+      vec2<f32>(-1.0,  3.0));
+  return VertexOutputs(
+    instance_index,
+    vec4<f32>(pos[vertex_index], 0.0, 1.0),
+  );
+}
+
+struct Params {
+  origin : vec2<u32>
+};
+
+@group(0) @binding(0) var src_tex : texture_2d<u32>;
+@group(0) @binding(1) var<uniform> params : Params;
+
+// Do nothing (but also don't discard). Used for clearing
+// stencil to 0.
+@fragment fn frag_noop() {}
+
+// Discard the fragment if the source texture doesn't
+// have the stencil_val.
+@fragment fn frag_check_src_stencil(input : VertexOutputs) {
+  // Load the source stencil value.
+  let src_val : u32 = textureLoad(
+    src_tex, vec2<u32>(input.position.xy) - params.origin, 0u)[0];
+
+  // Discard it if it doesn't contain the stencil reference.
+  if ((src_val & input.stencil_val) == 0u) {
+    discard;
+  }
+}
+
+)";
+
+ResultOrError<Ref<RenderPipelineBase>> GetOrCreateRG8ToDepth16UnormPipeline(DeviceBase* device) {
+    InternalPipelineStore* store = device->GetInternalPipelineStore();
+    if (store->blitRG8ToDepth16UnormPipeline != nullptr) {
+        return store->blitRG8ToDepth16UnormPipeline;
+    }
+
+    ShaderModuleWGSLDescriptor wgslDesc = {};
+    ShaderModuleDescriptor shaderModuleDesc = {};
+    shaderModuleDesc.nextInChain = &wgslDesc;
+    wgslDesc.source = kBlitRG8ToDepthShaders;
+
+    Ref<ShaderModuleBase> shaderModule;
+    DAWN_TRY_ASSIGN(shaderModule, device->CreateShaderModule(&shaderModuleDesc));
+
+    FragmentState fragmentState = {};
+    fragmentState.module = shaderModule.Get();
+    fragmentState.entryPoint = "blit_to_depth";
+
+    DepthStencilState dsState = {};
+    dsState.format = wgpu::TextureFormat::Depth16Unorm;
+    dsState.depthWriteEnabled = true;
+
+    RenderPipelineDescriptor renderPipelineDesc = {};
+    renderPipelineDesc.vertex.module = shaderModule.Get();
+    renderPipelineDesc.vertex.entryPoint = "vert_fullscreen_quad";
+    renderPipelineDesc.depthStencil = &dsState;
+    renderPipelineDesc.fragment = &fragmentState;
+
+    Ref<RenderPipelineBase> pipeline;
+    DAWN_TRY_ASSIGN(pipeline, device->CreateRenderPipeline(&renderPipelineDesc));
+
+    store->blitRG8ToDepth16UnormPipeline = pipeline;
+    return pipeline;
+}
+
+ResultOrError<InternalPipelineStore::BlitR8ToStencilPipelines> GetOrCreateR8ToStencilPipelines(
+    DeviceBase* device,
+    wgpu::TextureFormat format,
+    BindGroupLayoutBase* bgl) {
+    InternalPipelineStore* store = device->GetInternalPipelineStore();
+    {
+        auto it = store->blitR8ToStencilPipelines.find(format);
+        if (it != store->blitR8ToStencilPipelines.end()) {
+            return InternalPipelineStore::BlitR8ToStencilPipelines{it->second};
+        }
+    }
+
+    Ref<PipelineLayoutBase> pipelineLayout;
+    {
+        PipelineLayoutDescriptor plDesc = {};
+        plDesc.bindGroupLayoutCount = 1;
+
+        plDesc.bindGroupLayouts = &bgl;
+        DAWN_TRY_ASSIGN(pipelineLayout, device->CreatePipelineLayout(&plDesc));
+    }
+
+    ShaderModuleWGSLDescriptor wgslDesc = {};
+    ShaderModuleDescriptor shaderModuleDesc = {};
+    shaderModuleDesc.nextInChain = &wgslDesc;
+    wgslDesc.source = kBlitStencilShaders;
+
+    Ref<ShaderModuleBase> shaderModule;
+    DAWN_TRY_ASSIGN(shaderModule, device->CreateShaderModule(&shaderModuleDesc));
+
+    FragmentState fragmentState = {};
+    fragmentState.module = shaderModule.Get();
+
+    DepthStencilState dsState = {};
+    dsState.format = format;
+    dsState.depthWriteEnabled = false;
+    dsState.stencilFront.passOp = wgpu::StencilOperation::Replace;
+
+    RenderPipelineDescriptor renderPipelineDesc = {};
+    renderPipelineDesc.layout = pipelineLayout.Get();
+    renderPipelineDesc.vertex.module = shaderModule.Get();
+    renderPipelineDesc.vertex.entryPoint = "vert_fullscreen_quad";
+    renderPipelineDesc.depthStencil = &dsState;
+    renderPipelineDesc.fragment = &fragmentState;
+
+    // Build a pipeline to clear stencil to 0. We need a pipeline, and not just a render pass load
+    // op because the copy region may be a subregion of the stencil texture.
+    Ref<RenderPipelineBase> clearPipeline;
+    fragmentState.entryPoint = "frag_noop";
+    DAWN_TRY_ASSIGN(clearPipeline, device->CreateRenderPipeline(&renderPipelineDesc));
+
+    // Build 8 pipelines masked to replace each bit of the stencil.
+    std::array<Ref<RenderPipelineBase>, 8> setStencilPipelines;
+    fragmentState.entryPoint = "frag_check_src_stencil";
+    for (uint32_t bit = 0; bit < 8; ++bit) {
+        dsState.stencilWriteMask = 1u << bit;
+        DAWN_TRY_ASSIGN(setStencilPipelines[bit],
+                        device->CreateRenderPipeline(&renderPipelineDesc));
+    }
+
+    InternalPipelineStore::BlitR8ToStencilPipelines pipelines{std::move(clearPipeline),
+                                                              std::move(setStencilPipelines)};
+    store->blitR8ToStencilPipelines[format] = pipelines;
+    return pipelines;
+}
+
+MaybeError BlitRG8ToDepth16Unorm(DeviceBase* device,
+                                 CommandEncoder* commandEncoder,
+                                 TextureBase* dataTexture,
+                                 const TextureCopy& dst,
+                                 const Extent3D& copyExtent) {
+    ASSERT(dst.texture->GetFormat().format == wgpu::TextureFormat::Depth16Unorm);
+    ASSERT(dataTexture->GetFormat().format == wgpu::TextureFormat::RG8Uint);
+
+    // Allow internal usages since we need to use the destination
+    // as a render attachment.
+    auto scope = commandEncoder->MakeInternalUsageScope();
+
+    Ref<RenderPipelineBase> pipeline;
+    DAWN_TRY_ASSIGN(pipeline, GetOrCreateRG8ToDepth16UnormPipeline(device));
+
+    Ref<BindGroupLayoutBase> bgl;
+    DAWN_TRY_ASSIGN(bgl, pipeline->GetBindGroupLayout(0));
+
+    for (uint32_t z = 0; z < copyExtent.depthOrArrayLayers; ++z) {
+        Ref<TextureViewBase> srcView;
+        {
+            TextureViewDescriptor viewDesc = {};
+            viewDesc.dimension = wgpu::TextureViewDimension::e2D;
+            viewDesc.baseArrayLayer = z;
+            viewDesc.arrayLayerCount = 1;
+            viewDesc.mipLevelCount = 1;
+            DAWN_TRY_ASSIGN(srcView, dataTexture->CreateView(&viewDesc));
+        }
+
+        Ref<TextureViewBase> dstView;
+        {
+            TextureViewDescriptor viewDesc = {};
+            viewDesc.dimension = wgpu::TextureViewDimension::e2D;
+            viewDesc.baseArrayLayer = dst.origin.z + z;
+            viewDesc.arrayLayerCount = 1;
+            viewDesc.baseMipLevel = dst.mipLevel;
+            viewDesc.mipLevelCount = 1;
+            DAWN_TRY_ASSIGN(dstView, dst.texture->CreateView(&viewDesc));
+        }
+
+        Ref<BufferBase> paramsBuffer;
+        {
+            BufferDescriptor bufferDesc = {};
+            bufferDesc.size = sizeof(uint32_t) * 2;
+            bufferDesc.usage = wgpu::BufferUsage::Uniform;
+            bufferDesc.mappedAtCreation = true;
+            DAWN_TRY_ASSIGN(paramsBuffer, device->CreateBuffer(&bufferDesc));
+
+            uint32_t* params =
+                static_cast<uint32_t*>(paramsBuffer->GetMappedRange(0, bufferDesc.size));
+            params[0] = dst.origin.x;
+            params[1] = dst.origin.y;
+            paramsBuffer->Unmap();
+        }
+
+        Ref<BindGroupBase> bindGroup;
+        {
+            std::array<BindGroupEntry, 2> bgEntries = {};
+            bgEntries[0].binding = 0;
+            bgEntries[0].textureView = srcView.Get();
+            bgEntries[1].binding = 1;
+            bgEntries[1].buffer = paramsBuffer.Get();
+
+            BindGroupDescriptor bgDesc = {};
+            bgDesc.layout = bgl.Get();
+            bgDesc.entryCount = bgEntries.size();
+            bgDesc.entries = bgEntries.data();
+            DAWN_TRY_ASSIGN(bindGroup, device->CreateBindGroup(&bgDesc));
+        }
+
+        RenderPassDepthStencilAttachment dsAttachment;
+        dsAttachment.view = dstView.Get();
+        dsAttachment.depthLoadOp = wgpu::LoadOp::Load;
+        dsAttachment.depthStoreOp = wgpu::StoreOp::Store;
+
+        RenderPassDescriptor rpDesc = {};
+        rpDesc.depthStencilAttachment = &dsAttachment;
+
+        Ref<RenderPassEncoder> pass = AcquireRef(commandEncoder->APIBeginRenderPass(&rpDesc));
+        // Bind the resources.
+        pass->APISetBindGroup(0, bindGroup.Get());
+        // Discard all fragments outside the copy region.
+        pass->APISetScissorRect(dst.origin.x, dst.origin.y, copyExtent.width, copyExtent.height);
+
+        // Draw to perform the blit.
+        pass->APISetPipeline(pipeline.Get());
+        pass->APIDraw(3, 1, 0, 0);
+
+        pass->APIEnd();
+    }
+    return {};
+}
+
+MaybeError BlitR8ToStencil(DeviceBase* device,
+                           CommandEncoder* commandEncoder,
+                           TextureBase* dataTexture,
+                           const TextureCopy& dst,
+                           const Extent3D& copyExtent) {
+    const Format& format = dst.texture->GetFormat();
+    ASSERT(dst.aspect == Aspect::Stencil);
+
+    // Allow internal usages since we need to use the destination
+    // as a render attachment.
+    auto scope = commandEncoder->MakeInternalUsageScope();
+
+    // This bgl is the same for all the render pipelines.
+    Ref<BindGroupLayoutBase> bgl;
+    {
+        std::array<BindGroupLayoutEntry, 2> bglEntries = {};
+        // Binding 0: the r8uint texture.
+        bglEntries[0].binding = 0;
+        bglEntries[0].visibility = wgpu::ShaderStage::Fragment;
+        bglEntries[0].texture.sampleType = wgpu::TextureSampleType::Uint;
+        // Binding 1: the params buffer.
+        bglEntries[1].binding = 1;
+        bglEntries[1].visibility = wgpu::ShaderStage::Fragment;
+        bglEntries[1].buffer.type = wgpu::BufferBindingType::Uniform;
+        bglEntries[1].buffer.minBindingSize = 2 * sizeof(uint32_t);
+
+        BindGroupLayoutDescriptor bglDesc = {};
+        bglDesc.entryCount = bglEntries.size();
+        bglDesc.entries = bglEntries.data();
+
+        DAWN_TRY_ASSIGN(bgl, device->CreateBindGroupLayout(&bglDesc));
+    }
+
+    InternalPipelineStore::BlitR8ToStencilPipelines pipelines;
+    DAWN_TRY_ASSIGN(pipelines, GetOrCreateR8ToStencilPipelines(device, format.format, bgl.Get()));
+
+    // Build the params buffer, containing the copy dst origin.
+    Ref<BufferBase> paramsBuffer;
+    {
+        BufferDescriptor bufferDesc = {};
+        bufferDesc.size = sizeof(uint32_t) * 2;
+        bufferDesc.usage = wgpu::BufferUsage::Uniform;
+        bufferDesc.mappedAtCreation = true;
+        DAWN_TRY_ASSIGN(paramsBuffer, device->CreateBuffer(&bufferDesc));
+
+        uint32_t* params = static_cast<uint32_t*>(paramsBuffer->GetMappedRange(0, bufferDesc.size));
+        params[0] = dst.origin.x;
+        params[1] = dst.origin.y;
+        paramsBuffer->Unmap();
+    }
+
+    // For each layer, blit the stencil data.
+    for (uint32_t z = 0; z < copyExtent.depthOrArrayLayers; ++z) {
+        Ref<TextureViewBase> srcView;
+        {
+            TextureViewDescriptor viewDesc = {};
+            viewDesc.dimension = wgpu::TextureViewDimension::e2D;
+            viewDesc.baseArrayLayer = z;
+            viewDesc.arrayLayerCount = 1;
+            viewDesc.mipLevelCount = 1;
+            DAWN_TRY_ASSIGN(srcView, dataTexture->CreateView(&viewDesc));
+        }
+
+        Ref<TextureViewBase> dstView;
+        {
+            TextureViewDescriptor viewDesc = {};
+            viewDesc.dimension = wgpu::TextureViewDimension::e2D;
+            viewDesc.baseArrayLayer = dst.origin.z + z;
+            viewDesc.arrayLayerCount = 1;
+            viewDesc.baseMipLevel = dst.mipLevel;
+            viewDesc.mipLevelCount = 1;
+            DAWN_TRY_ASSIGN(dstView, dst.texture->CreateView(&viewDesc));
+        }
+
+        Ref<BindGroupBase> bindGroup;
+        {
+            std::array<BindGroupEntry, 2> bgEntries = {};
+            bgEntries[0].binding = 0;
+            bgEntries[0].textureView = srcView.Get();
+            bgEntries[1].binding = 1;
+            bgEntries[1].buffer = paramsBuffer.Get();
+
+            BindGroupDescriptor bgDesc = {};
+            bgDesc.layout = bgl.Get();
+            bgDesc.entryCount = bgEntries.size();
+            bgDesc.entries = bgEntries.data();
+            DAWN_TRY_ASSIGN(bindGroup,
+                            device->CreateBindGroup(&bgDesc, UsageValidationMode::Internal));
+        }
+
+        RenderPassDepthStencilAttachment dsAttachment;
+        dsAttachment.view = dstView.Get();
+        if (format.HasDepth()) {
+            dsAttachment.depthLoadOp = wgpu::LoadOp::Load;
+            dsAttachment.depthStoreOp = wgpu::StoreOp::Store;
+        }
+        dsAttachment.stencilLoadOp = wgpu::LoadOp::Load;
+        dsAttachment.stencilStoreOp = wgpu::StoreOp::Store;
+
+        RenderPassDescriptor rpDesc = {};
+        rpDesc.depthStencilAttachment = &dsAttachment;
+
+        Ref<RenderPassEncoder> pass = AcquireRef(commandEncoder->APIBeginRenderPass(&rpDesc));
+        // Bind the resources.
+        pass->APISetBindGroup(0, bindGroup.Get());
+        // Discard all fragments outside the copy region.
+        pass->APISetScissorRect(dst.origin.x, dst.origin.y, copyExtent.width, copyExtent.height);
+
+        // Clear the copy region to 0.
+        pass->APISetStencilReference(0);
+        pass->APISetPipeline(pipelines.clearPipeline.Get());
+        pass->APIDraw(3, 1, 0, 0);
+
+        // Perform 8 draws. Each will load the source stencil data, and will
+        // set the bit index in the destination stencil attachment if it the
+        // source also has that bit using stencil operation `Replace`.
+        // If it doesn't match, the fragment will be discarded.
+        pass->APISetStencilReference(255);
+        for (uint32_t bit = 0; bit < 8; ++bit) {
+            pass->APISetPipeline(pipelines.setStencilPipelines[bit].Get());
+            // Draw one instance, and use the stencil value as firstInstance.
+            // This is a cheap way to get the stencil value into the shader
+            // since WebGPU doesn't have push constants.
+            pass->APIDraw(3, 1, 0, 1u << bit);
+        }
+        pass->APIEnd();
+    }
+    return {};
+}
+
+}  // anonymous namespace
+
+MaybeError BlitStagingBufferToDepth(DeviceBase* device,
+                                    BufferBase* buffer,
+                                    const TextureDataLayout& src,
+                                    const TextureCopy& dst,
+                                    const Extent3D& copyExtent) {
+    const Format& format = dst.texture->GetFormat();
+    ASSERT(format.format == wgpu::TextureFormat::Depth16Unorm);
+
+    TextureDescriptor dataTextureDesc = {};
+    dataTextureDesc.format = wgpu::TextureFormat::RG8Uint;
+    dataTextureDesc.usage = wgpu::TextureUsage::CopyDst | wgpu::TextureUsage::TextureBinding;
+    dataTextureDesc.size = copyExtent;
+
+    Ref<TextureBase> dataTexture;
+    DAWN_TRY_ASSIGN(dataTexture, device->CreateTexture(&dataTextureDesc));
+    {
+        TextureCopy rg8Dst;
+        rg8Dst.texture = dataTexture.Get();
+        rg8Dst.mipLevel = 0;
+        rg8Dst.origin = {};
+        rg8Dst.aspect = Aspect::Color;
+        DAWN_TRY(device->CopyFromStagingToTexture(buffer, src, rg8Dst, copyExtent));
+    }
+
+    Ref<CommandEncoderBase> commandEncoder;
+    DAWN_TRY_ASSIGN(commandEncoder, device->CreateCommandEncoder());
+
+    DAWN_TRY(
+        BlitRG8ToDepth16Unorm(device, commandEncoder.Get(), dataTexture.Get(), dst, copyExtent));
+
+    Ref<CommandBufferBase> commandBuffer;
+    DAWN_TRY_ASSIGN(commandBuffer, commandEncoder->Finish());
+
+    CommandBufferBase* commands = commandBuffer.Get();
+    device->GetQueue()->APISubmit(1, &commands);
+    return {};
+}
+
+MaybeError BlitBufferToDepth(DeviceBase* device,
+                             CommandEncoder* commandEncoder,
+                             BufferBase* buffer,
+                             const TextureDataLayout& src,
+                             const TextureCopy& dst,
+                             const Extent3D& copyExtent) {
+    const Format& format = dst.texture->GetFormat();
+    ASSERT(format.format == wgpu::TextureFormat::Depth16Unorm);
+
+    TextureDescriptor dataTextureDesc = {};
+    dataTextureDesc.format = wgpu::TextureFormat::RG8Uint;
+    dataTextureDesc.usage = wgpu::TextureUsage::CopyDst | wgpu::TextureUsage::TextureBinding;
+    dataTextureDesc.size = copyExtent;
+
+    Ref<TextureBase> dataTexture;
+    DAWN_TRY_ASSIGN(dataTexture, device->CreateTexture(&dataTextureDesc));
+    {
+        ImageCopyBuffer bufferSrc;
+        bufferSrc.buffer = buffer;
+        bufferSrc.layout = src;
+
+        ImageCopyTexture textureDst;
+        textureDst.texture = dataTexture.Get();
+        commandEncoder->APICopyBufferToTexture(&bufferSrc, &textureDst, &copyExtent);
+    }
+
+    DAWN_TRY(BlitRG8ToDepth16Unorm(device, commandEncoder, dataTexture.Get(), dst, copyExtent));
+    return {};
+}
+
+MaybeError BlitStagingBufferToStencil(DeviceBase* device,
+                                      BufferBase* buffer,
+                                      const TextureDataLayout& src,
+                                      const TextureCopy& dst,
+                                      const Extent3D& copyExtent) {
+    TextureDescriptor dataTextureDesc = {};
+    dataTextureDesc.format = wgpu::TextureFormat::R8Uint;
+    dataTextureDesc.usage = wgpu::TextureUsage::CopyDst | wgpu::TextureUsage::TextureBinding;
+    dataTextureDesc.size = copyExtent;
+
+    Ref<TextureBase> dataTexture;
+    DAWN_TRY_ASSIGN(dataTexture, device->CreateTexture(&dataTextureDesc));
+    {
+        TextureCopy r8Dst;
+        r8Dst.texture = dataTexture.Get();
+        r8Dst.mipLevel = 0;
+        r8Dst.origin = {};
+        r8Dst.aspect = Aspect::Color;
+        DAWN_TRY(device->CopyFromStagingToTexture(buffer, src, r8Dst, copyExtent));
+    }
+
+    Ref<CommandEncoderBase> commandEncoder;
+    DAWN_TRY_ASSIGN(commandEncoder, device->CreateCommandEncoder());
+
+    DAWN_TRY(BlitR8ToStencil(device, commandEncoder.Get(), dataTexture.Get(), dst, copyExtent));
+
+    Ref<CommandBufferBase> commandBuffer;
+    DAWN_TRY_ASSIGN(commandBuffer, commandEncoder->Finish());
+
+    CommandBufferBase* commands = commandBuffer.Get();
+    device->GetQueue()->APISubmit(1, &commands);
+    return {};
+}
+
+MaybeError BlitBufferToStencil(DeviceBase* device,
+                               CommandEncoder* commandEncoder,
+                               BufferBase* buffer,
+                               const TextureDataLayout& src,
+                               const TextureCopy& dst,
+                               const Extent3D& copyExtent) {
+    TextureDescriptor dataTextureDesc = {};
+    dataTextureDesc.format = wgpu::TextureFormat::R8Uint;
+    dataTextureDesc.usage = wgpu::TextureUsage::CopyDst | wgpu::TextureUsage::TextureBinding;
+    dataTextureDesc.size = copyExtent;
+
+    Ref<TextureBase> dataTexture;
+    DAWN_TRY_ASSIGN(dataTexture, device->CreateTexture(&dataTextureDesc));
+    {
+        ImageCopyBuffer bufferSrc;
+        bufferSrc.buffer = buffer;
+        bufferSrc.layout = src;
+
+        ImageCopyTexture textureDst;
+        textureDst.texture = dataTexture.Get();
+        commandEncoder->APICopyBufferToTexture(&bufferSrc, &textureDst, &copyExtent);
+    }
+
+    DAWN_TRY(BlitR8ToStencil(device, commandEncoder, dataTexture.Get(), dst, copyExtent));
+    return {};
+}
+
+}  // namespace dawn::native
diff --git a/src/dawn/native/BlitBufferToDepthStencil.h b/src/dawn/native/BlitBufferToDepthStencil.h
new file mode 100644
index 0000000..ada2d8b
--- /dev/null
+++ b/src/dawn/native/BlitBufferToDepthStencil.h
@@ -0,0 +1,76 @@
+// Copyright 2023 The Dawn Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SRC_DAWN_NATIVE_BLITBUFFERTODEPTHSTENCIL_H_
+#define SRC_DAWN_NATIVE_BLITBUFFERTODEPTHSTENCIL_H_
+
+#include "dawn/native/Error.h"
+
+namespace dawn::native {
+
+struct TextureCopy;
+
+// BlitBufferToDepth works around issues where copying from a buffer
+// to depth does not work on some drivers.
+// Currently, only depth16unorm textures can be CopyDst, so only depth16unorm
+// is supported.
+// It does the following:
+//  - Copies buffer data to an rg8uint texture.
+//  - Sets the viewport to the copy rect.
+//  - Uploads the copy origin to a uniform buffer.
+//  - For each destination layer:
+//    - Performs a draw to sample the rg8uint data, computes the
+//      floating point depth value, and writes the frag depth.
+
+MaybeError BlitStagingBufferToDepth(DeviceBase* device,
+                                    BufferBase* buffer,
+                                    const TextureDataLayout& src,
+                                    const TextureCopy& dst,
+                                    const Extent3D& copyExtent);
+
+MaybeError BlitBufferToDepth(DeviceBase* device,
+                             CommandEncoder* commandEncoder,
+                             BufferBase* buffer,
+                             const TextureDataLayout& src,
+                             const TextureCopy& dst,
+                             const Extent3D& copyExtent);
+
+// BlitBufferToStencil works around issues where copying from a buffer
+// to stencil does not work on some drivers.
+// It does the following:
+//  - Copies buffer data to an r8uint texture.
+//  - Sets the viewport to the copy rect.
+//  - Uploads the copy origin to a uniform buffer.
+//  - For each destination layer:
+//    - Performs a draw to clear stencil to 0.
+//    - Performs 8 draws for each bit of stencil to set the respective
+//      stencil bit to 1, if the source r8 texture also has that bit set.
+//      If the source r8 texture does not, the fragment is discarded.
+
+MaybeError BlitStagingBufferToStencil(DeviceBase* device,
+                                      BufferBase* buffer,
+                                      const TextureDataLayout& src,
+                                      const TextureCopy& dst,
+                                      const Extent3D& copyExtent);
+
+MaybeError BlitBufferToStencil(DeviceBase* device,
+                               CommandEncoder* commandEncoder,
+                               BufferBase* buffer,
+                               const TextureDataLayout& src,
+                               const TextureCopy& dst,
+                               const Extent3D& copyExtent);
+
+}  // namespace dawn::native
+
+#endif  // SRC_DAWN_NATIVE_BLITBUFFERTODEPTHSTENCIL_H_
diff --git a/src/dawn/native/CMakeLists.txt b/src/dawn/native/CMakeLists.txt
index 7f5eb2e..0628bfe 100644
--- a/src/dawn/native/CMakeLists.txt
+++ b/src/dawn/native/CMakeLists.txt
@@ -47,6 +47,8 @@
     "BindGroupTracker.h"
     "BindingInfo.cpp"
     "BindingInfo.h"
+    "BlitBufferToDepthStencil.cpp"
+    "BlitBufferToDepthStencil.h"
     "Blob.cpp"
     "Blob.h"
     "BlobCache.cpp"
diff --git a/src/dawn/native/CommandEncoder.cpp b/src/dawn/native/CommandEncoder.cpp
index df7db6d..d0a58de 100644
--- a/src/dawn/native/CommandEncoder.cpp
+++ b/src/dawn/native/CommandEncoder.cpp
@@ -22,6 +22,7 @@
 #include "dawn/common/Math.h"
 #include "dawn/native/ApplyClearColorValueWithDrawHelper.h"
 #include "dawn/native/BindGroup.h"
+#include "dawn/native/BlitBufferToDepthStencil.h"
 #include "dawn/native/Buffer.h"
 #include "dawn/native/ChainUtils_autogen.h"
 #include "dawn/native/CommandBuffer.h"
@@ -1218,17 +1219,35 @@
             TextureDataLayout srcLayout = source->layout;
             ApplyDefaultTextureDataLayoutOptions(&srcLayout, blockInfo, *copySize);
 
+            TextureCopy dst;
+            dst.texture = destination->texture;
+            dst.origin = destination->origin;
+            dst.mipLevel = destination->mipLevel;
+            dst.aspect = ConvertAspect(destination->texture->GetFormat(), destination->aspect);
+
+            if (dst.aspect == Aspect::Depth &&
+                GetDevice()->IsToggleEnabled(Toggle::UseBlitForBufferToDepthTextureCopy)) {
+                DAWN_TRY_CONTEXT(
+                    BlitBufferToDepth(GetDevice(), this, source->buffer, srcLayout, dst, *copySize),
+                    "copying from %s to depth aspect of %s using blit workaround.", source->buffer,
+                    dst.texture.Get());
+                return {};
+            } else if (dst.aspect == Aspect::Stencil &&
+                       GetDevice()->IsToggleEnabled(Toggle::UseBlitForBufferToStencilTextureCopy)) {
+                DAWN_TRY_CONTEXT(BlitBufferToStencil(GetDevice(), this, source->buffer, srcLayout,
+                                                     dst, *copySize),
+                                 "copying from %s to stencil aspect of %s using blit workaround.",
+                                 source->buffer, dst.texture.Get());
+                return {};
+            }
+
             CopyBufferToTextureCmd* copy =
                 allocator->Allocate<CopyBufferToTextureCmd>(Command::CopyBufferToTexture);
             copy->source.buffer = source->buffer;
             copy->source.offset = srcLayout.offset;
             copy->source.bytesPerRow = srcLayout.bytesPerRow;
             copy->source.rowsPerImage = srcLayout.rowsPerImage;
-            copy->destination.texture = destination->texture;
-            copy->destination.origin = destination->origin;
-            copy->destination.mipLevel = destination->mipLevel;
-            copy->destination.aspect =
-                ConvertAspect(destination->texture->GetFormat(), destination->aspect);
+            copy->destination = dst;
             copy->copySize = *copySize;
 
             return {};
@@ -1277,45 +1296,12 @@
             TextureDataLayout dstLayout = destination->layout;
             ApplyDefaultTextureDataLayoutOptions(&dstLayout, blockInfo, *copySize);
 
-            TextureCopy copySrc;
-            copySrc.texture = source->texture;
-            copySrc.origin = source->origin;
-            copySrc.mipLevel = source->mipLevel;
-            copySrc.aspect = ConvertAspect(source->texture->GetFormat(), source->aspect);
-
-            if (copySrc.aspect == Aspect::Stencil &&
-                GetDevice()->IsToggleEnabled(Toggle::UseTempTextureInStencilTextureToBufferCopy)) {
-                // Encode a copy to an intermediate texture.
-                TextureDescriptor desc = {};
-                desc.format = source->texture->GetFormat().format;
-                desc.usage = wgpu::TextureUsage::CopySrc | wgpu::TextureUsage::CopyDst;
-                desc.size = *copySize;
-
-                Ref<TextureBase> intermediateTexture;
-                DAWN_TRY_ASSIGN(intermediateTexture, GetDevice()->CreateTexture(&desc));
-
-                // Allocate the intermediate t2t command.
-                Aspect aspect =
-                    ConvertAspect(source->texture->GetFormat(), wgpu::TextureAspect::All);
-                CopyTextureToTextureCmd* t2t =
-                    allocator->Allocate<CopyTextureToTextureCmd>(Command::CopyTextureToTexture);
-                t2t->source = copySrc;
-                t2t->source.aspect = aspect;
-                t2t->destination.texture = intermediateTexture;
-                t2t->destination.origin = {};
-                t2t->destination.mipLevel = 0;
-                t2t->destination.aspect = aspect;
-                t2t->copySize = *copySize;
-
-                // Replace the `copySrc` with the intermediate texture.
-                copySrc.texture = intermediateTexture;
-                copySrc.mipLevel = 0;
-                copySrc.origin = {};
-            }
-
             CopyTextureToBufferCmd* t2b =
                 allocator->Allocate<CopyTextureToBufferCmd>(Command::CopyTextureToBuffer);
-            t2b->source = copySrc;
+            t2b->source.texture = source->texture;
+            t2b->source.origin = source->origin;
+            t2b->source.mipLevel = source->mipLevel;
+            t2b->source.aspect = ConvertAspect(source->texture->GetFormat(), source->aspect);
             t2b->destination.buffer = destination->buffer;
             t2b->destination.offset = dstLayout.offset;
             t2b->destination.bytesPerRow = dstLayout.bytesPerRow;
@@ -1657,4 +1643,17 @@
     return {};
 }
 
+CommandEncoder::InternalUsageScope CommandEncoder::MakeInternalUsageScope() {
+    return InternalUsageScope(this);
+}
+
+CommandEncoder::InternalUsageScope::InternalUsageScope(CommandEncoder* encoder)
+    : mEncoder(encoder), mUsageValidationMode(mEncoder->mUsageValidationMode) {
+    mEncoder->mUsageValidationMode = UsageValidationMode::Internal;
+}
+
+CommandEncoder::InternalUsageScope::~InternalUsageScope() {
+    mEncoder->mUsageValidationMode = mUsageValidationMode;
+}
+
 }  // namespace dawn::native
diff --git a/src/dawn/native/CommandEncoder.h b/src/dawn/native/CommandEncoder.h
index d1e20f3..3b8f5fd 100644
--- a/src/dawn/native/CommandEncoder.h
+++ b/src/dawn/native/CommandEncoder.h
@@ -96,6 +96,27 @@
     ResultOrError<Ref<CommandBufferBase>> Finish(
         const CommandBufferDescriptor* descriptor = nullptr);
 
+    // `InternalUsageScope` is a scoped class that temporarily changes validation such that the
+    // command encoder includes internal resource usages.
+    friend class InternalUsageScope;
+    class [[nodiscard]] InternalUsageScope : public NonMovable {
+      public:
+        ~InternalUsageScope();
+
+      private:
+        // Disable heap allocation
+        void* operator new(size_t) = delete;
+
+        // Only CommandEncoder can make this class.
+        friend class CommandEncoder;
+        InternalUsageScope(CommandEncoder* encoder);
+
+        CommandEncoder* mEncoder;
+        UsageValidationMode mUsageValidationMode;
+    };
+
+    InternalUsageScope MakeInternalUsageScope();
+
   private:
     CommandEncoder(DeviceBase* device, const CommandEncoderDescriptor* descriptor);
     CommandEncoder(DeviceBase* device, ObjectBase::ErrorTag tag);
diff --git a/src/dawn/native/Device.cpp b/src/dawn/native/Device.cpp
index e924d53..12bf8db 100644
--- a/src/dawn/native/Device.cpp
+++ b/src/dawn/native/Device.cpp
@@ -26,6 +26,7 @@
 #include "dawn/native/AttachmentState.h"
 #include "dawn/native/BindGroup.h"
 #include "dawn/native/BindGroupLayout.h"
+#include "dawn/native/BlitBufferToDepthStencil.h"
 #include "dawn/native/BlobCache.h"
 #include "dawn/native/Buffer.h"
 #include "dawn/native/ChainUtils_autogen.h"
@@ -1938,11 +1939,25 @@
     return {};
 }
 
-MaybeError DeviceBase::CopyFromStagingToTexture(const BufferBase* source,
+MaybeError DeviceBase::CopyFromStagingToTexture(BufferBase* source,
                                                 const TextureDataLayout& src,
-                                                TextureCopy* dst,
+                                                const TextureCopy& dst,
                                                 const Extent3D& copySizePixels) {
-    DAWN_TRY(CopyFromStagingToTextureImpl(source, src, dst, copySizePixels));
+    if (dst.aspect == Aspect::Depth &&
+        IsToggleEnabled(Toggle::UseBlitForBufferToDepthTextureCopy)) {
+        DAWN_TRY_CONTEXT(BlitStagingBufferToDepth(this, source, src, dst, copySizePixels),
+                         "copying from staging buffer to depth aspect of %s using blit workaround.",
+                         dst.texture.Get());
+    } else if (dst.aspect == Aspect::Stencil &&
+               IsToggleEnabled(Toggle::UseBlitForBufferToStencilTextureCopy)) {
+        DAWN_TRY_CONTEXT(
+            BlitStagingBufferToStencil(this, source, src, dst, copySizePixels),
+            "copying from staging buffer to stencil aspect of %s using blit workaround.",
+            dst.texture.Get());
+    } else {
+        DAWN_TRY(CopyFromStagingToTextureImpl(source, src, dst, copySizePixels));
+    }
+
     if (GetDynamicUploader()->ShouldFlush()) {
         ForceEventualFlushOfCommands();
     }
diff --git a/src/dawn/native/Device.h b/src/dawn/native/Device.h
index ac8fd44..d04ff48 100644
--- a/src/dawn/native/Device.h
+++ b/src/dawn/native/Device.h
@@ -302,9 +302,9 @@
                                        BufferBase* destination,
                                        uint64_t destinationOffset,
                                        uint64_t size);
-    MaybeError CopyFromStagingToTexture(const BufferBase* source,
+    MaybeError CopyFromStagingToTexture(BufferBase* source,
                                         const TextureDataLayout& src,
-                                        TextureCopy* dst,
+                                        const TextureCopy& dst,
                                         const Extent3D& copySizePixels);
 
     DynamicUploader* GetDynamicUploader() const;
@@ -532,7 +532,7 @@
                                                    uint64_t size) = 0;
     virtual MaybeError CopyFromStagingToTextureImpl(const BufferBase* source,
                                                     const TextureDataLayout& src,
-                                                    TextureCopy* dst,
+                                                    const TextureCopy& dst,
                                                     const Extent3D& copySizePixels) = 0;
 
     wgpu::ErrorCallback mUncapturedErrorCallback = nullptr;
diff --git a/src/dawn/native/InternalPipelineStore.h b/src/dawn/native/InternalPipelineStore.h
index 6234ec0..e2560f1 100644
--- a/src/dawn/native/InternalPipelineStore.h
+++ b/src/dawn/native/InternalPipelineStore.h
@@ -57,6 +57,14 @@
     Ref<ComputePipelineBase> renderValidationPipeline;
     Ref<ShaderModuleBase> renderValidationShader;
     Ref<ComputePipelineBase> dispatchIndirectValidationPipeline;
+
+    Ref<RenderPipelineBase> blitRG8ToDepth16UnormPipeline;
+
+    struct BlitR8ToStencilPipelines {
+        Ref<RenderPipelineBase> clearPipeline;
+        std::array<Ref<RenderPipelineBase>, 8> setStencilPipelines;
+    };
+    std::unordered_map<wgpu::TextureFormat, BlitR8ToStencilPipelines> blitR8ToStencilPipelines;
 };
 
 }  // namespace dawn::native
diff --git a/src/dawn/native/Queue.cpp b/src/dawn/native/Queue.cpp
index acce952..a8d92e3 100644
--- a/src/dawn/native/Queue.cpp
+++ b/src/dawn/native/Queue.cpp
@@ -381,8 +381,8 @@
 
     DeviceBase* device = GetDevice();
 
-    return device->CopyFromStagingToTexture(uploadHandle.stagingBuffer, passDataLayout,
-                                            &textureCopy, writeSizePixel);
+    return device->CopyFromStagingToTexture(uploadHandle.stagingBuffer, passDataLayout, textureCopy,
+                                            writeSizePixel);
 }
 
 void QueueBase::APICopyTextureForBrowser(const ImageCopyTexture* source,
diff --git a/src/dawn/native/Texture.cpp b/src/dawn/native/Texture.cpp
index ad57774..85b1889 100644
--- a/src/dawn/native/Texture.cpp
+++ b/src/dawn/native/Texture.cpp
@@ -562,6 +562,19 @@
     if (applyAlwaysResolveIntoZeroLevelAndLayerToggle) {
         AddInternalUsage(wgpu::TextureUsage::CopyDst);
     }
+
+    if (mFormat.HasStencil() && (mInternalUsage & wgpu::TextureUsage::CopyDst) &&
+        device->IsToggleEnabled(Toggle::UseBlitForBufferToStencilTextureCopy)) {
+        // Add render attachment usage so we can blit to the stencil texture
+        // in a render pass.
+        AddInternalUsage(wgpu::TextureUsage::RenderAttachment);
+    }
+    if (mFormat.HasDepth() && (mInternalUsage & wgpu::TextureUsage::CopyDst) &&
+        device->IsToggleEnabled(Toggle::UseBlitForBufferToDepthTextureCopy)) {
+        // Add render attachment usage so we can blit to the depth texture
+        // in a render pass.
+        AddInternalUsage(wgpu::TextureUsage::RenderAttachment);
+    }
 }
 
 TextureBase::~TextureBase() = default;
diff --git a/src/dawn/native/Toggles.cpp b/src/dawn/native/Toggles.cpp
index c39ce05..4b9d44e 100644
--- a/src/dawn/native/Toggles.cpp
+++ b/src/dawn/native/Toggles.cpp
@@ -342,11 +342,17 @@
       "for stencil8 formats if metal_use_combined_depth_stencil_format_for_stencil8 is also "
       "enabled.",
       "https://crbug.com/dawn/1389"}},
-    {Toggle::UseTempTextureInStencilTextureToBufferCopy,
-     {"use_temp_texture_in_stencil_texture_to_buffer_copy",
-      "Use an intermediate temporary texture when copying the stencil aspect of a texture to a "
-      "buffer. Works around an issue where stencil writes from a render pass are not reflected in "
-      "the destination buffer.",
+    {Toggle::UseBlitForBufferToDepthTextureCopy,
+     {"use_blit_for_buffer_to_depth_texture_copy",
+      "Use a blit instead of a copy command to copy buffer data to the depth aspect of a "
+      "texture. Works around an issue where depth writes by copy commands are not visible "
+      "to a render or compute pass.",
+      "https://crbug.com/dawn/1389"}},
+    {Toggle::UseBlitForBufferToStencilTextureCopy,
+     {"use_blit_for_buffer_to_stencil_texture_copy",
+      "Use a blit instead of a copy command to copy buffer data to the stencil aspect of a "
+      "texture. Works around an issue where stencil writes by copy commands are not visible "
+      "to a render or compute pass.",
       "https://crbug.com/dawn/1389"}},
     {Toggle::DisallowDeprecatedAPIs,
      {"disallow_deprecated_apis",
diff --git a/src/dawn/native/Toggles.h b/src/dawn/native/Toggles.h
index b785c5a..665c385 100644
--- a/src/dawn/native/Toggles.h
+++ b/src/dawn/native/Toggles.h
@@ -84,7 +84,8 @@
     D3D12Allocate2DTexturewithCopyDstAsCommittedResource,
     MetalUseCombinedDepthStencilFormatForStencil8,
     MetalUseBothDepthAndStencilAttachmentsForCombinedDepthStencilFormats,
-    UseTempTextureInStencilTextureToBufferCopy,
+    UseBlitForBufferToDepthTextureCopy,
+    UseBlitForBufferToStencilTextureCopy,
     DisallowDeprecatedAPIs,
 
     // Unresolved issues.
diff --git a/src/dawn/native/d3d12/DeviceD3D12.cpp b/src/dawn/native/d3d12/DeviceD3D12.cpp
index e2752b5..2f4e3eb 100644
--- a/src/dawn/native/d3d12/DeviceD3D12.cpp
+++ b/src/dawn/native/d3d12/DeviceD3D12.cpp
@@ -524,15 +524,15 @@
 
 MaybeError Device::CopyFromStagingToTextureImpl(const BufferBase* source,
                                                 const TextureDataLayout& src,
-                                                TextureCopy* dst,
+                                                const TextureCopy& dst,
                                                 const Extent3D& copySizePixels) {
     CommandRecordingContext* commandContext;
     DAWN_TRY_ASSIGN(commandContext, GetPendingCommandContext(Device::SubmitMode::Passive));
-    Texture* texture = ToBackend(dst->texture.Get());
+    Texture* texture = ToBackend(dst.texture.Get());
 
-    SubresourceRange range = GetSubresourcesAffectedByCopy(*dst, copySizePixels);
+    SubresourceRange range = GetSubresourcesAffectedByCopy(dst, copySizePixels);
 
-    if (IsCompleteSubresourceCopiedTo(texture, copySizePixels, dst->mipLevel)) {
+    if (IsCompleteSubresourceCopiedTo(texture, copySizePixels, dst.mipLevel)) {
         texture->SetIsSubresourceContentInitialized(true, range);
     } else {
         texture->EnsureSubresourceContentInitialized(commandContext, range);
@@ -540,10 +540,10 @@
 
     texture->TrackUsageAndTransitionNow(commandContext, wgpu::TextureUsage::CopyDst, range);
 
-    RecordBufferTextureCopyWithBufferHandle(
-        BufferTextureCopyDirection::B2T, commandContext->GetCommandList(),
-        ToBackend(source)->GetD3D12Resource(), src.offset, src.bytesPerRow, src.rowsPerImage, *dst,
-        copySizePixels);
+    RecordBufferTextureCopyWithBufferHandle(BufferTextureCopyDirection::B2T,
+                                            commandContext->GetCommandList(),
+                                            ToBackend(source)->GetD3D12Resource(), src.offset,
+                                            src.bytesPerRow, src.rowsPerImage, dst, copySizePixels);
 
     return {};
 }
diff --git a/src/dawn/native/d3d12/DeviceD3D12.h b/src/dawn/native/d3d12/DeviceD3D12.h
index f053ede..f39ed40 100644
--- a/src/dawn/native/d3d12/DeviceD3D12.h
+++ b/src/dawn/native/d3d12/DeviceD3D12.h
@@ -111,7 +111,7 @@
 
     MaybeError CopyFromStagingToTextureImpl(const BufferBase* source,
                                             const TextureDataLayout& src,
-                                            TextureCopy* dst,
+                                            const TextureCopy& dst,
                                             const Extent3D& copySizePixels) override;
 
     ResultOrError<ResourceHeapAllocation> AllocateMemory(
diff --git a/src/dawn/native/metal/DeviceMTL.h b/src/dawn/native/metal/DeviceMTL.h
index 2bfa3bb..2b94c3d 100644
--- a/src/dawn/native/metal/DeviceMTL.h
+++ b/src/dawn/native/metal/DeviceMTL.h
@@ -70,7 +70,7 @@
                                            uint64_t size) override;
     MaybeError CopyFromStagingToTextureImpl(const BufferBase* source,
                                             const TextureDataLayout& dataLayout,
-                                            TextureCopy* dst,
+                                            const TextureCopy& dst,
                                             const Extent3D& copySizePixels) override;
 
     uint32_t GetOptimalBytesPerRowAlignment() const override;
diff --git a/src/dawn/native/metal/DeviceMTL.mm b/src/dawn/native/metal/DeviceMTL.mm
index 77de03e..3bc776e 100644
--- a/src/dawn/native/metal/DeviceMTL.mm
+++ b/src/dawn/native/metal/DeviceMTL.mm
@@ -259,9 +259,10 @@
 
 #if DAWN_PLATFORM_IS(MACOS)
     if (gpu_info::IsIntel(vendorId)) {
-        SetToggle(Toggle::UseTempTextureInStencilTextureToBufferCopy, true);
         SetToggle(Toggle::MetalUseBothDepthAndStencilAttachmentsForCombinedDepthStencilFormats,
                   true);
+        SetToggle(Toggle::UseBlitForBufferToStencilTextureCopy, true);
+        SetToggle(Toggle::UseBlitForBufferToDepthTextureCopy, true);
 
         if ([NSProcessInfo.processInfo
                 isOperatingSystemAtLeastVersion:NSOperatingSystemVersion{12, 0, 0}]) {
@@ -496,17 +497,17 @@
 // sets the private storage mode by default for all textures except IOSurfaces on macOS.
 MaybeError Device::CopyFromStagingToTextureImpl(const BufferBase* source,
                                                 const TextureDataLayout& dataLayout,
-                                                TextureCopy* dst,
+                                                const TextureCopy& dst,
                                                 const Extent3D& copySizePixels) {
-    Texture* texture = ToBackend(dst->texture.Get());
+    Texture* texture = ToBackend(dst.texture.Get());
     texture->SynchronizeTextureBeforeUse(GetPendingCommandContext());
     EnsureDestinationTextureInitialized(GetPendingCommandContext(DeviceBase::SubmitMode::Passive),
-                                        texture, *dst, copySizePixels);
+                                        texture, dst, copySizePixels);
 
     RecordCopyBufferToTexture(GetPendingCommandContext(DeviceBase::SubmitMode::Passive),
                               ToBackend(source)->GetMTLBuffer(), source->GetSize(),
                               dataLayout.offset, dataLayout.bytesPerRow, dataLayout.rowsPerImage,
-                              texture, dst->mipLevel, dst->origin, dst->aspect, copySizePixels);
+                              texture, dst.mipLevel, dst.origin, dst.aspect, copySizePixels);
     return {};
 }
 
diff --git a/src/dawn/native/null/DeviceNull.cpp b/src/dawn/native/null/DeviceNull.cpp
index 0d191ab..7288de0 100644
--- a/src/dawn/native/null/DeviceNull.cpp
+++ b/src/dawn/native/null/DeviceNull.cpp
@@ -234,7 +234,7 @@
 
 MaybeError Device::CopyFromStagingToTextureImpl(const BufferBase* source,
                                                 const TextureDataLayout& src,
-                                                TextureCopy* dst,
+                                                const TextureCopy& dst,
                                                 const Extent3D& copySizePixels) {
     return {};
 }
diff --git a/src/dawn/native/null/DeviceNull.h b/src/dawn/native/null/DeviceNull.h
index 6d5fb15..2a8565a 100644
--- a/src/dawn/native/null/DeviceNull.h
+++ b/src/dawn/native/null/DeviceNull.h
@@ -111,7 +111,7 @@
                                            uint64_t size) override;
     MaybeError CopyFromStagingToTextureImpl(const BufferBase* source,
                                             const TextureDataLayout& src,
-                                            TextureCopy* dst,
+                                            const TextureCopy& dst,
                                             const Extent3D& copySizePixels) override;
 
     MaybeError IncrementMemoryUsage(uint64_t bytes);
diff --git a/src/dawn/native/opengl/DeviceGL.cpp b/src/dawn/native/opengl/DeviceGL.cpp
index fabfd84..c1cf4fb 100644
--- a/src/dawn/native/opengl/DeviceGL.cpp
+++ b/src/dawn/native/opengl/DeviceGL.cpp
@@ -423,7 +423,7 @@
 
 MaybeError Device::CopyFromStagingToTextureImpl(const BufferBase* source,
                                                 const TextureDataLayout& src,
-                                                TextureCopy* dst,
+                                                const TextureCopy& dst,
                                                 const Extent3D& copySizePixels) {
     return DAWN_UNIMPLEMENTED_ERROR("Device unable to copy from staging buffer to texture.");
 }
diff --git a/src/dawn/native/opengl/DeviceGL.h b/src/dawn/native/opengl/DeviceGL.h
index 207ffe6..8061db3 100644
--- a/src/dawn/native/opengl/DeviceGL.h
+++ b/src/dawn/native/opengl/DeviceGL.h
@@ -75,7 +75,7 @@
 
     MaybeError CopyFromStagingToTextureImpl(const BufferBase* source,
                                             const TextureDataLayout& src,
-                                            TextureCopy* dst,
+                                            const TextureCopy& dst,
                                             const Extent3D& copySizePixels) override;
 
     uint32_t GetOptimalBytesPerRowAlignment() const override;
diff --git a/src/dawn/native/vulkan/DeviceVk.cpp b/src/dawn/native/vulkan/DeviceVk.cpp
index 9375be3..120801e 100644
--- a/src/dawn/native/vulkan/DeviceVk.cpp
+++ b/src/dawn/native/vulkan/DeviceVk.cpp
@@ -864,7 +864,7 @@
 
 MaybeError Device::CopyFromStagingToTextureImpl(const BufferBase* source,
                                                 const TextureDataLayout& src,
-                                                TextureCopy* dst,
+                                                const TextureCopy& dst,
                                                 const Extent3D& copySizePixels) {
     // There is no need of a barrier to make host writes available and visible to the copy
     // operation for HOST_COHERENT memory. The Vulkan spec for vkQueueSubmit describes that it
@@ -873,22 +873,22 @@
     CommandRecordingContext* recordingContext =
         GetPendingRecordingContext(DeviceBase::SubmitMode::Passive);
 
-    VkBufferImageCopy region = ComputeBufferImageCopyRegion(src, *dst, copySizePixels);
+    VkBufferImageCopy region = ComputeBufferImageCopyRegion(src, dst, copySizePixels);
     VkImageSubresourceLayers subresource = region.imageSubresource;
 
-    SubresourceRange range = GetSubresourcesAffectedByCopy(*dst, copySizePixels);
+    SubresourceRange range = GetSubresourcesAffectedByCopy(dst, copySizePixels);
 
-    if (IsCompleteSubresourceCopiedTo(dst->texture.Get(), copySizePixels, subresource.mipLevel)) {
+    if (IsCompleteSubresourceCopiedTo(dst.texture.Get(), copySizePixels, subresource.mipLevel)) {
         // Since texture has been overwritten, it has been "initialized"
-        dst->texture->SetIsSubresourceContentInitialized(true, range);
+        dst.texture->SetIsSubresourceContentInitialized(true, range);
     } else {
-        ToBackend(dst->texture)->EnsureSubresourceContentInitialized(recordingContext, range);
+        ToBackend(dst.texture)->EnsureSubresourceContentInitialized(recordingContext, range);
     }
     // Insert pipeline barrier to ensure correct ordering with previous memory operations on the
     // texture.
-    ToBackend(dst->texture)
+    ToBackend(dst.texture)
         ->TransitionUsageNow(recordingContext, wgpu::TextureUsage::CopyDst, range);
-    VkImage dstImage = ToBackend(dst->texture)->GetHandle();
+    VkImage dstImage = ToBackend(dst.texture)->GetHandle();
 
     // Dawn guarantees dstImage be in the TRANSFER_DST_OPTIMAL layout after the
     // copy command.
diff --git a/src/dawn/native/vulkan/DeviceVk.h b/src/dawn/native/vulkan/DeviceVk.h
index e6011a6..c42375a 100644
--- a/src/dawn/native/vulkan/DeviceVk.h
+++ b/src/dawn/native/vulkan/DeviceVk.h
@@ -96,7 +96,7 @@
                                            uint64_t size) override;
     MaybeError CopyFromStagingToTextureImpl(const BufferBase* source,
                                             const TextureDataLayout& src,
-                                            TextureCopy* dst,
+                                            const TextureCopy& dst,
                                             const Extent3D& copySizePixels) override;
 
     // Return the fixed subgroup size to use for compute shaders on this device or 0 if none
diff --git a/src/dawn/tests/end2end/DepthStencilCopyTests.cpp b/src/dawn/tests/end2end/DepthStencilCopyTests.cpp
index cafb44d..3adb3c7 100644
--- a/src/dawn/tests/end2end/DepthStencilCopyTests.cpp
+++ b/src/dawn/tests/end2end/DepthStencilCopyTests.cpp
@@ -918,7 +918,8 @@
 
 DAWN_INSTANTIATE_TEST_P(DepthStencilCopyTests,
                         {D3D12Backend(), MetalBackend(),
-                         MetalBackend({"use_temp_texture_in_stencil_texture_to_buffer_copy"}),
+                         MetalBackend({"use_blit_for_buffer_to_depth_texture_copy",
+                                       "use_blit_for_buffer_to_stencil_texture_copy"}),
                          OpenGLBackend(), OpenGLESBackend(),
                          // Test with the vulkan_use_s8 toggle forced on and off.
                          VulkanBackend({"vulkan_use_s8"}, {}),
@@ -938,7 +939,9 @@
                         {D3D12Backend(),
                          D3D12Backend({"d3d12_use_temp_buffer_in_depth_stencil_texture_and_buffer_"
                                        "copy_with_non_zero_buffer_offset"}),
-                         MetalBackend(), OpenGLBackend(), OpenGLESBackend(), VulkanBackend()},
+                         MetalBackend(),
+                         MetalBackend({"use_blit_for_buffer_to_depth_texture_copy"}),
+                         OpenGLBackend(), OpenGLESBackend(), VulkanBackend()},
                         std::vector<wgpu::TextureFormat>(kValidDepthCopyFromBufferFormats.begin(),
                                                          kValidDepthCopyFromBufferFormats.end()));
 
@@ -948,10 +951,10 @@
      D3D12Backend({"d3d12_use_temp_buffer_in_depth_stencil_texture_and_buffer_"
                    "copy_with_non_zero_buffer_offset"}),
      MetalBackend(), MetalBackend({"metal_use_combined_depth_stencil_format_for_stencil8"}),
-     MetalBackend({"use_temp_texture_in_stencil_texture_to_buffer_copy"}),
      MetalBackend(
          {"metal_use_both_depth_and_stencil_attachments_for_combined_depth_stencil_formats"}),
-     OpenGLBackend(), OpenGLESBackend(),
+     MetalBackend({"use_blit_for_buffer_to_stencil_texture_copy"}), OpenGLBackend(),
+     OpenGLESBackend(),
      // Test with the vulkan_use_s8 toggle forced on and off.
      VulkanBackend({"vulkan_use_s8"}, {}), VulkanBackend({}, {"vulkan_use_s8"})},
     std::vector<wgpu::TextureFormat>(utils::kStencilFormats.begin(), utils::kStencilFormats.end()));
diff --git a/src/dawn/tests/end2end/QueueTests.cpp b/src/dawn/tests/end2end/QueueTests.cpp
index 3e843ce..23e6f40 100644
--- a/src/dawn/tests/end2end/QueueTests.cpp
+++ b/src/dawn/tests/end2end/QueueTests.cpp
@@ -775,6 +775,8 @@
                       D3D12Backend({"d3d12_use_temp_buffer_in_depth_stencil_texture_and_buffer_"
                                     "copy_with_non_zero_buffer_offset"}),
                       MetalBackend(),
+                      MetalBackend({"use_blit_for_buffer_to_depth_texture_copy",
+                                    "use_blit_for_buffer_to_stencil_texture_copy"}),
                       OpenGLBackend(),
                       OpenGLESBackend(),
                       VulkanBackend());
diff --git a/src/dawn/tests/end2end/RenderPassTests.cpp b/src/dawn/tests/end2end/RenderPassTests.cpp
index d6606ac..523ab6f 100644
--- a/src/dawn/tests/end2end/RenderPassTests.cpp
+++ b/src/dawn/tests/end2end/RenderPassTests.cpp
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "dawn/tests/DawnTest.h"
+#include <utility>
+#include <vector>
 
+#include "dawn/tests/DawnTest.h"
 #include "dawn/utils/ComboRenderPipelineDescriptor.h"
 #include "dawn/utils/WGPUHelpers.h"
 
@@ -163,8 +165,6 @@
     EXPECT_PIXEL_RGBA8_EQ(utils::RGBA8::kRed, renderTarget, kRTSize - 1, 1);
 }
 
-class RenderPassTest_RegressionDawn1071 : public RenderPassTest {};
-
 DAWN_INSTANTIATE_TEST(RenderPassTest,
                       D3D12Backend(),
                       D3D12Backend({}, {"use_d3d12_render_pass"}),
@@ -175,6 +175,7 @@
 
 // Test that clearing the lower mips of an R8Unorm texture works. This is a regression test for
 // dawn:1071 where Intel Metal devices fail to do that correctly, requiring a workaround.
+class RenderPassTest_RegressionDawn1071 : public RenderPassTest {};
 TEST_P(RenderPassTest_RegressionDawn1071, ClearLowestMipOfR8Unorm) {
     const uint32_t kLastMipLevel = 2;
 
@@ -230,3 +231,137 @@
                       OpenGLBackend(),
                       OpenGLESBackend(),
                       VulkanBackend());
+
+// Test that clearing a depth16unorm texture with multiple subresources works. This is a regression
+// test for dawn:1389 where Intel Metal devices fail to do that correctly, requiring a workaround.
+class RenderPassTest_RegressionDawn1389 : public RenderPassTest {};
+TEST_P(RenderPassTest_RegressionDawn1389, ClearMultisubresourceAfterWriteDepth16Unorm) {
+    // TODO(crbug.com/dawn/1492): Support copying to Depth16Unorm on GL.
+    DAWN_SUPPRESS_TEST_IF(IsOpenGL() || IsOpenGLES());
+
+    // Test all combinatons of multi-mip, multi-layer
+    for (uint32_t mipLevelCount : {1, 5}) {
+        for (uint32_t arrayLayerCount : {1, 7}) {
+            // Only clear some of the subresources.
+            const auto& clearedMips =
+                mipLevelCount == 1 ? std::vector<std::pair<uint32_t, uint32_t>>{{0, 1}}
+                                   : std::vector<std::pair<uint32_t, uint32_t>>{{0, 2}, {3, 4}};
+            const auto& clearedLayers =
+                arrayLayerCount == 1 ? std::vector<std::pair<uint32_t, uint32_t>>{{0, 1}}
+                                     : std::vector<std::pair<uint32_t, uint32_t>>{{2, 4}, {6, 7}};
+
+            // Compute the texture size.
+            uint32_t width = 1u << (mipLevelCount - 1);
+            uint32_t height = 1u << (mipLevelCount - 1);
+
+            // Create the texture.
+            wgpu::TextureDescriptor texDesc;
+            texDesc.format = wgpu::TextureFormat::Depth16Unorm;
+            texDesc.usage = wgpu::TextureUsage::RenderAttachment | wgpu::TextureUsage::CopySrc |
+                            wgpu::TextureUsage::CopyDst;
+            texDesc.size = {width, height, arrayLayerCount};
+            texDesc.mipLevelCount = mipLevelCount;
+            wgpu::Texture tex = device.CreateTexture(&texDesc);
+
+            // Initialize all subresources with WriteTexture.
+            for (uint32_t level = 0; level < mipLevelCount; ++level) {
+                for (uint32_t layer = 0; layer < arrayLayerCount; ++layer) {
+                    wgpu::ImageCopyTexture imageCopyTexture =
+                        utils::CreateImageCopyTexture(tex, level, {0, 0, layer});
+                    wgpu::Extent3D copySize = {width >> level, height >> level, 1};
+
+                    wgpu::TextureDataLayout textureDataLayout;
+                    textureDataLayout.offset = 0;
+                    textureDataLayout.bytesPerRow = copySize.width * sizeof(uint16_t);
+                    textureDataLayout.rowsPerImage = copySize.height;
+
+                    // Use a distinct value for each subresource.
+                    uint16_t value = level * 10 + layer;
+                    std::vector<uint16_t> data(copySize.width * copySize.height, value);
+                    queue.WriteTexture(&imageCopyTexture, data.data(),
+                                       data.size() * sizeof(uint16_t), &textureDataLayout,
+                                       &copySize);
+                }
+            }
+
+            // Prep a viewDesc for rendering to depth. The base layer and level
+            // will be set later.
+            wgpu::TextureViewDescriptor viewDesc = {};
+            viewDesc.mipLevelCount = 1u;
+            viewDesc.arrayLayerCount = 1u;
+
+            // Overwrite some subresources with a render pass
+            {
+                wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+                for (const auto& clearedMipRange : clearedMips) {
+                    for (const auto& clearedLayerRange : clearedLayers) {
+                        for (uint32_t level = clearedMipRange.first; level < clearedMipRange.second;
+                             ++level) {
+                            for (uint32_t layer = clearedLayerRange.first;
+                                 layer < clearedLayerRange.second; ++layer) {
+                                viewDesc.baseMipLevel = level;
+                                viewDesc.baseArrayLayer = layer;
+
+                                utils::ComboRenderPassDescriptor renderPass(
+                                    {}, tex.CreateView(&viewDesc));
+                                renderPass.UnsetDepthStencilLoadStoreOpsForFormat(texDesc.format);
+                                renderPass.cDepthStencilAttachmentInfo.depthClearValue = 0.8;
+                                renderPass.cDepthStencilAttachmentInfo.depthLoadOp =
+                                    wgpu::LoadOp::Clear;
+                                renderPass.cDepthStencilAttachmentInfo.depthStoreOp =
+                                    wgpu::StoreOp::Store;
+                                encoder.BeginRenderPass(&renderPass).End();
+                            }
+                        }
+                    }
+                }
+                wgpu::CommandBuffer commands = encoder.Finish();
+                queue.Submit(1, &commands);
+            }
+
+            // Iterate all subresources.
+            for (uint32_t level = 0; level < mipLevelCount; ++level) {
+                for (uint32_t layer = 0; layer < arrayLayerCount; ++layer) {
+                    bool cleared = false;
+                    for (const auto& clearedMipRange : clearedMips) {
+                        for (const auto& clearedLayerRange : clearedLayers) {
+                            if (level >= clearedMipRange.first && level < clearedMipRange.second &&
+                                layer >= clearedLayerRange.first &&
+                                layer < clearedLayerRange.second) {
+                                cleared = true;
+                            }
+                        }
+                    }
+                    uint32_t mipWidth = width >> level;
+                    uint32_t mipHeight = height >> level;
+                    if (cleared) {
+                        // Check the subresource is cleared as expected.
+                        std::vector<uint16_t> data(mipWidth * mipHeight, 0xCCCC);
+                        EXPECT_TEXTURE_EQ(data.data(), tex, {0, 0, layer}, {mipWidth, mipHeight},
+                                          level)
+                            << "cleared texture data should have been 0xCCCC at:"
+                            << "\nlayer: " << layer << "\nlevel: " << level;
+                    } else {
+                        // Otherwise, check the other subresources have the orignal contents.
+                        // Without the workaround, they are 0.
+                        uint16_t value =
+                            level * 10 + layer;  // Compute the expected value for the subresource.
+                        std::vector<uint16_t> data(mipWidth * mipHeight, value);
+                        EXPECT_TEXTURE_EQ(data.data(), tex, {0, 0, layer}, {mipWidth, mipHeight},
+                                          level)
+                            << "written texture data should still be " << value << " at:"
+                            << "\nlayer: " << layer << "\nlevel: " << level;
+                    }
+                }
+            }
+        }
+    }
+}
+
+DAWN_INSTANTIATE_TEST(RenderPassTest_RegressionDawn1389,
+                      D3D12Backend(),
+                      MetalBackend(),
+                      MetalBackend({"use_blit_for_buffer_to_depth_texture_copy"}),
+                      OpenGLBackend(),
+                      OpenGLESBackend(),
+                      VulkanBackend());
diff --git a/src/dawn/tests/end2end/TextureZeroInitTests.cpp b/src/dawn/tests/end2end/TextureZeroInitTests.cpp
index f883ef2..0d33cef 100644
--- a/src/dawn/tests/end2end/TextureZeroInitTests.cpp
+++ b/src/dawn/tests/end2end/TextureZeroInitTests.cpp
@@ -2319,7 +2319,8 @@
                       D3D12Backend({"nonzero_clear_resources_on_creation_for_testing"}),
                       MetalBackend({"nonzero_clear_resources_on_creation_for_testing"}),
                       MetalBackend({"nonzero_clear_resources_on_creation_for_testing",
-                                    "use_temp_texture_in_stencil_texture_to_buffer_copy"}),
+                                    "use_blit_for_buffer_to_depth_texture_copy",
+                                    "use_blit_for_buffer_to_stencil_texture_copy"}),
                       OpenGLBackend({"nonzero_clear_resources_on_creation_for_testing"}),
                       OpenGLESBackend({"nonzero_clear_resources_on_creation_for_testing"}),
                       VulkanBackend({"nonzero_clear_resources_on_creation_for_testing"}));
diff --git a/src/dawn/tests/unittests/native/mocks/DeviceMock.h b/src/dawn/tests/unittests/native/mocks/DeviceMock.h
index 92c4883..4cabb78 100644
--- a/src/dawn/tests/unittests/native/mocks/DeviceMock.h
+++ b/src/dawn/tests/unittests/native/mocks/DeviceMock.h
@@ -40,7 +40,7 @@
                 (override));
     MOCK_METHOD(MaybeError,
                 CopyFromStagingToTextureImpl,
-                (const BufferBase*, const TextureDataLayout&, TextureCopy*, const Extent3D&),
+                (const BufferBase*, const TextureDataLayout&, const TextureCopy&, const Extent3D&),
                 (override));
 
     MOCK_METHOD(uint32_t, GetOptimalBytesPerRowAlignment, (), (const, override));
diff --git a/webgpu-cts/expectations.txt b/webgpu-cts/expectations.txt
index e61ae06..37f4252 100644
--- a/webgpu-cts/expectations.txt
+++ b/webgpu-cts/expectations.txt
@@ -315,14 +315,6 @@
 crbug.com/dawn/1500 [ intel-gen-9 monterey ] webgpu:api,operation,command_buffer,image_copy:origins_and_extents:initMethod="WriteTexture";checkMethod="PartialCopyT2B";* [ RetryOnFailure ]
 
 ################################################################################
-# webgpu:api,operation,resource_init,texture_zero:uninitialized_texture_is_zero
-# Failures on Mac Intel, likely also due to crbug.com/dawn/1083
-################################################################################
-crbug.com/dawn/1389 [ monterey ] webgpu:api,operation,resource_init,texture_zero:uninitialized_texture_is_zero:dimension="2d";readMethod="CopyToBuffer";format="depth16unorm" [ Failure ]
-crbug.com/dawn/1389 [ monterey ] webgpu:api,operation,resource_init,texture_zero:uninitialized_texture_is_zero:dimension="2d";readMethod="CopyToTexture";format="depth16unorm" [ Failure ]
-crbug.com/dawn/1389 [ monterey ] webgpu:api,operation,resource_init,texture_zero:uninitialized_texture_is_zero:dimension="2d";readMethod="DepthTest";format="depth16unorm" [ Failure ]
-
-################################################################################
 # copyToTexture,canvas:color_space_conversion:* fail with swiftshader
 # The other tests about canvas and image bitmap fail with swiftshader on Linux
 # KEEP