D3D12: Workaround buffer-texture copy with depth-stencil on some platforms

On the D3D12 platforms that don't support programmable sample positions,
the source box specifying a portion of the depth texture must all be 0, or
an error and a device lost will occur. This patch adds a workaround for this
issue by splitting the original buffer-texture copy into two copies:
1. copy from the source resource into a temporary buffer at offset 0
2. copy from the temporary buffer at offset 0 into the destination resource.

In the next patch we will fix the corresponding issue in Queue.WriteTexture.

Note that on newer version of D3D12 the restrictions about
D3D12_TEXTURE_DATA_PLACEMENT_ALIGNMENT and D3D12_TEXTURE_DATA_PITCH_ALIGNMENT
have all been lifted out, so the workaround added in this patch will also be
disabled on the platforms that don't support programmable sample positions
but the restrictions about D3D12_TEXTURE_DATA_PLACEMENT_ALIGNMENT and
D3D12_TEXTURE_DATA_PITCH_ALIGNMENT are no longer available.

Bug: dawn:727
Test: dawn_end2end_test
Change-Id: I9f1d848a0eeac5bd52c9219af6992a2821307746
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/97182
Commit-Queue: Jiawei Shao <jiawei.shao@intel.com>
Reviewed-by: Austin Eng <enga@chromium.org>
Kokoro: Kokoro <noreply+kokoro@google.com>
diff --git a/src/dawn/native/Toggles.cpp b/src/dawn/native/Toggles.cpp
index 5c90de2..3c1ae30 100644
--- a/src/dawn/native/Toggles.cpp
+++ b/src/dawn/native/Toggles.cpp
@@ -182,9 +182,8 @@
       "Split texture-to-texture copy into two copies: copy from source texture into a temporary "
       "buffer, and copy from the temporary buffer into the destination texture under specific "
       "situations. This workaround is by default enabled on some Intel GPUs which have a driver "
-      "bug "
-      "in the execution of CopyTextureRegion() when we copy with the formats whose texel block "
-      "sizes are less than 4 bytes from a greater mip level to a smaller mip level on D3D12 "
+      "bug in the execution of CopyTextureRegion() when we copy with the formats whose texel "
+      "block sizes are less than 4 bytes from a greater mip level to a smaller mip level on D3D12 "
       "backends.",
       "https://crbug.com/1161355"}},
     {Toggle::EmitHLSLDebugSymbols,
@@ -290,6 +289,12 @@
       "issue. We can workaround this security issue via allocating extra memory and limiting its "
       "access in itself.",
       "https://crbug.com/dawn/949"}},
+    {Toggle::D3D12UseTempBufferInDepthStencilTextureAndBufferCopyWithNonZeroBufferOffset,
+     {"d3d12_use_temp_buffer_in_depth_stencil_texture_and_buffer_copy_with_non_zero_buffer_offset",
+      "Split buffer-texture copy into two copies: do first copy with a temporary buffer at offset "
+      "0, then copy from the temporary buffer to the destination. Now this toggle must be enabled "
+      "on the D3D12 platforms where programmable MSAA is not supported.",
+      "https://crbug.com/dawn/727"}},
     // Comment to separate the }} so it is clearer what to copy-paste to add a toggle.
 }};
 }  // anonymous namespace
diff --git a/src/dawn/native/Toggles.h b/src/dawn/native/Toggles.h
index ceb8bf7..3ea47dc 100644
--- a/src/dawn/native/Toggles.h
+++ b/src/dawn/native/Toggles.h
@@ -75,6 +75,7 @@
     D3D12DontSetClearValueOnDepthTextureCreation,
     D3D12AlwaysUseTypelessFormatsForCastableTexture,
     D3D12AllocateExtraMemoryFor2DArrayTexture,
+    D3D12UseTempBufferInDepthStencilTextureAndBufferCopyWithNonZeroBufferOffset,
 
     EnumCount,
     InvalidEnum = EnumCount,
diff --git a/src/dawn/native/d3d12/CommandBufferD3D12.cpp b/src/dawn/native/d3d12/CommandBufferD3D12.cpp
index 08e2bcb..08aad34 100644
--- a/src/dawn/native/d3d12/CommandBufferD3D12.cpp
+++ b/src/dawn/native/d3d12/CommandBufferD3D12.cpp
@@ -225,6 +225,87 @@
     return {};
 }
 
+bool ShouldCopyUsingTemporaryBuffer(DeviceBase* device,
+                                    const BufferCopy& bufferCopy,
+                                    const TextureCopy& textureCopy) {
+    // Currently we only need the workaround for some D3D12 platforms.
+    if (device->IsToggleEnabled(
+            Toggle::D3D12UseTempBufferInDepthStencilTextureAndBufferCopyWithNonZeroBufferOffset)) {
+        if ((ToBackend(textureCopy.texture)->GetD3D12ResourceFlags() &
+             D3D12_RESOURCE_FLAG_ALLOW_DEPTH_STENCIL) &&
+            bufferCopy.offset % D3D12_TEXTURE_DATA_PLACEMENT_ALIGNMENT > 0) {
+            return true;
+        }
+    }
+    return false;
+}
+
+MaybeError RecordBufferTextureCopyWithTemporaryBuffer(CommandRecordingContext* recordingContext,
+                                                      BufferTextureCopyDirection copyDirection,
+                                                      const BufferCopy& bufferCopy,
+                                                      const TextureCopy& textureCopy,
+                                                      const Extent3D& copySize) {
+    dawn::native::Format format = textureCopy.texture->GetFormat();
+    const TexelBlockInfo& blockInfo = format.GetAspectInfo(textureCopy.aspect).block;
+
+    // Create tempBuffer
+    // The size of temporary buffer isn't needed to be a multiple of 4 because we don't
+    // need to set mappedAtCreation to be true.
+    auto tempBufferSize = ComputeRequiredBytesInCopy(blockInfo, copySize, bufferCopy.bytesPerRow,
+                                                     bufferCopy.rowsPerImage);
+
+    BufferDescriptor tempBufferDescriptor;
+    tempBufferDescriptor.usage = wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst;
+    tempBufferDescriptor.size = tempBufferSize.AcquireSuccess();
+    Device* device = ToBackend(textureCopy.texture->GetDevice());
+    Ref<BufferBase> tempBufferBase;
+    DAWN_TRY_ASSIGN(tempBufferBase, device->CreateBuffer(&tempBufferDescriptor));
+    // D3D12 aligns the entire buffer to at least 64KB, so the virtual address of tempBuffer will
+    // always be aligned to D3D12_TEXTURE_DATA_PLACEMENT_ALIGNMENT (512).
+    Ref<Buffer> tempBuffer = ToBackend(std::move(tempBufferBase));
+    ASSERT(tempBuffer->GetVA() % D3D12_TEXTURE_DATA_PLACEMENT_ALIGNMENT == 0);
+
+    BufferCopy tempBufferCopy;
+    tempBufferCopy.buffer = tempBuffer;
+    tempBufferCopy.offset = 0;
+    tempBufferCopy.bytesPerRow = bufferCopy.bytesPerRow;
+    tempBufferCopy.rowsPerImage = bufferCopy.rowsPerImage;
+
+    tempBuffer->TrackUsageAndTransitionNow(recordingContext, wgpu::BufferUsage::CopyDst);
+
+    ID3D12GraphicsCommandList* commandList = recordingContext->GetCommandList();
+    switch (copyDirection) {
+        case BufferTextureCopyDirection::B2T: {
+            commandList->CopyBufferRegion(tempBuffer->GetD3D12Resource(), 0,
+                                          ToBackend(bufferCopy.buffer)->GetD3D12Resource(),
+                                          bufferCopy.offset, tempBufferDescriptor.size);
+            tempBuffer->TrackUsageAndTransitionNow(recordingContext, wgpu::BufferUsage::CopySrc);
+            RecordBufferTextureCopy(BufferTextureCopyDirection::B2T,
+                                    recordingContext->GetCommandList(), tempBufferCopy, textureCopy,
+                                    copySize);
+            break;
+        }
+        case BufferTextureCopyDirection::T2B: {
+            RecordBufferTextureCopy(BufferTextureCopyDirection::T2B,
+                                    recordingContext->GetCommandList(), tempBufferCopy, textureCopy,
+                                    copySize);
+            tempBuffer->TrackUsageAndTransitionNow(recordingContext, wgpu::BufferUsage::CopySrc);
+            commandList->CopyBufferRegion(ToBackend(bufferCopy.buffer)->GetD3D12Resource(),
+                                          bufferCopy.offset, tempBuffer->GetD3D12Resource(), 0,
+                                          tempBufferDescriptor.size);
+            break;
+        }
+        default:
+            UNREACHABLE();
+            break;
+    }
+
+    // Save tempBuffer into recordingContext
+    recordingContext->AddToTempBuffers(std::move(tempBuffer));
+
+    return {};
+}
+
 void RecordNumWorkgroupsForDispatch(ID3D12GraphicsCommandList* commandList,
                                     ComputePipeline* pipeline,
                                     DispatchCmd* dispatch) {
@@ -733,6 +814,12 @@
                 texture->TrackUsageAndTransitionNow(commandContext, wgpu::TextureUsage::CopyDst,
                                                     subresources);
 
+                if (ShouldCopyUsingTemporaryBuffer(GetDevice(), copy->source, copy->destination)) {
+                    DAWN_TRY(RecordBufferTextureCopyWithTemporaryBuffer(
+                        commandContext, BufferTextureCopyDirection::B2T, copy->source,
+                        copy->destination, copy->copySize));
+                    break;
+                }
                 RecordBufferTextureCopy(BufferTextureCopyDirection::B2T, commandList, copy->source,
                                         copy->destination, copy->copySize);
 
@@ -760,6 +847,12 @@
                                                     subresources);
                 buffer->TrackUsageAndTransitionNow(commandContext, wgpu::BufferUsage::CopyDst);
 
+                if (ShouldCopyUsingTemporaryBuffer(GetDevice(), copy->destination, copy->source)) {
+                    DAWN_TRY(RecordBufferTextureCopyWithTemporaryBuffer(
+                        commandContext, BufferTextureCopyDirection::T2B, copy->destination,
+                        copy->source, copy->copySize));
+                    break;
+                }
                 RecordBufferTextureCopy(BufferTextureCopyDirection::T2B, commandList,
                                         copy->destination, copy->source, copy->copySize);
 
diff --git a/src/dawn/native/d3d12/D3D12Info.cpp b/src/dawn/native/d3d12/D3D12Info.cpp
index 6bbb865..3d3470c 100644
--- a/src/dawn/native/d3d12/D3D12Info.cpp
+++ b/src/dawn/native/d3d12/D3D12Info.cpp
@@ -44,6 +44,12 @@
                           "ID3D12Device::CheckFeatureSupport"));
     info.resourceHeapTier = featureOptions.ResourceHeapTier;
 
+    D3D12_FEATURE_DATA_D3D12_OPTIONS2 featureOptions2 = {};
+    if (SUCCEEDED(adapter.GetDevice()->CheckFeatureSupport(
+            D3D12_FEATURE_D3D12_OPTIONS2, &featureOptions2, sizeof(featureOptions2)))) {
+        info.programmableSamplePositionsTier = featureOptions2.ProgrammableSamplePositionsTier;
+    }
+
     D3D12_FEATURE_DATA_D3D12_OPTIONS3 featureOptions3 = {};
     if (SUCCEEDED(adapter.GetDevice()->CheckFeatureSupport(
             D3D12_FEATURE_D3D12_OPTIONS3, &featureOptions3, sizeof(featureOptions3)))) {
diff --git a/src/dawn/native/d3d12/D3D12Info.h b/src/dawn/native/d3d12/D3D12Info.h
index d985f55..f81e28d 100644
--- a/src/dawn/native/d3d12/D3D12Info.h
+++ b/src/dawn/native/d3d12/D3D12Info.h
@@ -35,6 +35,7 @@
     bool supportsSharedResourceCapabilityTier1;
     bool supportsDP4a;
     bool supportsCastingFullyTypedFormat;
+    uint32_t programmableSamplePositionsTier;
 };
 
 ResultOrError<D3D12DeviceInfo> GatherDeviceInfo(const Adapter& adapter);
diff --git a/src/dawn/native/d3d12/DeviceD3D12.cpp b/src/dawn/native/d3d12/DeviceD3D12.cpp
index 498fef2..5663ca2 100644
--- a/src/dawn/native/d3d12/DeviceD3D12.cpp
+++ b/src/dawn/native/d3d12/DeviceD3D12.cpp
@@ -640,6 +640,12 @@
     SetToggle(Toggle::D3D12AlwaysUseTypelessFormatsForCastableTexture,
               !GetDeviceInfo().supportsCastingFullyTypedFormat);
 
+    // The restriction on the source box specifying a portion of the depth stencil texture in
+    // CopyTextureRegion() is only available on the D3D12 platforms which doesn't support
+    // programmable sample positions.
+    SetToggle(Toggle::D3D12UseTempBufferInDepthStencilTextureAndBufferCopyWithNonZeroBufferOffset,
+              GetDeviceInfo().programmableSamplePositionsTier == 0);
+
     // Disable optimizations when using FXC
     // See https://crbug.com/dawn/1203
     SetToggle(Toggle::FxcOptimizations, false);
@@ -673,6 +679,8 @@
     // Currently this workaround is needed on any D3D12 backend for some particular situations.
     // But we may need to limit it if D3D12 runtime fixes the bug on its new release. See
     // https://crbug.com/dawn/1289 for more information.
+    // TODO(dawn:1289): Unset this toggle when we skip the split on the buffer-texture copy
+    // on the platforms where UnrestrictedBufferTextureCopyPitchSupported is true.
     SetToggle(Toggle::D3D12SplitBufferTextureCopyForRowsPerImagePaddings, true);
 
     // This workaround is only needed on Intel Gen12LP with driver prior to 30.0.101.1960.
diff --git a/src/dawn/native/d3d12/TextureD3D12.cpp b/src/dawn/native/d3d12/TextureD3D12.cpp
index 5f64920..41bc0a0 100644
--- a/src/dawn/native/d3d12/TextureD3D12.cpp
+++ b/src/dawn/native/d3d12/TextureD3D12.cpp
@@ -706,6 +706,10 @@
     return mResourceAllocation.GetD3D12Resource();
 }
 
+D3D12_RESOURCE_FLAGS Texture::GetD3D12ResourceFlags() const {
+    return mD3D12ResourceFlags;
+}
+
 DXGI_FORMAT Texture::GetD3D12CopyableSubresourceFormat(Aspect aspect) const {
     ASSERT(GetFormat().aspects & aspect);
 
diff --git a/src/dawn/native/d3d12/TextureD3D12.h b/src/dawn/native/d3d12/TextureD3D12.h
index c67aff2..3e49bc2 100644
--- a/src/dawn/native/d3d12/TextureD3D12.h
+++ b/src/dawn/native/d3d12/TextureD3D12.h
@@ -58,6 +58,7 @@
     DXGI_FORMAT GetD3D12Format() const;
     ID3D12Resource* GetD3D12Resource() const;
     DXGI_FORMAT GetD3D12CopyableSubresourceFormat(Aspect aspect) const;
+    D3D12_RESOURCE_FLAGS GetD3D12ResourceFlags() const;
 
     D3D12_RENDER_TARGET_VIEW_DESC GetRTVDescriptor(const Format& format,
                                                    uint32_t mipLevel,
diff --git a/src/dawn/tests/end2end/CopyTests.cpp b/src/dawn/tests/end2end/CopyTests.cpp
index 8b89e37..ed7fb59 100644
--- a/src/dawn/tests/end2end/CopyTests.cpp
+++ b/src/dawn/tests/end2end/CopyTests.cpp
@@ -1028,9 +1028,6 @@
     // depth.
     DAWN_TEST_UNSUPPORTED_IF(HasToggleEnabled("disable_depth_read"));
 
-    // TODO(crbug.com/dawn/727): currently this test fails on many D3D12 drivers.
-    DAWN_SUPPRESS_TEST_IF(IsD3D12());
-
     constexpr wgpu::TextureFormat kFormat = wgpu::TextureFormat::Depth32Float;
     constexpr uint32_t kPixelsPerRow = 4u;
 
diff --git a/src/dawn/tests/end2end/DepthStencilCopyTests.cpp b/src/dawn/tests/end2end/DepthStencilCopyTests.cpp
index d6431c8..1bcd451 100644
--- a/src/dawn/tests/end2end/DepthStencilCopyTests.cpp
+++ b/src/dawn/tests/end2end/DepthStencilCopyTests.cpp
@@ -543,10 +543,6 @@
 
 // Test copying the depth-only aspect into a buffer at a non-zero offset.
 TEST_P(DepthCopyTests, FromDepthAspectToBufferAtNonZeroOffset) {
-    // TODO(crbug.com/dawn/727): currently this test fails on many D3D12 drivers as there is a bug
-    // in the implementation of texture-to-buffer copies with depth stencil textures on D3D12.
-    DAWN_SUPPRESS_TEST_IF(IsD3D12());
-
     constexpr float kInitDepth = 0.2f;
     constexpr uint32_t kWidth = 4;
     constexpr uint32_t kHeight = 4;
@@ -661,10 +657,6 @@
 // Test copying the depth-only aspect from a buffer at a non-zero offset.
 TEST_P(DepthCopyFromBufferTests, BufferToRenderableDepthAspectAtNonZeroOffset) {
     constexpr std::array<uint32_t, 2> kBufferCopyOffsets = {8, 512};
-
-    // TODO(crbug.com/dawn/727): currently this test fails on many D3D12 drivers as there is a bug
-    // in the implementation of texture-to-buffer copies with depth stencil textures on D3D12.
-    DAWN_SUPPRESS_TEST_IF(IsD3D12());
     constexpr bool kIsRenderable = true;
     for (uint32_t offset : kBufferCopyOffsets) {
         DoTest(offset, kIsRenderable);
@@ -875,10 +867,6 @@
 
 // Test copying the stencil-only aspect into a buffer at a non-zero offset
 TEST_P(StencilCopyTests, FromStencilAspectAtNonZeroOffset) {
-    // TODO(crbug.com/dawn/727): currently this test fails on many D3D12 drivers as there is a bug
-    // in the implementation of texture-to-buffer copies with depth stencil textures on D3D12.
-    DAWN_SUPPRESS_TEST_IF(IsD3D12());
-
     constexpr uint32_t kWidth = 4;
     constexpr uint32_t kHeight = 4;
     constexpr uint32_t kTestLevel = 0;
@@ -905,10 +893,6 @@
 
 // Test copying to the stencil-aspect of a texture at non-zero offset
 TEST_P(StencilCopyTests, ToStencilAspectAtNonZeroOffset) {
-    // TODO(crbug.com/dawn/727): currently this test fails on many D3D12 drivers as there is a bug
-    // in the implementation of texture-to-buffer copies with depth stencil textures on D3D12.
-    DAWN_SUPPRESS_TEST_IF(IsD3D12());
-
     constexpr std::array<uint32_t, 2> kBufferCopyOffsets = {8, 512};
     for (uint32_t offset : kBufferCopyOffsets) {
         DoCopyToStencilTest(offset);
@@ -924,19 +908,26 @@
                                                          utils::kDepthAndStencilFormats.end()));
 
 DAWN_INSTANTIATE_TEST_P(DepthCopyTests,
-                        {D3D12Backend(), MetalBackend(), OpenGLBackend(), OpenGLESBackend(),
-                         VulkanBackend()},
+                        {D3D12Backend(),
+                         D3D12Backend({"d3d12_use_temp_buffer_in_depth_stencil_texture_and_buffer_"
+                                       "copy_with_non_zero_buffer_offset"}),
+                         MetalBackend(), OpenGLBackend(), OpenGLESBackend(), VulkanBackend()},
                         std::vector<wgpu::TextureFormat>(kValidDepthCopyTextureFormats.begin(),
                                                          kValidDepthCopyTextureFormats.end()));
 
 DAWN_INSTANTIATE_TEST_P(DepthCopyFromBufferTests,
-                        {D3D12Backend(), MetalBackend(), OpenGLBackend(), OpenGLESBackend(),
-                         VulkanBackend()},
+                        {D3D12Backend(),
+                         D3D12Backend({"d3d12_use_temp_buffer_in_depth_stencil_texture_and_buffer_"
+                                       "copy_with_non_zero_buffer_offset"}),
+                         MetalBackend(), OpenGLBackend(), OpenGLESBackend(), VulkanBackend()},
                         std::vector<wgpu::TextureFormat>(kValidDepthCopyFromBufferFormats.begin(),
                                                          kValidDepthCopyFromBufferFormats.end()));
 
 DAWN_INSTANTIATE_TEST_P(StencilCopyTests,
-                        {D3D12Backend(), MetalBackend(), OpenGLBackend(), OpenGLESBackend(),
+                        {D3D12Backend(),
+                         D3D12Backend({"d3d12_use_temp_buffer_in_depth_stencil_texture_and_buffer_"
+                                       "copy_with_non_zero_buffer_offset"}),
+                         MetalBackend(), OpenGLBackend(), OpenGLESBackend(),
                          // Test with the vulkan_use_s8 toggle forced on and off.
                          VulkanBackend({"vulkan_use_s8"}, {}),
                          VulkanBackend({}, {"vulkan_use_s8"})},