Align offset to 4 bytes in writeTexture on depth stencil textures

This patch fixes a bug in the allocation of internal staging buffer
for Queue::WriteTexture() that we must ensure the buffer offset to
be 4 bytes when calling Queue::WriteTexture() on depth stencil
textures as is restricted by Vulkan SPEC.

BUG=dawn:1213
TEST=dawn_end2end_tests

Change-Id: Ia2d073ef12d48baff42fca97005c1185c9560f1c
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/71605
Reviewed-by: Corentin Wallez <cwallez@chromium.org>
Reviewed-by: Austin Eng <enga@chromium.org>
Commit-Queue: Jiawei Shao <jiawei.shao@intel.com>
diff --git a/src/dawn_native/Queue.cpp b/src/dawn_native/Queue.cpp
index 44adbcf..0fb7ab2 100644
--- a/src/dawn_native/Queue.cpp
+++ b/src/dawn_native/Queue.cpp
@@ -80,6 +80,7 @@
             uint32_t optimallyAlignedBytesPerRow,
             uint32_t alignedRowsPerImage,
             const TextureDataLayout& dataLayout,
+            bool hasDepthOrStencil,
             const TexelBlockInfo& blockInfo,
             const Extent3D& writeSizePixel) {
             uint64_t newDataSizeBytes;
@@ -97,6 +98,13 @@
             uint64_t offsetAlignment =
                 std::max(optimalOffsetAlignment, uint64_t(blockInfo.byteSize));
 
+            // For depth-stencil texture, buffer offset must be a multiple of 4, which is required
+            // by WebGPU and Vulkan SPEC.
+            if (hasDepthOrStencil) {
+                constexpr uint64_t kOffsetAlignmentForDepthStencil = 4;
+                offsetAlignment = std::max(offsetAlignment, kOffsetAlignmentForDepthStencil);
+            }
+
             UploadHandle uploadHandle;
             DAWN_TRY_ASSIGN(uploadHandle, device->GetDynamicUploader()->Allocate(
                                               newDataSizeBytes, device->GetPendingCommandSerial(),
@@ -315,8 +323,8 @@
                                            const void* data,
                                            const TextureDataLayout& dataLayout,
                                            const Extent3D& writeSizePixel) {
-        const TexelBlockInfo& blockInfo =
-            destination.texture->GetFormat().GetAspectInfo(destination.aspect).block;
+        const Format& format = destination.texture->GetFormat();
+        const TexelBlockInfo& blockInfo = format.GetAspectInfo(destination.aspect).block;
 
         // We are only copying the part of the data that will appear in the texture.
         // Note that validating texture copy range ensures that writeSizePixel->width and
@@ -334,7 +342,8 @@
         DAWN_TRY_ASSIGN(uploadHandle,
                         UploadTextureDataAligningBytesPerRowAndOffset(
                             GetDevice(), data, alignedBytesPerRow, optimallyAlignedBytesPerRow,
-                            alignedRowsPerImage, dataLayout, blockInfo, writeSizePixel));
+                            alignedRowsPerImage, dataLayout, format.HasDepthOrStencil(), blockInfo,
+                            writeSizePixel));
 
         TextureDataLayout passDataLayout = dataLayout;
         passDataLayout.offset = uploadHandle.startOffset;
@@ -345,7 +354,7 @@
         textureCopy.texture = destination.texture;
         textureCopy.mipLevel = destination.mipLevel;
         textureCopy.origin = destination.origin;
-        textureCopy.aspect = ConvertAspect(destination.texture->GetFormat(), destination.aspect);
+        textureCopy.aspect = ConvertAspect(format, destination.aspect);
 
         DeviceBase* device = GetDevice();
 
diff --git a/src/tests/end2end/QueueTests.cpp b/src/tests/end2end/QueueTests.cpp
index d2288e0..014eb52 100644
--- a/src/tests/end2end/QueueTests.cpp
+++ b/src/tests/end2end/QueueTests.cpp
@@ -632,6 +632,77 @@
     DoSimpleWriteTextureTest(64, 1);
 }
 
+// This tests for a bug in the allocation of internal staging buffer, which incorrectly copied depth
+// stencil data to the internal offset that is not a multiple of 4.
+TEST_P(QueueWriteTextureTests, WriteStencilAspectWithSourceOffsetUnalignedTo4) {
+    // Copies to a single aspect are unsupported on OpenGL.
+    DAWN_SUPPRESS_TEST_IF(IsOpenGL() || IsOpenGLES());
+
+    wgpu::TextureDescriptor textureDescriptor;
+    textureDescriptor.format = wgpu::TextureFormat::Depth24PlusStencil8;
+    textureDescriptor.usage = wgpu::TextureUsage::CopySrc | wgpu::TextureUsage::CopyDst;
+    textureDescriptor.size = {1, 1, 1};
+    wgpu::Texture dstTexture1 = device.CreateTexture(&textureDescriptor);
+    wgpu::Texture dstTexture2 = device.CreateTexture(&textureDescriptor);
+
+    wgpu::BufferDescriptor bufferDescriptor;
+    bufferDescriptor.size = 8u;
+    bufferDescriptor.usage = wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst;
+    wgpu::Buffer outputBuffer = device.CreateBuffer(&bufferDescriptor);
+    wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+
+    constexpr wgpu::Extent3D kWriteSize = {1, 1, 1};
+    constexpr uint8_t kData[] = {1, 2};
+    constexpr uint32_t kBytesPerRowForWriteTexture = 1u;
+
+    std::vector<uint8_t> expectedData(8, 0);
+
+    // In the first call of queue.writeTexture(), Dawn will allocate a new staging buffer in its
+    // internal ring buffer and write the user data into it at the offset 0.
+    {
+        constexpr uint32_t kDataOffset1 = 0u;
+        wgpu::TextureDataLayout textureDataLayout =
+            utils::CreateTextureDataLayout(kDataOffset1, kBytesPerRowForWriteTexture);
+        wgpu::ImageCopyTexture imageCopyTexture = utils::CreateImageCopyTexture(
+            dstTexture1, 0, {0, 0, 0}, wgpu::TextureAspect::StencilOnly);
+        queue.WriteTexture(&imageCopyTexture, kData, sizeof(kData), &textureDataLayout,
+                           &kWriteSize);
+
+        constexpr uint32_t kOutputBufferOffset1 = 0u;
+        wgpu::ImageCopyBuffer imageCopyBuffer = utils::CreateImageCopyBuffer(
+            outputBuffer, kOutputBufferOffset1, kTextureBytesPerRowAlignment);
+        encoder.CopyTextureToBuffer(&imageCopyTexture, &imageCopyBuffer, &kWriteSize);
+
+        expectedData[kOutputBufferOffset1] = kData[kDataOffset1];
+    }
+
+    // In the second call of queue.writeTexture(), Dawn will still use the same staging buffer
+    // allocated in the first call, whose first 2 bytes have been used in the first call of
+    // queue.writeTexture(). Dawn should write the user data at the offset 4 bytes since the
+    // destination texture aspect is stencil.
+    {
+        constexpr uint32_t kDataOffset2 = 1u;
+        wgpu::TextureDataLayout textureDataLayout =
+            utils::CreateTextureDataLayout(kDataOffset2, kBytesPerRowForWriteTexture);
+        wgpu::ImageCopyTexture imageCopyTexture = utils::CreateImageCopyTexture(
+            dstTexture2, 0, {0, 0, 0}, wgpu::TextureAspect::StencilOnly);
+        queue.WriteTexture(&imageCopyTexture, kData, sizeof(kData), &textureDataLayout,
+                           &kWriteSize);
+
+        constexpr uint32_t kOutputBufferOffset2 = 4u;
+        wgpu::ImageCopyBuffer imageCopyBuffer = utils::CreateImageCopyBuffer(
+            outputBuffer, kOutputBufferOffset2, kTextureBytesPerRowAlignment);
+        encoder.CopyTextureToBuffer(&imageCopyTexture, &imageCopyBuffer, &kWriteSize);
+
+        expectedData[kOutputBufferOffset2] = kData[kDataOffset2];
+    }
+
+    wgpu::CommandBuffer commandBuffer = encoder.Finish();
+    queue.Submit(1, &commandBuffer);
+
+    EXPECT_BUFFER_U8_RANGE_EQ(expectedData.data(), outputBuffer, 0, 8);
+}
+
 DAWN_INSTANTIATE_TEST(QueueWriteTextureTests,
                       D3D12Backend(),
                       MetalBackend(),