Reset query set on Vulkan Backend

We do query resetting for each query commands outside render pass,
whether they're rewritten or not, so no longer need to track their
availability on command encoder. The availability on query set is enough
for resolving sparse queries.

But we still need to track query availability on render pass for query
rewrite checking and query resetting per render pass. Because reset
command must be called outside render pass, we need to reset them
together before the beginning render pass based that. Add availability
tracking on pass resource usage tracker (we only need it on render pass
) to facilitate use it in Vulkan backend.

Bug: dawn:434

Change-Id: Ie1b413ff54f62f3b84fe612e4abe45872c387e81
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/45440
Reviewed-by: Corentin Wallez <cwallez@chromium.org>
Commit-Queue: Hao Li <hao.x.li@intel.com>
diff --git a/src/dawn_native/CommandEncoder.cpp b/src/dawn_native/CommandEncoder.cpp
index 8d06963..55fc227 100644
--- a/src/dawn_native/CommandEncoder.cpp
+++ b/src/dawn_native/CommandEncoder.cpp
@@ -467,16 +467,7 @@
         }
 
         // Set the query at queryIndex to available for resolving in query set.
-        querySet->SetQueryAvailability(queryIndex, 1);
-
-        // Gets the iterator for that querySet or create a new vector of bool set to false
-        // if the querySet wasn't registered.
-        auto it = mQueryAvailabilityMap.emplace(querySet, querySet->GetQueryCount()).first;
-        it->second[queryIndex] = 1;
-    }
-
-    const QueryAvailabilityMap& CommandEncoder::GetQueryAvailabilityMap() const {
-        return mQueryAvailabilityMap;
+        querySet->SetQueryAvailability(queryIndex, true);
     }
 
     // Implementation of the API's command recording methods
diff --git a/src/dawn_native/CommandEncoder.h b/src/dawn_native/CommandEncoder.h
index c422e98..a8bf6a0 100644
--- a/src/dawn_native/CommandEncoder.h
+++ b/src/dawn_native/CommandEncoder.h
@@ -22,13 +22,10 @@
 #include "dawn_native/ObjectBase.h"
 #include "dawn_native/PassResourceUsage.h"
 
-#include <map>
 #include <string>
 
 namespace dawn_native {
 
-    using QueryAvailabilityMap = std::map<QuerySetBase*, std::vector<bool>>;
-
     class CommandEncoder final : public ObjectBase {
       public:
         CommandEncoder(DeviceBase* device, const CommandEncoderDescriptor* descriptor);
@@ -38,7 +35,6 @@
 
         void TrackUsedQuerySet(QuerySetBase* querySet);
         void TrackQueryAvailability(QuerySetBase* querySet, uint32_t queryIndex);
-        const QueryAvailabilityMap& GetQueryAvailabilityMap() const;
 
         // Dawn API
         ComputePassEncoder* APIBeginComputePass(const ComputePassDescriptor* descriptor);
@@ -84,7 +80,6 @@
         std::set<BufferBase*> mTopLevelBuffers;
         std::set<TextureBase*> mTopLevelTextures;
         std::set<QuerySetBase*> mUsedQuerySets;
-        QueryAvailabilityMap mQueryAvailabilityMap;
 
         uint64_t mDebugGroupStackSize = 0;
     };
diff --git a/src/dawn_native/PassResourceUsage.h b/src/dawn_native/PassResourceUsage.h
index f0aa43a..772f8c2 100644
--- a/src/dawn_native/PassResourceUsage.h
+++ b/src/dawn_native/PassResourceUsage.h
@@ -42,6 +42,9 @@
 
         std::vector<TextureBase*> textures;
         std::vector<PassTextureUsage> textureUsages;
+
+        std::vector<QuerySetBase*> querySets;
+        std::vector<std::vector<bool>> queryAvailabilities;
     };
 
     using PerPassUsages = std::vector<PassResourceUsage>;
diff --git a/src/dawn_native/PassResourceUsageTracker.cpp b/src/dawn_native/PassResourceUsageTracker.cpp
index 5816280..81d72ed 100644
--- a/src/dawn_native/PassResourceUsageTracker.cpp
+++ b/src/dawn_native/PassResourceUsageTracker.cpp
@@ -17,6 +17,7 @@
 #include "dawn_native/Buffer.h"
 #include "dawn_native/EnumMaskIterator.h"
 #include "dawn_native/Format.h"
+#include "dawn_native/QuerySet.h"
 #include "dawn_native/Texture.h"
 
 #include <utility>
@@ -65,6 +66,23 @@
                              const wgpu::TextureUsage& addedUsage) { *storedUsage |= addedUsage; });
     }
 
+    void PassResourceUsageTracker::TrackQueryAvailability(QuerySetBase* querySet,
+                                                          uint32_t queryIndex) {
+        // The query availability only need to be tracked again on render pass for checking query
+        // overwrite on render pass and resetting query set on Vulkan backend.
+        DAWN_ASSERT(mPassType == PassType::Render);
+        DAWN_ASSERT(querySet != nullptr);
+
+        // Gets the iterator for that querySet or create a new vector of bool set to false
+        // if the querySet wasn't registered.
+        auto it = mQueryAvailabilities.emplace(querySet, querySet->GetQueryCount()).first;
+        it->second[queryIndex] = true;
+    }
+
+    const QueryAvailabilityMap& PassResourceUsageTracker::GetQueryAvailabilityMap() const {
+        return mQueryAvailabilities;
+    }
+
     // Returns the per-pass usage for use by backends for APIs with explicit barriers.
     PassResourceUsage PassResourceUsageTracker::AcquireResourceUsage() {
         PassResourceUsage result;
@@ -73,6 +91,8 @@
         result.bufferUsages.reserve(mBufferUsages.size());
         result.textures.reserve(mTextureUsages.size());
         result.textureUsages.reserve(mTextureUsages.size());
+        result.querySets.reserve(mQueryAvailabilities.size());
+        result.queryAvailabilities.reserve(mQueryAvailabilities.size());
 
         for (auto& it : mBufferUsages) {
             result.buffers.push_back(it.first);
@@ -84,8 +104,14 @@
             result.textureUsages.push_back(std::move(it.second));
         }
 
+        for (auto& it : mQueryAvailabilities) {
+            result.querySets.push_back(it.first);
+            result.queryAvailabilities.push_back(std::move(it.second));
+        }
+
         mBufferUsages.clear();
         mTextureUsages.clear();
+        mQueryAvailabilities.clear();
 
         return result;
     }
diff --git a/src/dawn_native/PassResourceUsageTracker.h b/src/dawn_native/PassResourceUsageTracker.h
index cfcaa22..cd54f8c 100644
--- a/src/dawn_native/PassResourceUsageTracker.h
+++ b/src/dawn_native/PassResourceUsageTracker.h
@@ -24,8 +24,11 @@
 namespace dawn_native {
 
     class BufferBase;
+    class QuerySetBase;
     class TextureBase;
 
+    using QueryAvailabilityMap = std::map<QuerySetBase*, std::vector<bool>>;
+
     // Helper class to encapsulate the logic of tracking per-resource usage during the
     // validation of command buffer passes. It is used both to know if there are validation
     // errors, and to get a list of resources used per pass for backends that need the
@@ -36,6 +39,8 @@
         void BufferUsedAs(BufferBase* buffer, wgpu::BufferUsage usage);
         void TextureViewUsedAs(TextureViewBase* texture, wgpu::TextureUsage usage);
         void AddTextureUsage(TextureBase* texture, const PassTextureUsage& textureUsage);
+        void TrackQueryAvailability(QuerySetBase* querySet, uint32_t queryIndex);
+        const QueryAvailabilityMap& GetQueryAvailabilityMap() const;
 
         // Returns the per-pass usage for use by backends for APIs with explicit barriers.
         PassResourceUsage AcquireResourceUsage();
@@ -44,6 +49,10 @@
         PassType mPassType;
         std::map<BufferBase*, wgpu::BufferUsage> mBufferUsages;
         std::map<TextureBase*, PassTextureUsage> mTextureUsages;
+        // Dedicated to track the availability of the queries used on render pass. The same query
+        // cannot be written twice in same render pass, so each render pass also need to have its
+        // own query availability map for validation.
+        QueryAvailabilityMap mQueryAvailabilities;
     };
 
 }  // namespace dawn_native
diff --git a/src/dawn_native/RenderPassEncoder.cpp b/src/dawn_native/RenderPassEncoder.cpp
index 3c9e527..4c48bb4 100644
--- a/src/dawn_native/RenderPassEncoder.cpp
+++ b/src/dawn_native/RenderPassEncoder.cpp
@@ -80,19 +80,14 @@
     void RenderPassEncoder::TrackQueryAvailability(QuerySetBase* querySet, uint32_t queryIndex) {
         DAWN_ASSERT(querySet != nullptr);
 
-        // Gets the iterator for that querySet or create a new vector of bool set to false
-        // if the querySet wasn't registered.
-        auto it = mQueryAvailabilityMap.emplace(querySet, querySet->GetQueryCount()).first;
-        it->second[queryIndex] = 1;
+        // Track the query availability with true on render pass for rewrite validation and query
+        // reset on render pass on Vulkan
+        mUsageTracker.TrackQueryAvailability(querySet, queryIndex);
 
         // Track it again on command encoder for zero-initializing when resolving unused queries.
         mCommandEncoder->TrackQueryAvailability(querySet, queryIndex);
     }
 
-    const QueryAvailabilityMap& RenderPassEncoder::GetQueryAvailabilityMap() const {
-        return mQueryAvailabilityMap;
-    }
-
     void RenderPassEncoder::APIEndPass() {
         if (mEncodingContext->TryEncode(this, [&](CommandAllocator* allocator) -> MaybeError {
                 if (IsValidationEnabled()) {
@@ -254,9 +249,7 @@
                 }
 
                 DAWN_TRY(ValidateQueryIndexOverwrite(mOcclusionQuerySet.Get(), queryIndex,
-                                                     GetQueryAvailabilityMap()));
-
-                mCommandEncoder->TrackUsedQuerySet(mOcclusionQuerySet.Get());
+                                                     mUsageTracker.GetQueryAvailabilityMap()));
             }
 
             // Record the current query index for endOcclusionQuery.
@@ -283,6 +276,7 @@
             }
 
             TrackQueryAvailability(mOcclusionQuerySet.Get(), mCurrentOcclusionQueryIndex);
+
             mOcclusionQueryActive = false;
 
             EndOcclusionQueryCmd* cmd =
@@ -299,8 +293,8 @@
             if (IsValidationEnabled()) {
                 DAWN_TRY(GetDevice()->ValidateObject(querySet));
                 DAWN_TRY(ValidateTimestampQuery(querySet, queryIndex));
-                DAWN_TRY(
-                    ValidateQueryIndexOverwrite(querySet, queryIndex, GetQueryAvailabilityMap()));
+                DAWN_TRY(ValidateQueryIndexOverwrite(querySet, queryIndex,
+                                                     mUsageTracker.GetQueryAvailabilityMap()));
             }
 
             TrackQueryAvailability(querySet, queryIndex);
diff --git a/src/dawn_native/RenderPassEncoder.h b/src/dawn_native/RenderPassEncoder.h
index 08a24f4..a8bf460 100644
--- a/src/dawn_native/RenderPassEncoder.h
+++ b/src/dawn_native/RenderPassEncoder.h
@@ -37,9 +37,6 @@
                                             CommandEncoder* commandEncoder,
                                             EncodingContext* encodingContext);
 
-        void TrackQueryAvailability(QuerySetBase* querySet, uint32_t queryIndex);
-        const QueryAvailabilityMap& GetQueryAvailabilityMap() const;
-
         void APIEndPass();
 
         void APISetStencilReference(uint32_t reference);
@@ -65,6 +62,8 @@
                           ErrorTag errorTag);
 
       private:
+        void TrackQueryAvailability(QuerySetBase* querySet, uint32_t queryIndex);
+
         // For render and compute passes, the encoding context is borrowed from the command encoder.
         // Keep a reference to the encoder to make sure the context isn't freed.
         Ref<CommandEncoder> mCommandEncoder;
@@ -72,11 +71,6 @@
         uint32_t mRenderTargetWidth;
         uint32_t mRenderTargetHeight;
 
-        // This map is to indicate the availability of the queries used in render pass. The same
-        // query cannot be written twice in same render pass, so each render pass also need to have
-        // its own query availability map.
-        QueryAvailabilityMap mQueryAvailabilityMap;
-
         // The resources for occlusion query
         Ref<QuerySetBase> mOcclusionQuerySet;
         uint32_t mCurrentOcclusionQueryIndex = 0;
diff --git a/src/dawn_native/vulkan/CommandBufferVk.cpp b/src/dawn_native/vulkan/CommandBufferVk.cpp
index 0986db8..47fac44 100644
--- a/src/dawn_native/vulkan/CommandBufferVk.cpp
+++ b/src/dawn_native/vulkan/CommandBufferVk.cpp
@@ -369,13 +369,36 @@
             return {};
         }
 
-        void ResetUsedQuerySets(Device* device,
-                                VkCommandBuffer commands,
-                                const std::set<QuerySetBase*>& usedQuerySets) {
-            // TODO(hao.x.li@intel.com): Reset the queries based on the used indexes.
-            for (QuerySetBase* querySet : usedQuerySets) {
-                device->fn.CmdResetQueryPool(commands, ToBackend(querySet)->GetHandle(), 0,
-                                             querySet->GetQueryCount());
+        // Reset the query sets used on render pass because the reset command must be called outside
+        // render pass.
+        void ResetUsedQuerySetsOnRenderPass(Device* device,
+                                            VkCommandBuffer commands,
+                                            QuerySetBase* querySet,
+                                            const std::vector<bool>& availability) {
+            ASSERT(availability.size() == querySet->GetQueryAvailability().size());
+
+            auto currentIt = availability.begin();
+            auto lastIt = availability.end();
+            // Traverse the used queries which availability are true.
+            while (currentIt != lastIt) {
+                auto firstTrueIt = std::find(currentIt, lastIt, true);
+                // No used queries need to be reset
+                if (firstTrueIt == lastIt) {
+                    break;
+                }
+
+                auto nextFalseIt = std::find(firstTrueIt, lastIt, false);
+
+                uint32_t queryIndex = std::distance(availability.begin(), firstTrueIt);
+                uint32_t queryCount = std::distance(firstTrueIt, nextFalseIt);
+
+                // Reset the queries between firstTrueIt and nextFalseIt (which is at most
+                // lastIt)
+                device->fn.CmdResetQueryPool(commands, ToBackend(querySet)->GetHandle(), queryIndex,
+                                             queryCount);
+
+                // Set current iterator to next false
+                currentIt = nextFalseIt;
             }
         }
 
@@ -425,7 +448,7 @@
                     destination->GetHandle(), resolveDestinationOffset, sizeof(uint64_t),
                     VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT);
 
-                // Set current interator to next false
+                // Set current iterator to next false
                 currentIt = nextFalseIt;
             }
         }
@@ -504,7 +527,8 @@
         Device* device = ToBackend(GetDevice());
         VkCommandBuffer commands = recordingContext->commandBuffer;
 
-        // Records the necessary barriers for the resource usage pre-computed by the frontend
+        // Records the necessary barriers for the resource usage pre-computed by the frontend.
+        // And resets the used query sets which are rewritten on the render pass.
         auto PrepareResourcesForRenderPass = [](Device* device,
                                                 CommandRecordingContext* recordingContext,
                                                 const PassResourceUsage& usages) {
@@ -546,6 +570,13 @@
                                               bufferBarriers.data(), imageBarriers.size(),
                                               imageBarriers.data());
             }
+
+            // Reset all query set used on current render pass together before beginning render pass
+            // because the reset command must be called outside render pass
+            for (size_t i = 0; i < usages.querySets.size(); ++i) {
+                ResetUsedQuerySetsOnRenderPass(device, recordingContext->commandBuffer,
+                                               usages.querySets[i], usages.queryAvailabilities[i]);
+            }
         };
 
         // TODO(jiawei.shao@intel.com): move the resource lazy clearing inside the barrier tracking
@@ -568,9 +599,6 @@
         const std::vector<PassResourceUsage>& passResourceUsages = GetResourceUsages().perPass;
         size_t nextPassNumber = 0;
 
-        // QuerySet must be reset between uses.
-        ResetUsedQuerySets(device, commands, GetResourceUsages().usedQuerySets);
-
         Command type;
         while (mCommands.NextCommandId(&type)) {
             switch (type) {
@@ -772,10 +800,15 @@
                     QuerySet* querySet = ToBackend(cmd->querySet.Get());
                     Buffer* destination = ToBackend(cmd->destination.Get());
 
+                    // TODO(hao.x.li@intel.com): Clear the resolve region of the buffer to 0 if at
+                    // least one query is unavailable for the resolving and the resolve buffer has
+                    // been initialized or fully used.
+
                     destination->EnsureDataInitializedAsDestination(
                         recordingContext, cmd->destinationOffset,
                         cmd->queryCount * sizeof(uint64_t));
-                    destination->TransitionUsageNow(recordingContext, wgpu::BufferUsage::CopyDst);
+                    destination->TransitionUsageNow(recordingContext,
+                                                    wgpu::BufferUsage::QueryResolve);
 
                     RecordResolveQuerySetCmd(commands, device, querySet, cmd->firstQuery,
                                              cmd->queryCount, destination, cmd->destinationOffset);
@@ -786,6 +819,10 @@
                 case Command::WriteTimestamp: {
                     WriteTimestampCmd* cmd = mCommands.NextCommand<WriteTimestampCmd>();
 
+                    // The query must be reset between uses.
+                    device->fn.CmdResetQueryPool(commands, ToBackend(cmd->querySet)->GetHandle(),
+                                                 cmd->queryIndex, 1);
+
                     RecordWriteTimestampCmd(recordingContext, device, cmd);
                     break;
                 }
@@ -960,6 +997,10 @@
                 case Command::WriteTimestamp: {
                     WriteTimestampCmd* cmd = mCommands.NextCommand<WriteTimestampCmd>();
 
+                    // The query must be reset between uses.
+                    device->fn.CmdResetQueryPool(commands, ToBackend(cmd->querySet)->GetHandle(),
+                                                 cmd->queryIndex, 1);
+
                     RecordWriteTimestampCmd(recordingContext, device, cmd);
                     break;
                 }
diff --git a/src/tests/end2end/QueryTests.cpp b/src/tests/end2end/QueryTests.cpp
index 1f6f37a..8537c53 100644
--- a/src/tests/end2end/QueryTests.cpp
+++ b/src/tests/end2end/QueryTests.cpp
@@ -34,7 +34,8 @@
 };
 
 // Clear the content of the result buffer into 0xFFFFFFFF.
-constexpr static uint64_t kSentinelValue = ~uint64_t(0);
+constexpr static uint64_t kSentinelValue = ~uint64_t(0u);
+constexpr static uint64_t kZero = 0u;
 
 class OcclusionExpectation : public detail::Expectation {
   public:
@@ -78,6 +79,7 @@
     void SetUp() override {
         DawnTest::SetUp();
 
+        // Create basic render pipeline
         vsModule = utils::CreateShaderModule(device, R"(
             [[builtin(vertex_index)]] var<in> VertexIndex : u32;
             [[builtin(position)]] var<out> Position : vec4<f32>;
@@ -94,6 +96,12 @@
             [[stage(fragment)]] fn main() -> void {
                 fragColor = vec4<f32>(0.0, 1.0, 0.0, 1.0);
             })");
+
+        utils::ComboRenderPipelineDescriptor2 descriptor;
+        descriptor.vertex.module = vsModule;
+        descriptor.cFragment.module = fsModule;
+
+        pipeline = device.CreateRenderPipeline2(&descriptor);
     }
 
     struct ScissorRect {
@@ -121,6 +129,8 @@
     void TestOcclusionQueryWithDepthStencilTest(bool depthTestEnabled,
                                                 bool stencilTestEnabled,
                                                 OcclusionExpectation::Result expected) {
+        constexpr uint32_t kQueryCount = 1;
+
         utils::ComboRenderPipelineDescriptor2 descriptor;
         descriptor.vertex.module = vsModule;
         descriptor.cFragment.module = fsModule;
@@ -135,7 +145,7 @@
         depthStencil->stencilBack.compare =
             stencilTestEnabled ? wgpu::CompareFunction::Never : wgpu::CompareFunction::Always;
 
-        wgpu::RenderPipeline pipeline = device.CreateRenderPipeline2(&descriptor);
+        wgpu::RenderPipeline renderPipeline = device.CreateRenderPipeline2(&descriptor);
 
         wgpu::Texture renderTarget = CreateRenderTexture(wgpu::TextureFormat::RGBA8Unorm);
         wgpu::TextureView renderTargetView = renderTarget.CreateView();
@@ -154,7 +164,7 @@
 
         wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
         wgpu::RenderPassEncoder pass = encoder.BeginRenderPass(&renderPass);
-        pass.SetPipeline(pipeline);
+        pass.SetPipeline(renderPipeline);
         pass.SetStencilReference(0);
         pass.BeginOcclusionQuery(0);
         pass.Draw(3);
@@ -170,11 +180,7 @@
 
     void TestOcclusionQueryWithScissorTest(ScissorRect rect,
                                            OcclusionExpectation::Result expected) {
-        utils::ComboRenderPipelineDescriptor2 descriptor;
-        descriptor.vertex.module = vsModule;
-        descriptor.cFragment.module = fsModule;
-
-        wgpu::RenderPipeline pipeline = device.CreateRenderPipeline2(&descriptor);
+        constexpr uint32_t kQueryCount = 1;
 
         wgpu::QuerySet querySet = CreateOcclusionQuerySet(kQueryCount);
         wgpu::Buffer destination = CreateResolveBuffer(kQueryCount * sizeof(uint64_t));
@@ -204,18 +210,19 @@
     wgpu::ShaderModule vsModule;
     wgpu::ShaderModule fsModule;
 
+    wgpu::RenderPipeline pipeline;
+
     constexpr static unsigned int kRTSize = 4;
-    constexpr static uint32_t kQueryCount = 1;
 };
 
 // Test creating query set with the type of Occlusion
 TEST_P(OcclusionQueryTests, QuerySetCreation) {
-    CreateOcclusionQuerySet(kQueryCount);
+    CreateOcclusionQuerySet(1);
 }
 
 // Test destroying query set
 TEST_P(OcclusionQueryTests, QuerySetDestroy) {
-    wgpu::QuerySet querySet = CreateOcclusionQuerySet(kQueryCount);
+    wgpu::QuerySet querySet = CreateOcclusionQuerySet(1);
     querySet.Destroy();
 }
 
@@ -257,6 +264,189 @@
     TestOcclusionQueryWithScissorTest({0, 0, 2, 1}, OcclusionExpectation::Result::Zero);
 }
 
+// Test begin occlusion query with same query index on different render pass
+TEST_P(OcclusionQueryTests, Rewrite) {
+    constexpr uint32_t kQueryCount = 1;
+
+    wgpu::QuerySet querySet = CreateOcclusionQuerySet(kQueryCount);
+    wgpu::Buffer destination = CreateResolveBuffer(kQueryCount * sizeof(uint64_t));
+    // Set all bits in buffer to check 0 is correctly written if there is no sample passed the
+    // occlusion testing
+    queue.WriteBuffer(destination, 0, &kSentinelValue, sizeof(kSentinelValue));
+
+    utils::BasicRenderPass renderPass = utils::CreateBasicRenderPass(device, kRTSize, kRTSize);
+    renderPass.renderPassInfo.occlusionQuerySet = querySet;
+
+    wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+
+    // Begin occlusion without draw call
+    wgpu::RenderPassEncoder pass = encoder.BeginRenderPass(&renderPass.renderPassInfo);
+    pass.BeginOcclusionQuery(0);
+    pass.EndOcclusionQuery();
+    pass.EndPass();
+
+    // Begin occlusion with same query index with draw call
+    wgpu::RenderPassEncoder rewritePass = encoder.BeginRenderPass(&renderPass.renderPassInfo);
+    rewritePass.SetPipeline(pipeline);
+    rewritePass.BeginOcclusionQuery(0);
+    rewritePass.Draw(3);
+    rewritePass.EndOcclusionQuery();
+    rewritePass.EndPass();
+
+    encoder.ResolveQuerySet(querySet, 0, kQueryCount, destination, 0);
+    wgpu::CommandBuffer commands = encoder.Finish();
+    queue.Submit(1, &commands);
+
+    EXPECT_BUFFER(destination, 0, sizeof(uint64_t),
+                  new OcclusionExpectation(OcclusionExpectation::Result::NonZero));
+}
+
+// Test resolving occlusion query correctly if the queries are written sparsely, which also tests
+// the query resetting at the start of render passes on Vulkan backend.
+TEST_P(OcclusionQueryTests, ResolveSparseQueries) {
+    // TODO(hao.x.li@intel.com): Clear the resolve region of the buffer to 0 if there is at least
+    // one query not written and the resolve buffer has been initialized or fully used.
+    DAWN_SKIP_TEST_IF(IsVulkan());
+
+    // TODO(hao.x.li@intel.com): Investigate why it's failed on D3D12 on Nvidia when running with
+    // the previous occlusion tests. Expect resolve to 0 for these unwritten queries but the
+    // occlusion result of the previous tests is got.
+    DAWN_SKIP_TEST_IF(IsD3D12() & IsNvidia());
+
+    constexpr uint32_t kQueryCount = 7;
+
+    wgpu::QuerySet querySet = CreateOcclusionQuerySet(kQueryCount);
+    wgpu::Buffer destination = CreateResolveBuffer(kQueryCount * sizeof(uint64_t));
+    // Set sentinel values to check the queries are resolved correctly if the queries are
+    // written sparsely.
+    std::vector<uint64_t> sentinelValues(kQueryCount, kSentinelValue);
+    queue.WriteBuffer(destination, 0, sentinelValues.data(), kQueryCount * sizeof(uint64_t));
+
+    utils::BasicRenderPass renderPass = utils::CreateBasicRenderPass(device, kRTSize, kRTSize);
+    renderPass.renderPassInfo.occlusionQuerySet = querySet;
+
+    wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+    wgpu::RenderPassEncoder pass = encoder.BeginRenderPass(&renderPass.renderPassInfo);
+    pass.SetPipeline(pipeline);
+
+    // Write queries sparsely for testing the query resetting on Vulkan and resolving unwritten
+    // queries to 0.
+    // 0 - not written (tests starting with not written).
+    // 1 - written (tests combing multiple written, although other tests already do it).
+    // 2 - written.
+    // 3 - not written (tests skipping over not written in the middle).
+    // 4 - not written.
+    // 5 - written (tests another written query in the middle).
+    // 6 - not written (tests the last query not being written).
+    pass.BeginOcclusionQuery(1);
+    pass.Draw(3);
+    pass.EndOcclusionQuery();
+    pass.BeginOcclusionQuery(2);
+    pass.Draw(3);
+    pass.EndOcclusionQuery();
+    pass.BeginOcclusionQuery(5);
+    pass.Draw(3);
+    pass.EndOcclusionQuery();
+    pass.EndPass();
+
+    encoder.ResolveQuerySet(querySet, 0, kQueryCount, destination, 0);
+    wgpu::CommandBuffer commands = encoder.Finish();
+    queue.Submit(1, &commands);
+
+    // The query at index 0 should be resolved to 0.
+    EXPECT_BUFFER_U64_RANGE_EQ(&kZero, destination, 0, 1);
+    EXPECT_BUFFER(destination, sizeof(uint64_t), 2 * sizeof(uint64_t),
+                  new OcclusionExpectation(OcclusionExpectation::Result::NonZero));
+    // The queries at index 3 and 4 should be resolved to 0.
+    std::vector<uint64_t> zeros(2, kZero);
+    EXPECT_BUFFER_U64_RANGE_EQ(zeros.data(), destination, 3 * sizeof(uint64_t), 2);
+    EXPECT_BUFFER(destination, 5 * sizeof(uint64_t), sizeof(uint64_t),
+                  new OcclusionExpectation(OcclusionExpectation::Result::NonZero));
+    // The query at index 6 should be resolved to 0.
+    EXPECT_BUFFER_U64_RANGE_EQ(&kZero, destination, 6 * sizeof(uint64_t), 1);
+}
+
+// Test resolving occlusion query to 0 if all queries are not written
+TEST_P(OcclusionQueryTests, ResolveWithoutWritten) {
+    // TODO(hao.x.li@intel.com): Clear the resolve region of the buffer to 0 if there is at least
+    // one query not written and the resolve buffer has been initialized or fully used.
+    DAWN_SKIP_TEST_IF(IsVulkan());
+
+    // TODO(hao.x.li@intel.com): Investigate why it's failed on D3D12 on Nvidia when running with
+    // the previous occlusion tests. Expect resolve to 0 but the occlusion result of the previous
+    // tests is got.
+    DAWN_SKIP_TEST_IF(IsD3D12() & IsNvidia());
+
+    constexpr uint32_t kQueryCount = 1;
+
+    wgpu::QuerySet querySet = CreateOcclusionQuerySet(kQueryCount);
+    wgpu::Buffer destination = CreateResolveBuffer(kQueryCount * sizeof(uint64_t));
+    // Set sentinel values to check 0 is correctly written if resolving query set without
+    // any written.
+    queue.WriteBuffer(destination, 0, &kSentinelValue, sizeof(kSentinelValue));
+
+    wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+    encoder.ResolveQuerySet(querySet, 0, kQueryCount, destination, 0);
+    wgpu::CommandBuffer commands = encoder.Finish();
+    queue.Submit(1, &commands);
+
+    EXPECT_BUFFER_U64_RANGE_EQ(&kZero, destination, 0, 1);
+}
+
+// Test resolving occlusion query to the destination buffer with offset
+TEST_P(OcclusionQueryTests, ResolveToBufferWithOffset) {
+    constexpr uint32_t kQueryCount = 2;
+
+    wgpu::QuerySet querySet = CreateOcclusionQuerySet(kQueryCount);
+
+    utils::BasicRenderPass renderPass = utils::CreateBasicRenderPass(device, kRTSize, kRTSize);
+    renderPass.renderPassInfo.occlusionQuerySet = querySet;
+
+    wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+    wgpu::RenderPassEncoder pass = encoder.BeginRenderPass(&renderPass.renderPassInfo);
+    pass.SetPipeline(pipeline);
+    pass.BeginOcclusionQuery(0);
+    pass.Draw(3);
+    pass.EndOcclusionQuery();
+    pass.EndPass();
+    wgpu::CommandBuffer commands = encoder.Finish();
+    queue.Submit(1, &commands);
+
+    // Resolve the query result to first slot in the buffer, other slots should not be written.
+    {
+        wgpu::Buffer destination = CreateResolveBuffer(kQueryCount * sizeof(uint64_t));
+        // Set sentinel values to check the query is resolved to the correct slot of the buffer.
+        std::vector<uint64_t> sentinelValues(kQueryCount, kSentinelValue);
+        queue.WriteBuffer(destination, 0, sentinelValues.data(), kQueryCount * sizeof(uint64_t));
+
+        wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+        encoder.ResolveQuerySet(querySet, 0, 1, destination, 0);
+        wgpu::CommandBuffer commands = encoder.Finish();
+        queue.Submit(1, &commands);
+
+        EXPECT_BUFFER(destination, 0, sizeof(uint64_t),
+                      new OcclusionExpectation(OcclusionExpectation::Result::NonZero));
+        EXPECT_BUFFER_U64_RANGE_EQ(&kSentinelValue, destination, sizeof(uint64_t), 1);
+    }
+
+    // Resolve the query result to second slot in the buffer, the first one should not be written.
+    {
+        wgpu::Buffer destination = CreateResolveBuffer(kQueryCount * sizeof(uint64_t));
+        // Set sentinel values to check the query is resolved to the correct slot of the buffer.
+        std::vector<uint64_t> sentinelValues(kQueryCount, kSentinelValue);
+        queue.WriteBuffer(destination, 0, sentinelValues.data(), kQueryCount * sizeof(uint64_t));
+
+        wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+        encoder.ResolveQuerySet(querySet, 0, 1, destination, sizeof(uint64_t));
+        wgpu::CommandBuffer commands = encoder.Finish();
+        queue.Submit(1, &commands);
+
+        EXPECT_BUFFER_U64_RANGE_EQ(&kSentinelValue, destination, 0, 1);
+        EXPECT_BUFFER(destination, sizeof(uint64_t), sizeof(uint64_t),
+                      new OcclusionExpectation(OcclusionExpectation::Result::NonZero));
+    }
+}
+
 DAWN_INSTANTIATE_TEST(OcclusionQueryTests, D3D12Backend(), MetalBackend(), VulkanBackend());
 
 class PipelineStatisticsQueryTests : public QueryTests {
@@ -358,65 +548,149 @@
 
     constexpr uint32_t kQueryCount = 2;
 
-    wgpu::QuerySet querySet = CreateQuerySetForTimestamp(kQueryCount);
-    wgpu::Buffer destination = CreateResolveBuffer(kQueryCount * sizeof(uint64_t));
+    // Write timestamp with different query indexes
+    {
+        wgpu::QuerySet querySet = CreateQuerySetForTimestamp(kQueryCount);
+        wgpu::Buffer destination = CreateResolveBuffer(kQueryCount * sizeof(uint64_t));
 
-    wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
-    encoder.WriteTimestamp(querySet, 0);
-    encoder.WriteTimestamp(querySet, 1);
-    encoder.ResolveQuerySet(querySet, 0, kQueryCount, destination, 0);
-    wgpu::CommandBuffer commands = encoder.Finish();
-    queue.Submit(1, &commands);
+        wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+        encoder.WriteTimestamp(querySet, 0);
+        encoder.WriteTimestamp(querySet, 1);
+        encoder.ResolveQuerySet(querySet, 0, kQueryCount, destination, 0);
+        wgpu::CommandBuffer commands = encoder.Finish();
+        queue.Submit(1, &commands);
 
-    EXPECT_BUFFER(destination, 0, kQueryCount * sizeof(uint64_t), new TimestampExpectation);
+        EXPECT_BUFFER(destination, 0, kQueryCount * sizeof(uint64_t), new TimestampExpectation);
+    }
+
+    // Write timestamp with same query index outside pass on same encoder
+    {
+        wgpu::QuerySet querySet = CreateQuerySetForTimestamp(kQueryCount);
+        wgpu::Buffer destination = CreateResolveBuffer(kQueryCount * sizeof(uint64_t));
+
+        wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+        encoder.WriteTimestamp(querySet, 0);
+        encoder.WriteTimestamp(querySet, 1);
+        encoder.WriteTimestamp(querySet, 0);
+        encoder.WriteTimestamp(querySet, 1);
+        encoder.ResolveQuerySet(querySet, 0, kQueryCount, destination, 0);
+        wgpu::CommandBuffer commands = encoder.Finish();
+        queue.Submit(1, &commands);
+
+        EXPECT_BUFFER(destination, 0, kQueryCount * sizeof(uint64_t), new TimestampExpectation);
+    }
 }
 
 // Test calling timestamp query from render pass encoder
 TEST_P(TimestampQueryTests, TimestampOnRenderPass) {
     constexpr uint32_t kQueryCount = 2;
 
-    wgpu::QuerySet querySet = CreateQuerySetForTimestamp(kQueryCount);
-    wgpu::Buffer destination = CreateResolveBuffer(kQueryCount * sizeof(uint64_t));
+    // Write timestamp with different query indexes
+    {
+        wgpu::QuerySet querySet = CreateQuerySetForTimestamp(kQueryCount);
+        wgpu::Buffer destination = CreateResolveBuffer(kQueryCount * sizeof(uint64_t));
 
-    wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
-    utils::BasicRenderPass renderPass = utils::CreateBasicRenderPass(device, 1, 1);
-    wgpu::RenderPassEncoder pass = encoder.BeginRenderPass(&renderPass.renderPassInfo);
-    pass.WriteTimestamp(querySet, 0);
-    pass.WriteTimestamp(querySet, 1);
-    pass.EndPass();
-    encoder.ResolveQuerySet(querySet, 0, kQueryCount, destination, 0);
-    wgpu::CommandBuffer commands = encoder.Finish();
-    queue.Submit(1, &commands);
+        wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+        utils::BasicRenderPass renderPass = utils::CreateBasicRenderPass(device, 1, 1);
+        wgpu::RenderPassEncoder pass = encoder.BeginRenderPass(&renderPass.renderPassInfo);
+        pass.WriteTimestamp(querySet, 0);
+        pass.WriteTimestamp(querySet, 1);
+        pass.EndPass();
+        encoder.ResolveQuerySet(querySet, 0, kQueryCount, destination, 0);
+        wgpu::CommandBuffer commands = encoder.Finish();
+        queue.Submit(1, &commands);
 
-    EXPECT_BUFFER(destination, 0, kQueryCount * sizeof(uint64_t), new TimestampExpectation);
+        EXPECT_BUFFER(destination, 0, kQueryCount * sizeof(uint64_t), new TimestampExpectation);
+    }
+
+    // Write timestamp with same query index, not need test rewrite inside render pass due to it's
+    // not allowed
+    {
+        wgpu::QuerySet querySet = CreateQuerySetForTimestamp(kQueryCount);
+        wgpu::Buffer destination = CreateResolveBuffer(kQueryCount * sizeof(uint64_t));
+
+        wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+        encoder.WriteTimestamp(querySet, 0);
+        encoder.WriteTimestamp(querySet, 1);
+
+        utils::BasicRenderPass renderPass = utils::CreateBasicRenderPass(device, 1, 1);
+        wgpu::RenderPassEncoder pass = encoder.BeginRenderPass(&renderPass.renderPassInfo);
+        pass.WriteTimestamp(querySet, 0);
+        pass.WriteTimestamp(querySet, 1);
+        pass.EndPass();
+        encoder.ResolveQuerySet(querySet, 0, kQueryCount, destination, 0);
+        wgpu::CommandBuffer commands = encoder.Finish();
+        queue.Submit(1, &commands);
+
+        EXPECT_BUFFER(destination, 0, kQueryCount * sizeof(uint64_t), new TimestampExpectation);
+    }
 }
 
 // Test calling timestamp query from compute pass encoder
 TEST_P(TimestampQueryTests, TimestampOnComputePass) {
     constexpr uint32_t kQueryCount = 2;
 
-    wgpu::QuerySet querySet = CreateQuerySetForTimestamp(kQueryCount);
-    wgpu::Buffer destination = CreateResolveBuffer(kQueryCount * sizeof(uint64_t));
+    // Write timestamp with different query indexes
+    {
+        wgpu::QuerySet querySet = CreateQuerySetForTimestamp(kQueryCount);
+        wgpu::Buffer destination = CreateResolveBuffer(kQueryCount * sizeof(uint64_t));
 
-    wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
-    wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
-    pass.WriteTimestamp(querySet, 0);
-    pass.WriteTimestamp(querySet, 1);
-    pass.EndPass();
-    encoder.ResolveQuerySet(querySet, 0, kQueryCount, destination, 0);
-    wgpu::CommandBuffer commands = encoder.Finish();
-    queue.Submit(1, &commands);
+        wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+        wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
+        pass.WriteTimestamp(querySet, 0);
+        pass.WriteTimestamp(querySet, 1);
+        pass.EndPass();
+        encoder.ResolveQuerySet(querySet, 0, kQueryCount, destination, 0);
+        wgpu::CommandBuffer commands = encoder.Finish();
+        queue.Submit(1, &commands);
 
-    EXPECT_BUFFER(destination, 0, kQueryCount * sizeof(uint64_t), new TimestampExpectation);
+        EXPECT_BUFFER(destination, 0, kQueryCount * sizeof(uint64_t), new TimestampExpectation);
+    }
+
+    // Write timestamp with same query index on both the outside and the inside of the compute pass
+    {
+        wgpu::QuerySet querySet = CreateQuerySetForTimestamp(kQueryCount);
+        wgpu::Buffer destination = CreateResolveBuffer(kQueryCount * sizeof(uint64_t));
+
+        wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+        encoder.WriteTimestamp(querySet, 0);
+        encoder.WriteTimestamp(querySet, 1);
+
+        wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
+        pass.WriteTimestamp(querySet, 0);
+        pass.WriteTimestamp(querySet, 1);
+        pass.EndPass();
+
+        encoder.ResolveQuerySet(querySet, 0, kQueryCount, destination, 0);
+        wgpu::CommandBuffer commands = encoder.Finish();
+        queue.Submit(1, &commands);
+
+        EXPECT_BUFFER(destination, 0, kQueryCount * sizeof(uint64_t), new TimestampExpectation);
+    }
+
+    // Write timestamp with same query index inside compute pass
+    {
+        wgpu::QuerySet querySet = CreateQuerySetForTimestamp(kQueryCount);
+        wgpu::Buffer destination = CreateResolveBuffer(kQueryCount * sizeof(uint64_t));
+
+        wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+        wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
+        pass.WriteTimestamp(querySet, 0);
+        pass.WriteTimestamp(querySet, 1);
+        pass.WriteTimestamp(querySet, 0);
+        pass.WriteTimestamp(querySet, 1);
+        pass.EndPass();
+
+        encoder.ResolveQuerySet(querySet, 0, kQueryCount, destination, 0);
+        wgpu::CommandBuffer commands = encoder.Finish();
+        queue.Submit(1, &commands);
+
+        EXPECT_BUFFER(destination, 0, kQueryCount * sizeof(uint64_t), new TimestampExpectation);
+    }
 }
 
 // Test resolving timestamp query from another different encoder
 TEST_P(TimestampQueryTests, ResolveFromAnotherEncoder) {
-    // TODO(hao.x.li@intel.com): Fix queries reset on Vulkan backend, it does not allow to resolve
-    // unissued queries. Currently we reset the whole query set at the beginning of command buffer
-    // creation.
-    DAWN_SKIP_TEST_IF(IsVulkan());
-
     constexpr uint32_t kQueryCount = 2;
 
     wgpu::QuerySet querySet = CreateQuerySetForTimestamp(kQueryCount);
@@ -438,12 +712,7 @@
 
 // Test resolving timestamp query correctly if the queries are written sparsely
 TEST_P(TimestampQueryTests, ResolveSparseQueries) {
-    // TODO(hao.x.li@intel.com): Fix queries reset and sparsely resolving on Vulkan backend,
-    // otherwise its validation layer reports unissued queries resolving error
-    DAWN_SKIP_TEST_IF(IsVulkan() && IsBackendValidationEnabled());
-
     constexpr uint32_t kQueryCount = 4;
-    constexpr uint64_t kZero = 0;
 
     wgpu::QuerySet querySet = CreateQuerySetForTimestamp(kQueryCount);
     wgpu::Buffer destination = CreateResolveBuffer(kQueryCount * sizeof(uint64_t));
@@ -469,10 +738,6 @@
 
 // Test resolving timestamp query to 0 if all queries are not written
 TEST_P(TimestampQueryTests, ResolveWithoutWritten) {
-    // TODO(hao.x.li@intel.com): Fix queries reset and sparsely resolving on Vulkan backend,
-    // otherwise its validation layer reports unissued queries resolving error
-    DAWN_SKIP_TEST_IF(IsVulkan() && IsBackendValidationEnabled());
-
     constexpr uint32_t kQueryCount = 2;
 
     wgpu::QuerySet querySet = CreateQuerySetForTimestamp(kQueryCount);