Query API: Test the accuracy of timestamp compute shader

Because the uint64 is not supported on all GPU drivers, we use uint32
and float to simulate the multiplication of uint64, but there is
accuracy loss between the results and the expected results computed by
uint64. This test checks that the accuracy loss is less than 0.2%.

Bug: dawn:434
Change-Id: I6f5c842b6915f101441886bdfa4f9feb2827d174
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/34120
Commit-Queue: Hao Li <hao.x.li@intel.com>
Reviewed-by: Corentin Wallez <cwallez@chromium.org>
diff --git a/src/dawn_native/BUILD.gn b/src/dawn_native/BUILD.gn
index 7bd9e78..e57c7e1 100644
--- a/src/dawn_native/BUILD.gn
+++ b/src/dawn_native/BUILD.gn
@@ -235,6 +235,8 @@
     "PooledResourceMemoryAllocator.h",
     "ProgrammablePassEncoder.cpp",
     "ProgrammablePassEncoder.h",
+    "QueryHelper.cpp",
+    "QueryHelper.h",
     "QuerySet.cpp",
     "QuerySet.h",
     "Queue.cpp",
diff --git a/src/dawn_native/CMakeLists.txt b/src/dawn_native/CMakeLists.txt
index 93f81e2..d65ead6 100644
--- a/src/dawn_native/CMakeLists.txt
+++ b/src/dawn_native/CMakeLists.txt
@@ -122,6 +122,8 @@
     "PooledResourceMemoryAllocator.h"
     "ProgrammablePassEncoder.cpp"
     "ProgrammablePassEncoder.h"
+    "QueryHelper.cpp"
+    "QueryHelper.h"
     "QuerySet.cpp"
     "QuerySet.h"
     "Queue.cpp"
diff --git a/src/dawn_native/ComputePassEncoder.h b/src/dawn_native/ComputePassEncoder.h
index 6ae796a..0f99462 100644
--- a/src/dawn_native/ComputePassEncoder.h
+++ b/src/dawn_native/ComputePassEncoder.h
@@ -32,7 +32,7 @@
 
         void EndPass();
 
-        void Dispatch(uint32_t x, uint32_t y, uint32_t z);
+        void Dispatch(uint32_t x, uint32_t y = 1, uint32_t z = 1);
         void DispatchIndirect(BufferBase* indirectBuffer, uint64_t indirectOffset);
         void SetPipeline(ComputePipelineBase* pipeline);
 
diff --git a/src/dawn_native/InternalPipelineStore.h b/src/dawn_native/InternalPipelineStore.h
index e99d5d2..5e3462b 100644
--- a/src/dawn_native/InternalPipelineStore.h
+++ b/src/dawn_native/InternalPipelineStore.h
@@ -28,6 +28,9 @@
         Ref<RenderPipelineBase> copyTextureForBrowserPipeline;
         Ref<ShaderModuleBase> copyTextureForBrowserVS;
         Ref<ShaderModuleBase> copyTextureForBrowserFS;
+
+        Ref<ComputePipelineBase> timestampComputePipeline;
+        Ref<ShaderModuleBase> timestampCS;
     };
 }  // namespace dawn_native
 
diff --git a/src/dawn_native/QueryHelper.cpp b/src/dawn_native/QueryHelper.cpp
new file mode 100644
index 0000000..d2e729e
--- /dev/null
+++ b/src/dawn_native/QueryHelper.cpp
@@ -0,0 +1,190 @@
+// Copyright 2020 The Dawn Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dawn_native/QueryHelper.h"
+
+#include "dawn_native/BindGroup.h"
+#include "dawn_native/BindGroupLayout.h"
+#include "dawn_native/Buffer.h"
+#include "dawn_native/CommandEncoder.h"
+#include "dawn_native/ComputePassEncoder.h"
+#include "dawn_native/ComputePipeline.h"
+#include "dawn_native/Device.h"
+#include "dawn_native/InternalPipelineStore.h"
+
+namespace dawn_native {
+
+    namespace {
+
+        // Assert the offsets in dawn_native::TimestampParams are same with the ones in the shader
+        static_assert(offsetof(dawn_native::TimestampParams, inputByteOffset) == 0, "");
+        static_assert(offsetof(dawn_native::TimestampParams, outputByteOffset) == 4, "");
+        static_assert(offsetof(dawn_native::TimestampParams, count) == 8, "");
+        static_assert(offsetof(dawn_native::TimestampParams, period) == 12, "");
+
+        static const char sConvertTimestampsToNanoseconds[] = R"(
+            struct Timestamp {
+                [[offset(0)]] low  : u32;
+                [[offset(4)]] high : u32;
+            };
+
+            [[block]] struct TimestampArr {
+                [[offset(0)]] t : [[stride(8)]] array<Timestamp>;
+            };
+
+            [[block]] struct AvailabilityArr {
+                [[offset(0)]] v : [[stride(4)]] array<u32>;
+            };
+
+            [[block]] struct TimestampParams {
+                [[offset(0)]]  inputByteOffset  : u32;
+                [[offset(4)]]  outputByteOffset : u32;
+                [[offset(8)]]  count            : u32;
+                [[offset(12)]] period           : f32;
+            };
+
+            [[set(0), binding(0)]]
+                var<storage_buffer> input : [[access(read)]] TimestampArr;
+            [[set(0), binding(1)]]
+                var<storage_buffer> availability : [[access(read)]] AvailabilityArr;
+            [[set(0), binding(2)]]
+                var<storage_buffer> output : [[access(read_write)]] TimestampArr;
+            [[set(0), binding(3)]] var<uniform> params : TimestampParams;
+
+            [[builtin(global_invocation_id)]] var<in> GlobalInvocationID : vec3<u32>;
+
+            const sizeofTimestamp : u32 = 8u;
+
+            [[stage(compute), workgroup_size(8, 1, 1)]]
+            fn main() -> void {
+                if (GlobalInvocationID.x >= params.count) { return; }
+
+                var inputIndex : u32 = GlobalInvocationID.x +
+                                       params.inputByteOffset / sizeofTimestamp;
+                var outputIndex : u32 = GlobalInvocationID.x +
+                                        params.outputByteOffset / sizeofTimestamp;
+
+                var timestamp : Timestamp = input.t[inputIndex];
+
+                # Return 0 for the unavailable value.
+                if (availability.v[inputIndex] == 0u) {
+                    output.t[outputIndex].low = 0u;
+                    output.t[outputIndex].high = 0u;
+                    return;
+                }
+
+                # Multiply input values by the period and store into output.
+                var period : f32 = params.period;
+                var w : u32 = 0u;
+
+                # If the product of low 32-bits and the period does not exceed the maximum of u32,
+                # directly do the multiplication, otherwise, use two u32 to represent the high
+                # 16-bits and low 16-bits of this u32, then multiply them by the period separately.
+                if (timestamp.low <= u32(f32(0xFFFFFFFFu) / period)) {
+                    output.t[outputIndex].low = u32(round(f32(timestamp.low) * period));
+                } else {
+                    var lo : u32 = timestamp.low & 0xFFFF;
+                    var hi : u32 = timestamp.low >> 16;
+
+                    var t0 : u32 = u32(round(f32(lo) * period));
+                    var t1 : u32 = u32(round(f32(hi) * period)) + (t0 >> 16);
+                    w = t1 >> 16;
+
+                    var result : u32 = t1 << 16;
+                    result = result | (t0 & 0xFFFF);
+                    output.t[outputIndex].low = result;
+                }
+
+                # Get the nearest integer to the float result. For high 32-bits, the round
+                # function will greatly help reduce the accuracy loss of the final result.
+                output.t[outputIndex].high = u32(round(f32(timestamp.high) * period)) + w;
+            }
+        )";
+
+        ComputePipelineBase* GetOrCreateTimestampComputePipeline(DeviceBase* device) {
+            InternalPipelineStore* store = device->GetInternalPipelineStore();
+
+            if (store->timestampComputePipeline == nullptr) {
+                // Create compute shader module if not cached before.
+                if (store->timestampCS == nullptr) {
+                    ShaderModuleDescriptor descriptor;
+                    ShaderModuleWGSLDescriptor wgslDesc;
+                    wgslDesc.source = sConvertTimestampsToNanoseconds;
+                    descriptor.nextInChain = reinterpret_cast<ChainedStruct*>(&wgslDesc);
+
+                    store->timestampCS = AcquireRef(device->CreateShaderModule(&descriptor));
+                }
+
+                // Create ComputePipeline.
+                ComputePipelineDescriptor computePipelineDesc = {};
+                // Generate the layout based on shader module.
+                computePipelineDesc.layout = nullptr;
+                computePipelineDesc.computeStage.module = store->timestampCS.Get();
+                computePipelineDesc.computeStage.entryPoint = "main";
+
+                store->timestampComputePipeline =
+                    AcquireRef(device->CreateComputePipeline(&computePipelineDesc));
+            }
+
+            return store->timestampComputePipeline.Get();
+        }
+
+    }  // anonymous namespace
+
+    void EncodeConvertTimestampsToNanoseconds(CommandEncoder* encoder,
+                                              BufferBase* input,
+                                              BufferBase* availability,
+                                              BufferBase* output,
+                                              BufferBase* params) {
+        DeviceBase* device = encoder->GetDevice();
+
+        ComputePipelineBase* pipeline = GetOrCreateTimestampComputePipeline(device);
+
+        // Prepare bind group layout.
+        Ref<BindGroupLayoutBase> layout = AcquireRef(pipeline->GetBindGroupLayout(0));
+
+        // Prepare bind group descriptor
+        std::array<BindGroupEntry, 4> bindGroupEntries = {};
+        BindGroupDescriptor bgDesc = {};
+        bgDesc.layout = layout.Get();
+        bgDesc.entryCount = 4;
+        bgDesc.entries = bindGroupEntries.data();
+
+        // Set bind group entries.
+        bindGroupEntries[0].binding = 0;
+        bindGroupEntries[0].buffer = input;
+        bindGroupEntries[0].size = input->GetSize();
+        bindGroupEntries[1].binding = 1;
+        bindGroupEntries[1].buffer = availability;
+        bindGroupEntries[1].size = availability->GetSize();
+        bindGroupEntries[2].binding = 2;
+        bindGroupEntries[2].buffer = output;
+        bindGroupEntries[2].size = output->GetSize();
+        bindGroupEntries[3].binding = 3;
+        bindGroupEntries[3].buffer = params;
+        bindGroupEntries[3].size = params->GetSize();
+
+        // Create bind group after all binding entries are set.
+        Ref<BindGroupBase> bindGroup = AcquireRef(device->CreateBindGroup(&bgDesc));
+
+        // Create compute encoder and issue dispatch.
+        ComputePassDescriptor passDesc = {};
+        Ref<ComputePassEncoder> pass = AcquireRef(encoder->BeginComputePass(&passDesc));
+        pass->SetPipeline(pipeline);
+        pass->SetBindGroup(0, bindGroup.Get());
+        pass->Dispatch(static_cast<uint32_t>(ceil((input->GetSize() / sizeof(uint64_t) + 7) / 8)));
+        pass->EndPass();
+    }
+
+}  // namespace dawn_native
diff --git a/src/dawn_native/QueryHelper.h b/src/dawn_native/QueryHelper.h
new file mode 100644
index 0000000..733475b
--- /dev/null
+++ b/src/dawn_native/QueryHelper.h
@@ -0,0 +1,41 @@
+// Copyright 2020 The Dawn Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef DAWNNATIVE_QUERYHELPER_H_
+#define DAWNNATIVE_QUERYHELPER_H_
+
+#include "dawn_native/ObjectBase.h"
+
+namespace dawn_native {
+
+    class BufferBase;
+    class DeviceBase;
+    class CommandEncoder;
+
+    struct TimestampParams {
+        uint32_t inputByteOffset;
+        uint32_t outputByteOffset;
+        uint32_t count;
+        float period;
+    };
+
+    void EncodeConvertTimestampsToNanoseconds(CommandEncoder* encoder,
+                                              BufferBase* input,
+                                              BufferBase* availability,
+                                              BufferBase* output,
+                                              BufferBase* params);
+
+}  // namespace dawn_native
+
+#endif  // DAWNNATIVE_QUERYHELPER_H_
diff --git a/src/tests/BUILD.gn b/src/tests/BUILD.gn
index 0c5fc86..c76c9b2 100644
--- a/src/tests/BUILD.gn
+++ b/src/tests/BUILD.gn
@@ -401,7 +401,10 @@
     }
   }
 
-  sources += [ "white_box/InternalResourceUsageTests.cpp" ]
+  sources += [
+    "white_box/InternalResourceUsageTests.cpp",
+    "white_box/QueryInternalShaderTests.cpp",
+  ]
 
   if (dawn_enable_d3d12) {
     sources += [
diff --git a/src/tests/white_box/QueryInternalShaderTests.cpp b/src/tests/white_box/QueryInternalShaderTests.cpp
new file mode 100644
index 0000000..996e1c7
--- /dev/null
+++ b/src/tests/white_box/QueryInternalShaderTests.cpp
@@ -0,0 +1,203 @@
+// Copyright 2020 The Dawn Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tests/DawnTest.h"
+
+#include "dawn_native/Buffer.h"
+#include "dawn_native/CommandEncoder.h"
+#include "dawn_native/QueryHelper.h"
+#include "utils/WGPUHelpers.h"
+
+namespace {
+
+    void EncodeConvertTimestampsToNanoseconds(wgpu::CommandEncoder encoder,
+                                              wgpu::Buffer input,
+                                              wgpu::Buffer availability,
+                                              wgpu::Buffer output,
+                                              wgpu::Buffer params) {
+        dawn_native::EncodeConvertTimestampsToNanoseconds(
+            reinterpret_cast<dawn_native::CommandEncoder*>(encoder.Get()),
+            reinterpret_cast<dawn_native::BufferBase*>(input.Get()),
+            reinterpret_cast<dawn_native::BufferBase*>(availability.Get()),
+            reinterpret_cast<dawn_native::BufferBase*>(output.Get()),
+            reinterpret_cast<dawn_native::BufferBase*>(params.Get()));
+    }
+
+    class InternalShaderExpectation : public detail::Expectation {
+      public:
+        ~InternalShaderExpectation() override = default;
+
+        InternalShaderExpectation(const uint64_t* values, const unsigned int count) {
+            mExpected.assign(values, values + count);
+        }
+
+        // Expect the actual results are approximately equal to the expected values.
+        testing::AssertionResult Check(const void* data, size_t size) override {
+            DAWN_ASSERT(size == sizeof(uint64_t) * mExpected.size());
+            constexpr static float kErrorToleranceRatio = 0.002f;
+
+            const uint64_t* actual = static_cast<const uint64_t*>(data);
+            for (size_t i = 0; i < mExpected.size(); ++i) {
+                if (mExpected[i] == 0 && actual[i] != 0) {
+                    return testing::AssertionFailure()
+                           << "Expected data[" << i << "] to be 0, actual " << actual[i]
+                           << std::endl;
+                }
+
+                if (abs(static_cast<int64_t>(mExpected[i] - actual[i])) >
+                    mExpected[i] * kErrorToleranceRatio) {
+                    return testing::AssertionFailure()
+                           << "Expected data[" << i << "] to be " << mExpected[i] << ", actual "
+                           << actual[i] << ". Error rate is larger than " << kErrorToleranceRatio
+                           << std::endl;
+                }
+            }
+
+            return testing::AssertionSuccess();
+        }
+
+      private:
+        std::vector<uint64_t> mExpected;
+    };
+
+}  // anonymous namespace
+
+class QueryInternalShaderTests : public DawnTest {};
+
+// Test the accuracy of timestamp compute shader which uses unsigned 32-bit integers to simulate
+// unsigned 64-bit integers (timestamps) multiplied by float (period).
+// The arguments pass to timestamp internal pipeline:
+// - The input buffer passes the original timestamps resolved from query set (created by manual
+//   here).
+// - The availability buffer passes the data of which slot in input buffer is an initialized
+//   timestamp.
+// - The output buffer stores the converted results, expect 0 for unavailable timestamps and
+//   nanoseconds for available timestamps in an expected error rate.
+// - The params buffer passes the offset of input and output buffers, the count of timestamps and
+//   the timestamp period (here use GPU frequency (HZ) on Intel D3D12 to calculate the period in
+//   ns for testing).
+TEST_P(QueryInternalShaderTests, TimestampComputeShader) {
+    DAWN_SKIP_TEST_IF(UsesWire());
+
+    // TODO(crbug.com/tint/255, crbug.com/tint/256, crbug.com/tint/400, crbug.com/tint/417):
+    // There is no builtin support for doing the runtime array.
+    DAWN_SKIP_TEST_IF(HasToggleEnabled("use_tint_generator"));
+
+    constexpr uint32_t kTimestampCount = 10u;
+    // A gpu frequency on Intel D3D12 (ticks/second)
+    constexpr uint64_t kGPUFrequency = 12000048u;
+    constexpr uint64_t kNsPerSecond = 1000000000u;
+    // Timestamp period in nanoseconds
+    constexpr float kPeriod = static_cast<float>(kNsPerSecond) / kGPUFrequency;
+    constexpr uint64_t kOne = 1u;
+
+    // Original timestamp values for testing
+    std::array<uint64_t, kTimestampCount> timestamps;
+    timestamps[0] = 0;            // not written at beginning
+    timestamps[1] = 10079569507;  // t0
+    timestamps[2] = 10394415012;  // t1
+    timestamps[3] = 0;            // not written between timestamps
+    timestamps[4] = 11713454943;  // t2
+    timestamps[5] = 38912556941;  // t3 (big value)
+    timestamps[6] = 10080295766;  // t4 (reset)
+    timestamps[7] = 12159966783;  // t5 (after reset)
+    timestamps[8] = 12651224612;  // t6
+    timestamps[9] = 39872473956;  // t7
+
+    // Expected results: Timestamp value * kNsPerSecond / kGPUFrequency
+    std::array<uint64_t, kTimestampCount> expected;
+    // The availablility state of each timestamp
+    std::array<uint32_t, kTimestampCount> availabilities;
+
+    for (size_t i = 0; i < kTimestampCount; i++) {
+        if (timestamps[i] == 0) {
+            // Not a timestamp value, keep original value
+            expected[i] = 0u;
+            availabilities[i] = 0u;
+        } else {
+            // Maybe the timestamp * 10^9 is larger than the maximum of uint64, so cast the delta
+            // value to double (higher precision than float)
+            expected[i] = static_cast<uint64_t>(static_cast<double>(timestamps[i]) * kNsPerSecond /
+                                                kGPUFrequency);
+            availabilities[i] = 1u;
+        }
+    }
+
+    // The input storage buffer
+    wgpu::Buffer inputBuffer =
+        utils::CreateBufferFromData(device, timestamps.data(), sizeof(timestamps),
+                                    wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc);
+    EXPECT_BUFFER_U64_RANGE_EQ(timestamps.data(), inputBuffer, 0, kTimestampCount);
+
+    // To indicate which value is available
+    wgpu::Buffer availabilityBuffer = utils::CreateBufferFromData(
+        device, availabilities.data(), sizeof(availabilities), wgpu::BufferUsage::Storage);
+
+    // The output storage buffer
+    wgpu::BufferDescriptor outputDesc;
+    outputDesc.size = kTimestampCount * sizeof(uint64_t);
+    outputDesc.usage =
+        wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst;
+    wgpu::Buffer outputBuffer = device.CreateBuffer(&outputDesc);
+
+    std::array<uint64_t, kTimestampCount> ones;
+    ones.fill(kOne);
+
+    // Convert timestamps to output buffer with offset 0
+    {
+        queue.WriteBuffer(outputBuffer, 0, ones.data(), sizeof(ones));
+
+        constexpr uint32_t kOffset = 0u;
+        // The params uniform buffer
+        dawn_native::TimestampParams params = {kOffset, kOffset, kTimestampCount, kPeriod};
+        wgpu::Buffer paramsBuffer = utils::CreateBufferFromData(device, &params, sizeof(params),
+                                                                wgpu::BufferUsage::Uniform);
+
+        wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+
+        EncodeConvertTimestampsToNanoseconds(encoder, inputBuffer, availabilityBuffer, outputBuffer,
+                                             paramsBuffer);
+
+        wgpu::CommandBuffer commands = encoder.Finish();
+        queue.Submit(1, &commands);
+
+        EXPECT_BUFFER(outputBuffer, kOffset, kTimestampCount * sizeof(uint64_t),
+                      new InternalShaderExpectation(expected.data(), kTimestampCount));
+    }
+
+    // Convert timestamps to output buffer with offset 8 from input buffer with offset 8
+    {
+        queue.WriteBuffer(outputBuffer, 0, ones.data(), sizeof(ones));
+
+        constexpr uint32_t kOffset = 8u;
+        // The params uniform buffer
+        dawn_native::TimestampParams params = {kOffset, kOffset, kTimestampCount, kPeriod};
+        wgpu::Buffer paramsBuffer = utils::CreateBufferFromData(device, &params, sizeof(params),
+                                                                wgpu::BufferUsage::Uniform);
+
+        wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+
+        EncodeConvertTimestampsToNanoseconds(encoder, inputBuffer, availabilityBuffer, outputBuffer,
+                                             paramsBuffer);
+
+        wgpu::CommandBuffer commands = encoder.Finish();
+        queue.Submit(1, &commands);
+
+        EXPECT_BUFFER_U64_RANGE_EQ(&kOne, outputBuffer, 0, 1);
+        EXPECT_BUFFER(outputBuffer, kOffset, (kTimestampCount - 1) * sizeof(uint64_t),
+                      new InternalShaderExpectation(expected.data() + 1, kTimestampCount - 1));
+    }
+}
+
+DAWN_INSTANTIATE_TEST(QueryInternalShaderTests, D3D12Backend(), MetalBackend(), VulkanBackend());