Metal: Skip timestamp period estimation on Intel Iris Plus 655

On Mac/Intel Iris Plus 655, sampling CPU/GPU timestamps to estimate
timestamp period will cause the GPU to overheat due to driver bug. Skip
the estimation on the specific device and use the default value we set at device initialization.

Fixed the link of Mesa PCI IDs.

Bug: 342701242
Change-Id: Ib00068893ad6f8a706ba9a8f329b1b3920c84e95
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/192870
Reviewed-by: Corentin Wallez <cwallez@chromium.org>
Reviewed-by: Austin Eng <enga@chromium.org>
Commit-Queue: Hao Li <hao.x.li@intel.com>
diff --git a/src/dawn/common/GPUInfo.cpp b/src/dawn/common/GPUInfo.cpp
index c3d6ca7..129d079 100644
--- a/src/dawn/common/GPUInfo.cpp
+++ b/src/dawn/common/GPUInfo.cpp
@@ -38,12 +38,13 @@
 namespace {
 // Intel
 // Referenced from the following Mesa source code:
-// https://github.com/mesa3d/mesa/blob/main/include/pci_ids/iris_pci_ids.h
+// https://gitlab.freedesktop.org/mesa/mesa/-/blob/main/include/pci_ids/iris_pci_ids.h
 // gen9
-const std::array<uint32_t, 25> Skylake = {{0x1902, 0x1906, 0x190A, 0x190B, 0x190E, 0x1912, 0x1913,
-                                           0x1915, 0x1916, 0x1917, 0x191A, 0x191B, 0x191D, 0x191E,
-                                           0x1921, 0x1923, 0x1926, 0x1927, 0x192A, 0x192B, 0x192D,
-                                           0x1932, 0x193A, 0x193B, 0x193D}};
+const std::array<PCIDeviceID, 25> Skylake = {
+    {0x1902, 0x1906, 0x190A, 0x190B, 0x190E, 0x1912, 0x1913, 0x1915, 0x1916,
+     0x1917, 0x191A, 0x191B, 0x191D, 0x191E, 0x1921, 0x1923, 0x1926, 0x1927,
+     0x192A, 0x192B, 0x192D, 0x1932, 0x193A, 0x193B, 0x193D}};
+const std::array<PCIDeviceID, 2> IrisPlus655 = {{0x3EA5, 0x3EA8}};
 
 // According to Intel graphics driver version schema, build number is generated from the
 // last two fields.
@@ -118,4 +119,8 @@
     return std::find(Skylake.cbegin(), Skylake.cend(), deviceId) != Skylake.cend();
 }
 
+bool IsIrisPlus655(PCIDeviceID deviceId) {
+    return std::find(IrisPlus655.cbegin(), IrisPlus655.cend(), deviceId) != IrisPlus655.cend();
+}
+
 }  // namespace dawn::gpu_info
diff --git a/src/dawn/common/GPUInfo.h b/src/dawn/common/GPUInfo.h
index 1ad6238..8a35361 100644
--- a/src/dawn/common/GPUInfo.h
+++ b/src/dawn/common/GPUInfo.h
@@ -73,6 +73,7 @@
 
 // Intel architectures
 bool IsSkylake(PCIDeviceID deviceId);
+bool IsIrisPlus655(PCIDeviceID deviceId);
 
 }  // namespace dawn::gpu_info
 #endif  // SRC_DAWN_COMMON_GPUINFO_H_
diff --git a/src/dawn/native/Toggles.cpp b/src/dawn/native/Toggles.cpp
index 2d9d36f..d5cd62e 100644
--- a/src/dawn/native/Toggles.cpp
+++ b/src/dawn/native/Toggles.cpp
@@ -340,6 +340,13 @@
       "This toggle is enabled by default on Metal backend where GPU counters cannot be stored to"
       "sampleBufferAttachments on empty blit encoder.",
       "https://crbug.com/dawn/1473", ToggleStage::Device}},
+    {Toggle::MetalDisableTimestampPeriodEstimation,
+     {"metal_disable_timestamp_period_estimation",
+      "Calling sampleTimestamps:gpuTimestamp: from MTLDevice to estimate timestamp period leads to "
+      "GPU overheating on some specific Intel GPUs due to driver issue, such as Intel Iris "
+      "Plus Graphics 655. Enable this workaround to skip timestamp period estimation and use a "
+      "default value instead on the specific GPUs.",
+      "https://crbug.com/342701242", ToggleStage::Device}},
     {Toggle::VulkanSplitCommandBufferOnComputePassAfterRenderPass,
      {"vulkan_split_command_buffer_on_compute_pass_after_render_pass",
       "Splits any command buffer where a compute pass is recorded after a render pass. This "
diff --git a/src/dawn/native/Toggles.h b/src/dawn/native/Toggles.h
index cb343ee..81db67e 100644
--- a/src/dawn/native/Toggles.h
+++ b/src/dawn/native/Toggles.h
@@ -95,6 +95,7 @@
     D3D12UseTempBufferInTextureToTextureCopyBetweenDifferentDimensions,
     ApplyClearBigIntegerColorValueWithDraw,
     MetalUseMockBlitEncoderForWriteTimestamp,
+    MetalDisableTimestampPeriodEstimation,
     VulkanSplitCommandBufferOnComputePassAfterRenderPass,
     DisableSubAllocationFor2DTextureWithCopyDstOrRenderAttachment,
     MetalUseCombinedDepthStencilFormatForStencil8,
diff --git a/src/dawn/native/metal/DeviceMTL.mm b/src/dawn/native/metal/DeviceMTL.mm
index 73c810e..d2f669e 100644
--- a/src/dawn/native/metal/DeviceMTL.mm
+++ b/src/dawn/native/metal/DeviceMTL.mm
@@ -171,17 +171,20 @@
         // an accurate value by the following calculations.
         mTimestampPeriod = gpu_info::IsIntel(GetPhysicalDevice()->GetVendorId()) ? 83.333f : 1.0f;
 
-        // Initialize kalman filter parameters
-        mKalmanInfo = std::make_unique<KalmanInfo>();
-        mKalmanInfo->filterValue = 0.0f;
-        mKalmanInfo->kalmanGain = 0.5f;
-        mKalmanInfo->R = 0.0001f;  // The smaller this value is, the smaller the error of measured
-                                   // value is, the more we can trust the measured value.
-        mKalmanInfo->P = 1.0f;
-
         if (@available(macOS 10.15, iOS 14.0, *)) {
-            // Sample CPU timestamp and GPU timestamp for first time at device creation
-            [*mMtlDevice sampleTimestamps:&mCpuTimestamp gpuTimestamp:&mGpuTimestamp];
+            if (!IsToggleEnabled(Toggle::MetalDisableTimestampPeriodEstimation)) {
+                // Initialize kalman filter parameters
+                mKalmanInfo = std::make_unique<KalmanInfo>();
+                mKalmanInfo->filterValue = 0.0f;
+                mKalmanInfo->kalmanGain = 0.5f;
+                mKalmanInfo->R =
+                    0.0001f;  // The smaller this value is, the smaller the error of measured
+                              // value is, the more we can trust the measured value.
+                mKalmanInfo->P = 1.0f;
+
+                // Sample CPU timestamp and GPU timestamp for first time at device creation
+                [*mMtlDevice sampleTimestamps:&mCpuTimestamp gpuTimestamp:&mGpuTimestamp];
+            }
         }
     }
 
@@ -307,9 +310,10 @@
 MaybeError Device::TickImpl() {
     DAWN_TRY(ToBackend(GetQueue())->SubmitPendingCommandBuffer());
 
-    // Just run timestamp period calculation when timestamp feature is enabled and timestamp
-    // conversion is not disabled.
-    if (mIsTimestampQueryEnabled && !IsToggleEnabled(Toggle::DisableTimestampQueryConversion)) {
+    // Just run timestamp period estimation when timestamp feature is enabled and timestamp
+    // conversion is not disabled and the estimation is not disabled.
+    if (mIsTimestampQueryEnabled && !IsToggleEnabled(Toggle::DisableTimestampQueryConversion) &&
+        !IsToggleEnabled(Toggle::MetalDisableTimestampPeriodEstimation)) {
         if (@available(macOS 10.15, iOS 14.0, *)) {
             UpdateTimestampPeriod(GetMTLDevice(), mKalmanInfo.get(), &mCpuTimestamp, &mGpuTimestamp,
                                   &mTimestampPeriod);
diff --git a/src/dawn/native/metal/PhysicalDeviceMTL.mm b/src/dawn/native/metal/PhysicalDeviceMTL.mm
index 665d991..13d73cc 100644
--- a/src/dawn/native/metal/PhysicalDeviceMTL.mm
+++ b/src/dawn/native/metal/PhysicalDeviceMTL.mm
@@ -490,6 +490,16 @@
         deviceToggles->Default(Toggle::MetalUseMockBlitEncoderForWriteTimestamp, true);
     }
 
+    // On macOS 15.0+, we can use sampleTimestamps:gpuTimestamp: from MTLDevice to capture CPU and
+    // GPU timestamps to estimate GPU timestamp period at device creation, but this API call will
+    // cause GPU overheating on Intel Iris Plus Graphics 655 due to driver bug. Skip the
+    // timestamp sampling on the specific device as workaround. See https://crbug.com/342701242 for
+    // more details.
+    if (@available(macos 15.0, iOS 14.0, *)) {
+        deviceToggles->Default(Toggle::MetalDisableTimestampPeriodEstimation,
+                               gpu_info::IsIrisPlus655(deviceId));
+    }
+
 #if DAWN_PLATFORM_IS(MACOS)
     if (gpu_info::IsIntel(vendorId)) {
         deviceToggles->Default(