[Vulkan] Handle fence wait timeout by retry when max timeout requested

Change-Id: Ie628e47a3556239c4eccad84936eb19ace65e188
Bug: 40073661, 344798087
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/211014
Commit-Queue: Colin Blundell <blundell@chromium.org>
Reviewed-by: Corentin Wallez <cwallez@chromium.org>
diff --git a/src/dawn/native/vulkan/QueueVk.cpp b/src/dawn/native/vulkan/QueueVk.cpp
index aa87b5a..0e40ea5 100644
--- a/src/dawn/native/vulkan/QueueVk.cpp
+++ b/src/dawn/native/vulkan/QueueVk.cpp
@@ -27,6 +27,7 @@
 
 #include "dawn/native/vulkan/QueueVk.h"
 
+#include <limits>
 #include <optional>
 #include <utility>
 
@@ -520,32 +521,54 @@
 ResultOrError<bool> Queue::WaitForQueueSerial(ExecutionSerial serial, Nanoseconds timeout) {
     Device* device = ToBackend(GetDevice());
     VkDevice vkDevice = device->GetVkDevice();
-    VkResult waitResult = mFencesInFlight.Use([&](auto fencesInFlight) {
-        // Search from for the first fence >= serial.
-        VkFence waitFence = VK_NULL_HANDLE;
-        for (auto it = fencesInFlight->begin(); it != fencesInFlight->end(); ++it) {
-            if (it->second >= serial) {
-                waitFence = it->first;
-                break;
+    // If the client has passed a finite timeout, the function will eventually return due to
+    // either (1) the fences being signaled, (2) the timeout being reached, or (3) the device
+    // being lost. If the client has passed an infinite timeout, this function might hang forever
+    // if the fences are never signaled (which matches the semantics that the client has
+    // specified).
+    // TODO(crbug.com/344798087): Handle the issue of timeouts in a more general way further up the
+    // stack.
+    while (1) {
+        VkResult waitResult = mFencesInFlight.Use([&](auto fencesInFlight) {
+            // Search from for the first fence >= serial.
+            VkFence waitFence = VK_NULL_HANDLE;
+            for (auto it = fencesInFlight->begin(); it != fencesInFlight->end(); ++it) {
+                if (it->second >= serial) {
+                    waitFence = it->first;
+                    break;
+                }
             }
+            if (waitFence == VK_NULL_HANDLE) {
+                // Fence not found. This serial must have already completed.
+                // Return a VK_SUCCESS status.
+                DAWN_ASSERT(serial <= GetCompletedCommandSerial());
+                return VkResult::WrapUnsafe(VK_SUCCESS);
+            }
+            // Wait for the fence.
+            return VkResult::WrapUnsafe(
+                INJECT_ERROR_OR_RUN(device->fn.WaitForFences(vkDevice, 1, &*waitFence, true,
+                                                             static_cast<uint64_t>(timeout)),
+                                    VK_ERROR_DEVICE_LOST));
+        });
+        if (waitResult == VK_TIMEOUT) {
+            // There is evidence that `VK_TIMEOUT` can get returned even when the
+            // client has specified an infinite timeout (e.g., due to signals). Retry
+            // waiting on the fence in this case in order to satisfy the semantics
+            // that the function should return only when either (a) the fences are
+            // signaled or (b) the passed-in timeout is reached. Note that this can
+            // result in this function busy-looping forever in this case, but the
+            // client has explicitly requested this behavior by passing in an infinite
+            // timeout.
+            // TODO(crbug.com/344798087): Handle the issue of timeouts in a more general way further
+            // up the stack.
+            if (static_cast<uint64_t>(timeout) == std::numeric_limits<uint64_t>::max()) {
+                continue;
+            }
+            return false;
         }
-        if (waitFence == VK_NULL_HANDLE) {
-            // Fence not found. This serial must have already completed.
-            // Return a VK_SUCCESS status.
-            DAWN_ASSERT(serial <= GetCompletedCommandSerial());
-            return VkResult::WrapUnsafe(VK_SUCCESS);
-        }
-        // Wait for the fence.
-        return VkResult::WrapUnsafe(
-            INJECT_ERROR_OR_RUN(device->fn.WaitForFences(vkDevice, 1, &*waitFence, true,
-                                                         static_cast<uint64_t>(timeout)),
-                                VK_ERROR_DEVICE_LOST));
-    });
-    if (waitResult == VK_TIMEOUT) {
-        return false;
+        DAWN_TRY(CheckVkSuccess(::VkResult(waitResult), "vkWaitForFences"));
+        return true;
     }
-    DAWN_TRY(CheckVkSuccess(::VkResult(waitResult), "vkWaitForFences"));
-    return true;
 }
 
 }  // namespace dawn::native::vulkan