[dawn][native] Make a separate entry point for ExecutionQueue for callbacks.

- We have seen lock inversion issues w.r.t the lock in ExecutionQueue
  and the device lock before (see 417802523). In order to avoid this
  as the device-lock is a temporary solution for multithreading, this
  change introduces a separate function that we can use to implement
  new code without running into the lock ordering issue.
- This change also depends on the fact that the device lock is now
  recursively acquireable and makes all implementations (modulo Metal)
  explicitly acquire the device lock for CheckAndUpdateCompletedSerials.
  We may be able to remove some of those locks, as we go forwards hence
  the TODOs.
- This change reverts the change in:
  https://dawn-review.googlesource.com/c/dawn/+/246534 so that we can
  guarantee callback ordering. For context, if we call the callbacks
  like we did before outside of the lock after moving them into a
  temporary vector, two threads updating serials 1 and 2 respectively
  can race and each remove one task, then in the non-critical section,
  serial 2's thread can run first resulting in what appears to be
  out of order callback execution. This design has some restrictions
  documented in the comments, but may be alleviated in the future if
  needed by using a recursive mutex for the waiting tasks in
  ExecutionQueue.

Bug: 412761228, 412761367, 40643114, 42240396
Change-Id: I61b217db066d09f22756c5a242968cffc9c9f19c
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/249099
Commit-Queue: Loko Kung <lokokung@google.com>
Reviewed-by: Kai Ninomiya <kainino@chromium.org>
Reviewed-by: Corentin Wallez <cwallez@chromium.org>
diff --git a/src/dawn/native/EventManager.cpp b/src/dawn/native/EventManager.cpp
index 6162896..0ef0401 100644
--- a/src/dawn/native/EventManager.cpp
+++ b/src/dawn/native/EventManager.cpp
@@ -162,32 +162,31 @@
         auto waitSerial = queueAndSerial.second;
 
         auto* device = queue->GetDevice();
-        {
-            auto deviceGuard = device->GetGuard();
-            [[maybe_unused]] bool error = device->ConsumedError(
-                [&]() -> MaybeError {
-                    if (waitSerial > queue->GetLastSubmittedCommandSerial()) {
-                        // Serial has not been submitted yet. Submit it now.
-                        DAWN_TRY(queue->EnsureCommandsFlushed(waitSerial));
+        [[maybe_unused]] bool error;
+        error = device->ConsumedError(
+            [&]() -> MaybeError {
+                auto deviceGuard = device->GetGuard();
+
+                if (waitSerial > queue->GetLastSubmittedCommandSerial()) {
+                    // Serial has not been submitted yet. Submit it now.
+                    DAWN_TRY(queue->EnsureCommandsFlushed(waitSerial));
+                }
+                // Check the completed serial.
+                if (waitSerial > queue->GetCompletedCommandSerial()) {
+                    if (timeout > Nanoseconds(0)) {
+                        // Wait on the serial if it hasn't passed yet.
+                        [[maybe_unused]] bool waitResult = false;
+                        DAWN_TRY_ASSIGN(waitResult, queue->WaitForQueueSerial(waitSerial, timeout));
                     }
-                    // Check the completed serial.
-                    if (waitSerial > queue->GetCompletedCommandSerial()) {
-                        if (timeout > Nanoseconds(0)) {
-                            // Wait on the serial if it hasn't passed yet.
-                            [[maybe_unused]] bool waitResult = false;
-                            DAWN_TRY_ASSIGN(waitResult,
-                                            queue->WaitForQueueSerial(waitSerial, timeout));
-                        }
-                        // Update completed serials.
-                        DAWN_TRY(queue->CheckPassedSerials());
-                    }
-                    return {};
-                }(),
-                "waiting for work in %s.", queue.Get());
-        }
-        // TODO(crbug.com/421945313): Checking and updating serials cannot hold the device-wide lock
-        // because it may cause user callbacks to fire.
-        queue->UpdateCompletedSerial(queue->GetCompletedCommandSerial());
+                }
+                return {};
+            }(),
+            "waiting for work in %s.", queue.Get());
+
+        // Updating completed serial cannot hold the device-wide lock because it may cause user
+        // callbacks to fire.
+        error = device->ConsumedError(queue->UpdateCompletedSerial(),
+                                      "updating completed serial in %s", queue.Get());
     }
 }
 
diff --git a/src/dawn/native/ExecutionQueue.cpp b/src/dawn/native/ExecutionQueue.cpp
index 4645106..b0df0d0 100644
--- a/src/dawn/native/ExecutionQueue.cpp
+++ b/src/dawn/native/ExecutionQueue.cpp
@@ -56,10 +56,16 @@
 
     // Atomically set mCompletedSerial to completedSerial if completedSerial is larger.
     FetchMax(mCompletedSerial, uint64_t(completedSerial));
+    return {};
+}
 
-    // TODO(crbug.com/421945313): We should call |UpdateCompletedSerial| here also, but since some
-    // backends rely on the device lock for safe use of |CheckAndUpdateCompletedSerials|, we
-    // separate that call out for now.
+MaybeError ExecutionQueueBase::UpdateCompletedSerial() {
+    ExecutionSerial completedSerial;
+    DAWN_TRY_ASSIGN(completedSerial, CheckAndUpdateCompletedSerials());
+
+    DAWN_ASSERT(completedSerial <=
+                ExecutionSerial(mLastSubmittedSerial.load(std::memory_order_acquire)));
+    UpdateCompletedSerialTo(completedSerial);
     return {};
 }
 
@@ -71,20 +77,20 @@
     mWaitingTasks->Enqueue(std::move(task), serial);
 }
 
-void ExecutionQueueBase::UpdateCompletedSerial(ExecutionSerial completedSerial) {
+void ExecutionQueueBase::UpdateCompletedSerialTo(ExecutionSerial completedSerial) {
     // Atomically set mCompletedSerial to completedSerial if completedSerial is larger.
     FetchMax(mCompletedSerial, uint64_t(completedSerial));
 
-    std::vector<Task> pending;
     mWaitingTasks.Use([&](auto tasks) {
+        // Note that the callbacks need to be called while holding this lock to ensure that callback
+        // ordering is enforced. At the moment, that means that re-entrant callbacks that call async
+        // APIs will deadlock. In the future we may consider making this a recursive lock or
+        // providing some other mechanism in order to enable re-entrant async APIs.
         for (auto task : tasks->IterateUpTo(completedSerial)) {
-            pending.push_back(std::move(task));
+            task();
         }
         tasks->ClearUpTo(completedSerial);
     });
-    for (auto task : pending) {
-        task();
-    }
 }
 
 MaybeError ExecutionQueueBase::EnsureCommandsFlushed(ExecutionSerial serial) {
@@ -115,7 +121,7 @@
     // thread-safe yet. Two threads calling destroy would race setting these serials.
     ExecutionSerial completed =
         ExecutionSerial(mLastSubmittedSerial.fetch_add(1u, std::memory_order_release) + 1);
-    UpdateCompletedSerial(completed);
+    UpdateCompletedSerialTo(completed);
 }
 
 void ExecutionQueueBase::IncrementLastSubmittedCommandSerial() {
diff --git a/src/dawn/native/ExecutionQueue.h b/src/dawn/native/ExecutionQueue.h
index 13692f9..ff20f78 100644
--- a/src/dawn/native/ExecutionQueue.h
+++ b/src/dawn/native/ExecutionQueue.h
@@ -58,8 +58,14 @@
     // Whether the execution queue has scheduled commands to be submitted or executing.
     bool HasScheduledCommands() const;
 
-    // Check for passed fences and set the new completed serial.
+    // Check for passed fences and set the new completed serial. Note that the two functions below
+    // effectively do the same thing initially, however, |UpdateCompletedSerials| requires
+    // that the device-wide lock is NOT held since it may additionally trigger user callbacks. Note
+    // that for the purpose of going forwards, |CheckPassedSerials| should not be used anymore since
+    // it assumes that the device-wide lock IS held.
+    // TODO(crbug.com/42240396): Remove |CheckPassedSerials| in favor of |UpdateCompletedSerial|.
     MaybeError CheckPassedSerials();
+    MaybeError UpdateCompletedSerial();
 
     // For the commands being internally recorded in backend, that were not urgent to submit, this
     // method makes them to be submitted as soon as possible in next ticks.
@@ -97,13 +103,6 @@
     // example, until the client has explictly issued a submission.
     enum class SubmitMode { Normal, Passive };
 
-    // Currently, the queue has two paths for serial updating, one is via DeviceBase::Tick which
-    // calls into the backend specific polling mechanisms implemented in
-    // CheckAndUpdateCompletedSerials. Alternatively, the backend can actively call
-    // UpdateCompletedSerial when a new serial is complete to make forward progress proactively.
-    // TODO(crbug.com/421945313): This shouldn't need to be public once we fix lock ordering.
-    void UpdateCompletedSerial(ExecutionSerial completedSerial);
-
     // Tracks whether we are in a submit to avoid submit reentrancy. Reentrancy could otherwise
     // happen when allocating resources or staging memory during submission (for workarounds, or
     // emulation) and the heuristics ask for an early submit to happen (which would cause a
@@ -112,6 +111,13 @@
     // at which point this member can be private.
     bool mInSubmit = false;
 
+  protected:
+    // Currently, the queue has two paths for serial updating, one is via DeviceBase::Tick which
+    // calls into the backend specific polling mechanisms implemented in
+    // CheckAndUpdateCompletedSerials. Alternatively, the backend can actively call
+    // UpdateCompletedSerial when a new serial is complete to make forward progress proactively.
+    void UpdateCompletedSerialTo(ExecutionSerial completedSerial);
+
   private:
     // Each backend should implement to check their passed fences if there are any and return a
     // completed serial. Return 0 should indicate no fences to check.
diff --git a/src/dawn/native/d3d11/QueueD3D11.cpp b/src/dawn/native/d3d11/QueueD3D11.cpp
index abe671d..87d4837 100644
--- a/src/dawn/native/d3d11/QueueD3D11.cpp
+++ b/src/dawn/native/d3d11/QueueD3D11.cpp
@@ -408,6 +408,9 @@
 }
 
 ResultOrError<ExecutionSerial> MonitoredFenceQueue::CheckAndUpdateCompletedSerials() {
+    // TODO(crbug.com/40643114): Revisit whether this lock is needed for this backend.
+    auto deviceGuard = GetDevice()->GetGuard();
+
     ExecutionSerial completedSerial = ExecutionSerial(mFence->GetCompletedValue());
     if (completedSerial == ExecutionSerial(UINT64_MAX)) [[unlikely]] {
         // GetCompletedValue returns UINT64_MAX if the device was removed.
@@ -466,6 +469,9 @@
 }
 
 ResultOrError<ExecutionSerial> SystemEventQueue::CheckAndUpdateCompletedSerials() {
+    // TODO(crbug.com/40643114): Revisit whether this lock is needed for this backend.
+    auto deviceGuard = GetDevice()->GetGuard();
+
     ExecutionSerial completedSerial;
     std::vector<SystemEventReceiver> returnedReceivers;
     // Check for completed events in the pending list.
@@ -658,6 +664,9 @@
 }
 
 ResultOrError<ExecutionSerial> DelayFlushQueue::CheckAndUpdateCompletedSerials() {
+    // TODO(crbug.com/40643114): Revisit whether this lock is needed for this backend.
+    auto deviceGuard = GetDevice()->GetGuard();
+
     ExecutionSerial completedSerial;
     {
         auto commandContext = GetScopedPendingCommandContext(SubmitMode::Passive);
diff --git a/src/dawn/native/d3d12/QueueD3D12.cpp b/src/dawn/native/d3d12/QueueD3D12.cpp
index 2c6d610..652a2bc 100644
--- a/src/dawn/native/d3d12/QueueD3D12.cpp
+++ b/src/dawn/native/d3d12/QueueD3D12.cpp
@@ -170,6 +170,9 @@
 }
 
 ResultOrError<ExecutionSerial> Queue::CheckAndUpdateCompletedSerials() {
+    // TODO(crbug.com/40643114): Revisit whether this lock is needed for this backend.
+    auto deviceGuard = GetDevice()->GetGuard();
+
     ExecutionSerial completedSerial = ExecutionSerial(mFence->GetCompletedValue());
     if (completedSerial == ExecutionSerial(UINT64_MAX)) [[unlikely]] {
         // GetCompletedValue returns UINT64_MAX if the device was removed.
diff --git a/src/dawn/native/metal/QueueMTL.mm b/src/dawn/native/metal/QueueMTL.mm
index e1d959f..9bb90ba 100644
--- a/src/dawn/native/metal/QueueMTL.mm
+++ b/src/dawn/native/metal/QueueMTL.mm
@@ -166,7 +166,7 @@
         TRACE_EVENT_ASYNC_END0(platform, GPUWork, "DeviceMTL::SubmitPendingCommandBuffer",
                                uint64_t(pendingSerial));
 
-        this->UpdateCompletedSerial(pendingSerial);
+        this->UpdateCompletedSerialTo(pendingSerial);
         this->UpdateWaitingEvents(pendingSerial);
     }];
 
diff --git a/src/dawn/native/opengl/QueueGL.cpp b/src/dawn/native/opengl/QueueGL.cpp
index 68fcd29..d97c180 100644
--- a/src/dawn/native/opengl/QueueGL.cpp
+++ b/src/dawn/native/opengl/QueueGL.cpp
@@ -281,6 +281,9 @@
 }
 
 ResultOrError<ExecutionSerial> Queue::CheckAndUpdateCompletedSerials() {
+    // TODO(crbug.com/40643114): Revisit whether this lock is needed for this backend.
+    auto deviceGuard = GetDevice()->GetGuard();
+
     return mFencesInFlight.Use([&](auto fencesInFlight) -> ResultOrError<ExecutionSerial> {
         ExecutionSerial fenceSerial{0};
         while (!fencesInFlight->empty()) {
diff --git a/src/dawn/native/vulkan/QueueVk.cpp b/src/dawn/native/vulkan/QueueVk.cpp
index 225426c..7652951 100644
--- a/src/dawn/native/vulkan/QueueVk.cpp
+++ b/src/dawn/native/vulkan/QueueVk.cpp
@@ -124,6 +124,9 @@
 }
 
 ResultOrError<ExecutionSerial> Queue::CheckAndUpdateCompletedSerials() {
+    // TODO(crbug.com/40643114): Revisit whether this lock is needed for this backend.
+    auto deviceGuard = GetDevice()->GetGuard();
+
     Device* device = ToBackend(GetDevice());
     return mFencesInFlight.Use([&](auto fencesInFlight) -> ResultOrError<ExecutionSerial> {
         ExecutionSerial fenceSerial(0);