[dawn][native] Refactors ExecutionQueue to be an ApiObject.

- For a while I was avoiding changing the class hierarchy to try to
  keep the ExecutionQueue separate from the Device, but with the
  locking issues, it was too hard to keep it separate and by
  changing it, it becomes a lot easier to manage the callbacks and
  ensure that we take the device lock in places to ensure that we
  don't ever run into lock inversion issues. This also lets us
  introduce a toggle so that we can incrementally implement thread
  safety across different backends.
- Using the new toggle, we revert some of the changes from
  https://dawn-review.googlesource.com/c/dawn/+/252918 for backends
  other than Metal.

Bug: 430401631, 412761228
Include-Ci-Only-Tests: true
Change-Id: I2a0b04287789f42157b44cd48d99d0d988171799
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/251875
Commit-Queue: Loko Kung <lokokung@google.com>
Reviewed-by: Corentin Wallez <cwallez@chromium.org>
Reviewed-by: Brandon Jones <bajones@chromium.org>
diff --git a/src/dawn/native/EventManager.cpp b/src/dawn/native/EventManager.cpp
index dd1b6e6..f14c3ef 100644
--- a/src/dawn/native/EventManager.cpp
+++ b/src/dawn/native/EventManager.cpp
@@ -161,32 +161,8 @@
         }
         auto waitSerial = queueAndSerial.second;
 
-        auto* device = queue->GetDevice();
-        [[maybe_unused]] bool error;
-        error = device->ConsumedError(
-            [&]() -> MaybeError {
-                auto deviceGuard = device->GetGuard();
-
-                if (waitSerial > queue->GetLastSubmittedCommandSerial()) {
-                    // Serial has not been submitted yet. Submit it now.
-                    DAWN_TRY(queue->EnsureCommandsFlushed(waitSerial));
-                }
-                // Check the completed serial.
-                if (waitSerial > queue->GetCompletedCommandSerial()) {
-                    if (timeout > Nanoseconds(0)) {
-                        // Wait on the serial if it hasn't passed yet.
-                        [[maybe_unused]] bool waitResult = false;
-                        DAWN_TRY_ASSIGN(waitResult, queue->WaitForQueueSerial(waitSerial, timeout));
-                    }
-                }
-                return {};
-            }(),
-            "waiting for work in %s.", queue.Get());
-
-        // Updating completed serial cannot hold the device-wide lock because it may cause user
-        // callbacks to fire.
-        error = device->ConsumedError(queue->UpdateCompletedSerial(),
-                                      "updating completed serial in %s", queue.Get());
+        [[maybe_unused]] bool hadError = queue->GetDevice()->ConsumedError(
+            queue->WaitForQueueSerial(waitSerial, timeout), "waiting for work in %s.", queue.Get());
     }
 }
 
@@ -351,9 +327,9 @@
 
     if (const auto* queueAndSerial = event->GetIfQueueAndSerial()) {
         if (auto q = queueAndSerial->queue.Promote()) {
-            q->TrackSerialTask(queueAndSerial->completionSerial, [this, event]() {
+            q->TrackSerialTask(queueAndSerial->completionSerial, [this, q, event]() {
                 // If this is executed, we can be sure that the raw pointer to this EventManager is
-                // valid because the Queue is alive and:
+                // valid because the Queue is kept alive by the lambda capture and:
                 //   Queue -[refs]->
                 //     Device -[refs]->
                 //       Adapter -[refs]->
diff --git a/src/dawn/native/ExecutionQueue.cpp b/src/dawn/native/ExecutionQueue.cpp
index 695c756..707eadd 100644
--- a/src/dawn/native/ExecutionQueue.cpp
+++ b/src/dawn/native/ExecutionQueue.cpp
@@ -32,6 +32,7 @@
 #include <vector>
 
 #include "dawn/common/Atomic.h"
+#include "dawn/native/Device.h"
 
 namespace dawn::native {
 
@@ -47,23 +48,61 @@
     return ExecutionSerial(mCompletedSerial.load(std::memory_order_acquire));
 }
 
-ResultOrError<bool> ExecutionQueueBase::WaitForQueueSerial(ExecutionSerial serial,
-                                                           Nanoseconds timeout) {
-    ExecutionSerial completedSerial;
-    DAWN_TRY_ASSIGN(completedSerial, WaitForQueueSerialImpl(serial, timeout));
+MaybeError ExecutionQueueBase::WaitForQueueSerial(ExecutionSerial waitSerial, Nanoseconds timeout) {
+    // We currently have two differing implementations for this function depending on whether the
+    // backend supports thread safe waits. Note that while currently only the Metal backend
+    // explicitly enables thread safe wait, the main blocking backend is D3D11 which is using the
+    // value of |mCompletedSerial| within it's implementation of |CheckAndUpdateCompletedSerials|.
+    if (GetDevice()->IsToggleEnabled(Toggle::WaitIsThreadSafe)) {
+        {
+            auto deviceGuard = GetDevice()->GetGuard();
+            if (waitSerial > GetLastSubmittedCommandSerial()) {
+                // Serial has not been submitted yet. Submit it now.
+                DAWN_TRY(EnsureCommandsFlushed(waitSerial));
+            }
+        }
 
-    if (completedSerial == kWaitSerialTimeout) {
-        return false;
+        // Serial is already complete.
+        if (waitSerial <= GetCompletedCommandSerial()) {
+            return {};
+        }
+
+        if (timeout > Nanoseconds(0)) {
+            // Wait on the serial if it hasn't passed yet.
+            ExecutionSerial completedSerial = kWaitSerialTimeout;
+            DAWN_TRY_ASSIGN(completedSerial, WaitForQueueSerialImpl(waitSerial, timeout));
+            UpdateCompletedSerialTo(completedSerial);
+            return {};
+        }
+        return UpdateCompletedSerial();
+    } else {
+        // Otherwise, we need to acquire the device lock first.
+        auto deviceGuard = GetDevice()->GetGuard();
+        if (waitSerial > GetLastSubmittedCommandSerial()) {
+            // Serial has not been submitted yet. Submit it now.
+            DAWN_TRY(EnsureCommandsFlushed(waitSerial));
+        }
+
+        // Serial is already complete.
+        if (waitSerial <= GetCompletedCommandSerial()) {
+            return UpdateCompletedSerial();
+        }
+
+        if (timeout > Nanoseconds(0)) {
+            // Wait on the serial if it hasn't passed yet.
+            ExecutionSerial completedSerial = kWaitSerialTimeout;
+            DAWN_TRY_ASSIGN(completedSerial, WaitForQueueSerialImpl(waitSerial, timeout));
+
+            // It's critical to update the completed serial right away. If fences are processed
+            // by another thread before CheckAndUpdateCompletedSerials() runs on the current
+            // thread, the fence list will be empty, preventing the current thread from
+            // determining the true latest serial. Preemptively updating mCompletedSerial
+            // ensures CheckAndUpdateCompletedSerials() returns an accurate value, preventing
+            // stale data.
+            FetchMax(mCompletedSerial, uint64_t(completedSerial));
+        }
+        return UpdateCompletedSerial();
     }
-
-    // It's critical to update the completed serial right away. If fences are processed by another
-    // thread before CheckAndUpdateCompletedSerials() runs on the current thread,
-    // the fence list will be empty, preventing the current thread from determining the true latest
-    // serial. Pre-emptively updating mCompletedSerial ensures CheckAndUpdateCompletedSerials()
-    // returns an accurate value, preventing stale data.
-    FetchMax(mCompletedSerial, uint64_t(completedSerial));
-
-    return true;
 }
 
 MaybeError ExecutionQueueBase::WaitForIdleForDestruction() {
diff --git a/src/dawn/native/ExecutionQueue.h b/src/dawn/native/ExecutionQueue.h
index 720dd45..85c7a65 100644
--- a/src/dawn/native/ExecutionQueue.h
+++ b/src/dawn/native/ExecutionQueue.h
@@ -35,6 +35,7 @@
 #include "dawn/common/SerialMap.h"
 #include "dawn/native/Error.h"
 #include "dawn/native/IntegerTypes.h"
+#include "dawn/native/ObjectBase.h"
 #include "partition_alloc/pointers/raw_ptr.h"
 
 namespace dawn::native {
@@ -43,7 +44,7 @@
 // update of the various ExecutionSerials related to that work.
 // TODO(dawn:831, dawn:1413): Make usage of the ExecutionQueue thread-safe. Right now it is
 // only partially safe - where observation of the last-submitted and pending serials is atomic.
-class ExecutionQueueBase {
+class ExecutionQueueBase : public ApiObjectBase {
   public:
     using Task = std::function<void()>;
 
@@ -59,10 +60,9 @@
     bool HasScheduledCommands() const;
 
     // Check for passed fences and set the new completed serial. Note that the two functions below
-    // effectively do the same thing initially, however, |UpdateCompletedSerials| requires
-    // that the device-wide lock is NOT held since it may additionally trigger user callbacks. Note
-    // that for the purpose of going forwards, |CheckPassedSerials| should not be used anymore since
-    // it assumes that the device-wide lock IS held.
+    // effectively do the same thing initially, however, |UpdateCompletedSerials| may additionally
+    // trigger user callbacks. Note that for the purpose of going forwards, |CheckPassedSerials|
+    // should not be used anymore.
     // TODO(crbug.com/42240396): Remove |CheckPassedSerials| in favor of |UpdateCompletedSerial|.
     MaybeError CheckPassedSerials();
     MaybeError UpdateCompletedSerial();
@@ -90,9 +90,8 @@
     // called since the driver already closed all resources.
     MaybeError WaitForIdleForDestruction();
 
-    // Wait at most `timeout` synchronously for the ExecutionSerial to pass. Returns true
-    // if the serial passed.
-    ResultOrError<bool> WaitForQueueSerial(ExecutionSerial serial, Nanoseconds timeout);
+    // Wait at most `timeout` synchronously for the ExecutionSerial to pass.
+    MaybeError WaitForQueueSerial(ExecutionSerial serial, Nanoseconds timeout);
 
     // Tracks a new task to complete when |serial| is reached.
     void TrackSerialTask(ExecutionSerial serial, Task&& task);
@@ -111,6 +110,8 @@
     bool mInSubmit = false;
 
   protected:
+    using ApiObjectBase::ApiObjectBase;
+
     static constexpr ExecutionSerial kWaitSerialTimeout = kBeginningOfGPUTime;
 
     // Currently, the queue has two paths for serial updating, one is via DeviceBase::Tick which
@@ -132,20 +133,22 @@
     // Backend specific wait for idle function.
     virtual MaybeError WaitForIdleForDestructionImpl() = 0;
 
-    // mCompletedSerial tracks the last completed command serial that the fence has returned.
-    // mLastSubmittedSerial tracks the last submitted command serial.
-    // During device removal, the serials could be artificially incremented
-    // to make it appear as if commands have been compeleted.
-    std::atomic<uint64_t> mCompletedSerial = static_cast<uint64_t>(kBeginningOfGPUTime);
-    std::atomic<uint64_t> mLastSubmittedSerial = static_cast<uint64_t>(kBeginningOfGPUTime);
-
-    MutexProtected<SerialMap<ExecutionSerial, Task>> mWaitingTasks;
-
     // Indicates whether the backend has pending commands to be submitted as soon as possible.
     virtual bool HasPendingCommands() const = 0;
 
     // Submit any pending commands that are enqueued.
     virtual MaybeError SubmitPendingCommandsImpl() = 0;
+
+    // |mCompletedSerial| tracks the last completed command serial that the fence has returned. This
+    // is currently implicitly guarded by the lock for |mWaitingTasks| since we only update this
+    // value when holding that lock.
+    // |mLastSubmittedSerial| tracks the last submitted command serial.
+    // During device removal, the serials could be artificially incremented to make it appear as if
+    // commands have been completed.
+    std::atomic<uint64_t> mCompletedSerial = static_cast<uint64_t>(kBeginningOfGPUTime);
+    std::atomic<uint64_t> mLastSubmittedSerial = static_cast<uint64_t>(kBeginningOfGPUTime);
+
+    MutexProtected<SerialMap<ExecutionSerial, Task>> mWaitingTasks;
 };
 
 }  // namespace dawn::native
diff --git a/src/dawn/native/Queue.cpp b/src/dawn/native/Queue.cpp
index 0da902c..6c0b446 100644
--- a/src/dawn/native/Queue.cpp
+++ b/src/dawn/native/Queue.cpp
@@ -131,12 +131,12 @@
 // QueueBase
 
 QueueBase::QueueBase(DeviceBase* device, const QueueDescriptor* descriptor)
-    : ApiObjectBase(device, descriptor->label) {
+    : ExecutionQueueBase(device, descriptor->label) {
     GetObjectTrackingList()->Track(this);
 }
 
 QueueBase::QueueBase(DeviceBase* device, ObjectBase::ErrorTag tag, StringView label)
-    : ApiObjectBase(device, tag, label) {}
+    : ExecutionQueueBase(device, tag, label) {}
 
 QueueBase::~QueueBase() {
     DAWN_ASSERT(mTasksInFlight->Empty());
diff --git a/src/dawn/native/Queue.h b/src/dawn/native/Queue.h
index 8371e65..9c8fc13 100644
--- a/src/dawn/native/Queue.h
+++ b/src/dawn/native/Queue.h
@@ -62,9 +62,7 @@
     ExecutionSerial mSerial = kMaxExecutionSerial;
 };
 
-class QueueBase : public ApiObjectBase,
-                  public ExecutionQueueBase,
-                  public WeakRefSupport<QueueBase> {
+class QueueBase : public ExecutionQueueBase, public WeakRefSupport<QueueBase> {
   public:
     ~QueueBase() override;
 
diff --git a/src/dawn/native/Toggles.cpp b/src/dawn/native/Toggles.cpp
index 38b3fc2..107842b 100644
--- a/src/dawn/native/Toggles.cpp
+++ b/src/dawn/native/Toggles.cpp
@@ -691,6 +691,11 @@
      {"blob_cache_hash_validation",
       "Enable hash validation when loading/storing from/to the blob cache",
       "https://crbug.com/429938352", ToggleStage::Device}},
+    {Toggle::WaitIsThreadSafe,
+     {"wait_is_thread_safe",
+      "WaitFor* functions are thread-safe and can be called without the device-lock if implicit "
+      "synchronization is enabled.",
+      "https://crbug.com/412761228", ToggleStage::Device}},
     {Toggle::NoWorkaroundSampleMaskBecomesZeroForAllButLastColorTarget,
      {"no_workaround_sample_mask_becomes_zero_for_all_but_last_color_target",
       "MacOS 12.0+ Intel has a bug where the sample mask is only applied for the last color "
diff --git a/src/dawn/native/Toggles.h b/src/dawn/native/Toggles.h
index 2320424..3739230 100644
--- a/src/dawn/native/Toggles.h
+++ b/src/dawn/native/Toggles.h
@@ -166,6 +166,9 @@
     EnableShaderPrint,
     BlobCacheHashValidation,
 
+    // Once all backends have been updated to be thread safe for waiting, we can remove this toggle.
+    WaitIsThreadSafe,
+
     // Unresolved issues.
     NoWorkaroundSampleMaskBecomesZeroForAllButLastColorTarget,
     NoWorkaroundIndirectBaseVertexNotApplied,
diff --git a/src/dawn/native/metal/PhysicalDeviceMTL.mm b/src/dawn/native/metal/PhysicalDeviceMTL.mm
index a89ac1c..b657a02 100644
--- a/src/dawn/native/metal/PhysicalDeviceMTL.mm
+++ b/src/dawn/native/metal/PhysicalDeviceMTL.mm
@@ -546,6 +546,9 @@
     deviceToggles->Default(
         Toggle::EnableIntegerRangeAnalysisInRobustness,
         platform->IsFeatureEnabled(platform::Features::kWebGPUEnableRangeAnalysisForRobustness));
+
+    // Metal waiting is already thread safe.
+    deviceToggles->Default(Toggle::WaitIsThreadSafe, true);
 }
 
 MaybeError PhysicalDevice::InitializeImpl() {