Add an option to enable implicit device synchronization.

1. Auto-generate mutex locking code in DawnNative's ProcTable's
functions. Using a mutex owned by the related Device.
  - Unless the function/class has "no autolock" attribute (new) in
  dawn.json. In which cases, no locking code will be auto-generated.
  - Currently Instance, Adapter, Surface, Encoder classes and
  Reference/Release method have "no autolock".

2. Added Feature::ImplicitDeviceSynchronization to enable Device's
mutex.
  - If this feature is disabled, lock/unlock Device's mutex is no-op.
  Auto-generated locking code will have no effect. This is the default.
  - This approach is used instead of generating two sets of ProcTable
  because it's cleaner and the internal code doesn't need to care about
  whether there is a mutex being locked or not. Furthermore, if there
  were two sets of ProcTable, and user used dawnProcSetProcs() to set
  global ProcTable, it would affect all other modules using different
  Devices. Even though those modules don't need thread safety.

Bug: dawn:1662
Change-Id: I75f0d28959f333318e4159916b259131581f79f5
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/119940
Kokoro: Kokoro <noreply+kokoro@google.com>
Commit-Queue: Quyen Le <lehoangquyen@chromium.org>
Reviewed-by: Austin Eng <enga@chromium.org>
diff --git a/dawn.json b/dawn.json
index 268b835..6b4804f 100644
--- a/dawn.json
+++ b/dawn.json
@@ -77,6 +77,7 @@
     },
     "adapter": {
         "category": "object",
+        "no autolock": true,
         "methods": [
             {
                 "name": "get instance",
@@ -579,6 +580,7 @@
     },
     "command encoder": {
         "category": "object",
+        "no autolock": true,
         "methods": [
             {
                 "name": "finish",
@@ -792,6 +794,7 @@
     },
     "compute pass encoder": {
         "category": "object",
+        "no autolock": true,
         "methods": [
             {
                 "name": "insert debug marker",
@@ -1203,10 +1206,12 @@
             },
             {
                 "name": "tick",
+                "no autolock": true,
                 "tags": ["dawn"]
             },
             {
                 "name": "set uncaptured error callback",
+                "no autolock": true,
                 "args": [
                     {"name": "callback", "type": "error callback"},
                     {"name": "userdata", "type": "void", "annotation": "*"}
@@ -1214,6 +1219,7 @@
             },
             {
                 "name": "set logging callback",
+                "no autolock": true,
                 "tags": ["dawn"],
                 "args": [
                     {"name": "callback", "type": "logging callback"},
@@ -1222,6 +1228,7 @@
             },
             {
                 "name": "set device lost callback",
+                "no autolock": true,
                 "args": [
                     {"name": "callback", "type": "device lost callback"},
                     {"name": "userdata", "type": "void", "annotation": "*"}
@@ -1469,7 +1476,8 @@
             {"value": 1003, "name": "dawn multi planar formats", "tags": ["dawn"]},
             {"value": 1004, "name": "dawn native", "tags": ["dawn", "native"]},
             {"value": 1005, "name": "chromium experimental dp4a", "tags": ["dawn"]},
-            {"value": 1006, "name": "timestamp query inside passes", "tags": ["dawn"]}
+            {"value": 1006, "name": "timestamp query inside passes", "tags": ["dawn"]},
+            {"value": 1007, "name": "implicit device synchronization", "tags": ["dawn", "native"]}
         ]
     },
     "filter mode": {
@@ -1527,6 +1535,7 @@
     },
     "instance": {
         "category": "object",
+        "no autolock": true,
         "methods": [
             {
                 "name": "create surface",
@@ -1856,6 +1865,7 @@
 
     "render bundle encoder": {
         "category": "object",
+        "no autolock": true,
         "methods": [
             {
                 "name": "set pipeline",
@@ -2027,6 +2037,7 @@
     },
     "render pass encoder": {
         "category": "object",
+        "no autolock": true,
         "methods": [
             {
                 "name": "set pipeline",
@@ -2482,6 +2493,7 @@
     },
     "surface": {
         "category": "object",
+        "no autolock": true,
         "methods": [
             {
                 "name": "get preferred format",
diff --git a/docs/dawn/codegen.md b/docs/dawn/codegen.md
index 359fdde..1b0868d 100644
--- a/docs/dawn/codegen.md
+++ b/docs/dawn/codegen.md
@@ -76,6 +76,8 @@
    - `"name"` a string
    - `"return_type"` (default to no return type) a string that's the name of the return type.
    - `"arguments"` a **record**, so an array of **record members**
+   - `"no autolock"`: a boolean flag (default is false) indicates that the method's generated code won't automatically do thread synchronization. This flag can only be true on device or device child objects currently.
+ - `"no autolock"`: a boolean flag (default is false) to indicate that the object's generated code won't automatically do thread synchronization. This will override individual method's `"no autolock"` flag. This flag can only be true on device or device child objects currently.
 
 **`"constant"`**
  - `"type"`: a string, the name of the base data type
diff --git a/generator/dawn_json_generator.py b/generator/dawn_json_generator.py
index 705948c..5b2d5f9 100644
--- a/generator/dawn_json_generator.py
+++ b/generator/dawn_json_generator.py
@@ -189,8 +189,8 @@
         self.handle_type = handle_type
 
 
-Method = namedtuple('Method',
-                    ['name', 'return_type', 'arguments', 'json_data'])
+Method = namedtuple(
+    'Method', ['name', 'return_type', 'arguments', 'autolock', 'json_data'])
 
 
 class ObjectType(Type):
@@ -337,11 +337,16 @@
 
 
 def link_object(obj, types):
+    # Disable method's autolock if obj's "no autolock" = True
+    obj_scoped_autolock_enabled = not obj.json_data.get('no autolock', False)
+
     def make_method(json_data):
         arguments = linked_record_members(json_data.get('args', []), types)
+        autolock_enabled = obj_scoped_autolock_enabled and not json_data.get(
+            'no autolock', False)
         return Method(Name(json_data['name']),
-                      types[json_data.get('returns',
-                                          'void')], arguments, json_data)
+                      types[json_data.get('returns', 'void')], arguments,
+                      autolock_enabled, json_data)
 
     obj.methods = [make_method(m) for m in obj.json_data.get('methods', [])]
     obj.methods.sort(key=lambda method: method.name.canonical_case())
@@ -769,9 +774,9 @@
 def c_methods(params, typ):
     return typ.methods + [
         x for x in [
-            Method(Name('reference'), params['types']['void'], [],
+            Method(Name('reference'), params['types']['void'], [], False,
                    {'tags': ['dawn', 'emscripten']}),
-            Method(Name('release'), params['types']['void'], [],
+            Method(Name('release'), params['types']['void'], [], False,
                    {'tags': ['dawn', 'emscripten']}),
         ] if item_is_enabled(params['enabled_tags'], x.json_data)
         and not item_is_disabled(params['disabled_tags'], x.json_data)
diff --git a/generator/templates/dawn/native/ProcTable.cpp b/generator/templates/dawn/native/ProcTable.cpp
index f9980ff..7b1922b 100644
--- a/generator/templates/dawn/native/ProcTable.cpp
+++ b/generator/templates/dawn/native/ProcTable.cpp
@@ -56,6 +56,17 @@
                     {% endif %}
                 {%- endfor-%}
 
+                {% if method.autolock %}
+                    {% if type.name.get() != "device" %}
+                        auto device = self->GetDevice();
+                    {% else %}
+                        auto device = self;
+                    {% endif %}
+                    auto deviceLock(device->GetScopedLock());
+                {% else %}
+                    // This method is specified to not use AutoLock in json script.
+                {% endif %}
+
                 {% if method.return_type.name.canonical_case() != "void" %}
                     auto result =
                 {%- endif %}
diff --git a/src/dawn/common/BUILD.gn b/src/dawn/common/BUILD.gn
index df20a6c..808e8b7 100644
--- a/src/dawn/common/BUILD.gn
+++ b/src/dawn/common/BUILD.gn
@@ -246,6 +246,8 @@
       "Log.h",
       "Math.cpp",
       "Math.h",
+      "Mutex.cpp",
+      "Mutex.h",
       "NSRef.h",
       "NonCopyable.h",
       "Numeric.h",
diff --git a/src/dawn/common/CMakeLists.txt b/src/dawn/common/CMakeLists.txt
index 45de3dc..c78d6b0 100644
--- a/src/dawn/common/CMakeLists.txt
+++ b/src/dawn/common/CMakeLists.txt
@@ -52,6 +52,8 @@
     "Log.h"
     "Math.cpp"
     "Math.h"
+    "Mutex.cpp"
+    "Mutex.h"
     "NSRef.h"
     "NonCopyable.h"
     "Numeric.h"
diff --git a/src/dawn/common/Mutex.cpp b/src/dawn/common/Mutex.cpp
new file mode 100644
index 0000000..b8b891f
--- /dev/null
+++ b/src/dawn/common/Mutex.cpp
@@ -0,0 +1,49 @@
+// Copyright 2023 The Dawn Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dawn/common/Mutex.h"
+
+namespace dawn {
+Mutex::~Mutex() = default;
+
+void Mutex::Lock() {
+#if defined(DAWN_ENABLE_ASSERTS)
+    auto currentThread = std::this_thread::get_id();
+    ASSERT(mOwner.load(std::memory_order_acquire) != currentThread);
+#endif  // DAWN_ENABLE_ASSERTS
+
+    mNativeMutex.lock();
+
+#if defined(DAWN_ENABLE_ASSERTS)
+    mOwner.store(currentThread, std::memory_order_release);
+#endif  // DAWN_ENABLE_ASSERTS
+}
+void Mutex::Unlock() {
+#if defined(DAWN_ENABLE_ASSERTS)
+    ASSERT(IsLockedByCurrentThread());
+    mOwner.store(std::thread::id(), std::memory_order_release);
+#endif  // DAWN_ENABLE_ASSERTS
+
+    mNativeMutex.unlock();
+}
+
+bool Mutex::IsLockedByCurrentThread() {
+#if defined(DAWN_ENABLE_ASSERTS)
+    return mOwner.load(std::memory_order_acquire) == std::this_thread::get_id();
+#else
+    // This is not supported.
+    abort();
+#endif
+}
+}  // namespace dawn
diff --git a/src/dawn/common/Mutex.h b/src/dawn/common/Mutex.h
new file mode 100644
index 0000000..c5a3a8c
--- /dev/null
+++ b/src/dawn/common/Mutex.h
@@ -0,0 +1,72 @@
+// Copyright 2023 The Dawn Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SRC_DAWN_COMMON_MUTEX_H_
+#define SRC_DAWN_COMMON_MUTEX_H_
+
+#include <atomic>
+#include <mutex>
+#include <thread>
+#include <utility>
+
+#include "dawn/common/Assert.h"
+#include "dawn/common/NonCopyable.h"
+#include "dawn/common/RefCounted.h"
+
+namespace dawn {
+class Mutex : public RefCounted, NonCopyable {
+  public:
+    template <typename MutexRef>
+    struct AutoLockBase : NonMovable {
+        AutoLockBase() = delete;
+        explicit AutoLockBase(MutexRef mutex) : mMutex(std::move(mutex)) {
+            if (mMutex != nullptr) {
+                mMutex->Lock();
+            }
+        }
+        ~AutoLockBase() {
+            if (mMutex != nullptr) {
+                mMutex->Unlock();
+            }
+        }
+
+      private:
+        MutexRef mMutex;
+    };
+
+    // This scoped lock won't keep the mutex alive.
+    using AutoLock = AutoLockBase<Mutex*>;
+    // This scoped lock will keep the mutex alive until out of scope.
+    using AutoLockAndHoldRef = AutoLockBase<Ref<Mutex>>;
+
+    ~Mutex() override;
+
+    void Lock();
+    void Unlock();
+
+    // This method is only enabled when DAWN_ENABLE_ASSERTS is turned on.
+    // Thus it should only be wrapped inside ASSERT() macro.
+    // i.e. ASSERT(mutex.IsLockedByCurrentThread())
+    bool IsLockedByCurrentThread();
+
+  private:
+#if defined(DAWN_ENABLE_ASSERTS)
+    std::atomic<std::thread::id> mOwner;
+#endif  // DAWN_ENABLE_ASSERTS
+    std::mutex mNativeMutex;
+};
+
+}  // namespace dawn
+
+#endif
diff --git a/src/dawn/common/RefCounted.cpp b/src/dawn/common/RefCounted.cpp
index 14bd0b1..b5d2441 100644
--- a/src/dawn/common/RefCounted.cpp
+++ b/src/dawn/common/RefCounted.cpp
@@ -15,6 +15,11 @@
 #include "dawn/common/RefCounted.h"
 
 #include <cstddef>
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#include <sanitizer/tsan_interface.h>
+#endif
+#endif
 
 #include "dawn/common/Assert.h"
 
@@ -68,6 +73,14 @@
         // Note that on ARM64 this will generate a `dmb ish` instruction which is a global
         // memory barrier, when an acquire load on mRefCount (using the `ldar` instruction)
         // should be enough and could end up being faster.
+
+        // https://github.com/google/sanitizers/issues/1415 There is false positive bug in TSAN
+        // when using standalone fence.
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+        __tsan_acquire(&mRefCount);
+#endif
+#endif
         std::atomic_thread_fence(std::memory_order_acquire);
         return true;
     }
@@ -95,6 +108,16 @@
     }
 }
 
+void RefCounted::ReleaseAndLockBeforeDestroy() {
+    if (mRefCount.Decrement()) {
+        LockAndDeleteThis();
+    }
+}
+
 void RefCounted::DeleteThis() {
     delete this;
 }
+
+void RefCounted::LockAndDeleteThis() {
+    DeleteThis();
+}
diff --git a/src/dawn/common/RefCounted.h b/src/dawn/common/RefCounted.h
index b5d74f7..839b47d 100644
--- a/src/dawn/common/RefCounted.h
+++ b/src/dawn/common/RefCounted.h
@@ -46,18 +46,25 @@
     uint64_t GetRefCountPayload() const;
 
     void Reference();
+    // Release() is called by internal code, so it's assumed that there is already a thread
+    // synchronization in place for destruction.
     void Release();
 
     void APIReference() { Reference(); }
-    void APIRelease() { Release(); }
+    // APIRelease() can be called without any synchronization guarantees so we need to use a Release
+    // method that will call LockAndDeleteThis() on destruction.
+    void APIRelease() { ReleaseAndLockBeforeDestroy(); }
 
   protected:
     virtual ~RefCounted();
 
-    // A Derived class may override this if they require a custom deleter.
-    virtual void DeleteThis();
+    void ReleaseAndLockBeforeDestroy();
 
-  private:
+    // A Derived class may override these if they require a custom deleter.
+    virtual void DeleteThis();
+    // This calls DeleteThis() by default.
+    virtual void LockAndDeleteThis();
+
     RefCount mRefCount;
 };
 
diff --git a/src/dawn/native/Adapter.cpp b/src/dawn/native/Adapter.cpp
index 0a6d93e..a83d836 100644
--- a/src/dawn/native/Adapter.cpp
+++ b/src/dawn/native/Adapter.cpp
@@ -42,6 +42,7 @@
 
     EnableFeature(Feature::DawnNative);
     EnableFeature(Feature::DawnInternalUsages);
+    EnableFeature(Feature::ImplicitDeviceSynchronization);
     InitializeSupportedFeaturesImpl();
 
     DAWN_TRY_CONTEXT(
diff --git a/src/dawn/native/Buffer.cpp b/src/dawn/native/Buffer.cpp
index cc4c30e..1a72c90 100644
--- a/src/dawn/native/Buffer.cpp
+++ b/src/dawn/native/Buffer.cpp
@@ -43,9 +43,14 @@
 
   private:
     void FinishImpl() override {
-        ASSERT(mSerial != kMaxExecutionSerial);
-        TRACE_EVENT1(mPlatform, General, "Buffer::TaskInFlight::Finished", "serial",
-                     uint64_t(mSerial));
+        {
+            // This is called from a callback, and no lock will be held by default. Hence, we need
+            // to lock the mutex now because mSerial might be changed by another thread.
+            auto deviceLock(buffer->GetDevice()->GetScopedLock());
+            ASSERT(mSerial != kMaxExecutionSerial);
+            TRACE_EVENT1(mPlatform, General, "Buffer::TaskInFlight::Finished", "serial",
+                         uint64_t(mSerial));
+        }
         buffer->CallbackOnMapRequestCompleted(id, WGPUBufferMapAsyncStatus_Success);
     }
     void HandleDeviceLossImpl() override {
@@ -597,9 +602,14 @@
 
 void BufferBase::CallbackOnMapRequestCompleted(MapRequestID mapID,
                                                WGPUBufferMapAsyncStatus status) {
-    if (mapID == mLastMapID && status == WGPUBufferMapAsyncStatus_Success &&
-        mState == BufferState::PendingMap) {
-        mState = BufferState::Mapped;
+    {
+        // This is called from a callback, and no lock will be held by default. Hence, we need to
+        // lock the mutex now because this will modify the buffer's states.
+        auto deviceLock(GetDevice()->GetScopedLock());
+        if (mapID == mLastMapID && status == WGPUBufferMapAsyncStatus_Success &&
+            mState == BufferState::PendingMap) {
+            mState = BufferState::Mapped;
+        }
     }
 
     auto cb = PrepareMappingCallback(mapID, status);
diff --git a/src/dawn/native/CommandEncoder.cpp b/src/dawn/native/CommandEncoder.cpp
index a696c41..0279bdc 100644
--- a/src/dawn/native/CommandEncoder.cpp
+++ b/src/dawn/native/CommandEncoder.cpp
@@ -1047,14 +1047,21 @@
                 descriptor.dimension = wgpu::TextureDimension::e2D;
                 descriptor.mipLevelCount = 1;
 
+                // We are creating new resources. Need to lock the Device.
+                // TODO(crbug.com/dawn/1618): In future, all temp resources should be created at
+                // Command Submit time, so the locking would be removed from here at that point.
                 Ref<TextureBase> temporaryResolveTexture;
-                DAWN_TRY_ASSIGN(temporaryResolveTexture, device->CreateTexture(&descriptor));
-
-                TextureViewDescriptor viewDescriptor = {};
                 Ref<TextureViewBase> temporaryResolveView;
-                DAWN_TRY_ASSIGN(
-                    temporaryResolveView,
-                    device->CreateTextureView(temporaryResolveTexture.Get(), &viewDescriptor));
+                {
+                    auto deviceLock(GetDevice()->GetScopedLock());
+
+                    DAWN_TRY_ASSIGN(temporaryResolveTexture, device->CreateTexture(&descriptor));
+
+                    TextureViewDescriptor viewDescriptor = {};
+                    DAWN_TRY_ASSIGN(
+                        temporaryResolveView,
+                        device->CreateTextureView(temporaryResolveTexture.Get(), &viewDescriptor));
+                }
 
                 // Save the temporary and given render targets together for copying after
                 // the render pass ends.
@@ -1200,6 +1207,11 @@
 
             if (dst.aspect == Aspect::Depth &&
                 GetDevice()->IsToggleEnabled(Toggle::UseBlitForBufferToDepthTextureCopy)) {
+                // The below function might create new resources. Need to lock the Device.
+                // TODO(crbug.com/dawn/1618): In future, all temp resources should be created at
+                // Command Submit time, so the locking would be removed from here at that point.
+                auto deviceLock(GetDevice()->GetScopedLock());
+
                 DAWN_TRY_CONTEXT(
                     BlitBufferToDepth(GetDevice(), this, source->buffer, srcLayout, dst, *copySize),
                     "copying from %s to depth aspect of %s using blit workaround.", source->buffer,
@@ -1207,6 +1219,11 @@
                 return {};
             } else if (dst.aspect == Aspect::Stencil &&
                        GetDevice()->IsToggleEnabled(Toggle::UseBlitForBufferToStencilTextureCopy)) {
+                // The below function might create new resources. Need to lock the Device.
+                // TODO(crbug.com/dawn/1618): In future, all temp resources should be created at
+                // Command Submit time, so the locking would be removed from here at that point.
+                auto deviceLock(GetDevice()->GetScopedLock());
+
                 DAWN_TRY_CONTEXT(BlitBufferToStencil(GetDevice(), this, source->buffer, srcLayout,
                                                      dst, *copySize),
                                  "copying from %s to stencil aspect of %s using blit workaround.",
@@ -1375,6 +1392,11 @@
 
             // Use a blit to copy the depth aspect.
             if (blitDepth) {
+                // This function might create new resources. Need to lock the Device.
+                // TODO(crbug.com/dawn/1618): In future, all temp resources should be created at
+                // Command Submit time, so the locking would be removed from here at that point.
+                auto deviceLock(GetDevice()->GetScopedLock());
+
                 DAWN_TRY_CONTEXT(BlitDepthToDepth(GetDevice(), this, src, dst, *copySize),
                                  "copying depth aspect from %s to %s using blit workaround.",
                                  source->texture, destination->texture);
@@ -1586,6 +1608,9 @@
 }
 
 CommandBufferBase* CommandEncoder::APIFinish(const CommandBufferDescriptor* descriptor) {
+    // This function will create new object, need to lock the Device.
+    auto deviceLock(GetDevice()->GetScopedLock());
+
     Ref<CommandBufferBase> commandBuffer;
     if (GetDevice()->ConsumedError(Finish(descriptor), &commandBuffer)) {
         return CommandBufferBase::MakeError(GetDevice());
diff --git a/src/dawn/native/ComputePassEncoder.cpp b/src/dawn/native/ComputePassEncoder.cpp
index 63505c7..7fbf82b 100644
--- a/src/dawn/native/ComputePassEncoder.cpp
+++ b/src/dawn/native/ComputePassEncoder.cpp
@@ -238,6 +238,10 @@
 ComputePassEncoder::TransformIndirectDispatchBuffer(Ref<BufferBase> indirectBuffer,
                                                     uint64_t indirectOffset) {
     DeviceBase* device = GetDevice();
+    // This function creates new resources, need to lock the Device.
+    // TODO(crbug.com/dawn/1618): In future, all temp resources should be created at Command Submit
+    // time, so the locking would be removed from here at that point.
+    auto deviceLock(GetDevice()->GetScopedLock());
 
     const bool shouldDuplicateNumWorkgroups =
         device->ShouldDuplicateNumWorkgroupsForDispatchIndirect(
diff --git a/src/dawn/native/Device.cpp b/src/dawn/native/Device.cpp
index 0079baa..047131a 100644
--- a/src/dawn/native/Device.cpp
+++ b/src/dawn/native/Device.cpp
@@ -92,6 +92,10 @@
 };
 
 namespace {
+bool IsMutexLockedByCurrentThreadIfNeeded(const Ref<Mutex>& mutex) {
+    return mutex == nullptr || mutex->IsLockedByCurrentThread();
+}
+
 struct LoggingCallbackTask : CallbackTask {
   public:
     LoggingCallbackTask() = delete;
@@ -277,6 +281,13 @@
                         CreateShaderModule(&descriptor));
     }
 
+    if (HasFeature(Feature::ImplicitDeviceSynchronization)) {
+        mMutex = AcquireRef(new Mutex);
+    } else {
+        mMutex = nullptr;
+    }
+
+    // mAdapter is not set for mock test devices.
     // TODO(crbug.com/dawn/1702): using a mock adapter could avoid the null checking.
     if (mAdapter != nullptr) {
         mAdapter->GetInstance()->AddDevice(this);
@@ -286,30 +297,38 @@
 }
 
 void DeviceBase::WillDropLastExternalRef() {
-    // DeviceBase uses RefCountedWithExternalCount to break refcycles.
-    //
-    // DeviceBase holds multiple Refs to various API objects (pipelines, buffers, etc.) which are
-    // used to implement various device-level facilities. These objects are cached on the device,
-    // so we want to keep them around instead of making transient allocations. However, many of
-    // the objects also hold a Ref<Device> back to their parent device.
-    //
-    // In order to break this cycle and prevent leaks, when the application drops the last external
-    // ref and WillDropLastExternalRef is called, the device clears out any member refs to API
-    // objects that hold back-refs to the device - thus breaking any reference cycles.
-    //
-    // Currently, this is done by calling Destroy on the device to cease all in-flight work and
-    // drop references to internal objects. We may want to lift this in the future, but it would
-    // make things more complex because there might be pending tasks which hold a ref back to the
-    // device - either directly or indirectly. We would need to ensure those tasks don't create new
-    // reference cycles, and we would need to continuously try draining the pending tasks to clear
-    // out all remaining refs.
-    Destroy();
+    {
+        // This will be invoked by API side, so we need to lock.
+        // Note: we cannot hold the lock when flushing the callbacks so have to limit the scope of
+        // the lock.
+        auto deviceLock(GetScopedLock());
+
+        // DeviceBase uses RefCountedWithExternalCount to break refcycles.
+        //
+        // DeviceBase holds multiple Refs to various API objects (pipelines, buffers, etc.) which
+        // are used to implement various device-level facilities. These objects are cached on the
+        // device, so we want to keep them around instead of making transient allocations. However,
+        // many of the objects also hold a Ref<Device> back to their parent device.
+        //
+        // In order to break this cycle and prevent leaks, when the application drops the last
+        // external ref and WillDropLastExternalRef is called, the device clears out any member refs
+        // to API objects that hold back-refs to the device - thus breaking any reference cycles.
+        //
+        // Currently, this is done by calling Destroy on the device to cease all in-flight work and
+        // drop references to internal objects. We may want to lift this in the future, but it would
+        // make things more complex because there might be pending tasks which hold a ref back to
+        // the device - either directly or indirectly. We would need to ensure those tasks don't
+        // create new reference cycles, and we would need to continuously try draining the pending
+        // tasks to clear out all remaining refs.
+        Destroy();
+    }
 
     // Flush last remaining callback tasks.
     do {
-        mCallbackTaskManager->Flush();
+        FlushCallbackTaskQueue();
     } while (!mCallbackTaskManager->IsEmpty());
 
+    auto deviceLock(GetScopedLock());
     // Drop te device's reference to the queue. Because the application dropped the last external
     // references, they can no longer get the queue from APIGetQueue().
     mQueue = nullptr;
@@ -570,7 +589,8 @@
     // resetting) the resources pointed by such pointer may be freed. Flush all deferred
     // callback tasks to guarantee we are never going to use the previous callback after
     // this call.
-    mCallbackTaskManager->Flush();
+    FlushCallbackTaskQueue();
+    auto deviceLock(GetScopedLock());
     if (IsLost()) {
         return;
     }
@@ -584,7 +604,8 @@
     // resetting) the resources pointed by such pointer may be freed. Flush all deferred
     // callback tasks to guarantee we are never going to use the previous callback after
     // this call.
-    mCallbackTaskManager->Flush();
+    FlushCallbackTaskQueue();
+    auto deviceLock(GetScopedLock());
     if (IsLost()) {
         return;
     }
@@ -598,7 +619,8 @@
     // resetting) the resources pointed by such pointer may be freed. Flush all deferred
     // callback tasks to guarantee we are never going to use the previous callback after
     // this call.
-    mCallbackTaskManager->Flush();
+    FlushCallbackTaskQueue();
+    auto deviceLock(GetScopedLock());
     if (IsLost()) {
         return;
     }
@@ -851,6 +873,7 @@
 
 Ref<ComputePipelineBase> DeviceBase::AddOrGetCachedComputePipeline(
     Ref<ComputePipelineBase> computePipeline) {
+    ASSERT(IsMutexLockedByCurrentThreadIfNeeded(mMutex));
     auto [cachedPipeline, inserted] = mCaches->computePipelines.insert(computePipeline.Get());
     if (inserted) {
         computePipeline->SetIsCachedReference();
@@ -862,6 +885,7 @@
 
 Ref<RenderPipelineBase> DeviceBase::AddOrGetCachedRenderPipeline(
     Ref<RenderPipelineBase> renderPipeline) {
+    ASSERT(IsMutexLockedByCurrentThreadIfNeeded(mMutex));
     auto [cachedPipeline, inserted] = mCaches->renderPipelines.insert(renderPipeline.Get());
     if (inserted) {
         renderPipeline->SetIsCachedReference();
@@ -1267,15 +1291,23 @@
     // Tick may trigger callbacks which drop a ref to the device itself. Hold a Ref to ourselves
     // to avoid deleting |this| in the middle of this function call.
     Ref<DeviceBase> self(this);
-    if (ConsumedError(Tick())) {
-        mCallbackTaskManager->Flush();
-        return false;
+    bool tickError;
+    {
+        // Note: we cannot hold the lock when flushing the callbacks so have to limit the scope of
+        // the lock here.
+        auto deviceLock(GetScopedLock());
+        tickError = ConsumedError(Tick());
     }
 
     // We have to check callback tasks in every APITick because it is not related to any global
     // serials.
-    mCallbackTaskManager->Flush();
+    FlushCallbackTaskQueue();
 
+    if (tickError) {
+        return false;
+    }
+
+    auto deviceLock(GetScopedLock());
     // We don't throw an error when device is lost. This allows pending callbacks to be
     // executed even after the Device is lost/destroyed.
     if (IsLost()) {
@@ -1806,6 +1838,26 @@
     mToggles.ForceSet(toggle, isEnabled);
 }
 
+void DeviceBase::FlushCallbackTaskQueue() {
+    // Callbacks might cause re-entrances. Mutex shouldn't be locked. So we expect there is no
+    // locked mutex before entering this method.
+    ASSERT(mMutex == nullptr || !mMutex->IsLockedByCurrentThread());
+
+    Ref<CallbackTaskManager> callbackTaskManager;
+
+    {
+        // This is a data race with the assignment to InstanceBase's callback queue manager in
+        // WillDropLastExternalRef(). Need to protect with a lock and keep the old
+        // mCallbackTaskManager alive.
+        // TODO(crbug.com/dawn/752): In future, all devices should use InstanceBase's callback queue
+        // manager from the start. So we won't need to care about data race at that point.
+        auto deviceLock(GetScopedLock());
+        callbackTaskManager = mCallbackTaskManager;
+    }
+
+    callbackTaskManager->Flush();
+}
+
 const CombinedLimits& DeviceBase::GetLimits() const {
     return mLimits;
 }
@@ -1836,7 +1888,15 @@
             // CreateComputePipelineAsyncTaskImpl::Run() when the front-end pipeline cache is
             // thread-safe.
             ASSERT(mPipeline != nullptr);
-            mPipeline = mPipeline->GetDevice()->AddOrGetCachedComputePipeline(mPipeline);
+            {
+                // This is called inside a callback, and no lock will be held by default so we have
+                // to lock now to protect the cache.
+                // Note: we don't lock inside AddOrGetCachedComputePipeline() to avoid deadlock
+                // because many places calling that method might already have the lock held. For
+                // example, APICreateComputePipeline()
+                auto deviceLock(mPipeline->GetDevice()->GetScopedLock());
+                mPipeline = mPipeline->GetDevice()->AddOrGetCachedComputePipeline(mPipeline);
+            }
 
             CreateComputePipelineAsyncCallbackTask::FinishImpl();
         }
@@ -1861,6 +1921,12 @@
             // CreateRenderPipelineAsyncTaskImpl::Run() when the front-end pipeline cache is
             // thread-safe.
             if (mPipeline.Get() != nullptr) {
+                // This is called inside a callback, and no lock will be held by default so we have
+                // to lock now to protect the cache.
+                // Note: we don't lock inside AddOrGetCachedRenderPipeline() to avoid deadlock
+                // because many places calling that method might already have the lock held. For
+                // example, APICreateRenderPipeline()
+                auto deviceLock(mPipeline->GetDevice()->GetScopedLock());
                 mPipeline = mPipeline->GetDevice()->AddOrGetCachedRenderPipeline(mPipeline);
             }
 
@@ -1974,6 +2040,14 @@
     return {};
 }
 
+Mutex::AutoLockAndHoldRef DeviceBase::GetScopedLockSafeForDelete() {
+    return Mutex::AutoLockAndHoldRef(mMutex);
+}
+
+Mutex::AutoLock DeviceBase::GetScopedLock() {
+    return Mutex::AutoLock(mMutex.Get());
+}
+
 IgnoreLazyClearCountScope::IgnoreLazyClearCountScope(DeviceBase* device)
     : mDevice(device), mLazyClearCountForTesting(device->mLazyClearCountForTesting) {}
 
diff --git a/src/dawn/native/Device.h b/src/dawn/native/Device.h
index 2de7089..b47b98c 100644
--- a/src/dawn/native/Device.h
+++ b/src/dawn/native/Device.h
@@ -16,12 +16,12 @@
 #define SRC_DAWN_NATIVE_DEVICE_H_
 
 #include <memory>
-#include <mutex>
 #include <string>
 #include <unordered_set>
 #include <utility>
 #include <vector>
 
+#include "dawn/common/Mutex.h"
 #include "dawn/native/CacheKey.h"
 #include "dawn/native/Commands.h"
 #include "dawn/native/ComputePipeline.h"
@@ -424,6 +424,13 @@
     // method makes them to be submitted as soon as possbile in next ticks.
     virtual void ForceEventualFlushOfCommands() = 0;
 
+    // It is guaranteed that the wrapped mutex will outlive the Device (if the Device is deleted
+    // before the AutoLockAndHoldRef).
+    [[nodiscard]] Mutex::AutoLockAndHoldRef GetScopedLockSafeForDelete();
+    // This lock won't guarantee the wrapped mutex will be alive if the Device is deleted before the
+    // AutoLock. It would crash if such thing happens.
+    [[nodiscard]] Mutex::AutoLock GetScopedLock();
+
     // In the 'Normal' mode, currently recorded commands in the backend normally will be actually
     // submitted in the next Tick. However in the 'Passive' mode, the submission will be postponed
     // as late as possible, for example, until the client has explictly issued a submission.
@@ -480,6 +487,7 @@
     virtual void SetLabelImpl();
 
     virtual MaybeError TickImpl() = 0;
+    void FlushCallbackTaskQueue();
 
     ResultOrError<Ref<BindGroupLayoutBase>> CreateEmptyBindGroupLayout();
 
@@ -595,6 +603,9 @@
     std::unique_ptr<dawn::platform::WorkerTaskPool> mWorkerTaskPool;
     std::string mLabel;
     CacheKey mDeviceCacheKey;
+
+    // This pointer is non-null if Feature::ImplicitDeviceSynchronization is turned on.
+    Ref<Mutex> mMutex = nullptr;
 };
 
 ResultOrError<Ref<PipelineLayoutBase>> ValidateLayoutAndGetComputePipelineDescriptorWithDefaults(
diff --git a/src/dawn/native/EncodingContext.cpp b/src/dawn/native/EncodingContext.cpp
index 8628e28..0682a5a 100644
--- a/src/dawn/native/EncodingContext.cpp
+++ b/src/dawn/native/EncodingContext.cpp
@@ -87,6 +87,9 @@
             mError = std::move(error);
         }
     } else {
+        // EncodingContext is unprotected from multiple threads by default, but this code will
+        // modify Device's internal states so we need to lock the device now.
+        auto deviceLock(mDevice->GetScopedLock());
         mDevice->HandleError(std::move(error));
     }
 }
diff --git a/src/dawn/native/Features.cpp b/src/dawn/native/Features.cpp
index f1098bc..3d6cdce 100644
--- a/src/dawn/native/Features.cpp
+++ b/src/dawn/native/Features.cpp
@@ -97,6 +97,11 @@
       "https://dawn.googlesource.com/dawn/+/refs/heads/main/docs/dawn/features/"
       "dawn_native.md",
       FeatureInfo::FeatureState::Stable}},
+    {Feature::ImplicitDeviceSynchronization,
+     {"implicit-device-sync",
+      "Public API methods (except encoding) will have implicit device synchronization. So they "
+      "will be safe to be used on multiple threads.",
+      "https://bugs.chromium.org/p/dawn/issues/detail?id=1662", FeatureInfo::FeatureState::Stable}},
 }};
 
 Feature FromAPIFeature(wgpu::FeatureName feature) {
@@ -139,6 +144,8 @@
             return Feature::RG11B10UfloatRenderable;
         case wgpu::FeatureName::BGRA8UnormStorage:
             return Feature::BGRA8UnormStorage;
+        case wgpu::FeatureName::ImplicitDeviceSynchronization:
+            return Feature::ImplicitDeviceSynchronization;
     }
     return Feature::InvalidEnum;
 }
@@ -177,6 +184,8 @@
             return wgpu::FeatureName::RG11B10UfloatRenderable;
         case Feature::BGRA8UnormStorage:
             return wgpu::FeatureName::BGRA8UnormStorage;
+        case Feature::ImplicitDeviceSynchronization:
+            return wgpu::FeatureName::ImplicitDeviceSynchronization;
 
         case Feature::EnumCount:
             break;
diff --git a/src/dawn/native/Features.h b/src/dawn/native/Features.h
index 064d55d..44d1e7f 100644
--- a/src/dawn/native/Features.h
+++ b/src/dawn/native/Features.h
@@ -45,6 +45,7 @@
     DawnInternalUsages,
     MultiPlanarFormats,
     DawnNative,
+    ImplicitDeviceSynchronization,
 
     EnumCount,
     InvalidEnum = EnumCount,
diff --git a/src/dawn/native/ObjectBase.cpp b/src/dawn/native/ObjectBase.cpp
index 6a2f151..be9331c 100644
--- a/src/dawn/native/ObjectBase.cpp
+++ b/src/dawn/native/ObjectBase.cpp
@@ -96,6 +96,11 @@
     RefCounted::DeleteThis();
 }
 
+void ApiObjectBase::LockAndDeleteThis() {
+    auto deviceLock(GetDevice()->GetScopedLockSafeForDelete());
+    DeleteThis();
+}
+
 ApiObjectList* ApiObjectBase::GetObjectTrackingList() {
     ASSERT(GetDevice() != nullptr);
     return GetDevice()->GetObjectTrackingList(GetType());
diff --git a/src/dawn/native/ObjectBase.h b/src/dawn/native/ObjectBase.h
index c1268bb..3bb77d2 100644
--- a/src/dawn/native/ObjectBase.h
+++ b/src/dawn/native/ObjectBase.h
@@ -107,6 +107,7 @@
     // and they should ensure that their overriding versions call this underlying version
     // somewhere.
     void DeleteThis() override;
+    void LockAndDeleteThis() override;
 
     // Returns the list where this object may be tracked for future destruction. This can be
     // overrided to create hierarchical object tracking ownership:
diff --git a/src/dawn/tests/BUILD.gn b/src/dawn/tests/BUILD.gn
index d0b70f5..fff8168 100644
--- a/src/dawn/tests/BUILD.gn
+++ b/src/dawn/tests/BUILD.gn
@@ -293,6 +293,7 @@
     "unittests/LimitsTests.cpp",
     "unittests/LinkedListTests.cpp",
     "unittests/MathTests.cpp",
+    "unittests/MutexTests.cpp",
     "unittests/NumericTests.cpp",
     "unittests/ObjectBaseTests.cpp",
     "unittests/PerStageTests.cpp",
diff --git a/src/dawn/tests/unittests/MutexTests.cpp b/src/dawn/tests/unittests/MutexTests.cpp
new file mode 100644
index 0000000..6a4398b
--- /dev/null
+++ b/src/dawn/tests/unittests/MutexTests.cpp
@@ -0,0 +1,97 @@
+// Copyright 2023 The Dawn Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dawn/common/Mutex.h"
+#include "gtest/gtest.h"
+
+namespace {
+#if defined(DAWN_ENABLE_ASSERTS)
+constexpr bool kAssertEnabled = true;
+#else
+constexpr bool kAssertEnabled = false;
+#endif
+}  // namespace
+
+class MutexTest : public ::testing::Test {
+  protected:
+    void SetUp() override {
+        // IsLockedByCurrentThread() requires DAWN_ENABLE_ASSERTS flag enabled.
+        if (!kAssertEnabled) {
+            GTEST_SKIP() << "DAWN_ENABLE_ASSERTS is not enabled";
+        }
+    }
+
+    dawn::Mutex mMutex;
+};
+
+// Simple Lock() then Unlock() test.
+TEST_F(MutexTest, SimpleLockUnlock) {
+    mMutex.Lock();
+    EXPECT_TRUE(mMutex.IsLockedByCurrentThread());
+    mMutex.Unlock();
+    EXPECT_FALSE(mMutex.IsLockedByCurrentThread());
+}
+
+// Test AutoLock automatically locks the mutex and unlocks it when out of scope.
+TEST_F(MutexTest, AutoLock) {
+    {
+        dawn::Mutex::AutoLock autoLock(&mMutex);
+        EXPECT_TRUE(mMutex.IsLockedByCurrentThread());
+    }
+    EXPECT_FALSE(mMutex.IsLockedByCurrentThread());
+}
+
+// Test AutoLockAndHoldRef will keep the mutex alive
+TEST_F(MutexTest, AutoLockAndHoldRef) {
+    auto* mutex = new dawn::Mutex();
+    EXPECT_EQ(mutex->GetRefCountForTesting(), 1u);
+    {
+        dawn::Mutex::AutoLockAndHoldRef autoLock(mutex);
+        EXPECT_TRUE(mutex->IsLockedByCurrentThread());
+        EXPECT_EQ(mutex->GetRefCountForTesting(), 2u);
+
+        mutex->Release();
+        EXPECT_EQ(mutex->GetRefCountForTesting(), 1u);
+    }
+}
+
+using MutexDeathTest = MutexTest;
+
+// Test that Unlock() call on unlocked mutex will cause assertion failure.
+TEST_F(MutexDeathTest, UnlockWhenNotLocked) {
+    ASSERT_DEATH({ mMutex.Unlock(); }, "");
+}
+
+// Double Lock() calls should be cause assertion failure
+TEST_F(MutexDeathTest, DoubleLockCalls) {
+    mMutex.Lock();
+    EXPECT_TRUE(mMutex.IsLockedByCurrentThread());
+    ASSERT_DEATH({ mMutex.Lock(); }, "");
+    mMutex.Unlock();
+}
+
+// Lock() then use AutoLock should cause assertion failure.
+TEST_F(MutexDeathTest, LockThenAutoLock) {
+    mMutex.Lock();
+    EXPECT_TRUE(mMutex.IsLockedByCurrentThread());
+    ASSERT_DEATH({ dawn::Mutex::AutoLock autoLock(&mMutex); }, "");
+    mMutex.Unlock();
+}
+
+// Use AutoLock then call Lock() should cause assertion failure.
+TEST_F(MutexDeathTest, AutoLockThenLock) {
+    dawn::Mutex::AutoLock autoLock(&mMutex);
+    EXPECT_TRUE(mMutex.IsLockedByCurrentThread());
+    ASSERT_DEATH({ mMutex.Lock(); }, "");
+}
diff --git a/src/dawn/wire/SupportedFeatures.cpp b/src/dawn/wire/SupportedFeatures.cpp
index d3b0849..c4d02c5 100644
--- a/src/dawn/wire/SupportedFeatures.cpp
+++ b/src/dawn/wire/SupportedFeatures.cpp
@@ -24,6 +24,7 @@
         case WGPUFeatureName_Force32:
         case WGPUFeatureName_DawnNative:
         case WGPUFeatureName_DawnShaderFloat16:  // Deprecated
+        case WGPUFeatureName_ImplicitDeviceSynchronization:
             return false;
         case WGPUFeatureName_Depth32FloatStencil8:
         case WGPUFeatureName_TimestampQuery: