RefCounted: use more precise barriers This improves the DrawCallPerfRun/Vulkan_NoReuseBindGroups benchmark by 2% on an Intel processor but should be a bigger improvement on ARM. The change was inspired by the Boost documentation at https://www.boost.org/doc/libs/1_55_0/doc/html/atomic/usage_examples.html Chromium's base::AtomicRefCount implementation and Rust's core::Arc implementation. BUG=dawn:304 Change-Id: I7ca71f34af20fd267cf2efc63871ff330b1dcc7c Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/14482 Commit-Queue: Corentin Wallez <cwallez@chromium.org> Reviewed-by: Austin Eng <enga@chromium.org> Reviewed-by: David Turner <digit@google.com>

commit: ab4485f86cae89b8c0595668ffccdae5903cfbbe [log] [tgz]
author: Corentin Wallez <cwallez@chromium.org> Thu Dec 12 10:40:22 2019 +0000
committer: Commit Bot service account <commit-bot@chromium.org> Thu Dec 12 10:40:22 2019 +0000
tree: 001ec48430c238b81ab43023926334649ca33aff
parent: 73c5573f1f95aa783a51658d4ce58be69e45e5c1 [diff]
diff --git a/src/dawn_native/RefCounted.cpp b/src/dawn_native/RefCounted.cpp
index 6782c14..9b4c1c2 100644
--- a/src/dawn_native/RefCounted.cpp
+++ b/src/dawn_native/RefCounted.cpp

@@ -43,14 +43,36 @@
 
     void RefCounted::Reference() {
         ASSERT((mRefCount & ~kPayloadMask) != 0);
-        mRefCount += kRefCountIncrement;
+
+        // The relaxed ordering guarantees only the atomicity of the update, which is enough here
+        // because the reference we are copying from still exists and makes sure other threads
+        // don't delete `this`.
+        // See the explanation in the Boost documentation:
+        //     https://www.boost.org/doc/libs/1_55_0/doc/html/atomic/usage_examples.html
+        mRefCount.fetch_add(kRefCountIncrement, std::memory_order_relaxed);
     }
 
     void RefCounted::Release() {
         ASSERT((mRefCount & ~kPayloadMask) != 0);
 
-        mRefCount -= kRefCountIncrement;
-        if (mRefCount < kRefCountIncrement) {
+        // The release fence here is to make sure all accesses to the object on a thread A
+        // happen-before the object is deleted on a thread B. The release memory order ensures that
+        // all accesses on thread A happen-before the refcount is decreased and the atomic variable
+        // makes sure the refcount decrease in A happens-before the refcount decrease in B. Finally
+        // the acquire fence in the destruction case makes sure the refcount decrease in B
+        // happens-before the `delete this`.
+        //
+        // See the explanation in the Boost documentation:
+        //     https://www.boost.org/doc/libs/1_55_0/doc/html/atomic/usage_examples.html
+        uint64_t previousRefCount =
+            mRefCount.fetch_sub(kRefCountIncrement, std::memory_order_release);
+
+        // Check that the previous reference count was strictly less than 2, ignoring payload bits.
+        if (previousRefCount < 2 * kRefCountIncrement) {
+            // Note that on ARM64 this will generate a `dmb ish` instruction which is a global
+            // memory barrier, when an acquire load on mRefCount (using the `ldar` instruction)
+            // should be enough and could end up being faster.
+            std::atomic_thread_fence(std::memory_order_acquire);
             delete this;
         }
     }
commit	ab4485f86cae89b8c0595668ffccdae5903cfbbe	[log] [tgz]
author	Corentin Wallez <cwallez@chromium.org>	Thu Dec 12 10:40:22 2019 +0000
committer	Commit Bot service account <commit-bot@chromium.org>	Thu Dec 12 10:40:22 2019 +0000
tree	001ec48430c238b81ab43023926334649ca33aff
parent	73c5573f1f95aa783a51658d4ce58be69e45e5c1 [diff]