Make perf test tracing thread-safe

The Metal driver calls command buffer completion callbacks on a
separate thread so enqueueing these trace events needs to be made
thread-safe. In the future, Dawn will probably have other threads
that also require thread-safe tracing.

In this CL, each thread records trace events into its own buffer,
and all buffers concatenated when trace events are acquired.

Bug: dawn:254, dawn:208
Change-Id: I0f1abd404568d838091066a8f27a3bf98fa764b9
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/13080
Reviewed-by: Corentin Wallez <cwallez@chromium.org>
Commit-Queue: Austin Eng <enga@chromium.org>
diff --git a/src/tests/perf_tests/DawnPerfTest.cpp b/src/tests/perf_tests/DawnPerfTest.cpp
index c9d683a..20068ee 100644
--- a/src/tests/perf_tests/DawnPerfTest.cpp
+++ b/src/tests/perf_tests/DawnPerfTest.cpp
@@ -65,6 +65,7 @@
             }
             value["ph"] = &phase[0];
             value["id"] = traceEvent.id;
+            value["tid"] = traceEvent.threadId;
             value["ts"] = microseconds;
             value["pid"] = "Dawn";
 
@@ -265,6 +266,9 @@
     }
 
     // Wait for all GPU commands to complete.
+    // TODO(enga): When Dawn has multiple backgrounds threads, add a Device::WaitForIdleForTesting()
+    // which waits for all threads to stop doing work. When we output results, there should
+    // be no additional incoming trace events.
     while (signaledFenceValue != fence.GetCompletedValue()) {
         mTest->WaitABit();
     }
@@ -273,6 +277,9 @@
 }
 
 void DawnPerfTestBase::OutputResults() {
+    // TODO(enga): When Dawn has multiple backgrounds threads, add a Device::WaitForIdleForTesting()
+    // which waits for all threads to stop doing work. When we output results, there should
+    // be no additional incoming trace events.
     DawnPerfTestPlatform* platform =
         reinterpret_cast<DawnPerfTestPlatform*>(gTestEnv->GetInstance()->GetPlatform());
 
diff --git a/src/tests/perf_tests/DawnPerfTestPlatform.cpp b/src/tests/perf_tests/DawnPerfTestPlatform.cpp
index 850b930..016211f 100644
--- a/src/tests/perf_tests/DawnPerfTestPlatform.cpp
+++ b/src/tests/perf_tests/DawnPerfTestPlatform.cpp
@@ -15,6 +15,7 @@
 #include "tests/perf_tests/DawnPerfTestPlatform.h"
 
 #include "common/Assert.h"
+#include "common/HashUtils.h"
 #include "dawn_platform/tracing/TraceEvent.h"
 #include "tests/perf_tests/DawnPerfTest.h"
 #include "utils/Timer.h"
@@ -67,6 +68,22 @@
     return mTimer->GetAbsoluteTime() - origin;
 }
 
+std::vector<DawnPerfTestPlatform::TraceEvent>* DawnPerfTestPlatform::GetLocalTraceEventBuffer() {
+    // Cache the pointer to the vector in thread_local storage
+    thread_local std::vector<TraceEvent>* traceEventBuffer = nullptr;
+
+    if (traceEventBuffer == nullptr) {
+        auto buffer = std::make_unique<std::vector<TraceEvent>>();
+        traceEventBuffer = buffer.get();
+
+        // Add a new buffer to the map
+        std::lock_guard<std::mutex> guard(mTraceEventBufferMapMutex);
+        mTraceEventBuffers[std::this_thread::get_id()] = std::move(buffer);
+    }
+
+    return traceEventBuffer;
+}
+
 // TODO(enga): Simplify this API.
 uint64_t DawnPerfTestPlatform::AddTraceEvent(char phase,
                                              const unsigned char* categoryGroupEnabled,
@@ -90,8 +107,13 @@
     const TraceCategoryInfo* info =
         reinterpret_cast<const TraceCategoryInfo*>(categoryGroupEnabled);
 
-    mTraceEventBuffer.emplace_back(phase, info->category, name, id, timestamp);
-    return static_cast<uint64_t>(mTraceEventBuffer.size());
+    std::vector<TraceEvent>* buffer = GetLocalTraceEventBuffer();
+    buffer->emplace_back(phase, info->category, name, id, timestamp);
+
+    size_t hash = 0;
+    HashCombine(&hash, buffer->size());
+    HashCombine(&hash, std::this_thread::get_id());
+    return static_cast<uint64_t>(hash);
 }
 
 void DawnPerfTestPlatform::EnableTraceEventRecording(bool enable) {
@@ -99,7 +121,27 @@
 }
 
 std::vector<DawnPerfTestPlatform::TraceEvent> DawnPerfTestPlatform::AcquireTraceEventBuffer() {
-    std::vector<DawnPerfTestPlatform::TraceEvent> buffer = mTraceEventBuffer;
-    mTraceEventBuffer.clear();
-    return buffer;
+    std::vector<TraceEvent> traceEventBuffer;
+    {
+        // AcquireTraceEventBuffer should only be called when Dawn is completely idle. There should
+        // be no threads inserting trace events.
+        // Right now, this is safe because AcquireTraceEventBuffer is called after waiting on a
+        // fence for all GPU commands to finish executing. When Dawn has multiple background threads
+        // for other work (creation, validation, submission, residency, etc), we will need to ensure
+        // all work on those threads is stopped as well.
+        std::lock_guard<std::mutex> guard(mTraceEventBufferMapMutex);
+        for (auto it = mTraceEventBuffers.begin(); it != mTraceEventBuffers.end(); ++it) {
+            std::ostringstream stream;
+            stream << it->first;
+            std::string threadId = stream.str();
+
+            std::transform(it->second->begin(), it->second->end(),
+                           std::back_inserter(traceEventBuffer), [&threadId](TraceEvent ev) {
+                               ev.threadId = threadId;
+                               return ev;
+                           });
+            it->second->clear();
+        }
+    }
+    return traceEventBuffer;
 }
diff --git a/src/tests/perf_tests/DawnPerfTestPlatform.h b/src/tests/perf_tests/DawnPerfTestPlatform.h
index b835e34..3fef9e9 100644
--- a/src/tests/perf_tests/DawnPerfTestPlatform.h
+++ b/src/tests/perf_tests/DawnPerfTestPlatform.h
@@ -18,6 +18,9 @@
 #include <dawn_platform/DawnPlatform.h>
 
 #include <memory>
+#include <mutex>
+#include <thread>
+#include <unordered_map>
 #include <vector>
 
 namespace utils {
@@ -44,6 +47,7 @@
         dawn_platform::TraceCategory category;
         const char* name = nullptr;
         uint64_t id = 0;
+        std::string threadId;
         double timestamp = 0;
     };
 
@@ -59,6 +63,8 @@
 
     double MonotonicallyIncreasingTime() override;
 
+    std::vector<TraceEvent>* GetLocalTraceEventBuffer();
+
     uint64_t AddTraceEvent(char phase,
                            const unsigned char* categoryGroupEnabled,
                            const char* name,
@@ -74,7 +80,12 @@
     std::unique_ptr<utils::Timer> mTimer;
 
     // Trace event record.
-    std::vector<TraceEvent> mTraceEventBuffer;
+    // Each uses their own trace event buffer, but the PerfTestPlatform owns all of them in
+    // this map. The map stores all of them so we can iterate through them and flush when
+    // AcquireTraceEventBuffer is called.
+    std::unordered_map<std::thread::id, std::unique_ptr<std::vector<TraceEvent>>>
+        mTraceEventBuffers;
+    std::mutex mTraceEventBufferMapMutex;
 };
 
 #endif  // TESTS_PERFTESTS_DAWNPERFTESTPLATFORM_H_