Surface Vulkan validation messages in errors

This change ensures that when errors are raised from WebGPU, including
lost context errors, they can include any associated Vulkan validation
layer messages in the message text if backend validation is enabled.
This will allow these messages to be surfaced in places like when
running CTS tests on browser build infrastructure.

Because of how Vulkan messages are raised at the instance level, some
routing of the messages to the appropriate device needed to be added.
This is based on appending message IDs to object labels and then parsing
them out of the message when it comes in.

This change also makes Vulkan validation messages cause a device loss,
which is inline with the behavior of the D3D12 backend.

Bug: dawn:1396
Change-Id: I2b309ba5475a338e7e99d4b4daaa7c3d5de61011
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/88460
Reviewed-by: Kai Ninomiya <kainino@chromium.org>
Commit-Queue: Brandon Jones <bajones@chromium.org>
diff --git a/src/dawn/native/vulkan/BackendVk.cpp b/src/dawn/native/vulkan/BackendVk.cpp
index 1f34bcb..bc8efa9 100644
--- a/src/dawn/native/vulkan/BackendVk.cpp
+++ b/src/dawn/native/vulkan/BackendVk.cpp
@@ -23,6 +23,7 @@
 #include "dawn/native/Instance.h"
 #include "dawn/native/VulkanBackend.h"
 #include "dawn/native/vulkan/AdapterVk.h"
+#include "dawn/native/vulkan/DeviceVk.h"
 #include "dawn/native/vulkan/UtilsVulkan.h"
 #include "dawn/native/vulkan/VulkanError.h"
 
@@ -108,10 +109,33 @@
 OnDebugUtilsCallback(VkDebugUtilsMessageSeverityFlagBitsEXT messageSeverity,
                      VkDebugUtilsMessageTypeFlagsEXT /* messageTypes */,
                      const VkDebugUtilsMessengerCallbackDataEXT* pCallbackData,
-                     void* /* pUserData */) {
+                     void* pUserData) {
     if (ShouldReportDebugMessage(pCallbackData->pMessageIdName, pCallbackData->pMessage)) {
-        dawn::WarningLog() << pCallbackData->pMessage;
-        ASSERT((messageSeverity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT) == 0);
+        if (messageSeverity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT) {
+            dawn::ErrorLog() << pCallbackData->pMessage;
+
+            if (pUserData != nullptr) {
+                // Look through all the object labels attached to the debug message and try to parse
+                // a device debug prefix out of one of them. If a debug prefix is found and matches
+                // a registered device, forward the message on to it.
+                for (uint32_t i = 0; i < pCallbackData->objectCount; ++i) {
+                    const VkDebugUtilsObjectNameInfoEXT& object = pCallbackData->pObjects[i];
+                    std::string deviceDebugPrefix =
+                        GetDeviceDebugPrefixFromDebugName(object.pObjectName);
+                    if (deviceDebugPrefix.empty()) {
+                        continue;
+                    }
+
+                    VulkanInstance* instance = reinterpret_cast<VulkanInstance*>(pUserData);
+                    if (instance->HandleDeviceMessage(std::move(deviceDebugPrefix),
+                                                      pCallbackData->pMessage)) {
+                        return VK_FALSE;
+                    }
+                }
+            }
+        } else {
+            dawn::WarningLog() << pCallbackData->pMessage;
+        }
     }
     return VK_FALSE;
 }
@@ -133,6 +157,8 @@
 VulkanInstance::VulkanInstance() = default;
 
 VulkanInstance::~VulkanInstance() {
+    ASSERT(mMessageListenerDevices.empty());
+
     if (mDebugUtilsMessenger != VK_NULL_HANDLE) {
         mFunctions.DestroyDebugUtilsMessengerEXT(mInstance, mDebugUtilsMessenger, nullptr);
         mDebugUtilsMessenger = VK_NULL_HANDLE;
@@ -373,13 +399,31 @@
     createInfo.messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT |
                              VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT;
     createInfo.pfnUserCallback = OnDebugUtilsCallback;
-    createInfo.pUserData = nullptr;
+    createInfo.pUserData = this;
 
     return CheckVkSuccess(mFunctions.CreateDebugUtilsMessengerEXT(mInstance, &createInfo, nullptr,
                                                                   &*mDebugUtilsMessenger),
                           "vkCreateDebugUtilsMessengerEXT");
 }
 
+void VulkanInstance::StartListeningForDeviceMessages(Device* device) {
+    std::lock_guard<std::mutex> lock(mMessageListenerDevicesMutex);
+    mMessageListenerDevices.insert({device->GetDebugPrefix(), device});
+}
+void VulkanInstance::StopListeningForDeviceMessages(Device* device) {
+    std::lock_guard<std::mutex> lock(mMessageListenerDevicesMutex);
+    mMessageListenerDevices.erase(device->GetDebugPrefix());
+}
+bool VulkanInstance::HandleDeviceMessage(std::string deviceDebugPrefix, std::string message) {
+    std::lock_guard<std::mutex> lock(mMessageListenerDevicesMutex);
+    auto it = mMessageListenerDevices.find(deviceDebugPrefix);
+    if (it != mMessageListenerDevices.end()) {
+        it->second->OnDebugMessage(std::move(message));
+        return true;
+    }
+    return false;
+}
+
 Backend::Backend(InstanceBase* instance) : BackendConnection(instance, wgpu::BackendType::Vulkan) {}
 
 Backend::~Backend() = default;
diff --git a/src/dawn/native/vulkan/BackendVk.h b/src/dawn/native/vulkan/BackendVk.h
index 8567d3a..7b3e430 100644
--- a/src/dawn/native/vulkan/BackendVk.h
+++ b/src/dawn/native/vulkan/BackendVk.h
@@ -15,6 +15,9 @@
 #ifndef SRC_DAWN_NATIVE_VULKAN_BACKENDVK_H_
 #define SRC_DAWN_NATIVE_VULKAN_BACKENDVK_H_
 
+#include <mutex>
+#include <string>
+#include <unordered_map>
 #include <vector>
 
 #include "dawn/native/BackendConnection.h"
@@ -32,6 +35,8 @@
     SwiftShader,
 };
 
+class Device;
+
 // VulkanInstance holds the reference to the Vulkan library, the VkInstance, VkPhysicalDevices
 // on that instance, Vulkan functions loaded from the library, and global information
 // gathered from the instance. VkPhysicalDevices bound to the VkInstance are bound to the GPU
@@ -50,6 +55,12 @@
     const VulkanGlobalInfo& GetGlobalInfo() const;
     const std::vector<VkPhysicalDevice>& GetPhysicalDevices() const;
 
+    // TODO(dawn:831): This set of functions guards may need to be adjusted when Dawn is updated
+    // to support multithreading.
+    void StartListeningForDeviceMessages(Device* device);
+    void StopListeningForDeviceMessages(Device* device);
+    bool HandleDeviceMessage(std::string deviceDebugPrefix, std::string message);
+
   private:
     VulkanInstance();
 
@@ -66,6 +77,11 @@
     VkDebugUtilsMessengerEXT mDebugUtilsMessenger = VK_NULL_HANDLE;
 
     std::vector<VkPhysicalDevice> mPhysicalDevices;
+
+    // Devices keep the VulkanInstance alive, so as long as devices remove themselves from this
+    // map on destruction the pointers it contains should remain valid.
+    std::unordered_map<std::string, Device*> mMessageListenerDevices;
+    std::mutex mMessageListenerDevicesMutex;
 };
 
 class Backend : public BackendConnection {
diff --git a/src/dawn/native/vulkan/DeviceVk.cpp b/src/dawn/native/vulkan/DeviceVk.cpp
index 00cc9d4..8c0ed3b 100644
--- a/src/dawn/native/vulkan/DeviceVk.cpp
+++ b/src/dawn/native/vulkan/DeviceVk.cpp
@@ -14,11 +14,13 @@
 
 #include "dawn/native/vulkan/DeviceVk.h"
 
+#include "dawn/common/Log.h"
 #include "dawn/common/Platform.h"
 #include "dawn/native/BackendConnection.h"
 #include "dawn/native/ChainUtils_autogen.h"
 #include "dawn/native/Error.h"
 #include "dawn/native/ErrorData.h"
+#include "dawn/native/Instance.h"
 #include "dawn/native/VulkanBackend.h"
 #include "dawn/native/vulkan/AdapterVk.h"
 #include "dawn/native/vulkan/BackendVk.h"
@@ -53,7 +55,7 @@
 }
 
 Device::Device(Adapter* adapter, const DeviceDescriptor* descriptor)
-    : DeviceBase(adapter, descriptor) {
+    : DeviceBase(adapter, descriptor), mDebugPrefix(GetNextDeviceDebugPrefix()) {
     InitTogglesFromDriver();
 }
 
@@ -103,6 +105,8 @@
 
     SetLabelImpl();
 
+    ToBackend(GetAdapter())->GetVulkanInstance()->StartListeningForDeviceMessages(this);
+
     return DeviceBase::Initialize(Queue::Create(this, &descriptor->defaultQueue));
 }
 
@@ -200,6 +204,8 @@
         DAWN_TRY(SubmitPendingCommands());
     }
 
+    DAWN_TRY(CheckDebugLayerAndGenerateErrors());
+
     return {};
 }
 
@@ -876,6 +882,33 @@
     return mComputeSubgroupSize;
 }
 
+void Device::OnDebugMessage(std::string message) {
+    mDebugMessages.push_back(std::move(message));
+}
+
+MaybeError Device::CheckDebugLayerAndGenerateErrors() {
+    if (!GetAdapter()->GetInstance()->IsBackendValidationEnabled() || mDebugMessages.empty()) {
+        return {};
+    }
+
+    auto error = DAWN_INTERNAL_ERROR("The Vulkan validation layer reported uncaught errors.");
+
+    AppendDebugLayerMessages(error.get());
+
+    return error;
+}
+
+void Device::AppendDebugLayerMessages(ErrorData* error) {
+    if (!GetAdapter()->GetInstance()->IsBackendValidationEnabled()) {
+        return;
+    }
+
+    while (!mDebugMessages.empty()) {
+        error->AppendBackendMessage(std::move(mDebugMessages.back()));
+        mDebugMessages.pop_back();
+    }
+}
+
 MaybeError Device::WaitForIdleForDestruction() {
     // Immediately tag the recording context as unused so we don't try to submit it in Tick.
     // Move the mRecordingContext.used to mUnusedCommands so it can be cleaned up in
@@ -947,6 +980,8 @@
     // Enough of the Device's initialization happened that we can now do regular robust
     // deinitialization.
 
+    ToBackend(GetAdapter())->GetVulkanInstance()->StopListeningForDeviceMessages(this);
+
     // Immediately tag the recording context as unused so we don't try to submit it in Tick.
     mRecordingContext.used = false;
     if (mRecordingContext.commandPool != VK_NULL_HANDLE) {
diff --git a/src/dawn/native/vulkan/DeviceVk.h b/src/dawn/native/vulkan/DeviceVk.h
index 8473ecf..db23623 100644
--- a/src/dawn/native/vulkan/DeviceVk.h
+++ b/src/dawn/native/vulkan/DeviceVk.h
@@ -17,6 +17,7 @@
 
 #include <memory>
 #include <queue>
+#include <string>
 #include <utility>
 #include <vector>
 
@@ -105,6 +106,11 @@
 
     void SetLabelImpl() override;
 
+    void OnDebugMessage(std::string message);
+
+    // Used to associate this device with validation layer messages.
+    const char* GetDebugPrefix() { return mDebugPrefix.c_str(); }
+
   private:
     Device(Adapter* adapter, const DeviceDescriptor* descriptor);
 
@@ -152,6 +158,9 @@
     void ApplyDepthStencilFormatToggles();
     void ApplyUseZeroInitializeWorkgroupMemoryExtensionToggle();
 
+    MaybeError CheckDebugLayerAndGenerateErrors();
+    void AppendDebugLayerMessages(ErrorData* error) override;
+
     void DestroyImpl() override;
     MaybeError WaitForIdleForDestruction() override;
 
@@ -185,6 +194,10 @@
     // Fences in the unused list aren't reset yet.
     std::vector<VkFence> mUnusedFences;
 
+    // For capturing messages generated by the Vulkan debug layer.
+    const std::string mDebugPrefix;
+    std::vector<std::string> mDebugMessages;
+
     MaybeError PrepareRecordingContext();
     void RecycleCompletedCommands();
 
diff --git a/src/dawn/native/vulkan/UtilsVulkan.cpp b/src/dawn/native/vulkan/UtilsVulkan.cpp
index a64e659..fe03424 100644
--- a/src/dawn/native/vulkan/UtilsVulkan.cpp
+++ b/src/dawn/native/vulkan/UtilsVulkan.cpp
@@ -26,6 +26,9 @@
 
 namespace dawn::native::vulkan {
 
+constexpr char kDeviceDebugPrefix[] = "DawnDbg=";
+constexpr char kDeviceDebugSeparator[] = ";";
+
 #define VK_OBJECT_TYPE_GETTER(object, objectType)         \
     template <>                                           \
     VkObjectType GetVkObjectType<object>(object handle) { \
@@ -216,20 +219,44 @@
         objectNameInfo.objectType = objectType;
         objectNameInfo.objectHandle = objectHandle;
 
-        if (label.empty() || !device->IsToggleEnabled(Toggle::UseUserDefinedLabelsInBackend)) {
-            objectNameInfo.pObjectName = prefix;
-            device->fn.SetDebugUtilsObjectNameEXT(device->GetVkDevice(), &objectNameInfo);
-            return;
+        std::ostringstream objectNameStream;
+        // Prefix with the device's message ID so that if this label appears in a validation
+        // message it can be parsed out and the message can be associated with the right device.
+        objectNameStream << device->GetDebugPrefix() << kDeviceDebugSeparator << prefix;
+        if (!label.empty() && device->IsToggleEnabled(Toggle::UseUserDefinedLabelsInBackend)) {
+            objectNameStream << "_" << label;
         }
-
-        std::string objectName = prefix;
-        objectName += "_";
-        objectName += label;
+        std::string objectName = objectNameStream.str();
         objectNameInfo.pObjectName = objectName.c_str();
         device->fn.SetDebugUtilsObjectNameEXT(device->GetVkDevice(), &objectNameInfo);
     }
 }
 
+std::string GetNextDeviceDebugPrefix() {
+    static uint64_t nextDeviceDebugId = 1;
+    std::ostringstream objectName;
+    objectName << kDeviceDebugPrefix << nextDeviceDebugId++;
+    return objectName.str();
+}
+
+std::string GetDeviceDebugPrefixFromDebugName(const char* debugName) {
+    if (debugName == nullptr) {
+        return {};
+    }
+
+    if (strncmp(debugName, kDeviceDebugPrefix, sizeof(kDeviceDebugPrefix) - 1) != 0) {
+        return {};
+    }
+
+    const char* separator = strstr(debugName + sizeof(kDeviceDebugPrefix), kDeviceDebugSeparator);
+    if (separator == nullptr) {
+        return {};
+    }
+
+    size_t length = separator - debugName;
+    return std::string(debugName, length);
+}
+
 VkSpecializationInfo* GetVkSpecializationInfo(
     const ProgrammableStage& programmableStage,
     VkSpecializationInfo* specializationInfo,
diff --git a/src/dawn/native/vulkan/UtilsVulkan.h b/src/dawn/native/vulkan/UtilsVulkan.h
index 572b2a7..7c63b1d 100644
--- a/src/dawn/native/vulkan/UtilsVulkan.h
+++ b/src/dawn/native/vulkan/UtilsVulkan.h
@@ -141,6 +141,9 @@
                          label);
 }
 
+std::string GetNextDeviceDebugPrefix();
+std::string GetDeviceDebugPrefixFromDebugName(const char* debugName);
+
 // Returns nullptr or &specializationInfo
 // specializationInfo, specializationDataEntries, specializationMapEntries needs to
 // be alive at least until VkSpecializationInfo is passed into Vulkan Create*Pipelines